Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions include/infinicore/ops/gcd.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#pragma once

#include "../device.hpp"
#include "common/op.hpp"

namespace infinicore::op {

class Gcd {
public:
using schema = void (*)(Tensor, Tensor, Tensor);
static void execute(Tensor input, Tensor other, Tensor output);
static common::OpDispatcher<schema> &dispatcher();
};

Tensor gcd(Tensor input, Tensor other);
void gcd_(Tensor input, Tensor other, Tensor output);

} // namespace infinicore::op
18 changes: 18 additions & 0 deletions include/infinicore/ops/glu.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#pragma once

#include "../device.hpp"
#include "common/op.hpp"

namespace infinicore::op {

class Glu {
public:
using schema = void (*)(Tensor, Tensor, int);
static void execute(Tensor input, Tensor output, int dim);
static common::OpDispatcher<schema> &dispatcher();
};

Tensor glu(Tensor input, int dim);
void glu_(Tensor input, Tensor output, int dim);

} // namespace infinicore::op
18 changes: 18 additions & 0 deletions include/infinicore/ops/gt.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#pragma once

#include "../device.hpp"
#include "common/op.hpp"

namespace infinicore::op {

class Gt {
public:
using schema = void (*)(Tensor, Tensor, Tensor);
static void execute(Tensor input, Tensor other, Tensor output);
static common::OpDispatcher<schema> &dispatcher();
};

Tensor gt(Tensor input, Tensor other);
void gt_(Tensor input, Tensor other, Tensor output);

} // namespace infinicore::op
19 changes: 19 additions & 0 deletions include/infinicore/ops/nll_loss.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
#pragma once

#include "../device.hpp"
#include "common/op.hpp"
#include <optional>

namespace infinicore::op {

class NLLLoss {
public:
using schema = void (*)(Tensor, Tensor, std::optional<Tensor>, Tensor, int64_t);
static void execute(Tensor input, Tensor target, std::optional<Tensor> weight, Tensor output, int64_t ignore_index);
static common::OpDispatcher<schema> &dispatcher();
};

Tensor nll_loss(Tensor input, Tensor target, std::optional<Tensor> weight, int64_t ignore_index);
void nll_loss_(Tensor input, Tensor target, std::optional<Tensor> weight, Tensor output, int64_t ignore_index);

} // namespace infinicore::op
18 changes: 18 additions & 0 deletions include/infinicore/ops/select_scatter.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#pragma once

#include "../device.hpp"
#include "common/op.hpp"

namespace infinicore::op {

class SelectScatter {
public:
using schema = void (*)(Tensor, Tensor, int64_t, int64_t, Tensor);
static void execute(Tensor input, Tensor src, int64_t dim, int64_t index, Tensor output);
static common::OpDispatcher<schema> &dispatcher();
};

Tensor select_scatter(Tensor input, Tensor src, int64_t dim, int64_t index);
void select_scatter_(Tensor input, Tensor src, int64_t dim, int64_t index, Tensor output);

} // namespace infinicore::op
6 changes: 6 additions & 0 deletions python/infinicore/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,9 @@
from infinicore.ops.rearrange import rearrange
from infinicore.ops.squeeze import squeeze
from infinicore.ops.unsqueeze import unsqueeze
from infinicore.ops.gcd import gcd
from infinicore.ops.gt import gt
from infinicore.ops.select_scatter import select_scatter
from infinicore.tensor import (
Tensor,
empty,
Expand Down Expand Up @@ -134,6 +137,9 @@
"strided_empty",
"strided_from_blob",
"zeros",
"gcd",
"select_scatter",
"gt",
]

use_ntops = False
Expand Down
4 changes: 4 additions & 0 deletions python/infinicore/nn/functional/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
from .rope import RopeAlgo, rope
from .silu import silu
from .swiglu import swiglu
from .nll_loss import nll_loss
from .glu import glu

__all__ = [
"causal_softmax",
Expand All @@ -17,4 +19,6 @@
"embedding",
"rope",
"RopeAlgo",
"nll_loss",
"glu",
]
10 changes: 10 additions & 0 deletions python/infinicore/nn/functional/glu.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
import infinicore
from infinicore.lib import _infinicore
from infinicore.tensor import Tensor

def glu(input: Tensor, dim: int = -1) -> Tensor:

if infinicore.use_ntops and input.device.type in ("cuda", "musa"):
return infinicore.ntops.torch.glu(input, dim)

return Tensor(_infinicore.glu(input._underlying, dim))
39 changes: 39 additions & 0 deletions python/infinicore/nn/functional/nll_loss.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import infinicore
from infinicore.lib import _infinicore
from infinicore.tensor import Tensor

def nll_loss(
input: Tensor,
target: Tensor,
weight: Tensor | None = None,
ignore_index: int = -100,
reduction: str = "mean",
*,
out=None,
) -> Tensor:

if infinicore.use_ntops and input.device.type in ("cuda", "musa"):
return infinicore.ntops.torch.nll_loss(
input, target, weight=weight, ignore_index=ignore_index, reduction=reduction
)

weight_underlying = weight._underlying if weight is not None else None

if out is None:
return Tensor(
_infinicore.nll_loss(
input._underlying,
target._underlying,
weight_underlying,
ignore_index
)
)

_infinicore.nll_loss_(
input._underlying,
target._underlying,
weight_underlying,
out._underlying,
ignore_index
)
return out
15 changes: 15 additions & 0 deletions python/infinicore/ops/gcd.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
import infinicore
from infinicore.lib import _infinicore
from infinicore.tensor import Tensor

def gcd(input: Tensor, other: Tensor, *, out=None) -> Tensor:
r"""Computes the element-wise greatest common divisor (GCD)."""

if infinicore.use_ntops and input.device.type in ("cuda", "musa"):
return infinicore.ntops.torch.gcd(input, other, out=out)

if out is None:
return Tensor(_infinicore.gcd(input._underlying, other._underlying))

_infinicore.gcd_(input._underlying, other._underlying, out._underlying)
return out
16 changes: 16 additions & 0 deletions python/infinicore/ops/gt.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
import infinicore
from infinicore.lib import _infinicore
from infinicore.tensor import Tensor

def gt(input: Tensor, other: Tensor | float, *, out: Tensor | None = None) -> Tensor:
if infinicore.use_ntops and input.device.type in ("cuda", "musa"):
return infinicore.ntops.torch.gt(input, other, out=out)

if isinstance(other, (int, float)):
other = Tensor.full(input.shape, other, dtype=input.dtype, device=input.device)

if out is None:
return Tensor(_infinicore.gt(input._underlying, other._underlying))

_infinicore.gt_(input._underlying, other._underlying, out._underlying)
return out
9 changes: 9 additions & 0 deletions python/infinicore/ops/select_scatter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
import infinicore
from infinicore.lib import _infinicore
from infinicore.tensor import Tensor

def select_scatter(input: Tensor, src: Tensor, dim: int, index: int) -> Tensor:
if infinicore.use_ntops and input.device.type in ("cuda", "musa"):
return infinicore.ntops.torch.select_scatter(input, src, dim, index)

return Tensor(_infinicore.select_scatter(input._underlying, src._underlying, dim, index))
26 changes: 26 additions & 0 deletions src/infinicore/ops/gcd/gcd.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
#include "infinicore/ops/gcd.hpp"
#include "../../utils.hpp"

namespace infinicore::op {

common::OpDispatcher<Gcd::schema> &Gcd::dispatcher() {
static common::OpDispatcher<Gcd::schema> dispatcher_;
return dispatcher_;
};

void Gcd::execute(Tensor input, Tensor other, Tensor output) {
infinicore::context::setDevice(input->device());
dispatcher().lookup(input->device().getType())(input, other, output);
}

Tensor gcd(Tensor input, Tensor other) {
auto output = Tensor::empty(input->shape(), input->dtype(), input->device());
gcd_(input, other, output);
return output;
}

void gcd_(Tensor input, Tensor other, Tensor output) {
Gcd::execute(input, other, output);
}

} // namespace infinicore::op
107 changes: 107 additions & 0 deletions src/infinicore/ops/gcd/gcd_cpu.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
#include "../../../utils.h"
#include "infinicore/device.hpp"
#include "infinicore/ops/gcd.hpp"
#include <cmath>
#include <numeric>
#include <omp.h>
#include <vector>

namespace infinicore::op::gcd_impl::cpu {

template <typename T>
T compute_gcd(T a, T b) {
return std::gcd(std::abs(a), std::abs(b));
}

template <typename T>
void gcd_contiguous(const T *input_ptr, const T *other_ptr, T *output_ptr, size_t numel) {
#pragma omp parallel for
for (size_t i = 0; i < numel; ++i) {
auto a = utils::cast<int64_t>(input_ptr[i]);
auto b = utils::cast<int64_t>(other_ptr[i]);
output_ptr[i] = utils::cast<T>(compute_gcd(a, b));
}
}

template <typename T>
void gcd_strided(const T *input_base, const T *other_base, T *output_base,
const std::vector<size_t> &shape,
const std::vector<int64_t> &input_strides,
const std::vector<int64_t> &other_strides,
const std::vector<int64_t> &output_strides) {

size_t numel = 1;
for (auto s : shape) {
numel *= s;
}
int ndim = shape.size();

#pragma omp parallel for
for (size_t i = 0; i < numel; ++i) {
size_t temp_idx = i;
size_t input_offset = 0;
size_t other_offset = 0;
size_t output_offset = 0;

for (int d = ndim - 1; d >= 0; --d) {
size_t coord = temp_idx % shape[d];
temp_idx /= shape[d];

input_offset += coord * input_strides[d];
other_offset += coord * other_strides[d];
output_offset += coord * output_strides[d];
}

auto a = utils::cast<int64_t>(input_base[input_offset]);
auto b = utils::cast<int64_t>(other_base[other_offset]);

output_base[output_offset] = utils::cast<T>(compute_gcd(a, b));
}
}

void calculate(Tensor input, Tensor other, Tensor output) {
if (input->shape() != other->shape() || input->shape() != output->shape()) {
throw std::runtime_error("GCD CPU implementation requires all tensors to have the same shape.");
}

bool all_contiguous = input->is_contiguous() && other->is_contiguous() && output->is_contiguous();
auto dtype = input->dtype();
size_t numel = input->numel();

if (dtype == DataType::I64) {
if (all_contiguous) {
gcd_contiguous<int64_t>(
reinterpret_cast<int64_t *>(input->data()),
reinterpret_cast<int64_t *>(other->data()),
reinterpret_cast<int64_t *>(output->data()), numel);
} else {
gcd_strided<int64_t>(
reinterpret_cast<int64_t *>(input->data()),
reinterpret_cast<int64_t *>(other->data()),
reinterpret_cast<int64_t *>(output->data()),
input->shape(), input->strides(), other->strides(), output->strides());
}
} else if (dtype == DataType::I32) {
if (all_contiguous) {
gcd_contiguous<int32_t>(
reinterpret_cast<int32_t *>(input->data()),
reinterpret_cast<int32_t *>(other->data()),
reinterpret_cast<int32_t *>(output->data()), numel);
} else {
gcd_strided<int32_t>(
reinterpret_cast<int32_t *>(input->data()),
reinterpret_cast<int32_t *>(other->data()),
reinterpret_cast<int32_t *>(output->data()),
input->shape(), input->strides(), other->strides(), output->strides());
}
} else {
throw std::runtime_error("GCD only supports I32 and I64 on CPU.");
}
}

static bool registered = []() {
Gcd::dispatcher().registerDevice(Device::Type::CPU, &calculate);
return true;
}();

} // namespace infinicore::op::gcd_impl::cpu
Loading
Loading