InfiniTensor · PPPoint-t · Jan 30, 2026
diff --git a/include/infinicore/ops/gcd.hpp b/include/infinicore/ops/gcd.hpp
@@ -0,0 +1,18 @@
+#pragma once
+
+#include "../device.hpp"
+#include "common/op.hpp"
+
+namespace infinicore::op {
+
+class Gcd {
+public:
+    using schema = void (*)(Tensor, Tensor, Tensor);
+    static void execute(Tensor input, Tensor other, Tensor output);
+    static common::OpDispatcher<schema> &dispatcher();
+};
+
+Tensor gcd(Tensor input, Tensor other);
+void gcd_(Tensor input, Tensor other, Tensor output);
+
+} // namespace infinicore::op
diff --git a/include/infinicore/ops/glu.hpp b/include/infinicore/ops/glu.hpp
@@ -0,0 +1,18 @@
+#pragma once
+
+#include "../device.hpp"
+#include "common/op.hpp"
+
+namespace infinicore::op {
+
+class Glu {
+public:
+    using schema = void (*)(Tensor, Tensor, int);
+    static void execute(Tensor input, Tensor output, int dim);
+    static common::OpDispatcher<schema> &dispatcher();
+};
+
+Tensor glu(Tensor input, int dim);
+void glu_(Tensor input, Tensor output, int dim);
+
+} // namespace infinicore::op
diff --git a/include/infinicore/ops/gt.hpp b/include/infinicore/ops/gt.hpp
@@ -0,0 +1,18 @@
+#pragma once
+
+#include "../device.hpp"
+#include "common/op.hpp"
+
+namespace infinicore::op {
+
+class Gt {
+public:
+    using schema = void (*)(Tensor, Tensor, Tensor);
+    static void execute(Tensor input, Tensor other, Tensor output);
+    static common::OpDispatcher<schema> &dispatcher();
+};
+
+Tensor gt(Tensor input, Tensor other);
+void gt_(Tensor input, Tensor other, Tensor output);
+
+} // namespace infinicore::op
diff --git a/include/infinicore/ops/nll_loss.hpp b/include/infinicore/ops/nll_loss.hpp
@@ -0,0 +1,19 @@
+#pragma once
+
+#include "../device.hpp"
+#include "common/op.hpp"
+#include <optional>
+
+namespace infinicore::op {
+
+class NLLLoss {
+public:
+    using schema = void (*)(Tensor, Tensor, std::optional<Tensor>, Tensor, int64_t);
+    static void execute(Tensor input, Tensor target, std::optional<Tensor> weight, Tensor output, int64_t ignore_index);
+    static common::OpDispatcher<schema> &dispatcher();
+};
+
+Tensor nll_loss(Tensor input, Tensor target, std::optional<Tensor> weight, int64_t ignore_index);
+void nll_loss_(Tensor input, Tensor target, std::optional<Tensor> weight, Tensor output, int64_t ignore_index);
+
+} // namespace infinicore::op
diff --git a/include/infinicore/ops/select_scatter.hpp b/include/infinicore/ops/select_scatter.hpp
@@ -0,0 +1,18 @@
+#pragma once
+
+#include "../device.hpp"
+#include "common/op.hpp"
+
+namespace infinicore::op {
+
+class SelectScatter {
+public:
+    using schema = void (*)(Tensor, Tensor, int64_t, int64_t, Tensor);
+    static void execute(Tensor input, Tensor src, int64_t dim, int64_t index, Tensor output);
+    static common::OpDispatcher<schema> &dispatcher();
+};
+
+Tensor select_scatter(Tensor input, Tensor src, int64_t dim, int64_t index);
+void select_scatter_(Tensor input, Tensor src, int64_t dim, int64_t index, Tensor output);
+
+} // namespace infinicore::op
diff --git a/python/infinicore/__init__.py b/python/infinicore/__init__.py
@@ -54,6 +54,9 @@
 from infinicore.ops.rearrange import rearrange
 from infinicore.ops.squeeze import squeeze
 from infinicore.ops.unsqueeze import unsqueeze
+from infinicore.ops.gcd import gcd
+from infinicore.ops.gt import gt
+from infinicore.ops.select_scatter import select_scatter
 from infinicore.tensor import (
     Tensor,
     empty,
@@ -134,6 +137,9 @@
     "strided_empty",
     "strided_from_blob",
     "zeros",
+    "gcd",
+    "select_scatter",
+    "gt",
 ]
 
 use_ntops = False

diff --git a/python/infinicore/nn/functional/__init__.py b/python/infinicore/nn/functional/__init__.py
@@ -6,6 +6,8 @@
 from .rope import RopeAlgo, rope
 from .silu import silu
 from .swiglu import swiglu
+from .nll_loss import nll_loss
+from .glu import glu
 
 __all__ = [
     "causal_softmax",
@@ -17,4 +19,6 @@
     "embedding",
     "rope",
     "RopeAlgo",
+    "nll_loss",
+    "glu",
 ]
diff --git a/python/infinicore/nn/functional/glu.py b/python/infinicore/nn/functional/glu.py
@@ -0,0 +1,10 @@
+import infinicore
+from infinicore.lib import _infinicore
+from infinicore.tensor import Tensor
+
+def glu(input: Tensor, dim: int = -1) -> Tensor:
+
+    if infinicore.use_ntops and input.device.type in ("cuda", "musa"):
+        return infinicore.ntops.torch.glu(input, dim)
+
+    return Tensor(_infinicore.glu(input._underlying, dim))
diff --git a/python/infinicore/nn/functional/nll_loss.py b/python/infinicore/nn/functional/nll_loss.py
@@ -0,0 +1,39 @@
+import infinicore
+from infinicore.lib import _infinicore
+from infinicore.tensor import Tensor
+
+def nll_loss(
+    input: Tensor,
+    target: Tensor,
+    weight: Tensor | None = None,
+    ignore_index: int = -100,
+    reduction: str = "mean",
+    *,
+    out=None,
+) -> Tensor:
+
+    if infinicore.use_ntops and input.device.type in ("cuda", "musa"):
+        return infinicore.ntops.torch.nll_loss(
+            input, target, weight=weight, ignore_index=ignore_index, reduction=reduction
+        )
+
+    weight_underlying = weight._underlying if weight is not None else None
+
+    if out is None:
+        return Tensor(
+            _infinicore.nll_loss(
+                input._underlying, 
+                target._underlying, 
+                weight_underlying, 
+                ignore_index
+            )
+        )
+
+    _infinicore.nll_loss_(
+        input._underlying, 
+        target._underlying, 
+        weight_underlying, 
+        out._underlying, 
+        ignore_index
+    )
+    return out
diff --git a/python/infinicore/ops/gcd.py b/python/infinicore/ops/gcd.py
@@ -0,0 +1,15 @@
+import infinicore
+from infinicore.lib import _infinicore
+from infinicore.tensor import Tensor
+
+def gcd(input: Tensor, other: Tensor, *, out=None) -> Tensor:
+    r"""Computes the element-wise greatest common divisor (GCD)."""
+
+    if infinicore.use_ntops and input.device.type in ("cuda", "musa"):
+        return infinicore.ntops.torch.gcd(input, other, out=out)
+
+    if out is None:
+        return Tensor(_infinicore.gcd(input._underlying, other._underlying))
+
+    _infinicore.gcd_(input._underlying, other._underlying, out._underlying)
+    return out
diff --git a/python/infinicore/ops/gt.py b/python/infinicore/ops/gt.py
@@ -0,0 +1,16 @@
+import infinicore
+from infinicore.lib import _infinicore
+from infinicore.tensor import Tensor
+
+def gt(input: Tensor, other: Tensor | float, *, out: Tensor | None = None) -> Tensor:
+    if infinicore.use_ntops and input.device.type in ("cuda", "musa"):
+        return infinicore.ntops.torch.gt(input, other, out=out)
+
+    if isinstance(other, (int, float)):
+        other = Tensor.full(input.shape, other, dtype=input.dtype, device=input.device)
+
+    if out is None:
+        return Tensor(_infinicore.gt(input._underlying, other._underlying))
+
+    _infinicore.gt_(input._underlying, other._underlying, out._underlying)
+    return out
diff --git a/python/infinicore/ops/select_scatter.py b/python/infinicore/ops/select_scatter.py
@@ -0,0 +1,9 @@
+import infinicore
+from infinicore.lib import _infinicore
+from infinicore.tensor import Tensor
+
+def select_scatter(input: Tensor, src: Tensor, dim: int, index: int) -> Tensor:
+    if infinicore.use_ntops and input.device.type in ("cuda", "musa"):
+        return infinicore.ntops.torch.select_scatter(input, src, dim, index)
+
+    return Tensor(_infinicore.select_scatter(input._underlying, src._underlying, dim, index))
diff --git a/src/infinicore/ops/gcd/gcd.cc b/src/infinicore/ops/gcd/gcd.cc
@@ -0,0 +1,26 @@
+#include "infinicore/ops/gcd.hpp"
+#include "../../utils.hpp"
+
+namespace infinicore::op {
+
+common::OpDispatcher<Gcd::schema> &Gcd::dispatcher() {
+    static common::OpDispatcher<Gcd::schema> dispatcher_;
+    return dispatcher_;
+};
+
+void Gcd::execute(Tensor input, Tensor other, Tensor output) {
+    infinicore::context::setDevice(input->device());
+    dispatcher().lookup(input->device().getType())(input, other, output);
+}
+
+Tensor gcd(Tensor input, Tensor other) {
+    auto output = Tensor::empty(input->shape(), input->dtype(), input->device());
+    gcd_(input, other, output);
+    return output;
+}
+
+void gcd_(Tensor input, Tensor other, Tensor output) {
+    Gcd::execute(input, other, output);
+}
+
+} // namespace infinicore::op
diff --git a/src/infinicore/ops/gcd/gcd_cpu.cc b/src/infinicore/ops/gcd/gcd_cpu.cc
@@ -0,0 +1,107 @@
+#include "../../../utils.h"
+#include "infinicore/device.hpp"
+#include "infinicore/ops/gcd.hpp"
+#include <cmath>
+#include <numeric>
+#include <omp.h>
+#include <vector>
+
+namespace infinicore::op::gcd_impl::cpu {
+
+template <typename T>
+T compute_gcd(T a, T b) {
+    return std::gcd(std::abs(a), std::abs(b));
+}
+
+template <typename T>
+void gcd_contiguous(const T *input_ptr, const T *other_ptr, T *output_ptr, size_t numel) {
+#pragma omp parallel for
+    for (size_t i = 0; i < numel; ++i) {
+        auto a = utils::cast<int64_t>(input_ptr[i]);
+        auto b = utils::cast<int64_t>(other_ptr[i]);
+        output_ptr[i] = utils::cast<T>(compute_gcd(a, b));
+    }
+}
+
+template <typename T>
+void gcd_strided(const T *input_base, const T *other_base, T *output_base,
+                 const std::vector<size_t> &shape,
+                 const std::vector<int64_t> &input_strides,
+                 const std::vector<int64_t> &other_strides,
+                 const std::vector<int64_t> &output_strides) {
+
+    size_t numel = 1;
+    for (auto s : shape) {
+        numel *= s;
+    }
+    int ndim = shape.size();
+
+#pragma omp parallel for
+    for (size_t i = 0; i < numel; ++i) {
+        size_t temp_idx = i;
+        size_t input_offset = 0;
+        size_t other_offset = 0;
+        size_t output_offset = 0;
+
+        for (int d = ndim - 1; d >= 0; --d) {
+            size_t coord = temp_idx % shape[d];
+            temp_idx /= shape[d];
+
+            input_offset += coord * input_strides[d];
+            other_offset += coord * other_strides[d];
+            output_offset += coord * output_strides[d];
+        }
+
+        auto a = utils::cast<int64_t>(input_base[input_offset]);
+        auto b = utils::cast<int64_t>(other_base[other_offset]);
+
+        output_base[output_offset] = utils::cast<T>(compute_gcd(a, b));
+    }
+}
+
+void calculate(Tensor input, Tensor other, Tensor output) {
+    if (input->shape() != other->shape() || input->shape() != output->shape()) {
+        throw std::runtime_error("GCD CPU implementation requires all tensors to have the same shape.");
+    }
+
+    bool all_contiguous = input->is_contiguous() && other->is_contiguous() && output->is_contiguous();
+    auto dtype = input->dtype();
+    size_t numel = input->numel();
+
+    if (dtype == DataType::I64) {
+        if (all_contiguous) {
+            gcd_contiguous<int64_t>(
+                reinterpret_cast<int64_t *>(input->data()),
+                reinterpret_cast<int64_t *>(other->data()),
+                reinterpret_cast<int64_t *>(output->data()), numel);
+        } else {
+            gcd_strided<int64_t>(
+                reinterpret_cast<int64_t *>(input->data()),
+                reinterpret_cast<int64_t *>(other->data()),
+                reinterpret_cast<int64_t *>(output->data()),
+                input->shape(), input->strides(), other->strides(), output->strides());
+        }
+    } else if (dtype == DataType::I32) {
+        if (all_contiguous) {
+            gcd_contiguous<int32_t>(
+                reinterpret_cast<int32_t *>(input->data()),
+                reinterpret_cast<int32_t *>(other->data()),
+                reinterpret_cast<int32_t *>(output->data()), numel);
+        } else {
+            gcd_strided<int32_t>(
+                reinterpret_cast<int32_t *>(input->data()),
+                reinterpret_cast<int32_t *>(other->data()),
+                reinterpret_cast<int32_t *>(output->data()),
+                input->shape(), input->strides(), other->strides(), output->strides());
+        }
+    } else {
+        throw std::runtime_error("GCD only supports I32 and I64 on CPU.");
+    }
+}
+
+static bool registered = []() {
+    Gcd::dispatcher().registerDevice(Device::Type::CPU, &calculate);
+    return true;
+}();
+
+} // namespace infinicore::op::gcd_impl::cpu