From 120d746848322092e0084c2c424b1706bae6d489 Mon Sep 17 00:00:00 2001
From: gongchensu <zhuyue_134@qq.com>
Date: Mon, 5 Jan 2026 15:28:13 +0000
Subject: [PATCH 1/7] Issue/887 - Add pow,div,mod,min,max operator with CPU and
 NVIDIA implementations.

---
 include/infiniop.h                         |   3 +
 include/infiniop/ops/div.h                 |  26 +++
 include/infiniop/ops/max.h                 |  26 +++
 include/infiniop/ops/min.h                 |  26 +++
 include/infiniop/ops/mod.h                 |  26 +++
 include/infiniop/ops/pow.h                 |  26 +++
 src/infiniop/ops/div/cpu/div_cpu.cc        |  50 +++++
 src/infiniop/ops/div/cpu/div_cpu.h         |  19 ++
 src/infiniop/ops/div/cuda/kernel.cuh       |  23 +++
 src/infiniop/ops/div/nvidia/div_nvidia.cu  |  57 ++++++
 src/infiniop/ops/div/nvidia/div_nvidia.cuh |   8 +
 src/infiniop/ops/div/operator.cc           | 202 +++++++++++++++++++++
 src/infiniop/ops/max/cpu/max_cpu.cc        |  50 +++++
 src/infiniop/ops/max/cpu/max_cpu.h         |  20 ++
 src/infiniop/ops/max/cuda/kernel.cuh       |  23 +++
 src/infiniop/ops/max/nvidia/max_nvidia.cu  |  57 ++++++
 src/infiniop/ops/max/nvidia/max_nvidia.cuh |   8 +
 src/infiniop/ops/max/operator.cc           | 202 +++++++++++++++++++++
 src/infiniop/ops/min/cpu/min_cpu.cc        |  50 +++++
 src/infiniop/ops/min/cpu/min_cpu.h         |  20 ++
 src/infiniop/ops/min/cuda/kernel.cuh       |  23 +++
 src/infiniop/ops/min/nvidia/min_nvidia.cu  |  57 ++++++
 src/infiniop/ops/min/nvidia/min_nvidia.cuh |   8 +
 src/infiniop/ops/min/operator.cc           | 202 +++++++++++++++++++++
 src/infiniop/ops/mod/cpu/mod_cpu.cc        |  49 +++++
 src/infiniop/ops/mod/cpu/mod_cpu.h         |  23 +++
 src/infiniop/ops/mod/cuda/kernel.cuh       |  30 +++
 src/infiniop/ops/mod/nvidia/mod_nvidia.cu  |  57 ++++++
 src/infiniop/ops/mod/nvidia/mod_nvidia.cuh |   8 +
 src/infiniop/ops/mod/operator.cc           | 142 +++++++++++++++
 src/infiniop/ops/pow/cpu/pow_cpu.cc        |  49 +++++
 src/infiniop/ops/pow/cpu/pow_cpu.h         |  19 ++
 src/infiniop/ops/pow/cuda/kernel.cuh       |  40 ++++
 src/infiniop/ops/pow/nvidia/pow_nvidia.cu  |  57 ++++++
 src/infiniop/ops/pow/nvidia/pow_nvidia.cuh |   8 +
 src/infiniop/ops/pow/operator.cc           | 142 +++++++++++++++
 test/infiniop/div.py                       | 192 ++++++++++++++++++++
 test/infiniop/libinfiniop/op_register.py   | 170 +++++++++++++++++
 test/infiniop/max.py                       | 189 +++++++++++++++++++
 test/infiniop/min.py                       | 189 +++++++++++++++++++
 test/infiniop/mod.py                       | 190 +++++++++++++++++++
 test/infiniop/pow.py                       | 190 +++++++++++++++++++
 42 files changed, 2956 insertions(+)
 create mode 100644 include/infiniop/ops/div.h
 create mode 100644 include/infiniop/ops/max.h
 create mode 100644 include/infiniop/ops/min.h
 create mode 100644 include/infiniop/ops/mod.h
 create mode 100644 include/infiniop/ops/pow.h
 create mode 100644 src/infiniop/ops/div/cpu/div_cpu.cc
 create mode 100644 src/infiniop/ops/div/cpu/div_cpu.h
 create mode 100644 src/infiniop/ops/div/cuda/kernel.cuh
 create mode 100644 src/infiniop/ops/div/nvidia/div_nvidia.cu
 create mode 100644 src/infiniop/ops/div/nvidia/div_nvidia.cuh
 create mode 100644 src/infiniop/ops/div/operator.cc
 create mode 100644 src/infiniop/ops/max/cpu/max_cpu.cc
 create mode 100644 src/infiniop/ops/max/cpu/max_cpu.h
 create mode 100644 src/infiniop/ops/max/cuda/kernel.cuh
 create mode 100644 src/infiniop/ops/max/nvidia/max_nvidia.cu
 create mode 100644 src/infiniop/ops/max/nvidia/max_nvidia.cuh
 create mode 100644 src/infiniop/ops/max/operator.cc
 create mode 100644 src/infiniop/ops/min/cpu/min_cpu.cc
 create mode 100644 src/infiniop/ops/min/cpu/min_cpu.h
 create mode 100644 src/infiniop/ops/min/cuda/kernel.cuh
 create mode 100644 src/infiniop/ops/min/nvidia/min_nvidia.cu
 create mode 100644 src/infiniop/ops/min/nvidia/min_nvidia.cuh
 create mode 100644 src/infiniop/ops/min/operator.cc
 create mode 100644 src/infiniop/ops/mod/cpu/mod_cpu.cc
 create mode 100644 src/infiniop/ops/mod/cpu/mod_cpu.h
 create mode 100644 src/infiniop/ops/mod/cuda/kernel.cuh
 create mode 100644 src/infiniop/ops/mod/nvidia/mod_nvidia.cu
 create mode 100644 src/infiniop/ops/mod/nvidia/mod_nvidia.cuh
 create mode 100644 src/infiniop/ops/mod/operator.cc
 create mode 100644 src/infiniop/ops/pow/cpu/pow_cpu.cc
 create mode 100644 src/infiniop/ops/pow/cpu/pow_cpu.h
 create mode 100644 src/infiniop/ops/pow/cuda/kernel.cuh
 create mode 100644 src/infiniop/ops/pow/nvidia/pow_nvidia.cu
 create mode 100644 src/infiniop/ops/pow/nvidia/pow_nvidia.cuh
 create mode 100644 src/infiniop/ops/pow/operator.cc
 create mode 100644 test/infiniop/div.py
 create mode 100644 test/infiniop/max.py
 create mode 100644 test/infiniop/min.py
 create mode 100644 test/infiniop/mod.py
 create mode 100644 test/infiniop/pow.py

diff --git a/include/infiniop.h b/include/infiniop.h
index c0a09fcb4..cf1688868 100644
--- a/include/infiniop.h
+++ b/include/infiniop.h
@@ -9,11 +9,14 @@
 #include "infiniop/ops/clip.h"
 #include "infiniop/ops/conv.h"
 #include "infiniop/ops/dequantize_awq.h"
+#include "infiniop/ops/div.h"
 #include "infiniop/ops/gelu.h"
 #include "infiniop/ops/gemm.h"
 #include "infiniop/ops/layer_norm.h"
 #include "infiniop/ops/logsoftmax.h"
 #include "infiniop/ops/lp_norm.h"
+#include "infiniop/ops/max.h"
+#include "infiniop/ops/min.h"
 #include "infiniop/ops/mul.h"
 #include "infiniop/ops/ones.h"
 #include "infiniop/ops/paged_attention.h"
diff --git a/include/infiniop/ops/div.h b/include/infiniop/ops/div.h
new file mode 100644
index 000000000..e539b440c
--- /dev/null
+++ b/include/infiniop/ops/div.h
@@ -0,0 +1,26 @@
+#ifndef __INFINIOP_DIV_API_H__
+#define __INFINIOP_DIV_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopDivDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateDivDescriptor(infiniopHandle_t handle,
+                                                        infiniopDivDescriptor_t *desc_ptr,
+                                                        infiniopTensorDescriptor_t c,
+                                                        infiniopTensorDescriptor_t a,
+                                                        infiniopTensorDescriptor_t b);
+
+__C __export infiniStatus_t infiniopGetDivWorkspaceSize(infiniopDivDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopDiv(infiniopDivDescriptor_t desc,
+                                        void *workspace,
+                                        size_t workspace_size,
+                                        void *c,
+                                        const void *a,
+                                        const void *b,
+                                        void *stream);
+
+__C __export infiniStatus_t infiniopDestroyDivDescriptor(infiniopDivDescriptor_t desc);
+
+#endif
diff --git a/include/infiniop/ops/max.h b/include/infiniop/ops/max.h
new file mode 100644
index 000000000..e6f2f5d4c
--- /dev/null
+++ b/include/infiniop/ops/max.h
@@ -0,0 +1,26 @@
+#ifndef __INFINIOP_MAX_API_H__
+#define __INFINIOP_MAX_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopMaxDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateMaxDescriptor(infiniopHandle_t handle,
+                                                        infiniopMaxDescriptor_t *desc_ptr,
+                                                        infiniopTensorDescriptor_t c,
+                                                        infiniopTensorDescriptor_t a,
+                                                        infiniopTensorDescriptor_t b);
+
+__C __export infiniStatus_t infiniopGetMaxWorkspaceSize(infiniopMaxDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopMax(infiniopMaxDescriptor_t desc,
+                                        void *workspace,
+                                        size_t workspace_size,
+                                        void *c,
+                                        const void *a,
+                                        const void *b,
+                                        void *stream);
+
+__C __export infiniStatus_t infiniopDestroyMaxDescriptor(infiniopMaxDescriptor_t desc);
+
+#endif
diff --git a/include/infiniop/ops/min.h b/include/infiniop/ops/min.h
new file mode 100644
index 000000000..f72f0c4db
--- /dev/null
+++ b/include/infiniop/ops/min.h
@@ -0,0 +1,26 @@
+#ifndef __INFINIOP_MIN_API_H__
+#define __INFINIOP_MIN_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopMinDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateMinDescriptor(infiniopHandle_t handle,
+                                                        infiniopMinDescriptor_t *desc_ptr,
+                                                        infiniopTensorDescriptor_t c,
+                                                        infiniopTensorDescriptor_t a,
+                                                        infiniopTensorDescriptor_t b);
+
+__C __export infiniStatus_t infiniopGetMinWorkspaceSize(infiniopMinDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopMin(infiniopMinDescriptor_t desc,
+                                        void *workspace,
+                                        size_t workspace_size,
+                                        void *c,
+                                        const void *a,
+                                        const void *b,
+                                        void *stream);
+
+__C __export infiniStatus_t infiniopDestroyMinDescriptor(infiniopMinDescriptor_t desc);
+
+#endif
diff --git a/include/infiniop/ops/mod.h b/include/infiniop/ops/mod.h
new file mode 100644
index 000000000..5a6cd5bbf
--- /dev/null
+++ b/include/infiniop/ops/mod.h
@@ -0,0 +1,26 @@
+#ifndef __INFINIOP_MOD_API_H__
+#define __INFINIOP_MOD_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopModDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateModDescriptor(infiniopHandle_t handle,
+                                                        infiniopModDescriptor_t *desc_ptr,
+                                                        infiniopTensorDescriptor_t c,
+                                                        infiniopTensorDescriptor_t a,
+                                                        infiniopTensorDescriptor_t b);
+
+__C __export infiniStatus_t infiniopGetModWorkspaceSize(infiniopModDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopMod(infiniopModDescriptor_t desc,
+                                        void *workspace,
+                                        size_t workspace_size,
+                                        void *c,
+                                        const void *a,
+                                        const void *b,
+                                        void *stream);
+
+__C __export infiniStatus_t infiniopDestroyModDescriptor(infiniopModDescriptor_t desc);
+
+#endif
diff --git a/include/infiniop/ops/pow.h b/include/infiniop/ops/pow.h
new file mode 100644
index 000000000..6449d8622
--- /dev/null
+++ b/include/infiniop/ops/pow.h
@@ -0,0 +1,26 @@
+#ifndef __INFINIOP_POW_API_H__
+#define __INFINIOP_POW_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopPowDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreatePowDescriptor(infiniopHandle_t handle,
+                                                        infiniopPowDescriptor_t *desc_ptr,
+                                                        infiniopTensorDescriptor_t c,
+                                                        infiniopTensorDescriptor_t a,
+                                                        infiniopTensorDescriptor_t b);
+
+__C __export infiniStatus_t infiniopGetPowWorkspaceSize(infiniopPowDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopPow(infiniopPowDescriptor_t desc,
+                                        void *workspace,
+                                        size_t workspace_size,
+                                        void *c,
+                                        const void *a,
+                                        const void *b,
+                                        void *stream);
+
+__C __export infiniStatus_t infiniopDestroyPowDescriptor(infiniopPowDescriptor_t desc);
+
+#endif
diff --git a/src/infiniop/ops/div/cpu/div_cpu.cc b/src/infiniop/ops/div/cpu/div_cpu.cc
new file mode 100644
index 000000000..19e222031
--- /dev/null
+++ b/src/infiniop/ops/div/cpu/div_cpu.cc
@@ -0,0 +1,50 @@
+#include "div_cpu.h"
+
+namespace op::div::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &a_desc = input_desc_vec.at(0);
+    const auto &b_desc = input_desc_vec.at(1);
+    const auto &c_shape = out_desc->shape();
+    const auto &a_shape = a_desc->shape();
+    const auto &b_shape = b_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
+
+    CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);
+
+    // create CPU elementwise descriptor
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<DivOp, fp16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<DivOp, float>(_info, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::div::cpu
diff --git a/src/infiniop/ops/div/cpu/div_cpu.h b/src/infiniop/ops/div/cpu/div_cpu.h
new file mode 100644
index 000000000..0373b766f
--- /dev/null
+++ b/src/infiniop/ops/div/cpu/div_cpu.h
@@ -0,0 +1,19 @@
+#ifndef __DIV_CPU_H__
+#define __DIV_CPU_H__
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+
+ELEMENTWISE_DESCRIPTOR(div, cpu)
+
+namespace op::div::cpu {
+typedef struct DivOp {
+public:
+    static constexpr size_t num_inputs = 2;
+    template <typename T>
+    T operator()(const T &a, const T &b) const {
+        return a / b;
+    }
+} DivOp;
+} // namespace op::div::cpu
+
+#endif // __DIV_CPU_H__
diff --git a/src/infiniop/ops/div/cuda/kernel.cuh b/src/infiniop/ops/div/cuda/kernel.cuh
new file mode 100644
index 000000000..a67993da5
--- /dev/null
+++ b/src/infiniop/ops/div/cuda/kernel.cuh
@@ -0,0 +1,23 @@
+#ifndef __DIV_CUDA_H__
+#define __DIV_CUDA_H__
+
+namespace op::div::cuda {
+typedef struct DivOp {
+public:
+    static constexpr size_t num_inputs = 2;
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &a, const T &b) const {
+        if constexpr (std::is_same_v<T, half2>) {
+            return __h2div(a, b);
+        } else if constexpr (std::is_same_v<T, half> || std::is_same_v<T, cuda_bfloat16>) {
+            return a / b;
+        } else if constexpr (std::is_same_v<T, float>) {
+            return __fdividef(a, b);
+        } else {
+            return a / b;
+        }
+    }
+} DivOp;
+} // namespace op::div::cuda
+
+#endif // __DIV_CUDA_H__
diff --git a/src/infiniop/ops/div/nvidia/div_nvidia.cu b/src/infiniop/ops/div/nvidia/div_nvidia.cu
new file mode 100644
index 000000000..1abffe816
--- /dev/null
+++ b/src/infiniop/ops/div/nvidia/div_nvidia.cu
@@ -0,0 +1,57 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "div_nvidia.cuh"
+
+namespace op::div::nvidia {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &a_desc = input_desc_vec.at(0);
+    const auto &b_desc = input_desc_vec.at(1);
+    const auto &c_shape = out_desc->shape();
+    const auto &a_shape = a_desc->shape();
+    const auto &b_shape = b_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
+
+    CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::DivOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::DivOp, float>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::div::nvidia
diff --git a/src/infiniop/ops/div/nvidia/div_nvidia.cuh b/src/infiniop/ops/div/nvidia/div_nvidia.cuh
new file mode 100644
index 000000000..1ad8af94e
--- /dev/null
+++ b/src/infiniop/ops/div/nvidia/div_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __DIV_CUDA_API_H__
+#define __DIV_CUDA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(div, nvidia)
+
+#endif // __DIV_CUDA_API_H__
diff --git a/src/infiniop/ops/div/operator.cc b/src/infiniop/ops/div/operator.cc
new file mode 100644
index 000000000..84021a1af
--- /dev/null
+++ b/src/infiniop/ops/div/operator.cc
@@ -0,0 +1,202 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/div.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/div_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
+#include "nvidia/div_nvidia.cuh"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/div_metax.h"
+#endif
+#ifdef ENABLE_KUNLUN_API
+#include "kunlun/div_kunlun.h"
+#endif
+#ifdef ENABLE_CAMBRICON_API
+#include "bang/div_bang.h"
+#endif
+#ifdef ENABLE_MOORE_API
+#include "moore/div_moore.h"
+#endif
+
+__C infiniStatus_t infiniopCreateDivDescriptor(
+    infiniopHandle_t handle,
+    infiniopDivDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t c_desc,
+    infiniopTensorDescriptor_t a_desc,
+    infiniopTensorDescriptor_t b_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                            \
+    case CASE:                                                             \
+        return op::div::NAMESPACE::Descriptor::create(                     \
+            handle,                                                        \
+            reinterpret_cast<op::div::NAMESPACE::Descriptor **>(desc_ptr), \
+            c_desc,                                                        \
+            {a_desc,                                                       \
+             b_desc})
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CREATE(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_KUNLUN_API
+        CREATE(INFINI_DEVICE_KUNLUN, kunlun);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        CREATE(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+#ifdef ENABLE_MOORE_API
+        CREATE(INFINI_DEVICE_MOORE, moore);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetDivWorkspaceSize(infiniopDivDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                               \
+    case CASE:                                                                             \
+        *size = reinterpret_cast<op::div::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        GET(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_KUNLUN_API
+        GET(INFINI_DEVICE_KUNLUN, kunlun);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        GET(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+#ifdef ENABLE_MOORE_API
+        GET(INFINI_DEVICE_MOORE, moore);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopDiv(
+    infiniopDivDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *c,
+    const void *a,
+    const void *b,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                            \
+    case CASE:                                                                \
+        return reinterpret_cast<const op::div::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, c, {a, b}, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CALCULATE(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_KUNLUN_API
+        CALCULATE(INFINI_DEVICE_KUNLUN, kunlun);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        CALCULATE(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+#ifdef ENABLE_MOORE_API
+        CALCULATE(INFINI_DEVICE_MOORE, moore);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroyDivDescriptor(infiniopDivDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                \
+    case CASE:                                                                 \
+        delete reinterpret_cast<const op::div::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        DELETE(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_KUNLUN_API
+        DELETE(INFINI_DEVICE_KUNLUN, kunlun);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        DELETE(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+#ifdef ENABLE_MOORE_API
+        DELETE(INFINI_DEVICE_MOORE, moore);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/max/cpu/max_cpu.cc b/src/infiniop/ops/max/cpu/max_cpu.cc
new file mode 100644
index 000000000..1b30fa4e4
--- /dev/null
+++ b/src/infiniop/ops/max/cpu/max_cpu.cc
@@ -0,0 +1,50 @@
+#include "max_cpu.h"
+
+namespace op::max::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &a_desc = input_desc_vec.at(0);
+    const auto &b_desc = input_desc_vec.at(1);
+    const auto &c_shape = out_desc->shape();
+    const auto &a_shape = a_desc->shape();
+    const auto &b_shape = b_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
+
+    CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);
+
+    // create CPU elementwise descriptor
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<MaxOp, fp16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<MaxOp, float>(_info, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::max::cpu
diff --git a/src/infiniop/ops/max/cpu/max_cpu.h b/src/infiniop/ops/max/cpu/max_cpu.h
new file mode 100644
index 000000000..4d085ed39
--- /dev/null
+++ b/src/infiniop/ops/max/cpu/max_cpu.h
@@ -0,0 +1,20 @@
+#ifndef __MAX_CPU_H__
+#define __MAX_CPU_H__
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+#include <algorithm>
+
+ELEMENTWISE_DESCRIPTOR(max, cpu)
+
+namespace op::max::cpu {
+typedef struct MaxOp {
+public:
+    static constexpr size_t num_inputs = 2;
+    template <typename T>
+    T operator()(const T &a, const T &b) const {
+        return std::max(a, b);
+    }
+} MaxOp;
+} // namespace op::max::cpu
+
+#endif // __MAX_CPU_H__
diff --git a/src/infiniop/ops/max/cuda/kernel.cuh b/src/infiniop/ops/max/cuda/kernel.cuh
new file mode 100644
index 000000000..bf3977a31
--- /dev/null
+++ b/src/infiniop/ops/max/cuda/kernel.cuh
@@ -0,0 +1,23 @@
+#ifndef __MAX_CUDA_H__
+#define __MAX_CUDA_H__
+
+namespace op::max::cuda {
+typedef struct MaxOp {
+public:
+    static constexpr size_t num_inputs = 2;
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &a, const T &b) const {
+        if constexpr (std::is_same_v<T, half2>) {
+            return __hmax2(a, b);
+        } else if constexpr (std::is_same_v<T, half> || std::is_same_v<T, cuda_bfloat16>) {
+            return a > b ? a : b;
+        } else if constexpr (std::is_same_v<T, float>) {
+            return fmaxf(a, b);
+        } else {
+            return a > b ? a : b;
+        }
+    }
+} MaxOp;
+} // namespace op::max::cuda
+
+#endif // __MAX_CUDA_H__
diff --git a/src/infiniop/ops/max/nvidia/max_nvidia.cu b/src/infiniop/ops/max/nvidia/max_nvidia.cu
new file mode 100644
index 000000000..5e9fb13f4
--- /dev/null
+++ b/src/infiniop/ops/max/nvidia/max_nvidia.cu
@@ -0,0 +1,57 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "max_nvidia.cuh"
+
+namespace op::max::nvidia {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &a_desc = input_desc_vec.at(0);
+    const auto &b_desc = input_desc_vec.at(1);
+    const auto &c_shape = out_desc->shape();
+    const auto &a_shape = a_desc->shape();
+    const auto &b_shape = b_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
+
+    CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::MaxOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::MaxOp, float>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::max::nvidia
diff --git a/src/infiniop/ops/max/nvidia/max_nvidia.cuh b/src/infiniop/ops/max/nvidia/max_nvidia.cuh
new file mode 100644
index 000000000..b3b60dd2a
--- /dev/null
+++ b/src/infiniop/ops/max/nvidia/max_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __MAX_CUDA_API_H__
+#define __MAX_CUDA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(max, nvidia)
+
+#endif // __MAX_CUDA_API_H__
diff --git a/src/infiniop/ops/max/operator.cc b/src/infiniop/ops/max/operator.cc
new file mode 100644
index 000000000..e04368533
--- /dev/null
+++ b/src/infiniop/ops/max/operator.cc
@@ -0,0 +1,202 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/max.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/max_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
+#include "nvidia/max_nvidia.cuh"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/max_metax.h"
+#endif
+#ifdef ENABLE_KUNLUN_API
+#include "kunlun/max_kunlun.h"
+#endif
+#ifdef ENABLE_CAMBRICON_API
+#include "bang/max_bang.h"
+#endif
+#ifdef ENABLE_MOORE_API
+#include "moore/max_moore.h"
+#endif
+
+__C infiniStatus_t infiniopCreateMaxDescriptor(
+    infiniopHandle_t handle,
+    infiniopMaxDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t c_desc,
+    infiniopTensorDescriptor_t a_desc,
+    infiniopTensorDescriptor_t b_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                            \
+    case CASE:                                                             \
+        return op::max::NAMESPACE::Descriptor::create(                     \
+            handle,                                                        \
+            reinterpret_cast<op::max::NAMESPACE::Descriptor **>(desc_ptr), \
+            c_desc,                                                        \
+            {a_desc,                                                       \
+             b_desc})
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CREATE(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_KUNLUN_API
+        CREATE(INFINI_DEVICE_KUNLUN, kunlun);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        CREATE(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+#ifdef ENABLE_MOORE_API
+        CREATE(INFINI_DEVICE_MOORE, moore);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetMaxWorkspaceSize(infiniopMaxDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                               \
+    case CASE:                                                                             \
+        *size = reinterpret_cast<op::max::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        GET(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_KUNLUN_API
+        GET(INFINI_DEVICE_KUNLUN, kunlun);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        GET(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+#ifdef ENABLE_MOORE_API
+        GET(INFINI_DEVICE_MOORE, moore);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopMax(
+    infiniopMaxDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *c,
+    const void *a,
+    const void *b,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                            \
+    case CASE:                                                                \
+        return reinterpret_cast<const op::max::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, c, {a, b}, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CALCULATE(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_KUNLUN_API
+        CALCULATE(INFINI_DEVICE_KUNLUN, kunlun);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        CALCULATE(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+#ifdef ENABLE_MOORE_API
+        CALCULATE(INFINI_DEVICE_MOORE, moore);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroyMaxDescriptor(infiniopMaxDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                \
+    case CASE:                                                                 \
+        delete reinterpret_cast<const op::max::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        DELETE(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_KUNLUN_API
+        DELETE(INFINI_DEVICE_KUNLUN, kunlun);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        DELETE(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+#ifdef ENABLE_MOORE_API
+        DELETE(INFINI_DEVICE_MOORE, moore);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/min/cpu/min_cpu.cc b/src/infiniop/ops/min/cpu/min_cpu.cc
new file mode 100644
index 000000000..dc30ee57f
--- /dev/null
+++ b/src/infiniop/ops/min/cpu/min_cpu.cc
@@ -0,0 +1,50 @@
+#include "min_cpu.h"
+
+namespace op::min::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &a_desc = input_desc_vec.at(0);
+    const auto &b_desc = input_desc_vec.at(1);
+    const auto &c_shape = out_desc->shape();
+    const auto &a_shape = a_desc->shape();
+    const auto &b_shape = b_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
+
+    CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);
+
+    // create CPU elementwise descriptor
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<MinOp, fp16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<MinOp, float>(_info, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::min::cpu
diff --git a/src/infiniop/ops/min/cpu/min_cpu.h b/src/infiniop/ops/min/cpu/min_cpu.h
new file mode 100644
index 000000000..1c84d4fca
--- /dev/null
+++ b/src/infiniop/ops/min/cpu/min_cpu.h
@@ -0,0 +1,20 @@
+#ifndef __MIN_CPU_H__
+#define __MIN_CPU_H__
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+#include <algorithm>
+
+ELEMENTWISE_DESCRIPTOR(min, cpu)
+
+namespace op::min::cpu {
+typedef struct MinOp {
+public:
+    static constexpr size_t num_inputs = 2;
+    template <typename T>
+    T operator()(const T &a, const T &b) const {
+        return std::min(a, b);
+    }
+} MinOp;
+} // namespace op::min::cpu
+
+#endif // __MIN_CPU_H__
diff --git a/src/infiniop/ops/min/cuda/kernel.cuh b/src/infiniop/ops/min/cuda/kernel.cuh
new file mode 100644
index 000000000..aac14a0e8
--- /dev/null
+++ b/src/infiniop/ops/min/cuda/kernel.cuh
@@ -0,0 +1,23 @@
+#ifndef __MIN_CUDA_H__
+#define __MIN_CUDA_H__
+
+namespace op::min::cuda {
+typedef struct MinOp {
+public:
+    static constexpr size_t num_inputs = 2;
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &a, const T &b) const {
+        if constexpr (std::is_same_v<T, half2>) {
+            return __hmin2(a, b);
+        } else if constexpr (std::is_same_v<T, half> || std::is_same_v<T, cuda_bfloat16>) {
+            return a < b ? a : b;
+        } else if constexpr (std::is_same_v<T, float>) {
+            return fminf(a, b);
+        } else {
+            return a < b ? a : b;
+        }
+    }
+} MinOp;
+} // namespace op::min::cuda
+
+#endif // __MIN_CUDA_H__
diff --git a/src/infiniop/ops/min/nvidia/min_nvidia.cu b/src/infiniop/ops/min/nvidia/min_nvidia.cu
new file mode 100644
index 000000000..419655e29
--- /dev/null
+++ b/src/infiniop/ops/min/nvidia/min_nvidia.cu
@@ -0,0 +1,57 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "min_nvidia.cuh"
+
+namespace op::min::nvidia {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &a_desc = input_desc_vec.at(0);
+    const auto &b_desc = input_desc_vec.at(1);
+    const auto &c_shape = out_desc->shape();
+    const auto &a_shape = a_desc->shape();
+    const auto &b_shape = b_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
+
+    CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::MinOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::MinOp, float>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::min::nvidia
diff --git a/src/infiniop/ops/min/nvidia/min_nvidia.cuh b/src/infiniop/ops/min/nvidia/min_nvidia.cuh
new file mode 100644
index 000000000..ada9a3545
--- /dev/null
+++ b/src/infiniop/ops/min/nvidia/min_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __MIN_CUDA_API_H__
+#define __MIN_CUDA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(min, nvidia)
+
+#endif // __MIN_CUDA_API_H__
diff --git a/src/infiniop/ops/min/operator.cc b/src/infiniop/ops/min/operator.cc
new file mode 100644
index 000000000..8479feab4
--- /dev/null
+++ b/src/infiniop/ops/min/operator.cc
@@ -0,0 +1,202 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/min.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/min_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
+#include "nvidia/min_nvidia.cuh"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/min_metax.h"
+#endif
+#ifdef ENABLE_KUNLUN_API
+#include "kunlun/min_kunlun.h"
+#endif
+#ifdef ENABLE_CAMBRICON_API
+#include "bang/min_bang.h"
+#endif
+#ifdef ENABLE_MOORE_API
+#include "moore/min_moore.h"
+#endif
+
+__C infiniStatus_t infiniopCreateMinDescriptor(
+    infiniopHandle_t handle,
+    infiniopMinDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t c_desc,
+    infiniopTensorDescriptor_t a_desc,
+    infiniopTensorDescriptor_t b_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                            \
+    case CASE:                                                             \
+        return op::min::NAMESPACE::Descriptor::create(                     \
+            handle,                                                        \
+            reinterpret_cast<op::min::NAMESPACE::Descriptor **>(desc_ptr), \
+            c_desc,                                                        \
+            {a_desc,                                                       \
+             b_desc})
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CREATE(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_KUNLUN_API
+        CREATE(INFINI_DEVICE_KUNLUN, kunlun);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        CREATE(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+#ifdef ENABLE_MOORE_API
+        CREATE(INFINI_DEVICE_MOORE, moore);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetMinWorkspaceSize(infiniopMinDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                               \
+    case CASE:                                                                             \
+        *size = reinterpret_cast<op::min::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        GET(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_KUNLUN_API
+        GET(INFINI_DEVICE_KUNLUN, kunlun);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        GET(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+#ifdef ENABLE_MOORE_API
+        GET(INFINI_DEVICE_MOORE, moore);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopMin(
+    infiniopMinDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *c,
+    const void *a,
+    const void *b,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                            \
+    case CASE:                                                                \
+        return reinterpret_cast<const op::min::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, c, {a, b}, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CALCULATE(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_KUNLUN_API
+        CALCULATE(INFINI_DEVICE_KUNLUN, kunlun);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        CALCULATE(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+#ifdef ENABLE_MOORE_API
+        CALCULATE(INFINI_DEVICE_MOORE, moore);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroyMinDescriptor(infiniopMinDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                \
+    case CASE:                                                                 \
+        delete reinterpret_cast<const op::min::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        DELETE(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_KUNLUN_API
+        DELETE(INFINI_DEVICE_KUNLUN, kunlun);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        DELETE(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+#ifdef ENABLE_MOORE_API
+        DELETE(INFINI_DEVICE_MOORE, moore);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/mod/cpu/mod_cpu.cc b/src/infiniop/ops/mod/cpu/mod_cpu.cc
new file mode 100644
index 000000000..907d05166
--- /dev/null
+++ b/src/infiniop/ops/mod/cpu/mod_cpu.cc
@@ -0,0 +1,49 @@
+#include "mod_cpu.h"
+
+namespace op::mod::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &a_desc = input_desc_vec.at(0);
+    const auto &b_desc = input_desc_vec.at(1);
+    const auto &out_shape = out_desc->shape();
+    const auto &a_shape = a_desc->shape();
+    const auto &b_shape = b_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
+
+    CHECK_SAME_SHAPE(out_shape, a_shape, b_shape);
+
+    // create CPU elementwise descriptor
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<ModOp, fp16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<ModOp, float>(_info, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::mod::cpu
diff --git a/src/infiniop/ops/mod/cpu/mod_cpu.h b/src/infiniop/ops/mod/cpu/mod_cpu.h
new file mode 100644
index 000000000..9e78adca6
--- /dev/null
+++ b/src/infiniop/ops/mod/cpu/mod_cpu.h
@@ -0,0 +1,23 @@
+#ifndef __MOD_CPU_H__
+#define __MOD_CPU_H__
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+
+ELEMENTWISE_DESCRIPTOR(mod, cpu)
+
+namespace op::mod::cpu {
+typedef struct ModOp {
+public:
+    static constexpr size_t num_inputs = 2;
+    template <typename T>
+    T operator()(const T &a, const T &b) const {
+        if constexpr (std::is_floating_point_v<T>) {
+            return std::fmod(a, b);
+        } else {
+            return a % b;
+        }
+    }
+} ModOp;
+} // namespace op::mod::cpu
+
+#endif // __MOD_CPU_H__
diff --git a/src/infiniop/ops/mod/cuda/kernel.cuh b/src/infiniop/ops/mod/cuda/kernel.cuh
new file mode 100644
index 000000000..0dcb54136
--- /dev/null
+++ b/src/infiniop/ops/mod/cuda/kernel.cuh
@@ -0,0 +1,30 @@
+#ifndef __MOD_CUDA_H__
+#define __MOD_CUDA_H__
+
+#include <cmath>
+#include <cuda_fp16.h>
+
+namespace op::mod::cuda {
+typedef struct ModOp {
+public:
+    static constexpr size_t num_inputs = 2;
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &a, const T &b) const {
+        if constexpr (std::is_same_v<T, half2>) {
+            float2 a_f2 = __half22float2(a);
+            float2 b_f2 = __half22float2(b);
+            return __float22half2_rn(make_float2(std::fmod(a_f2.x, b_f2.x), std::fmod(a_f2.y, b_f2.y)));
+        } else if constexpr (std::is_same_v<T, half>) {
+            float a_ = __half2float(a);
+            float b_ = __half2float(b);
+            return __float2half(std::fmod(a_, b_));
+        } else if constexpr (std::is_floating_point_v<T>) {
+            return std::fmod(a, b);
+        } else {
+            return a % b;
+        }
+    }
+} ModOp;
+} // namespace op::mod::cuda
+
+#endif // __MOD_CUDA_H__
diff --git a/src/infiniop/ops/mod/nvidia/mod_nvidia.cu b/src/infiniop/ops/mod/nvidia/mod_nvidia.cu
new file mode 100644
index 000000000..64326d441
--- /dev/null
+++ b/src/infiniop/ops/mod/nvidia/mod_nvidia.cu
@@ -0,0 +1,57 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "mod_nvidia.cuh"
+
+namespace op::mod::nvidia {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &a_desc = input_desc_vec.at(0);
+    const auto &b_desc = input_desc_vec.at(1);
+    const auto &c_shape = out_desc->shape();
+    const auto &a_shape = a_desc->shape();
+    const auto &b_shape = b_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
+
+    CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::ModOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::ModOp, float>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::mod::nvidia
diff --git a/src/infiniop/ops/mod/nvidia/mod_nvidia.cuh b/src/infiniop/ops/mod/nvidia/mod_nvidia.cuh
new file mode 100644
index 000000000..31788cfd2
--- /dev/null
+++ b/src/infiniop/ops/mod/nvidia/mod_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __MOD_CUDA_API_H__
+#define __MOD_CUDA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(mod, nvidia)
+
+#endif // __MOD_CUDA_API_H__
diff --git a/src/infiniop/ops/mod/operator.cc b/src/infiniop/ops/mod/operator.cc
new file mode 100644
index 000000000..85810e794
--- /dev/null
+++ b/src/infiniop/ops/mod/operator.cc
@@ -0,0 +1,142 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/mod.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/mod_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
+#include "nvidia/mod_nvidia.cuh"
+#endif
+
+__C infiniStatus_t infiniopCreateModDescriptor(
+    infiniopHandle_t handle,
+    infiniopModDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t c_desc,
+    infiniopTensorDescriptor_t a_desc,
+    infiniopTensorDescriptor_t b_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                            \
+    case CASE:                                                             \
+        return op::mod::NAMESPACE::Descriptor::create(                     \
+            handle,                                                        \
+            reinterpret_cast<op::mod::NAMESPACE::Descriptor **>(desc_ptr), \
+            c_desc,                                                        \
+            {a_desc,                                                       \
+             b_desc})
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CREATE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetModWorkspaceSize(infiniopModDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                               \
+    case CASE:                                                                             \
+        *size = reinterpret_cast<op::mod::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        GET(INFINI_DEVICE_QY, nvidia);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopMod(
+    infiniopModDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *c,
+    const void *a,
+    const void *b,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                            \
+    case CASE:                                                                \
+        return reinterpret_cast<const op::mod::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, c, {a, b}, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CALCULATE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroyModDescriptor(infiniopModDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                \
+    case CASE:                                                                 \
+        delete reinterpret_cast<const op::mod::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        DELETE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/pow/cpu/pow_cpu.cc b/src/infiniop/ops/pow/cpu/pow_cpu.cc
new file mode 100644
index 000000000..0c6fda0f7
--- /dev/null
+++ b/src/infiniop/ops/pow/cpu/pow_cpu.cc
@@ -0,0 +1,49 @@
+#include "pow_cpu.h"
+
+namespace op::pow::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &a_desc = input_desc_vec.at(0);
+    const auto &b_desc = input_desc_vec.at(1);
+    const auto &out_shape = out_desc->shape();
+    const auto &a_shape = a_desc->shape();
+    const auto &b_shape = b_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
+
+    CHECK_SAME_SHAPE(out_shape, a_shape, b_shape);
+
+    // create CPU elementwise descriptor
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<PowOp, fp16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<PowOp, float>(_info, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::pow::cpu
diff --git a/src/infiniop/ops/pow/cpu/pow_cpu.h b/src/infiniop/ops/pow/cpu/pow_cpu.h
new file mode 100644
index 000000000..21d9bb897
--- /dev/null
+++ b/src/infiniop/ops/pow/cpu/pow_cpu.h
@@ -0,0 +1,19 @@
+#ifndef __POW_CPU_H__
+#define __POW_CPU_H__
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+
+ELEMENTWISE_DESCRIPTOR(pow, cpu)
+
+namespace op::pow::cpu {
+typedef struct PowOp {
+public:
+    static constexpr size_t num_inputs = 2;
+    template <typename T>
+    T operator()(const T &a, const T &b) const {
+        return std::pow(a, b);
+    }
+} PowOp;
+} // namespace op::pow::cpu
+
+#endif // __POW_CPU_H__
diff --git a/src/infiniop/ops/pow/cuda/kernel.cuh b/src/infiniop/ops/pow/cuda/kernel.cuh
new file mode 100644
index 000000000..e8b5324a0
--- /dev/null
+++ b/src/infiniop/ops/pow/cuda/kernel.cuh
@@ -0,0 +1,40 @@
+#ifndef __POW_CUDA_H__
+#define __POW_CUDA_H__
+
+#include <cmath>
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
+
+namespace op::pow::cuda {
+typedef struct PowOp {
+    static constexpr size_t num_inputs = 2;
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &a, const T &b) const {
+        if constexpr (std::is_same_v<T, half2>) {
+            float2 a_f2 = __half22float2(a);
+            float2 b_f2 = __half22float2(b);
+            return __float22half2_rn(make_float2(__powf(a_f2.x, b_f2.x), __powf(a_f2.y, b_f2.y)));
+        } else if constexpr (std::is_same_v<T, half>) {
+            float a_ = __half2float(a);
+            float b_ = __half2float(b);
+            float ans_f = __powf(a_, b_);
+            return __float2half(isnan(ans_f) ? std::pow(a_, b_) : ans_f);
+        } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
+            float2 a_f2 = __bfloat1622float2(a);
+            float2 b_f2 = __bfloat1622float2(b);
+            return __floats2bfloat162_rn(__powf(a_f2.x, b_f2.x), __powf(a_f2.y, b_f2.y));
+        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+            float a_ = __bfloat162float(a);
+            float b_ = __bfloat162float(b);
+            return __float2bfloat16_rn(__powf(a_, b_));
+        } else if constexpr (std::is_same_v<T, float>) {
+            return __powf(a, b);
+        } else {
+            return std::pow(a, b);
+        }
+    }
+} PowOp;
+
+} // namespace op::pow::cuda
+
+#endif // __POW_CUDA_H__
diff --git a/src/infiniop/ops/pow/nvidia/pow_nvidia.cu b/src/infiniop/ops/pow/nvidia/pow_nvidia.cu
new file mode 100644
index 000000000..3cfd0cd2f
--- /dev/null
+++ b/src/infiniop/ops/pow/nvidia/pow_nvidia.cu
@@ -0,0 +1,57 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "pow_nvidia.cuh"
+
+namespace op::pow::nvidia {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &a_desc = input_desc_vec.at(0);
+    const auto &b_desc = input_desc_vec.at(1);
+    const auto &c_shape = out_desc->shape();
+    const auto &a_shape = a_desc->shape();
+    const auto &b_shape = b_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
+
+    CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::PowOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::PowOp, float>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::pow::nvidia
diff --git a/src/infiniop/ops/pow/nvidia/pow_nvidia.cuh b/src/infiniop/ops/pow/nvidia/pow_nvidia.cuh
new file mode 100644
index 000000000..5bbb2fb8c
--- /dev/null
+++ b/src/infiniop/ops/pow/nvidia/pow_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __POW_CUDA_API_H__
+#define __POW_CUDA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(pow, nvidia)
+
+#endif // __POW_CUDA_API_H__
diff --git a/src/infiniop/ops/pow/operator.cc b/src/infiniop/ops/pow/operator.cc
new file mode 100644
index 000000000..e90639f67
--- /dev/null
+++ b/src/infiniop/ops/pow/operator.cc
@@ -0,0 +1,142 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/pow.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/pow_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
+#include "nvidia/pow_nvidia.cuh"
+#endif
+
+__C infiniStatus_t infiniopCreatePowDescriptor(
+    infiniopHandle_t handle,
+    infiniopPowDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t c_desc,
+    infiniopTensorDescriptor_t a_desc,
+    infiniopTensorDescriptor_t b_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                            \
+    case CASE:                                                             \
+        return op::pow::NAMESPACE::Descriptor::create(                     \
+            handle,                                                        \
+            reinterpret_cast<op::pow::NAMESPACE::Descriptor **>(desc_ptr), \
+            c_desc,                                                        \
+            {a_desc,                                                       \
+             b_desc})
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CREATE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetPowWorkspaceSize(infiniopPowDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                               \
+    case CASE:                                                                             \
+        *size = reinterpret_cast<op::pow::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        GET(INFINI_DEVICE_QY, nvidia);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopPow(
+    infiniopPowDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *c,
+    const void *a,
+    const void *b,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                            \
+    case CASE:                                                                \
+        return reinterpret_cast<const op::pow::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, c, {a, b}, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CALCULATE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroyPowDescriptor(infiniopPowDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                \
+    case CASE:                                                                 \
+        delete reinterpret_cast<const op::pow::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        DELETE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/test/infiniop/div.py b/test/infiniop/div.py
new file mode 100644
index 000000000..17b22b2e5
--- /dev/null
+++ b/test/infiniop/div.py
@@ -0,0 +1,192 @@
+import torch
+import ctypes
+from ctypes import c_uint64
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
+)
+from enum import Enum, auto
+
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+# These are not meant to be imported from other modules
+_TEST_CASES_ = [
+    # shape, a_stride, b_stride, c_stride
+    ((13, 4), None, None, None),
+    ((13, 4), (10, 1), (10, 1), (10, 1)),
+    ((13, 4), (0, 1), None, None),
+    ((13, 4, 4), None, None, None),
+    ((13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1)),
+    ((13, 4, 4), (4, 0, 1), (0, 4, 1), None),
+    ((16, 5632), None, None, None),
+    ((16, 5632), (13312, 1), (13312, 1), (13312, 1)),
+    ((13, 16, 2), (128, 4, 1), (0, 2, 1), (64, 4, 1)),
+    ((13, 16, 2), (128, 4, 1), (2, 0, 1), (64, 4, 1)),
+    ((4, 4, 5632), None, None, None),
+    ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1)),
+]
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_A = auto()
+    INPLACE_B = auto()
+
+
+# Inplace options applied for each test case in _TEST_CASES_
+_INPLACE = [
+    Inplace.OUT_OF_PLACE,
+    Inplace.INPLACE_A,
+    Inplace.INPLACE_B,
+]
+
+# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
+_TEST_CASES = [
+    test_case + (inplace_item,)
+    for test_case in _TEST_CASES_
+    for inplace_item in _INPLACE
+]
+
+# Data types used for testing (matching old operators library: only F16 and F32)
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32]
+
+# Tolerance map for different data types
+# Note: F32 tolerance is relaxed compared to theoretical precision due to:
+# - Old operators library uses vectorized operations (pack_size=4) with vecN<float2, float, 2>
+# - InfiniCore uses elementwise operations, which can cause 1 ULP differences
+# - This is acceptable as it's within floating-point precision limits
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6},  # Relaxed from 1e-7 to accommodate vectorization differences
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def div(c, a, b):
+    # Only support F16 and F32 (matching old operators library)
+    torch.div(a, b, out=c)
+
+
+def test(
+    handle,
+    device,
+    shape,
+    a_stride=None,
+    b_stride=None,
+    c_stride=None,
+    inplace=Inplace.OUT_OF_PLACE,
+    dtype=InfiniDtype.F16,
+    sync=None,
+):
+    a = TestTensor(shape, a_stride, dtype, device)
+    # For division, ensure b doesn't contain zeros to avoid division by zero
+    # Similar to old test: b = torch.rand(...) * 2, which gives range [0, 2)
+    # Use scale=2 to ensure values are in [0, 2) range, then add small bias to avoid zero
+    b = TestTensor(shape, b_stride, dtype, device, scale=2, bias=0.1)
+    
+    if inplace == Inplace.INPLACE_A:
+        if c_stride is not None and c_stride != a_stride:
+            return
+        c = a
+    elif inplace == Inplace.INPLACE_B:
+        if c_stride is not None and c_stride != b_stride:
+            return
+        c = b
+    else:
+        c = TestTensor(shape, c_stride, dtype, device)
+
+    if c.is_broadcast():
+        return
+
+    print(
+        f"Testing Div on {InfiniDeviceNames[device]} with shape:{shape} a_stride:{a_stride} b_stride:{b_stride} c_stride:{c_stride} "
+        f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}"
+    )
+    div(c.torch_tensor(), a.torch_tensor(), b.torch_tensor())
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateDivDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            c.descriptor,
+            a.descriptor,
+            b.descriptor,
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in [a, b, c]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetDivWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, device)
+
+    def lib_div():
+        check_error(
+            LIBINFINIOP.infiniopDiv(
+                descriptor,
+                workspace.data(),
+                workspace_size.value,
+                c.data(),
+                a.data(),
+                b.data(),
+                None,
+            )
+        )
+
+    lib_div()
+    if sync is not None:
+        sync()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol)
+    assert torch.allclose(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol, equal_nan=True)
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: div(c.torch_tensor(), a.torch_tensor(), b.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_div(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+    check_error(LIBINFINIOP.infiniopDestroyDivDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/libinfiniop/op_register.py b/test/infiniop/libinfiniop/op_register.py
index 618be2b05..a61cea018 100644
--- a/test/infiniop/libinfiniop/op_register.py
+++ b/test/infiniop/libinfiniop/op_register.py
@@ -269,6 +269,176 @@ def mul_(lib):
     ]
 
 
+@OpRegister.operator
+def pow_(lib):
+    lib.infiniopCreatePowDescriptor.restype = c_int32
+    lib.infiniopCreatePowDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+
+    lib.infiniopGetPowWorkspaceSize.restype = c_int32
+    lib.infiniopGetPowWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopPow.restype = c_int32
+    lib.infiniopPow.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyPowDescriptor.restype = c_int32
+    lib.infiniopDestroyPowDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def div_(lib):
+    lib.infiniopCreateDivDescriptor.restype = c_int32
+    lib.infiniopCreateDivDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+
+    lib.infiniopGetDivWorkspaceSize.restype = c_int32
+    lib.infiniopGetDivWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopDiv.restype = c_int32
+    lib.infiniopDiv.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyDivDescriptor.restype = c_int32
+    lib.infiniopDestroyDivDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def mod_(lib):
+    lib.infiniopCreateModDescriptor.restype = c_int32
+    lib.infiniopCreateModDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+
+    lib.infiniopGetModWorkspaceSize.restype = c_int32
+    lib.infiniopGetModWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopMod.restype = c_int32
+    lib.infiniopMod.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyModDescriptor.restype = c_int32
+    lib.infiniopDestroyModDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def max_(lib):
+    lib.infiniopCreateMaxDescriptor.restype = c_int32
+    lib.infiniopCreateMaxDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+
+    lib.infiniopGetMaxWorkspaceSize.restype = c_int32
+    lib.infiniopGetMaxWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopMax.restype = c_int32
+    lib.infiniopMax.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyMaxDescriptor.restype = c_int32
+    lib.infiniopDestroyMaxDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def min_(lib):
+    lib.infiniopCreateMinDescriptor.restype = c_int32
+    lib.infiniopCreateMinDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+
+    lib.infiniopGetMinWorkspaceSize.restype = c_int32
+    lib.infiniopGetMinWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopMin.restype = c_int32
+    lib.infiniopMin.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyMinDescriptor.restype = c_int32
+    lib.infiniopDestroyMinDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
 @OpRegister.operator
 def random_sample_(lib):
     lib.infiniopCreateRandomSampleDescriptor.restype = c_int32
diff --git a/test/infiniop/max.py b/test/infiniop/max.py
new file mode 100644
index 000000000..e4221cf3e
--- /dev/null
+++ b/test/infiniop/max.py
@@ -0,0 +1,189 @@
+import torch
+import ctypes
+from ctypes import c_uint64
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
+)
+from enum import Enum, auto
+
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+# These are not meant to be imported from other modules
+_TEST_CASES_ = [
+    # shape, a_stride, b_stride, c_stride
+    ((13, 4), None, None, None),
+    ((13, 4), (10, 1), (10, 1), (10, 1)),
+    ((13, 4), (0, 1), None, None),
+    ((13, 4, 4), None, None, None),
+    ((13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1)),
+    ((13, 4, 4), (4, 0, 1), (0, 4, 1), None),
+    ((16, 5632), None, None, None),
+    ((16, 5632), (13312, 1), (13312, 1), (13312, 1)),
+    ((13, 16, 2), (128, 4, 1), (0, 2, 1), (64, 4, 1)),
+    ((13, 16, 2), (128, 4, 1), (2, 0, 1), (64, 4, 1)),
+    ((4, 4, 5632), None, None, None),
+    ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1)),
+]
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_A = auto()
+    INPLACE_B = auto()
+
+
+# Inplace options applied for each test case in _TEST_CASES_
+_INPLACE = [
+    Inplace.OUT_OF_PLACE,
+    Inplace.INPLACE_A,
+    Inplace.INPLACE_B,
+]
+
+# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
+_TEST_CASES = [
+    test_case + (inplace_item,)
+    for test_case in _TEST_CASES_
+    for inplace_item in _INPLACE
+]
+
+# Data types used for testing (matching old operators library: only F16 and F32)
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32]
+
+# Tolerance map for different data types
+# Note: F32 tolerance is relaxed compared to theoretical precision due to:
+# - Old operators library uses vectorized operations (pack_size=4) with vecN<float2, float, 2>
+# - InfiniCore uses elementwise operations, which can cause 1 ULP differences
+# - This is acceptable as it's within floating-point precision limits
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6},  # Relaxed from 1e-7 to accommodate vectorization differences
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def max(c, a, b):
+    # Only support F16 and F32 (matching old operators library)
+    torch.maximum(a, b, out=c)
+
+
+def test(
+    handle,
+    device,
+    shape,
+    a_stride=None,
+    b_stride=None,
+    c_stride=None,
+    inplace=Inplace.OUT_OF_PLACE,
+    dtype=InfiniDtype.F16,
+    sync=None,
+):
+    a = TestTensor(shape, a_stride, dtype, device)
+    b = TestTensor(shape, b_stride, dtype, device)
+    
+    if inplace == Inplace.INPLACE_A:
+        if c_stride is not None and c_stride != a_stride:
+            return
+        c = a
+    elif inplace == Inplace.INPLACE_B:
+        if c_stride is not None and c_stride != b_stride:
+            return
+        c = b
+    else:
+        c = TestTensor(shape, c_stride, dtype, device)
+
+    if c.is_broadcast():
+        return
+
+    print(
+        f"Testing Max on {InfiniDeviceNames[device]} with shape:{shape} a_stride:{a_stride} b_stride:{b_stride} c_stride:{c_stride} "
+        f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}"
+    )
+    max(c.torch_tensor(), a.torch_tensor(), b.torch_tensor())
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateMaxDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            c.descriptor,
+            a.descriptor,
+            b.descriptor,
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in [a, b, c]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetMaxWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, device)
+
+    def lib_max():
+        check_error(
+            LIBINFINIOP.infiniopMax(
+                descriptor,
+                workspace.data(),
+                workspace_size.value,
+                c.data(),
+                a.data(),
+                b.data(),
+                None,
+            )
+        )
+
+    lib_max()
+    if sync is not None:
+        sync()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol)
+    assert torch.allclose(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol, equal_nan=True)
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: max(c.torch_tensor(), a.torch_tensor(), b.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_max(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+    check_error(LIBINFINIOP.infiniopDestroyMaxDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/min.py b/test/infiniop/min.py
new file mode 100644
index 000000000..19f52a334
--- /dev/null
+++ b/test/infiniop/min.py
@@ -0,0 +1,189 @@
+import torch
+import ctypes
+from ctypes import c_uint64
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
+)
+from enum import Enum, auto
+
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+# These are not meant to be imported from other modules
+_TEST_CASES_ = [
+    # shape, a_stride, b_stride, c_stride
+    ((13, 4), None, None, None),
+    ((13, 4), (10, 1), (10, 1), (10, 1)),
+    ((13, 4), (0, 1), None, None),
+    ((13, 4, 4), None, None, None),
+    ((13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1)),
+    ((13, 4, 4), (4, 0, 1), (0, 4, 1), None),
+    ((16, 5632), None, None, None),
+    ((16, 5632), (13312, 1), (13312, 1), (13312, 1)),
+    ((13, 16, 2), (128, 4, 1), (0, 2, 1), (64, 4, 1)),
+    ((13, 16, 2), (128, 4, 1), (2, 0, 1), (64, 4, 1)),
+    ((4, 4, 5632), None, None, None),
+    ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1)),
+]
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_A = auto()
+    INPLACE_B = auto()
+
+
+# Inplace options applied for each test case in _TEST_CASES_
+_INPLACE = [
+    Inplace.OUT_OF_PLACE,
+    Inplace.INPLACE_A,
+    Inplace.INPLACE_B,
+]
+
+# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
+_TEST_CASES = [
+    test_case + (inplace_item,)
+    for test_case in _TEST_CASES_
+    for inplace_item in _INPLACE
+]
+
+# Data types used for testing (matching old operators library: only F16 and F32)
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32]
+
+# Tolerance map for different data types
+# Note: F32 tolerance is relaxed compared to theoretical precision due to:
+# - Old operators library uses vectorized operations (pack_size=4) with vecN<float2, float, 2>
+# - InfiniCore uses elementwise operations, which can cause 1 ULP differences
+# - This is acceptable as it's within floating-point precision limits
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6},  # Relaxed from 1e-7 to accommodate vectorization differences
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def min(c, a, b):
+    # Only support F16 and F32 (matching old operators library)
+    torch.minimum(a, b, out=c)
+
+
+def test(
+    handle,
+    device,
+    shape,
+    a_stride=None,
+    b_stride=None,
+    c_stride=None,
+    inplace=Inplace.OUT_OF_PLACE,
+    dtype=InfiniDtype.F16,
+    sync=None,
+):
+    a = TestTensor(shape, a_stride, dtype, device)
+    b = TestTensor(shape, b_stride, dtype, device)
+    
+    if inplace == Inplace.INPLACE_A:
+        if c_stride is not None and c_stride != a_stride:
+            return
+        c = a
+    elif inplace == Inplace.INPLACE_B:
+        if c_stride is not None and c_stride != b_stride:
+            return
+        c = b
+    else:
+        c = TestTensor(shape, c_stride, dtype, device)
+
+    if c.is_broadcast():
+        return
+
+    print(
+        f"Testing Min on {InfiniDeviceNames[device]} with shape:{shape} a_stride:{a_stride} b_stride:{b_stride} c_stride:{c_stride} "
+        f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}"
+    )
+    min(c.torch_tensor(), a.torch_tensor(), b.torch_tensor())
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateMinDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            c.descriptor,
+            a.descriptor,
+            b.descriptor,
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in [a, b, c]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetMinWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, device)
+
+    def lib_min():
+        check_error(
+            LIBINFINIOP.infiniopMin(
+                descriptor,
+                workspace.data(),
+                workspace_size.value,
+                c.data(),
+                a.data(),
+                b.data(),
+                None,
+            )
+        )
+
+    lib_min()
+    if sync is not None:
+        sync()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol)
+    assert torch.allclose(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol, equal_nan=True)
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: min(c.torch_tensor(), a.torch_tensor(), b.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_min(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+    check_error(LIBINFINIOP.infiniopDestroyMinDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/mod.py b/test/infiniop/mod.py
new file mode 100644
index 000000000..298f3137f
--- /dev/null
+++ b/test/infiniop/mod.py
@@ -0,0 +1,190 @@
+import torch
+import ctypes
+from ctypes import c_uint64
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
+)
+from enum import Enum, auto
+
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+# These are not meant to be imported from other modules
+_TEST_CASES_ = [
+    # shape, a_stride, b_stride, c_stride
+    ((13, 4), None, None, None),
+    ((13, 4), (10, 1), (10, 1), (10, 1)),
+    ((13, 4), (0, 1), None, None),
+    ((13, 4, 4), None, None, None),
+    ((13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1)),
+    ((13, 4, 4), (4, 0, 1), (0, 4, 1), None),
+    ((16, 5632), None, None, None),
+    ((16, 5632), (13312, 1), (13312, 1), (13312, 1)),
+    ((13, 16, 2), (128, 4, 1), (0, 2, 1), (64, 4, 1)),
+    ((13, 16, 2), (128, 4, 1), (2, 0, 1), (64, 4, 1)),
+    ((4, 4, 5632), None, None, None),
+    ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1)),
+]
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_A = auto()
+    INPLACE_B = auto()
+
+
+# Inplace options applied for each test case in _TEST_CASES_
+_INPLACE = [
+    Inplace.OUT_OF_PLACE,
+    Inplace.INPLACE_A,
+    Inplace.INPLACE_B,
+]
+
+# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
+_TEST_CASES = [
+    test_case + (inplace_item,)
+    for test_case in _TEST_CASES_
+    for inplace_item in _INPLACE
+]
+
+# Data types used for testing (matching old operators library: only F16 and F32)
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32]
+
+# Tolerance map for different data types
+# Note: mod operation uses fmod for floating point, which should be exact
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def mod_op(c, a, b):
+    torch.fmod(a, b, out=c)
+
+
+def test(
+    handle,
+    device,
+    shape,
+    a_stride=None,
+    b_stride=None,
+    c_stride=None,
+    inplace=Inplace.OUT_OF_PLACE,
+    dtype=InfiniDtype.F16,
+    sync=None,
+):
+    # Generate test tensors with values in a reasonable range for mod operation
+    # Use scale=10 to get values in [0, 10) range, similar to old test
+    a = TestTensor(shape, a_stride, dtype, device, mode="random", scale=10.0)
+    # Ensure b doesn't contain zeros to avoid division by zero in mod
+    b = TestTensor(shape, b_stride, dtype, device, mode="random", scale=10.0, bias=0.1)
+    
+    if inplace == Inplace.INPLACE_A:
+        if c_stride is not None and c_stride != a_stride:
+            return
+        c = a
+    elif inplace == Inplace.INPLACE_B:
+        if c_stride is not None and c_stride != b_stride:
+            return
+        c = b
+    else:
+        c = TestTensor(shape, c_stride, dtype, device)
+
+    if c.is_broadcast():
+        return
+
+    print(
+        f"Testing Mod on {InfiniDeviceNames[device]} with shape:{shape} a_stride:{a_stride} b_stride:{b_stride} c_stride:{c_stride} "
+        f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}"
+    )
+    
+    mod_op(c.torch_tensor(), a.torch_tensor(), b.torch_tensor())
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateModDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            c.descriptor,
+            a.descriptor,
+            b.descriptor,
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in [a, b, c]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetModWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, device)
+
+    def lib_mod():
+        check_error(
+            LIBINFINIOP.infiniopMod(
+                descriptor,
+                workspace.data(),
+                workspace_size.value,
+                c.data(),
+                a.data(),
+                b.data(),
+                None,
+            )
+        )
+
+    lib_mod()
+    if sync is not None:
+        sync()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol, equal_nan=True)
+    assert torch.allclose(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol, equal_nan=True)
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: mod_op(c.torch_tensor(), a.torch_tensor(), b.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_mod(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+    check_error(LIBINFINIOP.infiniopDestroyModDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/pow.py b/test/infiniop/pow.py
new file mode 100644
index 000000000..f437c4229
--- /dev/null
+++ b/test/infiniop/pow.py
@@ -0,0 +1,190 @@
+import torch
+import ctypes
+from ctypes import c_uint64
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
+)
+from enum import Enum, auto
+
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+# These are not meant to be imported from other modules
+_TEST_CASES_ = [
+    # shape, a_stride, b_stride, c_stride
+    ((13, 4), None, None, None),
+    ((13, 4), (10, 1), (10, 1), (10, 1)),
+    ((13, 4), (0, 1), None, None),
+    ((13, 4, 4), None, None, None),
+    ((13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1)),
+    ((13, 4, 4), (4, 0, 1), (0, 4, 1), None),
+    ((16, 5632), None, None, None),
+    ((16, 5632), (13312, 1), (13312, 1), (13312, 1)),
+    ((4, 4, 5632), None, None, None),
+    ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1)),
+]
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_A = auto()
+    INPLACE_B = auto()
+
+
+# Inplace options applied for each test case in _TEST_CASES_
+_INPLACE = [
+    Inplace.OUT_OF_PLACE,
+    Inplace.INPLACE_A,
+    Inplace.INPLACE_B,
+]
+
+# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
+_TEST_CASES = [
+    test_case + (inplace_item,)
+    for test_case in _TEST_CASES_
+    for inplace_item in _INPLACE
+]
+
+# Data types used for testing
+# Note: Only F16 and F32 are supported, matching the old repository's binary operator
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32]
+
+# Tolerance map for different data types
+# Note: pow operation may have larger numerical errors, especially for F16
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-3},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def pow_op(c, a, b):
+    torch.pow(a, b, out=c)
+
+
+def test(
+    handle,
+    device,
+    shape,
+    a_stride=None,
+    b_stride=None,
+    c_stride=None,
+    inplace=Inplace.OUT_OF_PLACE,
+    dtype=InfiniDtype.F16,
+    sync=None,
+):
+    # Generate test tensors with values in a reasonable range for pow operation
+    # Avoid negative bases and very large exponents to prevent numerical issues
+    a = TestTensor(shape, a_stride, dtype, device, mode="random", scale=5.0, bias=0.1)
+    b = TestTensor(shape, b_stride, dtype, device, mode="random", scale=3.0, bias=0.1)
+    
+    if inplace == Inplace.INPLACE_A:
+        if c_stride is not None and c_stride != a_stride:
+            return
+        c = a
+    elif inplace == Inplace.INPLACE_B:
+        if c_stride is not None and c_stride != b_stride:
+            return
+        c = b
+    else:
+        c = TestTensor(shape, c_stride, dtype, device)
+
+    if c.is_broadcast():
+        return
+
+    print(
+        f"Testing Pow on {InfiniDeviceNames[device]} with shape:{shape} a_stride:{a_stride} b_stride:{b_stride} c_stride:{c_stride} "
+        f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}"
+    )
+    
+    pow_op(c.torch_tensor(), a.torch_tensor(), b.torch_tensor())
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreatePowDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            c.descriptor,
+            a.descriptor,
+            b.descriptor,
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in [a, b, c]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetPowWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, c.device)
+
+    def lib_pow():
+        check_error(
+            LIBINFINIOP.infiniopPow(
+                descriptor,
+                workspace.data(),
+                workspace_size.value,
+                c.data(),
+                a.data(),
+                b.data(),
+                None,
+            )
+        )
+
+    lib_pow()
+
+    if sync is not None:
+        sync()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol, equal_nan=True)
+    # Use equal_nan=True to handle NaN cases in pow operation
+    assert torch.allclose(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol, equal_nan=True)
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: pow_op(c.torch_tensor(), a.torch_tensor(), b.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_pow(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+    check_error(LIBINFINIOP.infiniopDestroyPowDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")

From 8ab767e0c7aface0a3b2e9f0dd73fce209ee1474 Mon Sep 17 00:00:00 2001
From: gongchensu <zhuyue_134@qq.com>
Date: Wed, 7 Jan 2026 02:34:11 +0000
Subject: [PATCH 2/7] Issue/887 - Add
 abs,acos,acosh,asin,asinh,atan,atanh,ceil,cos,cosh,erf,floor,log,neg,reciprocal,round,sign,sinh,sqrt,tan
 operator with CPU and NVIDIA implementations.

---
 include/infiniop.h                            |  20 +
 include/infiniop/ops/abs.h                    |  24 +
 include/infiniop/ops/acos.h                   |  24 +
 include/infiniop/ops/acosh.h                  |  24 +
 include/infiniop/ops/asin.h                   |  24 +
 include/infiniop/ops/asinh.h                  |  24 +
 include/infiniop/ops/atan.h                   |  24 +
 include/infiniop/ops/atanh.h                  |  24 +
 include/infiniop/ops/ceil.h                   |  24 +
 include/infiniop/ops/cos.h                    |  24 +
 include/infiniop/ops/cosh.h                   |  24 +
 include/infiniop/ops/erf.h                    |  24 +
 include/infiniop/ops/floor.h                  |  24 +
 include/infiniop/ops/log.h                    |  24 +
 include/infiniop/ops/neg.h                    |  24 +
 include/infiniop/ops/reciprocal.h             |  24 +
 include/infiniop/ops/round.h                  |  24 +
 include/infiniop/ops/sign.h                   |  24 +
 include/infiniop/ops/sinh.h                   |  24 +
 include/infiniop/ops/sqrt.h                   |  24 +
 include/infiniop/ops/tan.h                    |  24 +
 src/infiniop/ops/abs/cpu/abs_cpu.cc           |  48 ++
 src/infiniop/ops/abs/cpu/abs_cpu.h            |  26 +
 src/infiniop/ops/abs/cuda/kernel.cuh          |  26 +
 src/infiniop/ops/abs/nvidia/abs_nvidia.cu     |  54 ++
 src/infiniop/ops/abs/nvidia/abs_nvidia.cuh    |   8 +
 src/infiniop/ops/abs/operator.cc              | 139 +++++
 src/infiniop/ops/acos/cpu/acos_cpu.cc         |  48 ++
 src/infiniop/ops/acos/cpu/acos_cpu.h          |  22 +
 src/infiniop/ops/acos/cuda/kernel.cuh         |  32 +
 src/infiniop/ops/acos/nvidia/acos_nvidia.cu   |  54 ++
 src/infiniop/ops/acos/nvidia/acos_nvidia.cuh  |   8 +
 src/infiniop/ops/acos/operator.cc             | 139 +++++
 src/infiniop/ops/acosh/cpu/acosh_cpu.cc       |  48 ++
 src/infiniop/ops/acosh/cpu/acosh_cpu.h        |  22 +
 src/infiniop/ops/acosh/cuda/kernel.cuh        |  32 +
 src/infiniop/ops/acosh/nvidia/acosh_nvidia.cu |  54 ++
 .../ops/acosh/nvidia/acosh_nvidia.cuh         |   8 +
 src/infiniop/ops/acosh/operator.cc            | 139 +++++
 src/infiniop/ops/asin/cpu/asin_cpu.cc         |  48 ++
 src/infiniop/ops/asin/cpu/asin_cpu.h          |  22 +
 src/infiniop/ops/asin/cuda/kernel.cuh         |  32 +
 src/infiniop/ops/asin/nvidia/asin_nvidia.cu   |  54 ++
 src/infiniop/ops/asin/nvidia/asin_nvidia.cuh  |   8 +
 src/infiniop/ops/asin/operator.cc             | 139 +++++
 src/infiniop/ops/asinh/cpu/asinh_cpu.cc       |  48 ++
 src/infiniop/ops/asinh/cpu/asinh_cpu.h        |  22 +
 src/infiniop/ops/asinh/cuda/kernel.cuh        |  32 +
 src/infiniop/ops/asinh/nvidia/asinh_nvidia.cu |  54 ++
 .../ops/asinh/nvidia/asinh_nvidia.cuh         |   8 +
 src/infiniop/ops/asinh/operator.cc            | 139 +++++
 src/infiniop/ops/atan/cpu/atan_cpu.cc         |  48 ++
 src/infiniop/ops/atan/cpu/atan_cpu.h          |  22 +
 src/infiniop/ops/atan/cuda/kernel.cuh         |  32 +
 src/infiniop/ops/atan/nvidia/atan_nvidia.cu   |  54 ++
 src/infiniop/ops/atan/nvidia/atan_nvidia.cuh  |   8 +
 src/infiniop/ops/atan/operator.cc             | 139 +++++
 src/infiniop/ops/atanh/cpu/atanh_cpu.cc       |  48 ++
 src/infiniop/ops/atanh/cpu/atanh_cpu.h        |  22 +
 src/infiniop/ops/atanh/cuda/kernel.cuh        |  32 +
 src/infiniop/ops/atanh/nvidia/atanh_nvidia.cu |  54 ++
 .../ops/atanh/nvidia/atanh_nvidia.cuh         |   8 +
 src/infiniop/ops/atanh/operator.cc            | 139 +++++
 src/infiniop/ops/ceil/cpu/ceil_cpu.cc         |  48 ++
 src/infiniop/ops/ceil/cpu/ceil_cpu.h          |  26 +
 src/infiniop/ops/ceil/cuda/kernel.cuh         |  34 +
 src/infiniop/ops/ceil/nvidia/ceil_nvidia.cu   |  54 ++
 src/infiniop/ops/ceil/nvidia/ceil_nvidia.cuh  |   8 +
 src/infiniop/ops/ceil/operator.cc             | 139 +++++
 src/infiniop/ops/cos/cpu/cos_cpu.cc           |  48 ++
 src/infiniop/ops/cos/cpu/cos_cpu.h            |  22 +
 src/infiniop/ops/cos/cuda/kernel.cuh          |  32 +
 src/infiniop/ops/cos/nvidia/cos_nvidia.cu     |  54 ++
 src/infiniop/ops/cos/nvidia/cos_nvidia.cuh    |   8 +
 src/infiniop/ops/cos/operator.cc              | 139 +++++
 src/infiniop/ops/cosh/cpu/cosh_cpu.cc         |  48 ++
 src/infiniop/ops/cosh/cpu/cosh_cpu.h          |  22 +
 src/infiniop/ops/cosh/cuda/kernel.cuh         |  32 +
 src/infiniop/ops/cosh/nvidia/cosh_nvidia.cu   |  54 ++
 src/infiniop/ops/cosh/nvidia/cosh_nvidia.cuh  |   8 +
 src/infiniop/ops/cosh/operator.cc             | 139 +++++
 src/infiniop/ops/erf/cpu/erf_cpu.cc           |  48 ++
 src/infiniop/ops/erf/cpu/erf_cpu.h            |  22 +
 src/infiniop/ops/erf/cuda/kernel.cuh          |  32 +
 src/infiniop/ops/erf/nvidia/erf_nvidia.cu     |  54 ++
 src/infiniop/ops/erf/nvidia/erf_nvidia.cuh    |   8 +
 src/infiniop/ops/erf/operator.cc              | 139 +++++
 src/infiniop/ops/floor/cpu/floor_cpu.cc       |  48 ++
 src/infiniop/ops/floor/cpu/floor_cpu.h        |  26 +
 src/infiniop/ops/floor/cuda/kernel.cuh        |  34 +
 src/infiniop/ops/floor/nvidia/floor_nvidia.cu |  54 ++
 .../ops/floor/nvidia/floor_nvidia.cuh         |   8 +
 src/infiniop/ops/floor/operator.cc            | 139 +++++
 src/infiniop/ops/log/cpu/log_cpu.cc           |  48 ++
 src/infiniop/ops/log/cpu/log_cpu.h            |  22 +
 src/infiniop/ops/log/cuda/kernel.cuh          |  32 +
 src/infiniop/ops/log/nvidia/log_nvidia.cu     |  54 ++
 src/infiniop/ops/log/nvidia/log_nvidia.cuh    |   8 +
 src/infiniop/ops/log/operator.cc              | 139 +++++
 src/infiniop/ops/neg/cpu/neg_cpu.cc           |  48 ++
 src/infiniop/ops/neg/cpu/neg_cpu.h            |  20 +
 src/infiniop/ops/neg/cuda/kernel.cuh          |  23 +
 src/infiniop/ops/neg/nvidia/neg_nvidia.cu     |  54 ++
 src/infiniop/ops/neg/nvidia/neg_nvidia.cuh    |   8 +
 src/infiniop/ops/neg/operator.cc              | 139 +++++
 src/infiniop/ops/pow/cuda/kernel.cuh          |   2 +-
 .../ops/reciprocal/cpu/reciprocal_cpu.cc      |  48 ++
 .../ops/reciprocal/cpu/reciprocal_cpu.h       |  20 +
 src/infiniop/ops/reciprocal/cuda/kernel.cuh   |  32 +
 .../reciprocal/nvidia/reciprocal_nvidia.cu    |  54 ++
 .../reciprocal/nvidia/reciprocal_nvidia.cuh   |   8 +
 src/infiniop/ops/reciprocal/operator.cc       | 139 +++++
 src/infiniop/ops/round/cpu/round_cpu.cc       |  48 ++
 src/infiniop/ops/round/cpu/round_cpu.h        |  25 +
 src/infiniop/ops/round/cuda/kernel.cuh        |  34 +
 src/infiniop/ops/round/nvidia/round_nvidia.cu |  54 ++
 .../ops/round/nvidia/round_nvidia.cuh         |   8 +
 src/infiniop/ops/round/operator.cc            | 139 +++++
 src/infiniop/ops/sign/cpu/sign_cpu.cc         |  48 ++
 src/infiniop/ops/sign/cpu/sign_cpu.h          |  20 +
 src/infiniop/ops/sign/cuda/kernel.cuh         |  25 +
 src/infiniop/ops/sign/nvidia/sign_nvidia.cu   |  54 ++
 src/infiniop/ops/sign/nvidia/sign_nvidia.cuh  |   8 +
 src/infiniop/ops/sign/operator.cc             | 139 +++++
 src/infiniop/ops/sinh/cpu/sinh_cpu.cc         |  48 ++
 src/infiniop/ops/sinh/cpu/sinh_cpu.h          |  22 +
 src/infiniop/ops/sinh/cuda/kernel.cuh         |  32 +
 src/infiniop/ops/sinh/nvidia/sinh_nvidia.cu   |  54 ++
 src/infiniop/ops/sinh/nvidia/sinh_nvidia.cuh  |   8 +
 src/infiniop/ops/sinh/operator.cc             | 139 +++++
 src/infiniop/ops/sqrt/cpu/sqrt_cpu.cc         |  48 ++
 src/infiniop/ops/sqrt/cpu/sqrt_cpu.h          |  22 +
 src/infiniop/ops/sqrt/cuda/kernel.cuh         |  32 +
 src/infiniop/ops/sqrt/nvidia/sqrt_nvidia.cu   |  54 ++
 src/infiniop/ops/sqrt/nvidia/sqrt_nvidia.cuh  |   8 +
 src/infiniop/ops/sqrt/operator.cc             | 139 +++++
 src/infiniop/ops/tan/cpu/tan_cpu.cc           |  48 ++
 src/infiniop/ops/tan/cpu/tan_cpu.h            |  22 +
 src/infiniop/ops/tan/cuda/kernel.cuh          |  55 ++
 src/infiniop/ops/tan/nvidia/tan_nvidia.cu     |  54 ++
 src/infiniop/ops/tan/nvidia/tan_nvidia.cuh    |   8 +
 src/infiniop/ops/tan/operator.cc              | 139 +++++
 test/infiniop/abs.py                          | 164 +++++
 test/infiniop/acos.py                         | 165 +++++
 test/infiniop/acosh.py                        | 165 +++++
 test/infiniop/asin.py                         | 165 +++++
 test/infiniop/asinh.py                        | 165 +++++
 test/infiniop/atan.py                         | 164 +++++
 test/infiniop/atanh.py                        | 165 +++++
 test/infiniop/ceil.py                         | 165 +++++
 test/infiniop/cos.py                          | 166 +++++
 test/infiniop/cosh.py                         | 165 +++++
 test/infiniop/erf.py                          | 165 +++++
 test/infiniop/floor.py                        | 165 +++++
 test/infiniop/libinfiniop/op_register.py      | 583 ++++++++++++++++++
 test/infiniop/log.py                          | 166 +++++
 test/infiniop/neg.py                          | 165 +++++
 test/infiniop/reciprocal.py                   | 168 +++++
 test/infiniop/round.py                        | 165 +++++
 test/infiniop/sign.py                         | 166 +++++
 test/infiniop/sinh.py                         | 166 +++++
 test/infiniop/sqrt.py                         | 166 +++++
 test/infiniop/tan.py                          | 167 +++++
 163 files changed, 10468 insertions(+), 1 deletion(-)
 create mode 100644 include/infiniop/ops/abs.h
 create mode 100644 include/infiniop/ops/acos.h
 create mode 100644 include/infiniop/ops/acosh.h
 create mode 100644 include/infiniop/ops/asin.h
 create mode 100644 include/infiniop/ops/asinh.h
 create mode 100644 include/infiniop/ops/atan.h
 create mode 100644 include/infiniop/ops/atanh.h
 create mode 100644 include/infiniop/ops/ceil.h
 create mode 100644 include/infiniop/ops/cos.h
 create mode 100644 include/infiniop/ops/cosh.h
 create mode 100644 include/infiniop/ops/erf.h
 create mode 100644 include/infiniop/ops/floor.h
 create mode 100644 include/infiniop/ops/log.h
 create mode 100644 include/infiniop/ops/neg.h
 create mode 100644 include/infiniop/ops/reciprocal.h
 create mode 100644 include/infiniop/ops/round.h
 create mode 100644 include/infiniop/ops/sign.h
 create mode 100644 include/infiniop/ops/sinh.h
 create mode 100644 include/infiniop/ops/sqrt.h
 create mode 100644 include/infiniop/ops/tan.h
 create mode 100644 src/infiniop/ops/abs/cpu/abs_cpu.cc
 create mode 100644 src/infiniop/ops/abs/cpu/abs_cpu.h
 create mode 100644 src/infiniop/ops/abs/cuda/kernel.cuh
 create mode 100644 src/infiniop/ops/abs/nvidia/abs_nvidia.cu
 create mode 100644 src/infiniop/ops/abs/nvidia/abs_nvidia.cuh
 create mode 100644 src/infiniop/ops/abs/operator.cc
 create mode 100644 src/infiniop/ops/acos/cpu/acos_cpu.cc
 create mode 100644 src/infiniop/ops/acos/cpu/acos_cpu.h
 create mode 100644 src/infiniop/ops/acos/cuda/kernel.cuh
 create mode 100644 src/infiniop/ops/acos/nvidia/acos_nvidia.cu
 create mode 100644 src/infiniop/ops/acos/nvidia/acos_nvidia.cuh
 create mode 100644 src/infiniop/ops/acos/operator.cc
 create mode 100644 src/infiniop/ops/acosh/cpu/acosh_cpu.cc
 create mode 100644 src/infiniop/ops/acosh/cpu/acosh_cpu.h
 create mode 100644 src/infiniop/ops/acosh/cuda/kernel.cuh
 create mode 100644 src/infiniop/ops/acosh/nvidia/acosh_nvidia.cu
 create mode 100644 src/infiniop/ops/acosh/nvidia/acosh_nvidia.cuh
 create mode 100644 src/infiniop/ops/acosh/operator.cc
 create mode 100644 src/infiniop/ops/asin/cpu/asin_cpu.cc
 create mode 100644 src/infiniop/ops/asin/cpu/asin_cpu.h
 create mode 100644 src/infiniop/ops/asin/cuda/kernel.cuh
 create mode 100644 src/infiniop/ops/asin/nvidia/asin_nvidia.cu
 create mode 100644 src/infiniop/ops/asin/nvidia/asin_nvidia.cuh
 create mode 100644 src/infiniop/ops/asin/operator.cc
 create mode 100644 src/infiniop/ops/asinh/cpu/asinh_cpu.cc
 create mode 100644 src/infiniop/ops/asinh/cpu/asinh_cpu.h
 create mode 100644 src/infiniop/ops/asinh/cuda/kernel.cuh
 create mode 100644 src/infiniop/ops/asinh/nvidia/asinh_nvidia.cu
 create mode 100644 src/infiniop/ops/asinh/nvidia/asinh_nvidia.cuh
 create mode 100644 src/infiniop/ops/asinh/operator.cc
 create mode 100644 src/infiniop/ops/atan/cpu/atan_cpu.cc
 create mode 100644 src/infiniop/ops/atan/cpu/atan_cpu.h
 create mode 100644 src/infiniop/ops/atan/cuda/kernel.cuh
 create mode 100644 src/infiniop/ops/atan/nvidia/atan_nvidia.cu
 create mode 100644 src/infiniop/ops/atan/nvidia/atan_nvidia.cuh
 create mode 100644 src/infiniop/ops/atan/operator.cc
 create mode 100644 src/infiniop/ops/atanh/cpu/atanh_cpu.cc
 create mode 100644 src/infiniop/ops/atanh/cpu/atanh_cpu.h
 create mode 100644 src/infiniop/ops/atanh/cuda/kernel.cuh
 create mode 100644 src/infiniop/ops/atanh/nvidia/atanh_nvidia.cu
 create mode 100644 src/infiniop/ops/atanh/nvidia/atanh_nvidia.cuh
 create mode 100644 src/infiniop/ops/atanh/operator.cc
 create mode 100644 src/infiniop/ops/ceil/cpu/ceil_cpu.cc
 create mode 100644 src/infiniop/ops/ceil/cpu/ceil_cpu.h
 create mode 100644 src/infiniop/ops/ceil/cuda/kernel.cuh
 create mode 100644 src/infiniop/ops/ceil/nvidia/ceil_nvidia.cu
 create mode 100644 src/infiniop/ops/ceil/nvidia/ceil_nvidia.cuh
 create mode 100644 src/infiniop/ops/ceil/operator.cc
 create mode 100644 src/infiniop/ops/cos/cpu/cos_cpu.cc
 create mode 100644 src/infiniop/ops/cos/cpu/cos_cpu.h
 create mode 100644 src/infiniop/ops/cos/cuda/kernel.cuh
 create mode 100644 src/infiniop/ops/cos/nvidia/cos_nvidia.cu
 create mode 100644 src/infiniop/ops/cos/nvidia/cos_nvidia.cuh
 create mode 100644 src/infiniop/ops/cos/operator.cc
 create mode 100644 src/infiniop/ops/cosh/cpu/cosh_cpu.cc
 create mode 100644 src/infiniop/ops/cosh/cpu/cosh_cpu.h
 create mode 100644 src/infiniop/ops/cosh/cuda/kernel.cuh
 create mode 100644 src/infiniop/ops/cosh/nvidia/cosh_nvidia.cu
 create mode 100644 src/infiniop/ops/cosh/nvidia/cosh_nvidia.cuh
 create mode 100644 src/infiniop/ops/cosh/operator.cc
 create mode 100644 src/infiniop/ops/erf/cpu/erf_cpu.cc
 create mode 100644 src/infiniop/ops/erf/cpu/erf_cpu.h
 create mode 100644 src/infiniop/ops/erf/cuda/kernel.cuh
 create mode 100644 src/infiniop/ops/erf/nvidia/erf_nvidia.cu
 create mode 100644 src/infiniop/ops/erf/nvidia/erf_nvidia.cuh
 create mode 100644 src/infiniop/ops/erf/operator.cc
 create mode 100644 src/infiniop/ops/floor/cpu/floor_cpu.cc
 create mode 100644 src/infiniop/ops/floor/cpu/floor_cpu.h
 create mode 100644 src/infiniop/ops/floor/cuda/kernel.cuh
 create mode 100644 src/infiniop/ops/floor/nvidia/floor_nvidia.cu
 create mode 100644 src/infiniop/ops/floor/nvidia/floor_nvidia.cuh
 create mode 100644 src/infiniop/ops/floor/operator.cc
 create mode 100644 src/infiniop/ops/log/cpu/log_cpu.cc
 create mode 100644 src/infiniop/ops/log/cpu/log_cpu.h
 create mode 100644 src/infiniop/ops/log/cuda/kernel.cuh
 create mode 100644 src/infiniop/ops/log/nvidia/log_nvidia.cu
 create mode 100644 src/infiniop/ops/log/nvidia/log_nvidia.cuh
 create mode 100644 src/infiniop/ops/log/operator.cc
 create mode 100644 src/infiniop/ops/neg/cpu/neg_cpu.cc
 create mode 100644 src/infiniop/ops/neg/cpu/neg_cpu.h
 create mode 100644 src/infiniop/ops/neg/cuda/kernel.cuh
 create mode 100644 src/infiniop/ops/neg/nvidia/neg_nvidia.cu
 create mode 100644 src/infiniop/ops/neg/nvidia/neg_nvidia.cuh
 create mode 100644 src/infiniop/ops/neg/operator.cc
 create mode 100644 src/infiniop/ops/reciprocal/cpu/reciprocal_cpu.cc
 create mode 100644 src/infiniop/ops/reciprocal/cpu/reciprocal_cpu.h
 create mode 100644 src/infiniop/ops/reciprocal/cuda/kernel.cuh
 create mode 100644 src/infiniop/ops/reciprocal/nvidia/reciprocal_nvidia.cu
 create mode 100644 src/infiniop/ops/reciprocal/nvidia/reciprocal_nvidia.cuh
 create mode 100644 src/infiniop/ops/reciprocal/operator.cc
 create mode 100644 src/infiniop/ops/round/cpu/round_cpu.cc
 create mode 100644 src/infiniop/ops/round/cpu/round_cpu.h
 create mode 100644 src/infiniop/ops/round/cuda/kernel.cuh
 create mode 100644 src/infiniop/ops/round/nvidia/round_nvidia.cu
 create mode 100644 src/infiniop/ops/round/nvidia/round_nvidia.cuh
 create mode 100644 src/infiniop/ops/round/operator.cc
 create mode 100644 src/infiniop/ops/sign/cpu/sign_cpu.cc
 create mode 100644 src/infiniop/ops/sign/cpu/sign_cpu.h
 create mode 100644 src/infiniop/ops/sign/cuda/kernel.cuh
 create mode 100644 src/infiniop/ops/sign/nvidia/sign_nvidia.cu
 create mode 100644 src/infiniop/ops/sign/nvidia/sign_nvidia.cuh
 create mode 100644 src/infiniop/ops/sign/operator.cc
 create mode 100644 src/infiniop/ops/sinh/cpu/sinh_cpu.cc
 create mode 100644 src/infiniop/ops/sinh/cpu/sinh_cpu.h
 create mode 100644 src/infiniop/ops/sinh/cuda/kernel.cuh
 create mode 100644 src/infiniop/ops/sinh/nvidia/sinh_nvidia.cu
 create mode 100644 src/infiniop/ops/sinh/nvidia/sinh_nvidia.cuh
 create mode 100644 src/infiniop/ops/sinh/operator.cc
 create mode 100644 src/infiniop/ops/sqrt/cpu/sqrt_cpu.cc
 create mode 100644 src/infiniop/ops/sqrt/cpu/sqrt_cpu.h
 create mode 100644 src/infiniop/ops/sqrt/cuda/kernel.cuh
 create mode 100644 src/infiniop/ops/sqrt/nvidia/sqrt_nvidia.cu
 create mode 100644 src/infiniop/ops/sqrt/nvidia/sqrt_nvidia.cuh
 create mode 100644 src/infiniop/ops/sqrt/operator.cc
 create mode 100644 src/infiniop/ops/tan/cpu/tan_cpu.cc
 create mode 100644 src/infiniop/ops/tan/cpu/tan_cpu.h
 create mode 100644 src/infiniop/ops/tan/cuda/kernel.cuh
 create mode 100644 src/infiniop/ops/tan/nvidia/tan_nvidia.cu
 create mode 100644 src/infiniop/ops/tan/nvidia/tan_nvidia.cuh
 create mode 100644 src/infiniop/ops/tan/operator.cc
 create mode 100644 test/infiniop/abs.py
 create mode 100644 test/infiniop/acos.py
 create mode 100644 test/infiniop/acosh.py
 create mode 100644 test/infiniop/asin.py
 create mode 100644 test/infiniop/asinh.py
 create mode 100644 test/infiniop/atan.py
 create mode 100644 test/infiniop/atanh.py
 create mode 100644 test/infiniop/ceil.py
 create mode 100644 test/infiniop/cos.py
 create mode 100644 test/infiniop/cosh.py
 create mode 100644 test/infiniop/erf.py
 create mode 100644 test/infiniop/floor.py
 create mode 100644 test/infiniop/log.py
 create mode 100644 test/infiniop/neg.py
 create mode 100644 test/infiniop/reciprocal.py
 create mode 100644 test/infiniop/round.py
 create mode 100644 test/infiniop/sign.py
 create mode 100644 test/infiniop/sinh.py
 create mode 100644 test/infiniop/sqrt.py
 create mode 100644 test/infiniop/tan.py

diff --git a/include/infiniop.h b/include/infiniop.h
index cf1688868..4778fce90 100644
--- a/include/infiniop.h
+++ b/include/infiniop.h
@@ -2,9 +2,21 @@
 #define __INFINIOP_API_H__
 
 #include "infiniop/handle.h"
+#include "infiniop/ops/abs.h"
+#include "infiniop/ops/acos.h"
+#include "infiniop/ops/acosh.h"
 #include "infiniop/ops/add.h"
 #include "infiniop/ops/add_rms_norm.h"
+#include "infiniop/ops/asin.h"
+#include "infiniop/ops/asinh.h"
+#include "infiniop/ops/atan.h"
+#include "infiniop/ops/atanh.h"
 #include "infiniop/ops/attention.h"
+#include "infiniop/ops/ceil.h"
+#include "infiniop/ops/cos.h"
+#include "infiniop/ops/cosh.h"
+#include "infiniop/ops/erf.h"
+#include "infiniop/ops/floor.h"
 #include "infiniop/ops/causal_softmax.h"
 #include "infiniop/ops/clip.h"
 #include "infiniop/ops/conv.h"
@@ -13,17 +25,24 @@
 #include "infiniop/ops/gelu.h"
 #include "infiniop/ops/gemm.h"
 #include "infiniop/ops/layer_norm.h"
+#include "infiniop/ops/log.h"
 #include "infiniop/ops/logsoftmax.h"
 #include "infiniop/ops/lp_norm.h"
 #include "infiniop/ops/max.h"
 #include "infiniop/ops/min.h"
 #include "infiniop/ops/mul.h"
+#include "infiniop/ops/neg.h"
 #include "infiniop/ops/ones.h"
 #include "infiniop/ops/paged_attention.h"
 #include "infiniop/ops/paged_attention_prefill.h"
 #include "infiniop/ops/paged_caching.h"
 #include "infiniop/ops/random_sample.h"
+#include "infiniop/ops/reciprocal.h"
 #include "infiniop/ops/rearrange.h"
+#include "infiniop/ops/round.h"
+#include "infiniop/ops/sign.h"
+#include "infiniop/ops/sinh.h"
+#include "infiniop/ops/sqrt.h"
 #include "infiniop/ops/relu.h"
 #include "infiniop/ops/rms_norm.h"
 #include "infiniop/ops/rope.h"
@@ -33,6 +52,7 @@
 #include "infiniop/ops/softplus.h"
 #include "infiniop/ops/sub.h"
 #include "infiniop/ops/swiglu.h"
+#include "infiniop/ops/tan.h"
 #include "infiniop/ops/tanh.h"
 #include "infiniop/ops/topkrouter.h"
 #include "infiniop/ops/topksoftmax.h"
diff --git a/include/infiniop/ops/abs.h b/include/infiniop/ops/abs.h
new file mode 100644
index 000000000..7b5872657
--- /dev/null
+++ b/include/infiniop/ops/abs.h
@@ -0,0 +1,24 @@
+#ifndef __INFINIOP_ABS_API_H__
+#define __INFINIOP_ABS_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopAbsDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateAbsDescriptor(infiniopHandle_t handle,
+                                                        infiniopAbsDescriptor_t *desc_ptr,
+                                                        infiniopTensorDescriptor_t y,
+                                                        infiniopTensorDescriptor_t x);
+
+__C __export infiniStatus_t infiniopGetAbsWorkspaceSize(infiniopAbsDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopAbs(infiniopAbsDescriptor_t desc,
+                                        void *workspace,
+                                        size_t workspace_size,
+                                        void *y,
+                                        const void *x,
+                                        void *stream);
+
+__C __export infiniStatus_t infiniopDestroyAbsDescriptor(infiniopAbsDescriptor_t desc);
+
+#endif
diff --git a/include/infiniop/ops/acos.h b/include/infiniop/ops/acos.h
new file mode 100644
index 000000000..fe6af01ed
--- /dev/null
+++ b/include/infiniop/ops/acos.h
@@ -0,0 +1,24 @@
+#ifndef __INFINIOP_ACOS_API_H__
+#define __INFINIOP_ACOS_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopAcosDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateAcosDescriptor(infiniopHandle_t handle,
+                                                        infiniopAcosDescriptor_t *desc_ptr,
+                                                        infiniopTensorDescriptor_t y,
+                                                        infiniopTensorDescriptor_t x);
+
+__C __export infiniStatus_t infiniopGetAcosWorkspaceSize(infiniopAcosDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopAcos(infiniopAcosDescriptor_t desc,
+                                        void *workspace,
+                                        size_t workspace_size,
+                                        void *y,
+                                        const void *x,
+                                        void *stream);
+
+__C __export infiniStatus_t infiniopDestroyAcosDescriptor(infiniopAcosDescriptor_t desc);
+
+#endif
diff --git a/include/infiniop/ops/acosh.h b/include/infiniop/ops/acosh.h
new file mode 100644
index 000000000..be28918bb
--- /dev/null
+++ b/include/infiniop/ops/acosh.h
@@ -0,0 +1,24 @@
+#ifndef __INFINIOP_ACOSH_API_H__
+#define __INFINIOP_ACOSH_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopAcoshDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateAcoshDescriptor(infiniopHandle_t handle,
+                                                        infiniopAcoshDescriptor_t *desc_ptr,
+                                                        infiniopTensorDescriptor_t y,
+                                                        infiniopTensorDescriptor_t x);
+
+__C __export infiniStatus_t infiniopGetAcoshWorkspaceSize(infiniopAcoshDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopAcosh(infiniopAcoshDescriptor_t desc,
+                                        void *workspace,
+                                        size_t workspace_size,
+                                        void *y,
+                                        const void *x,
+                                        void *stream);
+
+__C __export infiniStatus_t infiniopDestroyAcoshDescriptor(infiniopAcoshDescriptor_t desc);
+
+#endif
diff --git a/include/infiniop/ops/asin.h b/include/infiniop/ops/asin.h
new file mode 100644
index 000000000..2aac6d1e1
--- /dev/null
+++ b/include/infiniop/ops/asin.h
@@ -0,0 +1,24 @@
+#ifndef __INFINIOP_ASIN_API_H__
+#define __INFINIOP_ASIN_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopAsinDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateAsinDescriptor(infiniopHandle_t handle,
+                                                        infiniopAsinDescriptor_t *desc_ptr,
+                                                        infiniopTensorDescriptor_t y,
+                                                        infiniopTensorDescriptor_t x);
+
+__C __export infiniStatus_t infiniopGetAsinWorkspaceSize(infiniopAsinDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopAsin(infiniopAsinDescriptor_t desc,
+                                        void *workspace,
+                                        size_t workspace_size,
+                                        void *y,
+                                        const void *x,
+                                        void *stream);
+
+__C __export infiniStatus_t infiniopDestroyAsinDescriptor(infiniopAsinDescriptor_t desc);
+
+#endif
diff --git a/include/infiniop/ops/asinh.h b/include/infiniop/ops/asinh.h
new file mode 100644
index 000000000..d1385fc01
--- /dev/null
+++ b/include/infiniop/ops/asinh.h
@@ -0,0 +1,24 @@
+#ifndef __INFINIOP_ASINH_API_H__
+#define __INFINIOP_ASINH_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopAsinhDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateAsinhDescriptor(infiniopHandle_t handle,
+                                                        infiniopAsinhDescriptor_t *desc_ptr,
+                                                        infiniopTensorDescriptor_t y,
+                                                        infiniopTensorDescriptor_t x);
+
+__C __export infiniStatus_t infiniopGetAsinhWorkspaceSize(infiniopAsinhDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopAsinh(infiniopAsinhDescriptor_t desc,
+                                        void *workspace,
+                                        size_t workspace_size,
+                                        void *y,
+                                        const void *x,
+                                        void *stream);
+
+__C __export infiniStatus_t infiniopDestroyAsinhDescriptor(infiniopAsinhDescriptor_t desc);
+
+#endif
diff --git a/include/infiniop/ops/atan.h b/include/infiniop/ops/atan.h
new file mode 100644
index 000000000..3b1a5bde3
--- /dev/null
+++ b/include/infiniop/ops/atan.h
@@ -0,0 +1,24 @@
+#ifndef __INFINIOP_ATAN_API_H__
+#define __INFINIOP_ATAN_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopAtanDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateAtanDescriptor(infiniopHandle_t handle,
+                                                        infiniopAtanDescriptor_t *desc_ptr,
+                                                        infiniopTensorDescriptor_t y,
+                                                        infiniopTensorDescriptor_t x);
+
+__C __export infiniStatus_t infiniopGetAtanWorkspaceSize(infiniopAtanDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopAtan(infiniopAtanDescriptor_t desc,
+                                        void *workspace,
+                                        size_t workspace_size,
+                                        void *y,
+                                        const void *x,
+                                        void *stream);
+
+__C __export infiniStatus_t infiniopDestroyAtanDescriptor(infiniopAtanDescriptor_t desc);
+
+#endif
diff --git a/include/infiniop/ops/atanh.h b/include/infiniop/ops/atanh.h
new file mode 100644
index 000000000..800afd5d5
--- /dev/null
+++ b/include/infiniop/ops/atanh.h
@@ -0,0 +1,24 @@
+#ifndef __INFINIOP_ATANH_API_H__
+#define __INFINIOP_ATANH_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopAtanhDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateAtanhDescriptor(infiniopHandle_t handle,
+                                                        infiniopAtanhDescriptor_t *desc_ptr,
+                                                        infiniopTensorDescriptor_t y,
+                                                        infiniopTensorDescriptor_t x);
+
+__C __export infiniStatus_t infiniopGetAtanhWorkspaceSize(infiniopAtanhDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopAtanh(infiniopAtanhDescriptor_t desc,
+                                        void *workspace,
+                                        size_t workspace_size,
+                                        void *y,
+                                        const void *x,
+                                        void *stream);
+
+__C __export infiniStatus_t infiniopDestroyAtanhDescriptor(infiniopAtanhDescriptor_t desc);
+
+#endif
diff --git a/include/infiniop/ops/ceil.h b/include/infiniop/ops/ceil.h
new file mode 100644
index 000000000..4539d77fd
--- /dev/null
+++ b/include/infiniop/ops/ceil.h
@@ -0,0 +1,24 @@
+#ifndef __INFINIOP_CEIL_API_H__
+#define __INFINIOP_CEIL_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopCeilDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateCeilDescriptor(infiniopHandle_t handle,
+                                                        infiniopCeilDescriptor_t *desc_ptr,
+                                                        infiniopTensorDescriptor_t y,
+                                                        infiniopTensorDescriptor_t x);
+
+__C __export infiniStatus_t infiniopGetCeilWorkspaceSize(infiniopCeilDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopCeil(infiniopCeilDescriptor_t desc,
+                                        void *workspace,
+                                        size_t workspace_size,
+                                        void *y,
+                                        const void *x,
+                                        void *stream);
+
+__C __export infiniStatus_t infiniopDestroyCeilDescriptor(infiniopCeilDescriptor_t desc);
+
+#endif
diff --git a/include/infiniop/ops/cos.h b/include/infiniop/ops/cos.h
new file mode 100644
index 000000000..8f0b6eeb7
--- /dev/null
+++ b/include/infiniop/ops/cos.h
@@ -0,0 +1,24 @@
+#ifndef __INFINIOP_COS_API_H__
+#define __INFINIOP_COS_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopCosDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateCosDescriptor(infiniopHandle_t handle,
+                                                        infiniopCosDescriptor_t *desc_ptr,
+                                                        infiniopTensorDescriptor_t y,
+                                                        infiniopTensorDescriptor_t x);
+
+__C __export infiniStatus_t infiniopGetCosWorkspaceSize(infiniopCosDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopCos(infiniopCosDescriptor_t desc,
+                                        void *workspace,
+                                        size_t workspace_size,
+                                        void *y,
+                                        const void *x,
+                                        void *stream);
+
+__C __export infiniStatus_t infiniopDestroyCosDescriptor(infiniopCosDescriptor_t desc);
+
+#endif
diff --git a/include/infiniop/ops/cosh.h b/include/infiniop/ops/cosh.h
new file mode 100644
index 000000000..3328151ad
--- /dev/null
+++ b/include/infiniop/ops/cosh.h
@@ -0,0 +1,24 @@
+#ifndef __INFINIOP_COSH_API_H__
+#define __INFINIOP_COSH_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopCoshDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateCoshDescriptor(infiniopHandle_t handle,
+                                                        infiniopCoshDescriptor_t *desc_ptr,
+                                                        infiniopTensorDescriptor_t y,
+                                                        infiniopTensorDescriptor_t x);
+
+__C __export infiniStatus_t infiniopGetCoshWorkspaceSize(infiniopCoshDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopCosh(infiniopCoshDescriptor_t desc,
+                                        void *workspace,
+                                        size_t workspace_size,
+                                        void *y,
+                                        const void *x,
+                                        void *stream);
+
+__C __export infiniStatus_t infiniopDestroyCoshDescriptor(infiniopCoshDescriptor_t desc);
+
+#endif
diff --git a/include/infiniop/ops/erf.h b/include/infiniop/ops/erf.h
new file mode 100644
index 000000000..8cbb8fb74
--- /dev/null
+++ b/include/infiniop/ops/erf.h
@@ -0,0 +1,24 @@
+#ifndef __INFINIOP_ERF_API_H__
+#define __INFINIOP_ERF_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopErfDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateErfDescriptor(infiniopHandle_t handle,
+                                                        infiniopErfDescriptor_t *desc_ptr,
+                                                        infiniopTensorDescriptor_t y,
+                                                        infiniopTensorDescriptor_t x);
+
+__C __export infiniStatus_t infiniopGetErfWorkspaceSize(infiniopErfDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopErf(infiniopErfDescriptor_t desc,
+                                        void *workspace,
+                                        size_t workspace_size,
+                                        void *y,
+                                        const void *x,
+                                        void *stream);
+
+__C __export infiniStatus_t infiniopDestroyErfDescriptor(infiniopErfDescriptor_t desc);
+
+#endif
diff --git a/include/infiniop/ops/floor.h b/include/infiniop/ops/floor.h
new file mode 100644
index 000000000..2f65f8f4a
--- /dev/null
+++ b/include/infiniop/ops/floor.h
@@ -0,0 +1,24 @@
+#ifndef __INFINIOP_FLOOR_API_H__
+#define __INFINIOP_FLOOR_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopFloorDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateFloorDescriptor(infiniopHandle_t handle,
+                                                        infiniopFloorDescriptor_t *desc_ptr,
+                                                        infiniopTensorDescriptor_t y,
+                                                        infiniopTensorDescriptor_t x);
+
+__C __export infiniStatus_t infiniopGetFloorWorkspaceSize(infiniopFloorDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopFloor(infiniopFloorDescriptor_t desc,
+                                        void *workspace,
+                                        size_t workspace_size,
+                                        void *y,
+                                        const void *x,
+                                        void *stream);
+
+__C __export infiniStatus_t infiniopDestroyFloorDescriptor(infiniopFloorDescriptor_t desc);
+
+#endif
diff --git a/include/infiniop/ops/log.h b/include/infiniop/ops/log.h
new file mode 100644
index 000000000..f5bec4382
--- /dev/null
+++ b/include/infiniop/ops/log.h
@@ -0,0 +1,24 @@
+#ifndef __INFINIOP_LOG_API_H__
+#define __INFINIOP_LOG_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopLogDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateLogDescriptor(infiniopHandle_t handle,
+                                                        infiniopLogDescriptor_t *desc_ptr,
+                                                        infiniopTensorDescriptor_t y,
+                                                        infiniopTensorDescriptor_t x);
+
+__C __export infiniStatus_t infiniopGetLogWorkspaceSize(infiniopLogDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopLog(infiniopLogDescriptor_t desc,
+                                        void *workspace,
+                                        size_t workspace_size,
+                                        void *y,
+                                        const void *x,
+                                        void *stream);
+
+__C __export infiniStatus_t infiniopDestroyLogDescriptor(infiniopLogDescriptor_t desc);
+
+#endif
diff --git a/include/infiniop/ops/neg.h b/include/infiniop/ops/neg.h
new file mode 100644
index 000000000..4d3b06e21
--- /dev/null
+++ b/include/infiniop/ops/neg.h
@@ -0,0 +1,24 @@
+#ifndef __INFINIOP_NEG_API_H__
+#define __INFINIOP_NEG_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopNegDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateNegDescriptor(infiniopHandle_t handle,
+                                                        infiniopNegDescriptor_t *desc_ptr,
+                                                        infiniopTensorDescriptor_t y,
+                                                        infiniopTensorDescriptor_t x);
+
+__C __export infiniStatus_t infiniopGetNegWorkspaceSize(infiniopNegDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopNeg(infiniopNegDescriptor_t desc,
+                                        void *workspace,
+                                        size_t workspace_size,
+                                        void *y,
+                                        const void *x,
+                                        void *stream);
+
+__C __export infiniStatus_t infiniopDestroyNegDescriptor(infiniopNegDescriptor_t desc);
+
+#endif
diff --git a/include/infiniop/ops/reciprocal.h b/include/infiniop/ops/reciprocal.h
new file mode 100644
index 000000000..73836fea4
--- /dev/null
+++ b/include/infiniop/ops/reciprocal.h
@@ -0,0 +1,24 @@
+#ifndef __INFINIOP_RECIPROCAL_API_H__
+#define __INFINIOP_RECIPROCAL_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopReciprocalDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateReciprocalDescriptor(infiniopHandle_t handle,
+                                                                infiniopReciprocalDescriptor_t *desc_ptr,
+                                                                infiniopTensorDescriptor_t y,
+                                                                infiniopTensorDescriptor_t x);
+
+__C __export infiniStatus_t infiniopGetReciprocalWorkspaceSize(infiniopReciprocalDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopReciprocal(infiniopReciprocalDescriptor_t desc,
+                                                void *workspace,
+                                                size_t workspace_size,
+                                                void *y,
+                                                const void *x,
+                                                void *stream);
+
+__C __export infiniStatus_t infiniopDestroyReciprocalDescriptor(infiniopReciprocalDescriptor_t desc);
+
+#endif
diff --git a/include/infiniop/ops/round.h b/include/infiniop/ops/round.h
new file mode 100644
index 000000000..18c7fe44e
--- /dev/null
+++ b/include/infiniop/ops/round.h
@@ -0,0 +1,24 @@
+#ifndef __INFINIOP_ROUND_API_H__
+#define __INFINIOP_ROUND_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopRoundDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateRoundDescriptor(infiniopHandle_t handle,
+                                                           infiniopRoundDescriptor_t *desc_ptr,
+                                                           infiniopTensorDescriptor_t y,
+                                                           infiniopTensorDescriptor_t x);
+
+__C __export infiniStatus_t infiniopGetRoundWorkspaceSize(infiniopRoundDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopRound(infiniopRoundDescriptor_t desc,
+                                          void *workspace,
+                                          size_t workspace_size,
+                                          void *y,
+                                          const void *x,
+                                          void *stream);
+
+__C __export infiniStatus_t infiniopDestroyRoundDescriptor(infiniopRoundDescriptor_t desc);
+
+#endif
diff --git a/include/infiniop/ops/sign.h b/include/infiniop/ops/sign.h
new file mode 100644
index 000000000..fe47c7190
--- /dev/null
+++ b/include/infiniop/ops/sign.h
@@ -0,0 +1,24 @@
+#ifndef __INFINIOP_SIGN_API_H__
+#define __INFINIOP_SIGN_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopSignDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateSignDescriptor(infiniopHandle_t handle,
+                                                        infiniopSignDescriptor_t *desc_ptr,
+                                                        infiniopTensorDescriptor_t y,
+                                                        infiniopTensorDescriptor_t x);
+
+__C __export infiniStatus_t infiniopGetSignWorkspaceSize(infiniopSignDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopSign(infiniopSignDescriptor_t desc,
+                                        void *workspace,
+                                        size_t workspace_size,
+                                        void *y,
+                                        const void *x,
+                                        void *stream);
+
+__C __export infiniStatus_t infiniopDestroySignDescriptor(infiniopSignDescriptor_t desc);
+
+#endif
diff --git a/include/infiniop/ops/sinh.h b/include/infiniop/ops/sinh.h
new file mode 100644
index 000000000..a5325fb81
--- /dev/null
+++ b/include/infiniop/ops/sinh.h
@@ -0,0 +1,24 @@
+#ifndef __INFINIOP_SINH_API_H__
+#define __INFINIOP_SINH_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopSinhDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateSinhDescriptor(infiniopHandle_t handle,
+                                                        infiniopSinhDescriptor_t *desc_ptr,
+                                                        infiniopTensorDescriptor_t y,
+                                                        infiniopTensorDescriptor_t x);
+
+__C __export infiniStatus_t infiniopGetSinhWorkspaceSize(infiniopSinhDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopSinh(infiniopSinhDescriptor_t desc,
+                                        void *workspace,
+                                        size_t workspace_size,
+                                        void *y,
+                                        const void *x,
+                                        void *stream);
+
+__C __export infiniStatus_t infiniopDestroySinhDescriptor(infiniopSinhDescriptor_t desc);
+
+#endif
diff --git a/include/infiniop/ops/sqrt.h b/include/infiniop/ops/sqrt.h
new file mode 100644
index 000000000..db04ec8bc
--- /dev/null
+++ b/include/infiniop/ops/sqrt.h
@@ -0,0 +1,24 @@
+#ifndef __INFINIOP_SQRT_API_H__
+#define __INFINIOP_SQRT_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopSqrtDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateSqrtDescriptor(infiniopHandle_t handle,
+                                                         infiniopSqrtDescriptor_t *desc_ptr,
+                                                         infiniopTensorDescriptor_t y,
+                                                         infiniopTensorDescriptor_t x);
+
+__C __export infiniStatus_t infiniopGetSqrtWorkspaceSize(infiniopSqrtDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopSqrt(infiniopSqrtDescriptor_t desc,
+                                        void *workspace,
+                                        size_t workspace_size,
+                                        void *y,
+                                        const void *x,
+                                        void *stream);
+
+__C __export infiniStatus_t infiniopDestroySqrtDescriptor(infiniopSqrtDescriptor_t desc);
+
+#endif
diff --git a/include/infiniop/ops/tan.h b/include/infiniop/ops/tan.h
new file mode 100644
index 000000000..69fc47bf1
--- /dev/null
+++ b/include/infiniop/ops/tan.h
@@ -0,0 +1,24 @@
+#ifndef __INFINIOP_TAN_API_H__
+#define __INFINIOP_TAN_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopTanDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateTanDescriptor(infiniopHandle_t handle,
+                                                        infiniopTanDescriptor_t *desc_ptr,
+                                                        infiniopTensorDescriptor_t y,
+                                                        infiniopTensorDescriptor_t x);
+
+__C __export infiniStatus_t infiniopGetTanWorkspaceSize(infiniopTanDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopTan(infiniopTanDescriptor_t desc,
+                                       void *workspace,
+                                       size_t workspace_size,
+                                       void *y,
+                                       const void *x,
+                                       void *stream);
+
+__C __export infiniStatus_t infiniopDestroyTanDescriptor(infiniopTanDescriptor_t desc);
+
+#endif
diff --git a/src/infiniop/ops/abs/cpu/abs_cpu.cc b/src/infiniop/ops/abs/cpu/abs_cpu.cc
new file mode 100644
index 000000000..7d6e81d04
--- /dev/null
+++ b/src/infiniop/ops/abs/cpu/abs_cpu.cc
@@ -0,0 +1,48 @@
+#include "abs_cpu.h"
+
+namespace op::abs::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &x_desc = input_desc_vec.at(0);
+    const auto &y_shape = out_desc->shape();
+    const auto &x_shape = x_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
+
+    CHECK_SAME_SHAPE(y_shape, x_shape);
+
+    // create CPU elementwise descriptor
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<AbsOp, fp16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<AbsOp, float>(_info, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::abs::cpu
diff --git a/src/infiniop/ops/abs/cpu/abs_cpu.h b/src/infiniop/ops/abs/cpu/abs_cpu.h
new file mode 100644
index 000000000..5b9773298
--- /dev/null
+++ b/src/infiniop/ops/abs/cpu/abs_cpu.h
@@ -0,0 +1,26 @@
+#ifndef __ABS_CPU_H__
+#define __ABS_CPU_H__
+
+#include <cmath>
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+
+ELEMENTWISE_DESCRIPTOR(abs, cpu)
+
+namespace op::abs::cpu {
+typedef struct AbsOp {
+public:
+    static constexpr size_t num_inputs = 1;
+
+    template <typename T>
+    T operator()(const T &x) const {
+        if constexpr (std::is_floating_point_v<T>) {
+            return std::fabs(x);
+        } else {
+            return std::abs(x);
+        }
+    }
+} AbsOp;
+} // namespace op::abs::cpu
+
+#endif // __ABS_CPU_H__
diff --git a/src/infiniop/ops/abs/cuda/kernel.cuh b/src/infiniop/ops/abs/cuda/kernel.cuh
new file mode 100644
index 000000000..d7ff2db12
--- /dev/null
+++ b/src/infiniop/ops/abs/cuda/kernel.cuh
@@ -0,0 +1,26 @@
+#ifndef __ABS_CUDA_H__
+#define __ABS_CUDA_H__
+
+#include <cmath>
+#include <cuda_fp16.h>
+
+namespace op::abs::cuda {
+typedef struct AbsOp {
+public:
+    static constexpr size_t num_inputs = 1;
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &x) const {
+        if constexpr (std::is_same_v<T, half2>) {
+            return __habs2(x);
+        } else if constexpr (std::is_same_v<T, half>) {
+            return __habs(x);
+        } else if constexpr (std::is_floating_point_v<T>) {
+            return std::fabs(x);
+        } else {
+            return std::abs(x);
+        }
+    }
+} AbsOp;
+} // namespace op::abs::cuda
+
+#endif // __ABS_CUDA_H__
diff --git a/src/infiniop/ops/abs/nvidia/abs_nvidia.cu b/src/infiniop/ops/abs/nvidia/abs_nvidia.cu
new file mode 100644
index 000000000..485f0406a
--- /dev/null
+++ b/src/infiniop/ops/abs/nvidia/abs_nvidia.cu
@@ -0,0 +1,54 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "abs_nvidia.cuh"
+
+namespace op::abs::nvidia {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &x_desc = input_desc_vec.at(0);
+    const auto &y_shape = out_desc->shape();
+    const auto &x_shape = x_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
+
+    CHECK_SAME_SHAPE(y_shape, x_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::AbsOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::AbsOp, float>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::abs::nvidia
diff --git a/src/infiniop/ops/abs/nvidia/abs_nvidia.cuh b/src/infiniop/ops/abs/nvidia/abs_nvidia.cuh
new file mode 100644
index 000000000..db1751e26
--- /dev/null
+++ b/src/infiniop/ops/abs/nvidia/abs_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __ABS_NVIDIA_API_H__
+#define __ABS_NVIDIA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(abs, nvidia)
+
+#endif // __ABS_NVIDIA_API_H__
diff --git a/src/infiniop/ops/abs/operator.cc b/src/infiniop/ops/abs/operator.cc
new file mode 100644
index 000000000..b6820079d
--- /dev/null
+++ b/src/infiniop/ops/abs/operator.cc
@@ -0,0 +1,139 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/abs.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/abs_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
+#include "nvidia/abs_nvidia.cuh"
+#endif
+
+__C infiniStatus_t infiniopCreateAbsDescriptor(
+    infiniopHandle_t handle,
+    infiniopAbsDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                            \
+    case CASE:                                                             \
+        return op::abs::NAMESPACE::Descriptor::create(                     \
+            handle,                                                        \
+            reinterpret_cast<op::abs::NAMESPACE::Descriptor **>(desc_ptr), \
+            y_desc,                                                        \
+            {x_desc})
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CREATE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetAbsWorkspaceSize(infiniopAbsDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                               \
+    case CASE:                                                                             \
+        *size = reinterpret_cast<op::abs::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu)
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia)
+#endif
+#ifdef ENABLE_QY_API
+        GET(INFINI_DEVICE_QY, nvidia)
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopAbs(
+    infiniopAbsDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                            \
+    case CASE:                                                                \
+        return reinterpret_cast<const op::abs::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, y, {x}, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CALCULATE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroyAbsDescriptor(infiniopAbsDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                \
+    case CASE:                                                                 \
+        delete reinterpret_cast<const op::abs::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        DELETE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/acos/cpu/acos_cpu.cc b/src/infiniop/ops/acos/cpu/acos_cpu.cc
new file mode 100644
index 000000000..1accb6752
--- /dev/null
+++ b/src/infiniop/ops/acos/cpu/acos_cpu.cc
@@ -0,0 +1,48 @@
+#include "acos_cpu.h"
+
+namespace op::acos::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &x_desc = input_desc_vec.at(0);
+    const auto &y_shape = out_desc->shape();
+    const auto &x_shape = x_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
+
+    CHECK_SAME_SHAPE(y_shape, x_shape);
+
+    // create CPU elementwise descriptor
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<AcosOp, fp16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<AcosOp, float>(_info, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::acos::cpu
diff --git a/src/infiniop/ops/acos/cpu/acos_cpu.h b/src/infiniop/ops/acos/cpu/acos_cpu.h
new file mode 100644
index 000000000..14e74b75c
--- /dev/null
+++ b/src/infiniop/ops/acos/cpu/acos_cpu.h
@@ -0,0 +1,22 @@
+#ifndef __ACOS_CPU_H__
+#define __ACOS_CPU_H__
+
+#include <cmath>
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+
+ELEMENTWISE_DESCRIPTOR(acos, cpu)
+
+namespace op::acos::cpu {
+typedef struct AcosOp {
+public:
+    static constexpr size_t num_inputs = 1;
+
+    template <typename T>
+    T operator()(const T &x) const {
+        return std::acos(x);
+    }
+} AcosOp;
+} // namespace op::acos::cpu
+
+#endif // __ACOS_CPU_H__
diff --git a/src/infiniop/ops/acos/cuda/kernel.cuh b/src/infiniop/ops/acos/cuda/kernel.cuh
new file mode 100644
index 000000000..c3281c7e3
--- /dev/null
+++ b/src/infiniop/ops/acos/cuda/kernel.cuh
@@ -0,0 +1,32 @@
+#ifndef __ACOS_CUDA_H__
+#define __ACOS_CUDA_H__
+
+#include <cmath>
+#include <cuda_fp16.h>
+
+namespace op::acos::cuda {
+typedef struct AcosOp {
+public:
+    static constexpr size_t num_inputs = 1;
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &x) const {
+        if constexpr (std::is_same_v<T, half2>) {
+            return __floats2half2_rn(acosf(__half2float(__low2half(x))), acosf(__half2float(__high2half(x))));
+        } else if constexpr (std::is_same_v<T, half>) {
+            return __float2half(acosf(__half2float(x)));
+        } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
+            float x0 = __bfloat162float(__low2bfloat16(x));
+            float x1 = __bfloat162float(__high2bfloat16(x));
+            return __floats2bfloat162_rn(acosf(x0), acosf(x1));
+        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+            return __float2bfloat16_rn(acosf(__bfloat162float(x)));
+        } else if constexpr (std::is_same_v<T, float>) {
+            return acosf(x);
+        } else {
+            return std::acos(x);
+        }
+    }
+} AcosOp;
+} // namespace op::acos::cuda
+
+#endif // __ACOS_CUDA_H__
diff --git a/src/infiniop/ops/acos/nvidia/acos_nvidia.cu b/src/infiniop/ops/acos/nvidia/acos_nvidia.cu
new file mode 100644
index 000000000..8480219bc
--- /dev/null
+++ b/src/infiniop/ops/acos/nvidia/acos_nvidia.cu
@@ -0,0 +1,54 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "acos_nvidia.cuh"
+
+namespace op::acos::nvidia {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &x_desc = input_desc_vec.at(0);
+    const auto &y_shape = out_desc->shape();
+    const auto &x_shape = x_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
+
+    CHECK_SAME_SHAPE(y_shape, x_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::AcosOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::AcosOp, float>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::acos::nvidia
diff --git a/src/infiniop/ops/acos/nvidia/acos_nvidia.cuh b/src/infiniop/ops/acos/nvidia/acos_nvidia.cuh
new file mode 100644
index 000000000..a7ac7e190
--- /dev/null
+++ b/src/infiniop/ops/acos/nvidia/acos_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __ACOS_NVIDIA_API_H__
+#define __ACOS_NVIDIA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(acos, nvidia)
+
+#endif // __ACOS_NVIDIA_API_H__
diff --git a/src/infiniop/ops/acos/operator.cc b/src/infiniop/ops/acos/operator.cc
new file mode 100644
index 000000000..e775a005a
--- /dev/null
+++ b/src/infiniop/ops/acos/operator.cc
@@ -0,0 +1,139 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/acos.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/acos_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
+#include "nvidia/acos_nvidia.cuh"
+#endif
+
+__C infiniStatus_t infiniopCreateAcosDescriptor(
+    infiniopHandle_t handle,
+    infiniopAcosDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                             \
+    case CASE:                                                              \
+        return op::acos::NAMESPACE::Descriptor::create(                     \
+            handle,                                                         \
+            reinterpret_cast<op::acos::NAMESPACE::Descriptor **>(desc_ptr), \
+            y_desc,                                                         \
+            {x_desc})
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CREATE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetAcosWorkspaceSize(infiniopAcosDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                \
+    case CASE:                                                                              \
+        *size = reinterpret_cast<op::acos::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu)
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia)
+#endif
+#ifdef ENABLE_QY_API
+        GET(INFINI_DEVICE_QY, nvidia)
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopAcos(
+    infiniopAcosDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                             \
+    case CASE:                                                                 \
+        return reinterpret_cast<const op::acos::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, y, {x}, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CALCULATE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroyAcosDescriptor(infiniopAcosDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                 \
+    case CASE:                                                                  \
+        delete reinterpret_cast<const op::acos::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        DELETE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/acosh/cpu/acosh_cpu.cc b/src/infiniop/ops/acosh/cpu/acosh_cpu.cc
new file mode 100644
index 000000000..005463679
--- /dev/null
+++ b/src/infiniop/ops/acosh/cpu/acosh_cpu.cc
@@ -0,0 +1,48 @@
+#include "acosh_cpu.h"
+
+namespace op::acosh::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &x_desc = input_desc_vec.at(0);
+    const auto &y_shape = out_desc->shape();
+    const auto &x_shape = x_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
+
+    CHECK_SAME_SHAPE(y_shape, x_shape);
+
+    // create CPU elementwise descriptor
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<AcoshOp, fp16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<AcoshOp, float>(_info, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::acosh::cpu
diff --git a/src/infiniop/ops/acosh/cpu/acosh_cpu.h b/src/infiniop/ops/acosh/cpu/acosh_cpu.h
new file mode 100644
index 000000000..b4b710ed5
--- /dev/null
+++ b/src/infiniop/ops/acosh/cpu/acosh_cpu.h
@@ -0,0 +1,22 @@
+#ifndef __ACOSH_CPU_H__
+#define __ACOSH_CPU_H__
+
+#include <cmath>
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+
+ELEMENTWISE_DESCRIPTOR(acosh, cpu)
+
+namespace op::acosh::cpu {
+typedef struct AcoshOp {
+public:
+    static constexpr size_t num_inputs = 1;
+
+    template <typename T>
+    T operator()(const T &x) const {
+        return std::acosh(x);
+    }
+} AcoshOp;
+} // namespace op::acosh::cpu
+
+#endif // __ACOSH_CPU_H__
diff --git a/src/infiniop/ops/acosh/cuda/kernel.cuh b/src/infiniop/ops/acosh/cuda/kernel.cuh
new file mode 100644
index 000000000..fe444b1b4
--- /dev/null
+++ b/src/infiniop/ops/acosh/cuda/kernel.cuh
@@ -0,0 +1,32 @@
+#ifndef __ACOSH_CUDA_H__
+#define __ACOSH_CUDA_H__
+
+#include <cmath>
+#include <cuda_fp16.h>
+
+namespace op::acosh::cuda {
+typedef struct AcoshOp {
+public:
+    static constexpr size_t num_inputs = 1;
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &x) const {
+        if constexpr (std::is_same_v<T, half2>) {
+            return __floats2half2_rn(acoshf(__half2float(__low2half(x))), acoshf(__half2float(__high2half(x))));
+        } else if constexpr (std::is_same_v<T, half>) {
+            return __float2half(acoshf(__half2float(x)));
+        } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
+            float x0 = __bfloat162float(__low2bfloat16(x));
+            float x1 = __bfloat162float(__high2bfloat16(x));
+            return __floats2bfloat162_rn(acoshf(x0), acoshf(x1));
+        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+            return __float2bfloat16_rn(acoshf(__bfloat162float(x)));
+        } else if constexpr (std::is_same_v<T, float>) {
+            return acoshf(x);
+        } else {
+            return std::acosh(x);
+        }
+    }
+} AcoshOp;
+} // namespace op::acosh::cuda
+
+#endif // __ACOSH_CUDA_H__
diff --git a/src/infiniop/ops/acosh/nvidia/acosh_nvidia.cu b/src/infiniop/ops/acosh/nvidia/acosh_nvidia.cu
new file mode 100644
index 000000000..fc06590a7
--- /dev/null
+++ b/src/infiniop/ops/acosh/nvidia/acosh_nvidia.cu
@@ -0,0 +1,54 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "acosh_nvidia.cuh"
+
+namespace op::acosh::nvidia {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &x_desc = input_desc_vec.at(0);
+    const auto &y_shape = out_desc->shape();
+    const auto &x_shape = x_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
+
+    CHECK_SAME_SHAPE(y_shape, x_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::AcoshOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::AcoshOp, float>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::acosh::nvidia
diff --git a/src/infiniop/ops/acosh/nvidia/acosh_nvidia.cuh b/src/infiniop/ops/acosh/nvidia/acosh_nvidia.cuh
new file mode 100644
index 000000000..b13332431
--- /dev/null
+++ b/src/infiniop/ops/acosh/nvidia/acosh_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __ACOSH_NVIDIA_API_H__
+#define __ACOSH_NVIDIA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(acosh, nvidia)
+
+#endif // __ACOSH_NVIDIA_API_H__
diff --git a/src/infiniop/ops/acosh/operator.cc b/src/infiniop/ops/acosh/operator.cc
new file mode 100644
index 000000000..9bba3389a
--- /dev/null
+++ b/src/infiniop/ops/acosh/operator.cc
@@ -0,0 +1,139 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/acosh.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/acosh_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
+#include "nvidia/acosh_nvidia.cuh"
+#endif
+
+__C infiniStatus_t infiniopCreateAcoshDescriptor(
+    infiniopHandle_t handle,
+    infiniopAcoshDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                              \
+    case CASE:                                                               \
+        return op::acosh::NAMESPACE::Descriptor::create(                     \
+            handle,                                                          \
+            reinterpret_cast<op::acosh::NAMESPACE::Descriptor **>(desc_ptr), \
+            y_desc,                                                          \
+            {x_desc})
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CREATE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetAcoshWorkspaceSize(infiniopAcoshDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                 \
+    case CASE:                                                                               \
+        *size = reinterpret_cast<op::acosh::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu)
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia)
+#endif
+#ifdef ENABLE_QY_API
+        GET(INFINI_DEVICE_QY, nvidia)
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopAcosh(
+    infiniopAcoshDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                              \
+    case CASE:                                                                  \
+        return reinterpret_cast<const op::acosh::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, y, {x}, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CALCULATE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroyAcoshDescriptor(infiniopAcoshDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                  \
+    case CASE:                                                                   \
+        delete reinterpret_cast<const op::acosh::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        DELETE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/asin/cpu/asin_cpu.cc b/src/infiniop/ops/asin/cpu/asin_cpu.cc
new file mode 100644
index 000000000..e149044f1
--- /dev/null
+++ b/src/infiniop/ops/asin/cpu/asin_cpu.cc
@@ -0,0 +1,48 @@
+#include "asin_cpu.h"
+
+namespace op::asin::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &x_desc = input_desc_vec.at(0);
+    const auto &y_shape = out_desc->shape();
+    const auto &x_shape = x_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
+
+    CHECK_SAME_SHAPE(y_shape, x_shape);
+
+    // create CPU elementwise descriptor
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<AsinOp, fp16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<AsinOp, float>(_info, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::asin::cpu
diff --git a/src/infiniop/ops/asin/cpu/asin_cpu.h b/src/infiniop/ops/asin/cpu/asin_cpu.h
new file mode 100644
index 000000000..22bcba337
--- /dev/null
+++ b/src/infiniop/ops/asin/cpu/asin_cpu.h
@@ -0,0 +1,22 @@
+#ifndef __ASIN_CPU_H__
+#define __ASIN_CPU_H__
+
+#include <cmath>
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+
+ELEMENTWISE_DESCRIPTOR(asin, cpu)
+
+namespace op::asin::cpu {
+typedef struct AsinOp {
+public:
+    static constexpr size_t num_inputs = 1;
+
+    template <typename T>
+    T operator()(const T &x) const {
+        return std::asin(x);
+    }
+} AsinOp;
+} // namespace op::asin::cpu
+
+#endif // __ASIN_CPU_H__
diff --git a/src/infiniop/ops/asin/cuda/kernel.cuh b/src/infiniop/ops/asin/cuda/kernel.cuh
new file mode 100644
index 000000000..3e8d11a07
--- /dev/null
+++ b/src/infiniop/ops/asin/cuda/kernel.cuh
@@ -0,0 +1,32 @@
+#ifndef __ASIN_CUDA_H__
+#define __ASIN_CUDA_H__
+
+#include <cmath>
+#include <cuda_fp16.h>
+
+namespace op::asin::cuda {
+typedef struct AsinOp {
+public:
+    static constexpr size_t num_inputs = 1;
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &x) const {
+        if constexpr (std::is_same_v<T, half2>) {
+            return __floats2half2_rn(asinf(__half2float(__low2half(x))), asinf(__half2float(__high2half(x))));
+        } else if constexpr (std::is_same_v<T, half>) {
+            return __float2half(asinf(__half2float(x)));
+        } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
+            float x0 = __bfloat162float(__low2bfloat16(x));
+            float x1 = __bfloat162float(__high2bfloat16(x));
+            return __floats2bfloat162_rn(asinf(x0), asinf(x1));
+        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+            return __float2bfloat16_rn(asinf(__bfloat162float(x)));
+        } else if constexpr (std::is_same_v<T, float>) {
+            return asinf(x);
+        } else {
+            return std::asin(x);
+        }
+    }
+} AsinOp;
+} // namespace op::asin::cuda
+
+#endif // __ASIN_CUDA_H__
diff --git a/src/infiniop/ops/asin/nvidia/asin_nvidia.cu b/src/infiniop/ops/asin/nvidia/asin_nvidia.cu
new file mode 100644
index 000000000..714d2b1b3
--- /dev/null
+++ b/src/infiniop/ops/asin/nvidia/asin_nvidia.cu
@@ -0,0 +1,54 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "asin_nvidia.cuh"
+
+namespace op::asin::nvidia {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &x_desc = input_desc_vec.at(0);
+    const auto &y_shape = out_desc->shape();
+    const auto &x_shape = x_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
+
+    CHECK_SAME_SHAPE(y_shape, x_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::AsinOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::AsinOp, float>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::asin::nvidia
diff --git a/src/infiniop/ops/asin/nvidia/asin_nvidia.cuh b/src/infiniop/ops/asin/nvidia/asin_nvidia.cuh
new file mode 100644
index 000000000..46e168ede
--- /dev/null
+++ b/src/infiniop/ops/asin/nvidia/asin_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __ASIN_NVIDIA_API_H__
+#define __ASIN_NVIDIA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(asin, nvidia)
+
+#endif // __ASIN_NVIDIA_API_H__
diff --git a/src/infiniop/ops/asin/operator.cc b/src/infiniop/ops/asin/operator.cc
new file mode 100644
index 000000000..c4973e9f5
--- /dev/null
+++ b/src/infiniop/ops/asin/operator.cc
@@ -0,0 +1,139 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/asin.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/asin_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
+#include "nvidia/asin_nvidia.cuh"
+#endif
+
+__C infiniStatus_t infiniopCreateAsinDescriptor(
+    infiniopHandle_t handle,
+    infiniopAsinDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                             \
+    case CASE:                                                              \
+        return op::asin::NAMESPACE::Descriptor::create(                     \
+            handle,                                                         \
+            reinterpret_cast<op::asin::NAMESPACE::Descriptor **>(desc_ptr), \
+            y_desc,                                                         \
+            {x_desc})
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CREATE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetAsinWorkspaceSize(infiniopAsinDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                \
+    case CASE:                                                                              \
+        *size = reinterpret_cast<op::asin::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu)
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia)
+#endif
+#ifdef ENABLE_QY_API
+        GET(INFINI_DEVICE_QY, nvidia)
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopAsin(
+    infiniopAsinDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                             \
+    case CASE:                                                                 \
+        return reinterpret_cast<const op::asin::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, y, {x}, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CALCULATE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroyAsinDescriptor(infiniopAsinDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                 \
+    case CASE:                                                                  \
+        delete reinterpret_cast<const op::asin::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        DELETE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/asinh/cpu/asinh_cpu.cc b/src/infiniop/ops/asinh/cpu/asinh_cpu.cc
new file mode 100644
index 000000000..e0d5b749a
--- /dev/null
+++ b/src/infiniop/ops/asinh/cpu/asinh_cpu.cc
@@ -0,0 +1,48 @@
+#include "asinh_cpu.h"
+
+namespace op::asinh::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &x_desc = input_desc_vec.at(0);
+    const auto &y_shape = out_desc->shape();
+    const auto &x_shape = x_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
+
+    CHECK_SAME_SHAPE(y_shape, x_shape);
+
+    // create CPU elementwise descriptor
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<AsinhOp, fp16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<AsinhOp, float>(_info, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::asinh::cpu
diff --git a/src/infiniop/ops/asinh/cpu/asinh_cpu.h b/src/infiniop/ops/asinh/cpu/asinh_cpu.h
new file mode 100644
index 000000000..0a999b63b
--- /dev/null
+++ b/src/infiniop/ops/asinh/cpu/asinh_cpu.h
@@ -0,0 +1,22 @@
+#ifndef __ASINH_CPU_H__
+#define __ASINH_CPU_H__
+
+#include <cmath>
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+
+ELEMENTWISE_DESCRIPTOR(asinh, cpu)
+
+namespace op::asinh::cpu {
+typedef struct AsinhOp {
+public:
+    static constexpr size_t num_inputs = 1;
+
+    template <typename T>
+    T operator()(const T &x) const {
+        return std::asinh(x);
+    }
+} AsinhOp;
+} // namespace op::asinh::cpu
+
+#endif // __ASINH_CPU_H__
diff --git a/src/infiniop/ops/asinh/cuda/kernel.cuh b/src/infiniop/ops/asinh/cuda/kernel.cuh
new file mode 100644
index 000000000..7cb018c8a
--- /dev/null
+++ b/src/infiniop/ops/asinh/cuda/kernel.cuh
@@ -0,0 +1,32 @@
+#ifndef __ASINH_CUDA_H__
+#define __ASINH_CUDA_H__
+
+#include <cmath>
+#include <cuda_fp16.h>
+
+namespace op::asinh::cuda {
+typedef struct AsinhOp {
+public:
+    static constexpr size_t num_inputs = 1;
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &x) const {
+        if constexpr (std::is_same_v<T, half2>) {
+            return __floats2half2_rn(asinhf(__half2float(__low2half(x))), asinhf(__half2float(__high2half(x))));
+        } else if constexpr (std::is_same_v<T, half>) {
+            return __float2half(asinhf(__half2float(x)));
+        } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
+            float x0 = __bfloat162float(__low2bfloat16(x));
+            float x1 = __bfloat162float(__high2bfloat16(x));
+            return __floats2bfloat162_rn(asinhf(x0), asinhf(x1));
+        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+            return __float2bfloat16_rn(asinhf(__bfloat162float(x)));
+        } else if constexpr (std::is_same_v<T, float>) {
+            return asinhf(x);
+        } else {
+            return std::asinh(x);
+        }
+    }
+} AsinhOp;
+} // namespace op::asinh::cuda
+
+#endif // __ASINH_CUDA_H__
diff --git a/src/infiniop/ops/asinh/nvidia/asinh_nvidia.cu b/src/infiniop/ops/asinh/nvidia/asinh_nvidia.cu
new file mode 100644
index 000000000..203008b81
--- /dev/null
+++ b/src/infiniop/ops/asinh/nvidia/asinh_nvidia.cu
@@ -0,0 +1,54 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "asinh_nvidia.cuh"
+
+namespace op::asinh::nvidia {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &x_desc = input_desc_vec.at(0);
+    const auto &y_shape = out_desc->shape();
+    const auto &x_shape = x_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
+
+    CHECK_SAME_SHAPE(y_shape, x_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::AsinhOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::AsinhOp, float>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::asinh::nvidia
diff --git a/src/infiniop/ops/asinh/nvidia/asinh_nvidia.cuh b/src/infiniop/ops/asinh/nvidia/asinh_nvidia.cuh
new file mode 100644
index 000000000..d1dcb4287
--- /dev/null
+++ b/src/infiniop/ops/asinh/nvidia/asinh_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __ASINH_NVIDIA_API_H__
+#define __ASINH_NVIDIA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(asinh, nvidia)
+
+#endif // __ASINH_NVIDIA_API_H__
diff --git a/src/infiniop/ops/asinh/operator.cc b/src/infiniop/ops/asinh/operator.cc
new file mode 100644
index 000000000..d9ff5beda
--- /dev/null
+++ b/src/infiniop/ops/asinh/operator.cc
@@ -0,0 +1,139 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/asinh.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/asinh_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
+#include "nvidia/asinh_nvidia.cuh"
+#endif
+
+__C infiniStatus_t infiniopCreateAsinhDescriptor(
+    infiniopHandle_t handle,
+    infiniopAsinhDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                              \
+    case CASE:                                                               \
+        return op::asinh::NAMESPACE::Descriptor::create(                     \
+            handle,                                                          \
+            reinterpret_cast<op::asinh::NAMESPACE::Descriptor **>(desc_ptr), \
+            y_desc,                                                          \
+            {x_desc})
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CREATE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetAsinhWorkspaceSize(infiniopAsinhDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                 \
+    case CASE:                                                                               \
+        *size = reinterpret_cast<op::asinh::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu)
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia)
+#endif
+#ifdef ENABLE_QY_API
+        GET(INFINI_DEVICE_QY, nvidia)
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopAsinh(
+    infiniopAsinhDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                              \
+    case CASE:                                                                  \
+        return reinterpret_cast<const op::asinh::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, y, {x}, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CALCULATE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroyAsinhDescriptor(infiniopAsinhDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                  \
+    case CASE:                                                                   \
+        delete reinterpret_cast<const op::asinh::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        DELETE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/atan/cpu/atan_cpu.cc b/src/infiniop/ops/atan/cpu/atan_cpu.cc
new file mode 100644
index 000000000..a8c613d1e
--- /dev/null
+++ b/src/infiniop/ops/atan/cpu/atan_cpu.cc
@@ -0,0 +1,48 @@
+#include "atan_cpu.h"
+
+namespace op::atan::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &x_desc = input_desc_vec.at(0);
+    const auto &y_shape = out_desc->shape();
+    const auto &x_shape = x_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
+
+    CHECK_SAME_SHAPE(y_shape, x_shape);
+
+    // create CPU elementwise descriptor
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<AtanOp, fp16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<AtanOp, float>(_info, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::atan::cpu
diff --git a/src/infiniop/ops/atan/cpu/atan_cpu.h b/src/infiniop/ops/atan/cpu/atan_cpu.h
new file mode 100644
index 000000000..ac2a1bc0c
--- /dev/null
+++ b/src/infiniop/ops/atan/cpu/atan_cpu.h
@@ -0,0 +1,22 @@
+#ifndef __ATAN_CPU_H__
+#define __ATAN_CPU_H__
+
+#include <cmath>
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+
+ELEMENTWISE_DESCRIPTOR(atan, cpu)
+
+namespace op::atan::cpu {
+typedef struct AtanOp {
+public:
+    static constexpr size_t num_inputs = 1;
+
+    template <typename T>
+    T operator()(const T &x) const {
+        return std::atan(x);
+    }
+} AtanOp;
+} // namespace op::atan::cpu
+
+#endif // __ATAN_CPU_H__
diff --git a/src/infiniop/ops/atan/cuda/kernel.cuh b/src/infiniop/ops/atan/cuda/kernel.cuh
new file mode 100644
index 000000000..0c7745196
--- /dev/null
+++ b/src/infiniop/ops/atan/cuda/kernel.cuh
@@ -0,0 +1,32 @@
+#ifndef __ATAN_CUDA_H__
+#define __ATAN_CUDA_H__
+
+#include <cmath>
+#include <cuda_fp16.h>
+
+namespace op::atan::cuda {
+typedef struct AtanOp {
+public:
+    static constexpr size_t num_inputs = 1;
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &x) const {
+        if constexpr (std::is_same_v<T, half2>) {
+            return __floats2half2_rn(atanf(__half2float(__low2half(x))), atanf(__half2float(__high2half(x))));
+        } else if constexpr (std::is_same_v<T, half>) {
+            return __float2half(atanf(__half2float(x)));
+        } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
+            float x0 = __bfloat162float(__low2bfloat16(x));
+            float x1 = __bfloat162float(__high2bfloat16(x));
+            return __floats2bfloat162_rn(atanf(x0), atanf(x1));
+        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+            return __float2bfloat16_rn(atanf(__bfloat162float(x)));
+        } else if constexpr (std::is_same_v<T, float>) {
+            return atanf(x);
+        } else {
+            return std::atan(x);
+        }
+    }
+} AtanOp;
+} // namespace op::atan::cuda
+
+#endif // __ATAN_CUDA_H__
diff --git a/src/infiniop/ops/atan/nvidia/atan_nvidia.cu b/src/infiniop/ops/atan/nvidia/atan_nvidia.cu
new file mode 100644
index 000000000..2c6cf53d4
--- /dev/null
+++ b/src/infiniop/ops/atan/nvidia/atan_nvidia.cu
@@ -0,0 +1,54 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "atan_nvidia.cuh"
+
+namespace op::atan::nvidia {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &x_desc = input_desc_vec.at(0);
+    const auto &y_shape = out_desc->shape();
+    const auto &x_shape = x_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
+
+    CHECK_SAME_SHAPE(y_shape, x_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::AtanOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::AtanOp, float>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::atan::nvidia
diff --git a/src/infiniop/ops/atan/nvidia/atan_nvidia.cuh b/src/infiniop/ops/atan/nvidia/atan_nvidia.cuh
new file mode 100644
index 000000000..2aaee1ad9
--- /dev/null
+++ b/src/infiniop/ops/atan/nvidia/atan_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __ATAN_NVIDIA_API_H__
+#define __ATAN_NVIDIA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(atan, nvidia)
+
+#endif // __ATAN_NVIDIA_API_H__
diff --git a/src/infiniop/ops/atan/operator.cc b/src/infiniop/ops/atan/operator.cc
new file mode 100644
index 000000000..c56e101d2
--- /dev/null
+++ b/src/infiniop/ops/atan/operator.cc
@@ -0,0 +1,139 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/atan.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/atan_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
+#include "nvidia/atan_nvidia.cuh"
+#endif
+
+__C infiniStatus_t infiniopCreateAtanDescriptor(
+    infiniopHandle_t handle,
+    infiniopAtanDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                             \
+    case CASE:                                                              \
+        return op::atan::NAMESPACE::Descriptor::create(                     \
+            handle,                                                         \
+            reinterpret_cast<op::atan::NAMESPACE::Descriptor **>(desc_ptr), \
+            y_desc,                                                         \
+            {x_desc})
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CREATE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetAtanWorkspaceSize(infiniopAtanDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                \
+    case CASE:                                                                              \
+        *size = reinterpret_cast<op::atan::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu)
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia)
+#endif
+#ifdef ENABLE_QY_API
+        GET(INFINI_DEVICE_QY, nvidia)
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopAtan(
+    infiniopAtanDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                             \
+    case CASE:                                                                 \
+        return reinterpret_cast<const op::atan::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, y, {x}, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CALCULATE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroyAtanDescriptor(infiniopAtanDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                 \
+    case CASE:                                                                  \
+        delete reinterpret_cast<const op::atan::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        DELETE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/atanh/cpu/atanh_cpu.cc b/src/infiniop/ops/atanh/cpu/atanh_cpu.cc
new file mode 100644
index 000000000..66ef4b1df
--- /dev/null
+++ b/src/infiniop/ops/atanh/cpu/atanh_cpu.cc
@@ -0,0 +1,48 @@
+#include "atanh_cpu.h"
+
+namespace op::atanh::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &x_desc = input_desc_vec.at(0);
+    const auto &y_shape = out_desc->shape();
+    const auto &x_shape = x_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
+
+    CHECK_SAME_SHAPE(y_shape, x_shape);
+
+    // create CPU elementwise descriptor
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<AtanhOp, fp16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<AtanhOp, float>(_info, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::atanh::cpu
diff --git a/src/infiniop/ops/atanh/cpu/atanh_cpu.h b/src/infiniop/ops/atanh/cpu/atanh_cpu.h
new file mode 100644
index 000000000..8c2b04755
--- /dev/null
+++ b/src/infiniop/ops/atanh/cpu/atanh_cpu.h
@@ -0,0 +1,22 @@
+#ifndef __ATANH_CPU_H__
+#define __ATANH_CPU_H__
+
+#include <cmath>
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+
+ELEMENTWISE_DESCRIPTOR(atanh, cpu)
+
+namespace op::atanh::cpu {
+typedef struct AtanhOp {
+public:
+    static constexpr size_t num_inputs = 1;
+
+    template <typename T>
+    T operator()(const T &x) const {
+        return std::atanh(x);
+    }
+} AtanhOp;
+} // namespace op::atanh::cpu
+
+#endif // __ATANH_CPU_H__
diff --git a/src/infiniop/ops/atanh/cuda/kernel.cuh b/src/infiniop/ops/atanh/cuda/kernel.cuh
new file mode 100644
index 000000000..5337d8243
--- /dev/null
+++ b/src/infiniop/ops/atanh/cuda/kernel.cuh
@@ -0,0 +1,32 @@
+#ifndef __ATANH_CUDA_H__
+#define __ATANH_CUDA_H__
+
+#include <cmath>
+#include <cuda_fp16.h>
+
+namespace op::atanh::cuda {
+typedef struct AtanhOp {
+public:
+    static constexpr size_t num_inputs = 1;
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &x) const {
+        if constexpr (std::is_same_v<T, half2>) {
+            return __floats2half2_rn(atanhf(__half2float(__low2half(x))), atanhf(__half2float(__high2half(x))));
+        } else if constexpr (std::is_same_v<T, half>) {
+            return __float2half(atanhf(__half2float(x)));
+        } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
+            float x0 = __bfloat162float(__low2bfloat16(x));
+            float x1 = __bfloat162float(__high2bfloat16(x));
+            return __floats2bfloat162_rn(atanhf(x0), atanhf(x1));
+        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+            return __float2bfloat16_rn(atanhf(__bfloat162float(x)));
+        } else if constexpr (std::is_same_v<T, float>) {
+            return atanhf(x);
+        } else {
+            return std::atanh(x);
+        }
+    }
+} AtanhOp;
+} // namespace op::atanh::cuda
+
+#endif // __ATANH_CUDA_H__
diff --git a/src/infiniop/ops/atanh/nvidia/atanh_nvidia.cu b/src/infiniop/ops/atanh/nvidia/atanh_nvidia.cu
new file mode 100644
index 000000000..cb5a1ff03
--- /dev/null
+++ b/src/infiniop/ops/atanh/nvidia/atanh_nvidia.cu
@@ -0,0 +1,54 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "atanh_nvidia.cuh"
+
+namespace op::atanh::nvidia {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &x_desc = input_desc_vec.at(0);
+    const auto &y_shape = out_desc->shape();
+    const auto &x_shape = x_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
+
+    CHECK_SAME_SHAPE(y_shape, x_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::AtanhOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::AtanhOp, float>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::atanh::nvidia
diff --git a/src/infiniop/ops/atanh/nvidia/atanh_nvidia.cuh b/src/infiniop/ops/atanh/nvidia/atanh_nvidia.cuh
new file mode 100644
index 000000000..da73cfa99
--- /dev/null
+++ b/src/infiniop/ops/atanh/nvidia/atanh_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __ATANH_NVIDIA_API_H__
+#define __ATANH_NVIDIA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(atanh, nvidia)
+
+#endif // __ATANH_NVIDIA_API_H__
diff --git a/src/infiniop/ops/atanh/operator.cc b/src/infiniop/ops/atanh/operator.cc
new file mode 100644
index 000000000..a73adcb23
--- /dev/null
+++ b/src/infiniop/ops/atanh/operator.cc
@@ -0,0 +1,139 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/atanh.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/atanh_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
+#include "nvidia/atanh_nvidia.cuh"
+#endif
+
+__C infiniStatus_t infiniopCreateAtanhDescriptor(
+    infiniopHandle_t handle,
+    infiniopAtanhDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                              \
+    case CASE:                                                               \
+        return op::atanh::NAMESPACE::Descriptor::create(                     \
+            handle,                                                          \
+            reinterpret_cast<op::atanh::NAMESPACE::Descriptor **>(desc_ptr), \
+            y_desc,                                                          \
+            {x_desc})
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CREATE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetAtanhWorkspaceSize(infiniopAtanhDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                 \
+    case CASE:                                                                               \
+        *size = reinterpret_cast<op::atanh::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu)
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia)
+#endif
+#ifdef ENABLE_QY_API
+        GET(INFINI_DEVICE_QY, nvidia)
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopAtanh(
+    infiniopAtanhDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                              \
+    case CASE:                                                                  \
+        return reinterpret_cast<const op::atanh::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, y, {x}, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CALCULATE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroyAtanhDescriptor(infiniopAtanhDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                  \
+    case CASE:                                                                   \
+        delete reinterpret_cast<const op::atanh::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        DELETE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/ceil/cpu/ceil_cpu.cc b/src/infiniop/ops/ceil/cpu/ceil_cpu.cc
new file mode 100644
index 000000000..17b3ec888
--- /dev/null
+++ b/src/infiniop/ops/ceil/cpu/ceil_cpu.cc
@@ -0,0 +1,48 @@
+#include "ceil_cpu.h"
+
+namespace op::ceil::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &x_desc = input_desc_vec.at(0);
+    const auto &y_shape = out_desc->shape();
+    const auto &x_shape = x_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
+
+    CHECK_SAME_SHAPE(y_shape, x_shape);
+
+    // create CPU elementwise descriptor
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<CeilOp, fp16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<CeilOp, float>(_info, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::ceil::cpu
diff --git a/src/infiniop/ops/ceil/cpu/ceil_cpu.h b/src/infiniop/ops/ceil/cpu/ceil_cpu.h
new file mode 100644
index 000000000..c3ca8e441
--- /dev/null
+++ b/src/infiniop/ops/ceil/cpu/ceil_cpu.h
@@ -0,0 +1,26 @@
+#ifndef __CEIL_CPU_H__
+#define __CEIL_CPU_H__
+
+#include <cmath>
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+
+ELEMENTWISE_DESCRIPTOR(ceil, cpu)
+
+namespace op::ceil::cpu {
+typedef struct CeilOp {
+public:
+    static constexpr size_t num_inputs = 1;
+
+    template <typename T>
+    T operator()(const T &x) const {
+        if constexpr (std::is_integral_v<T>) {
+            return x;
+        } else {
+            return std::ceil(x);
+        }
+    }
+} CeilOp;
+} // namespace op::ceil::cpu
+
+#endif // __CEIL_CPU_H__
diff --git a/src/infiniop/ops/ceil/cuda/kernel.cuh b/src/infiniop/ops/ceil/cuda/kernel.cuh
new file mode 100644
index 000000000..a2d2e7fb5
--- /dev/null
+++ b/src/infiniop/ops/ceil/cuda/kernel.cuh
@@ -0,0 +1,34 @@
+#ifndef __CEIL_CUDA_H__
+#define __CEIL_CUDA_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+#include <cuda_fp16.h>
+
+namespace op::ceil::cuda {
+typedef struct CeilOp {
+public:
+    static constexpr size_t num_inputs = 1;
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &x) const {
+        if constexpr (std::is_same_v<T, half2>) {
+            return h2ceil(x);
+        } else if constexpr (std::is_same_v<T, half>) {
+            return hceil(x);
+        } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
+            float x0 = __bfloat162float(__low2bfloat16(x));
+            float x1 = __bfloat162float(__high2bfloat16(x));
+            return __floats2bfloat162_rn(ceilf(x0), ceilf(x1));
+        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+            return __float2bfloat16_rn(ceilf(__bfloat162float(x)));
+        } else if constexpr (std::is_same_v<T, float>) {
+            return ceilf(x);
+        } else if constexpr (std::is_integral_v<T>) {
+            return x;
+        } else {
+            return std::ceil(x);
+        }
+    }
+} CeilOp;
+} // namespace op::ceil::cuda
+
+#endif // __CEIL_CUDA_H__
diff --git a/src/infiniop/ops/ceil/nvidia/ceil_nvidia.cu b/src/infiniop/ops/ceil/nvidia/ceil_nvidia.cu
new file mode 100644
index 000000000..c7ad2ee5b
--- /dev/null
+++ b/src/infiniop/ops/ceil/nvidia/ceil_nvidia.cu
@@ -0,0 +1,54 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "ceil_nvidia.cuh"
+
+namespace op::ceil::nvidia {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &x_desc = input_desc_vec.at(0);
+    const auto &y_shape = out_desc->shape();
+    const auto &x_shape = x_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
+
+    CHECK_SAME_SHAPE(y_shape, x_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::CeilOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::CeilOp, float>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::ceil::nvidia
diff --git a/src/infiniop/ops/ceil/nvidia/ceil_nvidia.cuh b/src/infiniop/ops/ceil/nvidia/ceil_nvidia.cuh
new file mode 100644
index 000000000..9bada334d
--- /dev/null
+++ b/src/infiniop/ops/ceil/nvidia/ceil_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __CEIL_NVIDIA_API_H__
+#define __CEIL_NVIDIA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(ceil, nvidia)
+
+#endif // __CEIL_NVIDIA_API_H__
diff --git a/src/infiniop/ops/ceil/operator.cc b/src/infiniop/ops/ceil/operator.cc
new file mode 100644
index 000000000..4e5ee7800
--- /dev/null
+++ b/src/infiniop/ops/ceil/operator.cc
@@ -0,0 +1,139 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/ceil.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/ceil_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
+#include "nvidia/ceil_nvidia.cuh"
+#endif
+
+__C infiniStatus_t infiniopCreateCeilDescriptor(
+    infiniopHandle_t handle,
+    infiniopCeilDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                             \
+    case CASE:                                                              \
+        return op::ceil::NAMESPACE::Descriptor::create(                     \
+            handle,                                                         \
+            reinterpret_cast<op::ceil::NAMESPACE::Descriptor **>(desc_ptr), \
+            y_desc,                                                         \
+            {x_desc})
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CREATE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetCeilWorkspaceSize(infiniopCeilDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                \
+    case CASE:                                                                              \
+        *size = reinterpret_cast<op::ceil::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu)
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia)
+#endif
+#ifdef ENABLE_QY_API
+        GET(INFINI_DEVICE_QY, nvidia)
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopCeil(
+    infiniopCeilDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                             \
+    case CASE:                                                                 \
+        return reinterpret_cast<const op::ceil::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, y, {x}, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CALCULATE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroyCeilDescriptor(infiniopCeilDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                 \
+    case CASE:                                                                  \
+        delete reinterpret_cast<const op::ceil::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        DELETE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/cos/cpu/cos_cpu.cc b/src/infiniop/ops/cos/cpu/cos_cpu.cc
new file mode 100644
index 000000000..9dc68d327
--- /dev/null
+++ b/src/infiniop/ops/cos/cpu/cos_cpu.cc
@@ -0,0 +1,48 @@
+#include "cos_cpu.h"
+
+namespace op::cos::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &x_desc = input_desc_vec.at(0);
+    const auto &y_shape = out_desc->shape();
+    const auto &x_shape = x_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
+
+    CHECK_SAME_SHAPE(y_shape, x_shape);
+
+    // create CPU elementwise descriptor
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<CosOp, fp16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<CosOp, float>(_info, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::cos::cpu
diff --git a/src/infiniop/ops/cos/cpu/cos_cpu.h b/src/infiniop/ops/cos/cpu/cos_cpu.h
new file mode 100644
index 000000000..9b4236fc2
--- /dev/null
+++ b/src/infiniop/ops/cos/cpu/cos_cpu.h
@@ -0,0 +1,22 @@
+#ifndef __COS_CPU_H__
+#define __COS_CPU_H__
+
+#include <cmath>
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+
+ELEMENTWISE_DESCRIPTOR(cos, cpu)
+
+namespace op::cos::cpu {
+typedef struct CosOp {
+public:
+    static constexpr size_t num_inputs = 1;
+
+    template <typename T>
+    T operator()(const T &x) const {
+        return std::cos(x);
+    }
+} CosOp;
+} // namespace op::cos::cpu
+
+#endif // __COS_CPU_H__
diff --git a/src/infiniop/ops/cos/cuda/kernel.cuh b/src/infiniop/ops/cos/cuda/kernel.cuh
new file mode 100644
index 000000000..b0dabb340
--- /dev/null
+++ b/src/infiniop/ops/cos/cuda/kernel.cuh
@@ -0,0 +1,32 @@
+#ifndef __COS_CUDA_H__
+#define __COS_CUDA_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+#include <cuda_fp16.h>
+
+namespace op::cos::cuda {
+typedef struct CosOp {
+public:
+    static constexpr size_t num_inputs = 1;
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &x) const {
+        if constexpr (std::is_same_v<T, half2>) {
+            return h2cos(x);
+        } else if constexpr (std::is_same_v<T, half>) {
+            return hcos(x);
+        } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
+            float x0 = __bfloat162float(__low2bfloat16(x));
+            float x1 = __bfloat162float(__high2bfloat16(x));
+            return __floats2bfloat162_rn(cosf(x0), cosf(x1));
+        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+            return __float2bfloat16_rn(cosf(__bfloat162float(x)));
+        } else if constexpr (std::is_same_v<T, float>) {
+            return __cosf(x);
+        } else {
+            return std::cos(x);
+        }
+    }
+} CosOp;
+} // namespace op::cos::cuda
+
+#endif // __COS_CUDA_H__
diff --git a/src/infiniop/ops/cos/nvidia/cos_nvidia.cu b/src/infiniop/ops/cos/nvidia/cos_nvidia.cu
new file mode 100644
index 000000000..044c59ca0
--- /dev/null
+++ b/src/infiniop/ops/cos/nvidia/cos_nvidia.cu
@@ -0,0 +1,54 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "cos_nvidia.cuh"
+
+namespace op::cos::nvidia {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &x_desc = input_desc_vec.at(0);
+    const auto &y_shape = out_desc->shape();
+    const auto &x_shape = x_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
+
+    CHECK_SAME_SHAPE(y_shape, x_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::CosOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::CosOp, float>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::cos::nvidia
diff --git a/src/infiniop/ops/cos/nvidia/cos_nvidia.cuh b/src/infiniop/ops/cos/nvidia/cos_nvidia.cuh
new file mode 100644
index 000000000..a9866e4d2
--- /dev/null
+++ b/src/infiniop/ops/cos/nvidia/cos_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __COS_NVIDIA_API_H__
+#define __COS_NVIDIA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(cos, nvidia)
+
+#endif // __COS_NVIDIA_API_H__
diff --git a/src/infiniop/ops/cos/operator.cc b/src/infiniop/ops/cos/operator.cc
new file mode 100644
index 000000000..5c464ad60
--- /dev/null
+++ b/src/infiniop/ops/cos/operator.cc
@@ -0,0 +1,139 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/cos.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/cos_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
+#include "nvidia/cos_nvidia.cuh"
+#endif
+
+__C infiniStatus_t infiniopCreateCosDescriptor(
+    infiniopHandle_t handle,
+    infiniopCosDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                            \
+    case CASE:                                                             \
+        return op::cos::NAMESPACE::Descriptor::create(                     \
+            handle,                                                        \
+            reinterpret_cast<op::cos::NAMESPACE::Descriptor **>(desc_ptr), \
+            y_desc,                                                        \
+            {x_desc})
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CREATE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetCosWorkspaceSize(infiniopCosDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                               \
+    case CASE:                                                                             \
+        *size = reinterpret_cast<op::cos::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu)
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia)
+#endif
+#ifdef ENABLE_QY_API
+        GET(INFINI_DEVICE_QY, nvidia)
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopCos(
+    infiniopCosDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                            \
+    case CASE:                                                                \
+        return reinterpret_cast<const op::cos::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, y, {x}, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CALCULATE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroyCosDescriptor(infiniopCosDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                \
+    case CASE:                                                                 \
+        delete reinterpret_cast<const op::cos::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        DELETE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/cosh/cpu/cosh_cpu.cc b/src/infiniop/ops/cosh/cpu/cosh_cpu.cc
new file mode 100644
index 000000000..9ed8e33da
--- /dev/null
+++ b/src/infiniop/ops/cosh/cpu/cosh_cpu.cc
@@ -0,0 +1,48 @@
+#include "cosh_cpu.h"
+
+namespace op::cosh::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &x_desc = input_desc_vec.at(0);
+    const auto &y_shape = out_desc->shape();
+    const auto &x_shape = x_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
+
+    CHECK_SAME_SHAPE(y_shape, x_shape);
+
+    // create CPU elementwise descriptor
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<CoshOp, fp16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<CoshOp, float>(_info, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::cosh::cpu
diff --git a/src/infiniop/ops/cosh/cpu/cosh_cpu.h b/src/infiniop/ops/cosh/cpu/cosh_cpu.h
new file mode 100644
index 000000000..aea359ef2
--- /dev/null
+++ b/src/infiniop/ops/cosh/cpu/cosh_cpu.h
@@ -0,0 +1,22 @@
+#ifndef __COSH_CPU_H__
+#define __COSH_CPU_H__
+
+#include <cmath>
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+
+ELEMENTWISE_DESCRIPTOR(cosh, cpu)
+
+namespace op::cosh::cpu {
+typedef struct CoshOp {
+public:
+    static constexpr size_t num_inputs = 1;
+
+    template <typename T>
+    T operator()(const T &x) const {
+        return std::cosh(x);
+    }
+} CoshOp;
+} // namespace op::cosh::cpu
+
+#endif // __COSH_CPU_H__
diff --git a/src/infiniop/ops/cosh/cuda/kernel.cuh b/src/infiniop/ops/cosh/cuda/kernel.cuh
new file mode 100644
index 000000000..ce6806433
--- /dev/null
+++ b/src/infiniop/ops/cosh/cuda/kernel.cuh
@@ -0,0 +1,32 @@
+#ifndef __COSH_CUDA_H__
+#define __COSH_CUDA_H__
+
+#include <cmath>
+#include <cuda_fp16.h>
+
+namespace op::cosh::cuda {
+typedef struct CoshOp {
+public:
+    static constexpr size_t num_inputs = 1;
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &x) const {
+        if constexpr (std::is_same_v<T, half2>) {
+            return __floats2half2_rn(coshf(__half2float(__low2half(x))), coshf(__half2float(__high2half(x))));
+        } else if constexpr (std::is_same_v<T, half>) {
+            return __float2half(coshf(__half2float(x)));
+        } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
+            float x0 = __bfloat162float(__low2bfloat16(x));
+            float x1 = __bfloat162float(__high2bfloat16(x));
+            return __floats2bfloat162_rn(coshf(x0), coshf(x1));
+        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+            return __float2bfloat16_rn(coshf(__bfloat162float(x)));
+        } else if constexpr (std::is_same_v<T, float>) {
+            return coshf(x);
+        } else {
+            return std::cosh(x);
+        }
+    }
+} CoshOp;
+} // namespace op::cosh::cuda
+
+#endif // __COSH_CUDA_H__
diff --git a/src/infiniop/ops/cosh/nvidia/cosh_nvidia.cu b/src/infiniop/ops/cosh/nvidia/cosh_nvidia.cu
new file mode 100644
index 000000000..a5e1442ce
--- /dev/null
+++ b/src/infiniop/ops/cosh/nvidia/cosh_nvidia.cu
@@ -0,0 +1,54 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "cosh_nvidia.cuh"
+
+namespace op::cosh::nvidia {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &x_desc = input_desc_vec.at(0);
+    const auto &y_shape = out_desc->shape();
+    const auto &x_shape = x_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
+
+    CHECK_SAME_SHAPE(y_shape, x_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::CoshOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::CoshOp, float>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::cosh::nvidia
diff --git a/src/infiniop/ops/cosh/nvidia/cosh_nvidia.cuh b/src/infiniop/ops/cosh/nvidia/cosh_nvidia.cuh
new file mode 100644
index 000000000..6a032b0bb
--- /dev/null
+++ b/src/infiniop/ops/cosh/nvidia/cosh_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __COSH_NVIDIA_API_H__
+#define __COSH_NVIDIA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(cosh, nvidia)
+
+#endif // __COSH_NVIDIA_API_H__
diff --git a/src/infiniop/ops/cosh/operator.cc b/src/infiniop/ops/cosh/operator.cc
new file mode 100644
index 000000000..75aac0c91
--- /dev/null
+++ b/src/infiniop/ops/cosh/operator.cc
@@ -0,0 +1,139 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/cosh.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/cosh_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
+#include "nvidia/cosh_nvidia.cuh"
+#endif
+
+__C infiniStatus_t infiniopCreateCoshDescriptor(
+    infiniopHandle_t handle,
+    infiniopCoshDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                             \
+    case CASE:                                                              \
+        return op::cosh::NAMESPACE::Descriptor::create(                     \
+            handle,                                                         \
+            reinterpret_cast<op::cosh::NAMESPACE::Descriptor **>(desc_ptr), \
+            y_desc,                                                         \
+            {x_desc})
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CREATE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetCoshWorkspaceSize(infiniopCoshDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                \
+    case CASE:                                                                              \
+        *size = reinterpret_cast<op::cosh::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu)
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia)
+#endif
+#ifdef ENABLE_QY_API
+        GET(INFINI_DEVICE_QY, nvidia)
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopCosh(
+    infiniopCoshDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                             \
+    case CASE:                                                                 \
+        return reinterpret_cast<const op::cosh::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, y, {x}, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CALCULATE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroyCoshDescriptor(infiniopCoshDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                 \
+    case CASE:                                                                  \
+        delete reinterpret_cast<const op::cosh::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        DELETE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/erf/cpu/erf_cpu.cc b/src/infiniop/ops/erf/cpu/erf_cpu.cc
new file mode 100644
index 000000000..00b1897d1
--- /dev/null
+++ b/src/infiniop/ops/erf/cpu/erf_cpu.cc
@@ -0,0 +1,48 @@
+#include "erf_cpu.h"
+
+namespace op::erf::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &x_desc = input_desc_vec.at(0);
+    const auto &y_shape = out_desc->shape();
+    const auto &x_shape = x_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
+
+    CHECK_SAME_SHAPE(y_shape, x_shape);
+
+    // create CPU elementwise descriptor
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<ErfOp, fp16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<ErfOp, float>(_info, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::erf::cpu
diff --git a/src/infiniop/ops/erf/cpu/erf_cpu.h b/src/infiniop/ops/erf/cpu/erf_cpu.h
new file mode 100644
index 000000000..c26f519cf
--- /dev/null
+++ b/src/infiniop/ops/erf/cpu/erf_cpu.h
@@ -0,0 +1,22 @@
+#ifndef __ERF_CPU_H__
+#define __ERF_CPU_H__
+
+#include <cmath>
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+
+ELEMENTWISE_DESCRIPTOR(erf, cpu)
+
+namespace op::erf::cpu {
+typedef struct ErfOp {
+public:
+    static constexpr size_t num_inputs = 1;
+
+    template <typename T>
+    T operator()(const T &x) const {
+        return std::erf(x);
+    }
+} ErfOp;
+} // namespace op::erf::cpu
+
+#endif // __ERF_CPU_H__
diff --git a/src/infiniop/ops/erf/cuda/kernel.cuh b/src/infiniop/ops/erf/cuda/kernel.cuh
new file mode 100644
index 000000000..820c10b19
--- /dev/null
+++ b/src/infiniop/ops/erf/cuda/kernel.cuh
@@ -0,0 +1,32 @@
+#ifndef __ERF_CUDA_H__
+#define __ERF_CUDA_H__
+
+#include <cmath>
+#include <cuda_fp16.h>
+
+namespace op::erf::cuda {
+typedef struct ErfOp {
+public:
+    static constexpr size_t num_inputs = 1;
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &x) const {
+        if constexpr (std::is_same_v<T, half2>) {
+            return __floats2half2_rn(erff(__half2float(__low2half(x))), erff(__half2float(__high2half(x))));
+        } else if constexpr (std::is_same_v<T, half>) {
+            return __float2half(erff(__half2float(x)));
+        } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
+            float x0 = __bfloat162float(__low2bfloat16(x));
+            float x1 = __bfloat162float(__high2bfloat16(x));
+            return __floats2bfloat162_rn(erff(x0), erff(x1));
+        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+            return __float2bfloat16_rn(erff(__bfloat162float(x)));
+        } else if constexpr (std::is_same_v<T, float>) {
+            return erff(x);
+        } else {
+            return std::erf(x);
+        }
+    }
+} ErfOp;
+} // namespace op::erf::cuda
+
+#endif // __ERF_CUDA_H__
diff --git a/src/infiniop/ops/erf/nvidia/erf_nvidia.cu b/src/infiniop/ops/erf/nvidia/erf_nvidia.cu
new file mode 100644
index 000000000..9080593de
--- /dev/null
+++ b/src/infiniop/ops/erf/nvidia/erf_nvidia.cu
@@ -0,0 +1,54 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "erf_nvidia.cuh"
+
+namespace op::erf::nvidia {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &x_desc = input_desc_vec.at(0);
+    const auto &y_shape = out_desc->shape();
+    const auto &x_shape = x_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
+
+    CHECK_SAME_SHAPE(y_shape, x_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::ErfOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::ErfOp, float>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::erf::nvidia
diff --git a/src/infiniop/ops/erf/nvidia/erf_nvidia.cuh b/src/infiniop/ops/erf/nvidia/erf_nvidia.cuh
new file mode 100644
index 000000000..0621150fa
--- /dev/null
+++ b/src/infiniop/ops/erf/nvidia/erf_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __ERF_NVIDIA_API_H__
+#define __ERF_NVIDIA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(erf, nvidia)
+
+#endif // __ERF_NVIDIA_API_H__
diff --git a/src/infiniop/ops/erf/operator.cc b/src/infiniop/ops/erf/operator.cc
new file mode 100644
index 000000000..1491cfa9a
--- /dev/null
+++ b/src/infiniop/ops/erf/operator.cc
@@ -0,0 +1,139 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/erf.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/erf_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
+#include "nvidia/erf_nvidia.cuh"
+#endif
+
+__C infiniStatus_t infiniopCreateErfDescriptor(
+    infiniopHandle_t handle,
+    infiniopErfDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                            \
+    case CASE:                                                             \
+        return op::erf::NAMESPACE::Descriptor::create(                     \
+            handle,                                                        \
+            reinterpret_cast<op::erf::NAMESPACE::Descriptor **>(desc_ptr), \
+            y_desc,                                                        \
+            {x_desc})
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CREATE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetErfWorkspaceSize(infiniopErfDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                               \
+    case CASE:                                                                             \
+        *size = reinterpret_cast<op::erf::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu)
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia)
+#endif
+#ifdef ENABLE_QY_API
+        GET(INFINI_DEVICE_QY, nvidia)
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopErf(
+    infiniopErfDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                            \
+    case CASE:                                                                \
+        return reinterpret_cast<const op::erf::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, y, {x}, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CALCULATE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroyErfDescriptor(infiniopErfDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                \
+    case CASE:                                                                 \
+        delete reinterpret_cast<const op::erf::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        DELETE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/floor/cpu/floor_cpu.cc b/src/infiniop/ops/floor/cpu/floor_cpu.cc
new file mode 100644
index 000000000..e809a02e2
--- /dev/null
+++ b/src/infiniop/ops/floor/cpu/floor_cpu.cc
@@ -0,0 +1,48 @@
+#include "floor_cpu.h"
+
+namespace op::floor::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &x_desc = input_desc_vec.at(0);
+    const auto &y_shape = out_desc->shape();
+    const auto &x_shape = x_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
+
+    CHECK_SAME_SHAPE(y_shape, x_shape);
+
+    // create CPU elementwise descriptor
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<FloorOp, fp16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<FloorOp, float>(_info, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::floor::cpu
diff --git a/src/infiniop/ops/floor/cpu/floor_cpu.h b/src/infiniop/ops/floor/cpu/floor_cpu.h
new file mode 100644
index 000000000..91508a384
--- /dev/null
+++ b/src/infiniop/ops/floor/cpu/floor_cpu.h
@@ -0,0 +1,26 @@
+#ifndef __FLOOR_CPU_H__
+#define __FLOOR_CPU_H__
+
+#include <cmath>
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+
+ELEMENTWISE_DESCRIPTOR(floor, cpu)
+
+namespace op::floor::cpu {
+typedef struct FloorOp {
+public:
+    static constexpr size_t num_inputs = 1;
+
+    template <typename T>
+    T operator()(const T &x) const {
+        if constexpr (std::is_integral_v<T>) {
+            return x;
+        } else {
+            return std::floor(x);
+        }
+    }
+} FloorOp;
+} // namespace op::floor::cpu
+
+#endif // __FLOOR_CPU_H__
diff --git a/src/infiniop/ops/floor/cuda/kernel.cuh b/src/infiniop/ops/floor/cuda/kernel.cuh
new file mode 100644
index 000000000..c89ce34f4
--- /dev/null
+++ b/src/infiniop/ops/floor/cuda/kernel.cuh
@@ -0,0 +1,34 @@
+#ifndef __FLOOR_CUDA_H__
+#define __FLOOR_CUDA_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+#include <cuda_fp16.h>
+
+namespace op::floor::cuda {
+typedef struct FloorOp {
+public:
+    static constexpr size_t num_inputs = 1;
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &x) const {
+        if constexpr (std::is_same_v<T, half2>) {
+            return h2floor(x);
+        } else if constexpr (std::is_same_v<T, half>) {
+            return hfloor(x);
+        } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
+            float x0 = __bfloat162float(__low2bfloat16(x));
+            float x1 = __bfloat162float(__high2bfloat16(x));
+            return __floats2bfloat162_rn(floorf(x0), floorf(x1));
+        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+            return __float2bfloat16_rn(floorf(__bfloat162float(x)));
+        } else if constexpr (std::is_same_v<T, float>) {
+            return floorf(x);
+        } else if constexpr (std::is_integral_v<T>) {
+            return x;
+        } else {
+            return std::floor(x);
+        }
+    }
+} FloorOp;
+} // namespace op::floor::cuda
+
+#endif // __FLOOR_CUDA_H__
diff --git a/src/infiniop/ops/floor/nvidia/floor_nvidia.cu b/src/infiniop/ops/floor/nvidia/floor_nvidia.cu
new file mode 100644
index 000000000..08305048a
--- /dev/null
+++ b/src/infiniop/ops/floor/nvidia/floor_nvidia.cu
@@ -0,0 +1,54 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "floor_nvidia.cuh"
+
+namespace op::floor::nvidia {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &x_desc = input_desc_vec.at(0);
+    const auto &y_shape = out_desc->shape();
+    const auto &x_shape = x_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
+
+    CHECK_SAME_SHAPE(y_shape, x_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::FloorOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::FloorOp, float>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::floor::nvidia
diff --git a/src/infiniop/ops/floor/nvidia/floor_nvidia.cuh b/src/infiniop/ops/floor/nvidia/floor_nvidia.cuh
new file mode 100644
index 000000000..7a3c2f5c7
--- /dev/null
+++ b/src/infiniop/ops/floor/nvidia/floor_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __FLOOR_NVIDIA_API_H__
+#define __FLOOR_NVIDIA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(floor, nvidia)
+
+#endif // __FLOOR_NVIDIA_API_H__
diff --git a/src/infiniop/ops/floor/operator.cc b/src/infiniop/ops/floor/operator.cc
new file mode 100644
index 000000000..4e4ed2b5a
--- /dev/null
+++ b/src/infiniop/ops/floor/operator.cc
@@ -0,0 +1,139 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/floor.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/floor_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
+#include "nvidia/floor_nvidia.cuh"
+#endif
+
+__C infiniStatus_t infiniopCreateFloorDescriptor(
+    infiniopHandle_t handle,
+    infiniopFloorDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                              \
+    case CASE:                                                               \
+        return op::floor::NAMESPACE::Descriptor::create(                     \
+            handle,                                                          \
+            reinterpret_cast<op::floor::NAMESPACE::Descriptor **>(desc_ptr), \
+            y_desc,                                                          \
+            {x_desc})
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CREATE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetFloorWorkspaceSize(infiniopFloorDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                 \
+    case CASE:                                                                               \
+        *size = reinterpret_cast<op::floor::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu)
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia)
+#endif
+#ifdef ENABLE_QY_API
+        GET(INFINI_DEVICE_QY, nvidia)
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopFloor(
+    infiniopFloorDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                              \
+    case CASE:                                                                  \
+        return reinterpret_cast<const op::floor::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, y, {x}, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CALCULATE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroyFloorDescriptor(infiniopFloorDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                  \
+    case CASE:                                                                   \
+        delete reinterpret_cast<const op::floor::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        DELETE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/log/cpu/log_cpu.cc b/src/infiniop/ops/log/cpu/log_cpu.cc
new file mode 100644
index 000000000..e7314c319
--- /dev/null
+++ b/src/infiniop/ops/log/cpu/log_cpu.cc
@@ -0,0 +1,48 @@
+#include "log_cpu.h"
+
+namespace op::log::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &x_desc = input_desc_vec.at(0);
+    const auto &y_shape = out_desc->shape();
+    const auto &x_shape = x_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
+
+    CHECK_SAME_SHAPE(y_shape, x_shape);
+
+    // create CPU elementwise descriptor
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<LogOp, fp16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<LogOp, float>(_info, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::log::cpu
diff --git a/src/infiniop/ops/log/cpu/log_cpu.h b/src/infiniop/ops/log/cpu/log_cpu.h
new file mode 100644
index 000000000..535e681d3
--- /dev/null
+++ b/src/infiniop/ops/log/cpu/log_cpu.h
@@ -0,0 +1,22 @@
+#ifndef __LOG_CPU_H__
+#define __LOG_CPU_H__
+
+#include <cmath>
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+
+ELEMENTWISE_DESCRIPTOR(log, cpu)
+
+namespace op::log::cpu {
+typedef struct LogOp {
+public:
+    static constexpr size_t num_inputs = 1;
+
+    template <typename T>
+    T operator()(const T &x) const {
+        return std::log(x);
+    }
+} LogOp;
+} // namespace op::log::cpu
+
+#endif // __LOG_CPU_H__
diff --git a/src/infiniop/ops/log/cuda/kernel.cuh b/src/infiniop/ops/log/cuda/kernel.cuh
new file mode 100644
index 000000000..b1e46873c
--- /dev/null
+++ b/src/infiniop/ops/log/cuda/kernel.cuh
@@ -0,0 +1,32 @@
+#ifndef __LOG_CUDA_H__
+#define __LOG_CUDA_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+#include <cuda_fp16.h>
+
+namespace op::log::cuda {
+typedef struct LogOp {
+public:
+    static constexpr size_t num_inputs = 1;
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &x) const {
+        if constexpr (std::is_same_v<T, half2>) {
+            return h2log(x);
+        } else if constexpr (std::is_same_v<T, half>) {
+            return __float2half(__logf(__half2float(x)));
+        } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
+            float x0 = __bfloat162float(__low2bfloat16(x));
+            float x1 = __bfloat162float(__high2bfloat16(x));
+            return __floats2bfloat162_rn(logf(x0), logf(x1));
+        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+            return __float2bfloat16_rn(logf(__bfloat162float(x)));
+        } else if constexpr (std::is_same_v<T, float>) {
+            return __logf(x);
+        } else {
+            return std::log(x);
+        }
+    }
+} LogOp;
+} // namespace op::log::cuda
+
+#endif // __LOG_CUDA_H__
diff --git a/src/infiniop/ops/log/nvidia/log_nvidia.cu b/src/infiniop/ops/log/nvidia/log_nvidia.cu
new file mode 100644
index 000000000..9e7bcafc4
--- /dev/null
+++ b/src/infiniop/ops/log/nvidia/log_nvidia.cu
@@ -0,0 +1,54 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "log_nvidia.cuh"
+
+namespace op::log::nvidia {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &x_desc = input_desc_vec.at(0);
+    const auto &y_shape = out_desc->shape();
+    const auto &x_shape = x_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
+
+    CHECK_SAME_SHAPE(y_shape, x_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::LogOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::LogOp, float>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::log::nvidia
diff --git a/src/infiniop/ops/log/nvidia/log_nvidia.cuh b/src/infiniop/ops/log/nvidia/log_nvidia.cuh
new file mode 100644
index 000000000..c48841622
--- /dev/null
+++ b/src/infiniop/ops/log/nvidia/log_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __LOG_NVIDIA_API_H__
+#define __LOG_NVIDIA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(log, nvidia)
+
+#endif // __LOG_NVIDIA_API_H__
diff --git a/src/infiniop/ops/log/operator.cc b/src/infiniop/ops/log/operator.cc
new file mode 100644
index 000000000..8f2add408
--- /dev/null
+++ b/src/infiniop/ops/log/operator.cc
@@ -0,0 +1,139 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/log.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/log_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
+#include "nvidia/log_nvidia.cuh"
+#endif
+
+__C infiniStatus_t infiniopCreateLogDescriptor(
+    infiniopHandle_t handle,
+    infiniopLogDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                            \
+    case CASE:                                                             \
+        return op::log::NAMESPACE::Descriptor::create(                     \
+            handle,                                                        \
+            reinterpret_cast<op::log::NAMESPACE::Descriptor **>(desc_ptr), \
+            y_desc,                                                        \
+            {x_desc})
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CREATE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetLogWorkspaceSize(infiniopLogDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                               \
+    case CASE:                                                                             \
+        *size = reinterpret_cast<op::log::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu)
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia)
+#endif
+#ifdef ENABLE_QY_API
+        GET(INFINI_DEVICE_QY, nvidia)
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopLog(
+    infiniopLogDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                            \
+    case CASE:                                                                \
+        return reinterpret_cast<const op::log::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, y, {x}, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CALCULATE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroyLogDescriptor(infiniopLogDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                \
+    case CASE:                                                                 \
+        delete reinterpret_cast<const op::log::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        DELETE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/neg/cpu/neg_cpu.cc b/src/infiniop/ops/neg/cpu/neg_cpu.cc
new file mode 100644
index 000000000..5da2ae4c3
--- /dev/null
+++ b/src/infiniop/ops/neg/cpu/neg_cpu.cc
@@ -0,0 +1,48 @@
+#include "neg_cpu.h"
+
+namespace op::neg::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &x_desc = input_desc_vec.at(0);
+    const auto &y_shape = out_desc->shape();
+    const auto &x_shape = x_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
+
+    CHECK_SAME_SHAPE(y_shape, x_shape);
+
+    // create CPU elementwise descriptor
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<NegOp, fp16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<NegOp, float>(_info, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::neg::cpu
diff --git a/src/infiniop/ops/neg/cpu/neg_cpu.h b/src/infiniop/ops/neg/cpu/neg_cpu.h
new file mode 100644
index 000000000..ea45989b3
--- /dev/null
+++ b/src/infiniop/ops/neg/cpu/neg_cpu.h
@@ -0,0 +1,20 @@
+#ifndef __NEG_CPU_H__
+#define __NEG_CPU_H__
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+
+ELEMENTWISE_DESCRIPTOR(neg, cpu)
+
+namespace op::neg::cpu {
+typedef struct NegOp {
+public:
+    static constexpr size_t num_inputs = 1;
+
+    template <typename T>
+    T operator()(const T &x) const {
+        return -x;
+    }
+} NegOp;
+} // namespace op::neg::cpu
+
+#endif // __NEG_CPU_H__
diff --git a/src/infiniop/ops/neg/cuda/kernel.cuh b/src/infiniop/ops/neg/cuda/kernel.cuh
new file mode 100644
index 000000000..57904b3df
--- /dev/null
+++ b/src/infiniop/ops/neg/cuda/kernel.cuh
@@ -0,0 +1,23 @@
+#ifndef __NEG_CUDA_H__
+#define __NEG_CUDA_H__
+
+#include <cuda_fp16.h>
+
+namespace op::neg::cuda {
+typedef struct NegOp {
+public:
+    static constexpr size_t num_inputs = 1;
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &x) const {
+        if constexpr (std::is_same_v<T, half2>) {
+            return __hneg2(x);
+        } else if constexpr (std::is_same_v<T, half>) {
+            return __hneg(x);
+        } else {
+            return -x;
+        }
+    }
+} NegOp;
+} // namespace op::neg::cuda
+
+#endif // __NEG_CUDA_H__
diff --git a/src/infiniop/ops/neg/nvidia/neg_nvidia.cu b/src/infiniop/ops/neg/nvidia/neg_nvidia.cu
new file mode 100644
index 000000000..d18b8bf25
--- /dev/null
+++ b/src/infiniop/ops/neg/nvidia/neg_nvidia.cu
@@ -0,0 +1,54 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "neg_nvidia.cuh"
+
+namespace op::neg::nvidia {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &x_desc = input_desc_vec.at(0);
+    const auto &y_shape = out_desc->shape();
+    const auto &x_shape = x_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
+
+    CHECK_SAME_SHAPE(y_shape, x_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::NegOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::NegOp, float>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::neg::nvidia
diff --git a/src/infiniop/ops/neg/nvidia/neg_nvidia.cuh b/src/infiniop/ops/neg/nvidia/neg_nvidia.cuh
new file mode 100644
index 000000000..1265cd3df
--- /dev/null
+++ b/src/infiniop/ops/neg/nvidia/neg_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __NEG_NVIDIA_API_H__
+#define __NEG_NVIDIA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(neg, nvidia)
+
+#endif // __NEG_NVIDIA_API_H__
diff --git a/src/infiniop/ops/neg/operator.cc b/src/infiniop/ops/neg/operator.cc
new file mode 100644
index 000000000..d4134df3e
--- /dev/null
+++ b/src/infiniop/ops/neg/operator.cc
@@ -0,0 +1,139 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/neg.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/neg_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
+#include "nvidia/neg_nvidia.cuh"
+#endif
+
+__C infiniStatus_t infiniopCreateNegDescriptor(
+    infiniopHandle_t handle,
+    infiniopNegDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                            \
+    case CASE:                                                             \
+        return op::neg::NAMESPACE::Descriptor::create(                     \
+            handle,                                                        \
+            reinterpret_cast<op::neg::NAMESPACE::Descriptor **>(desc_ptr), \
+            y_desc,                                                        \
+            {x_desc})
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CREATE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetNegWorkspaceSize(infiniopNegDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                               \
+    case CASE:                                                                             \
+        *size = reinterpret_cast<op::neg::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu)
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia)
+#endif
+#ifdef ENABLE_QY_API
+        GET(INFINI_DEVICE_QY, nvidia)
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopNeg(
+    infiniopNegDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                            \
+    case CASE:                                                                \
+        return reinterpret_cast<const op::neg::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, y, {x}, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CALCULATE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroyNegDescriptor(infiniopNegDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                \
+    case CASE:                                                                 \
+        delete reinterpret_cast<const op::neg::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        DELETE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/pow/cuda/kernel.cuh b/src/infiniop/ops/pow/cuda/kernel.cuh
index e8b5324a0..3786e7a52 100644
--- a/src/infiniop/ops/pow/cuda/kernel.cuh
+++ b/src/infiniop/ops/pow/cuda/kernel.cuh
@@ -2,8 +2,8 @@
 #define __POW_CUDA_H__
 
 #include <cmath>
-#include <cuda_fp16.h>
 #include <cuda_bf16.h>
+#include <cuda_fp16.h>
 
 namespace op::pow::cuda {
 typedef struct PowOp {
diff --git a/src/infiniop/ops/reciprocal/cpu/reciprocal_cpu.cc b/src/infiniop/ops/reciprocal/cpu/reciprocal_cpu.cc
new file mode 100644
index 000000000..52874c8b3
--- /dev/null
+++ b/src/infiniop/ops/reciprocal/cpu/reciprocal_cpu.cc
@@ -0,0 +1,48 @@
+#include "reciprocal_cpu.h"
+
+namespace op::reciprocal::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &x_desc = input_desc_vec.at(0);
+    const auto &y_shape = out_desc->shape();
+    const auto &x_shape = x_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
+
+    CHECK_SAME_SHAPE(y_shape, x_shape);
+
+    // create CPU elementwise descriptor
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<ReciprocalOp, fp16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<ReciprocalOp, float>(_info, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::reciprocal::cpu
diff --git a/src/infiniop/ops/reciprocal/cpu/reciprocal_cpu.h b/src/infiniop/ops/reciprocal/cpu/reciprocal_cpu.h
new file mode 100644
index 000000000..0a0f223f0
--- /dev/null
+++ b/src/infiniop/ops/reciprocal/cpu/reciprocal_cpu.h
@@ -0,0 +1,20 @@
+#ifndef __RECIPROCAL_CPU_H__
+#define __RECIPROCAL_CPU_H__
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+
+ELEMENTWISE_DESCRIPTOR(reciprocal, cpu)
+
+namespace op::reciprocal::cpu {
+typedef struct ReciprocalOp {
+public:
+    static constexpr size_t num_inputs = 1;
+
+    template <typename T>
+    T operator()(const T &x) const {
+        return T(1) / x;
+    }
+} ReciprocalOp;
+} // namespace op::reciprocal::cpu
+
+#endif // __RECIPROCAL_CPU_H__
diff --git a/src/infiniop/ops/reciprocal/cuda/kernel.cuh b/src/infiniop/ops/reciprocal/cuda/kernel.cuh
new file mode 100644
index 000000000..94c71de90
--- /dev/null
+++ b/src/infiniop/ops/reciprocal/cuda/kernel.cuh
@@ -0,0 +1,32 @@
+#ifndef __RECIPROCAL_CUDA_H__
+#define __RECIPROCAL_CUDA_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+#include <cuda_fp16.h>
+
+namespace op::reciprocal::cuda {
+typedef struct ReciprocalOp {
+public:
+    static constexpr size_t num_inputs = 1;
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &x) const {
+        if constexpr (std::is_same_v<T, half2>) {
+            return h2rcp(x);
+        } else if constexpr (std::is_same_v<T, half>) {
+            return hrcp(x);
+        } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
+            float x0 = __bfloat162float(__low2bfloat16(x));
+            float x1 = __bfloat162float(__high2bfloat16(x));
+            return __floats2bfloat162_rn(__frcp_rn(x0), __frcp_rn(x1));
+        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+            return __float2bfloat16_rn(__frcp_rn(__bfloat162float(x)));
+        } else if constexpr (std::is_same_v<T, float>) {
+            return __frcp_rn(x);
+        } else {
+            return T(1) / x;
+        }
+    }
+} ReciprocalOp;
+} // namespace op::reciprocal::cuda
+
+#endif // __RECIPROCAL_CUDA_H__
diff --git a/src/infiniop/ops/reciprocal/nvidia/reciprocal_nvidia.cu b/src/infiniop/ops/reciprocal/nvidia/reciprocal_nvidia.cu
new file mode 100644
index 000000000..45b74e25e
--- /dev/null
+++ b/src/infiniop/ops/reciprocal/nvidia/reciprocal_nvidia.cu
@@ -0,0 +1,54 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "reciprocal_nvidia.cuh"
+
+namespace op::reciprocal::nvidia {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &x_desc = input_desc_vec.at(0);
+    const auto &y_shape = out_desc->shape();
+    const auto &x_shape = x_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
+
+    CHECK_SAME_SHAPE(y_shape, x_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::ReciprocalOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::ReciprocalOp, float>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::reciprocal::nvidia
diff --git a/src/infiniop/ops/reciprocal/nvidia/reciprocal_nvidia.cuh b/src/infiniop/ops/reciprocal/nvidia/reciprocal_nvidia.cuh
new file mode 100644
index 000000000..d98c8f4c2
--- /dev/null
+++ b/src/infiniop/ops/reciprocal/nvidia/reciprocal_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __RECIPROCAL_NVIDIA_API_H__
+#define __RECIPROCAL_NVIDIA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(reciprocal, nvidia)
+
+#endif // __RECIPROCAL_NVIDIA_API_H__
diff --git a/src/infiniop/ops/reciprocal/operator.cc b/src/infiniop/ops/reciprocal/operator.cc
new file mode 100644
index 000000000..033286024
--- /dev/null
+++ b/src/infiniop/ops/reciprocal/operator.cc
@@ -0,0 +1,139 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/reciprocal.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/reciprocal_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
+#include "nvidia/reciprocal_nvidia.cuh"
+#endif
+
+__C infiniStatus_t infiniopCreateReciprocalDescriptor(
+    infiniopHandle_t handle,
+    infiniopReciprocalDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                                   \
+    case CASE:                                                                    \
+        return op::reciprocal::NAMESPACE::Descriptor::create(                     \
+            handle,                                                               \
+            reinterpret_cast<op::reciprocal::NAMESPACE::Descriptor **>(desc_ptr), \
+            y_desc,                                                               \
+            {x_desc})
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CREATE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetReciprocalWorkspaceSize(infiniopReciprocalDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                      \
+    case CASE:                                                                                    \
+        *size = reinterpret_cast<op::reciprocal::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu)
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia)
+#endif
+#ifdef ENABLE_QY_API
+        GET(INFINI_DEVICE_QY, nvidia)
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopReciprocal(
+    infiniopReciprocalDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                                   \
+    case CASE:                                                                       \
+        return reinterpret_cast<const op::reciprocal::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, y, {x}, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CALCULATE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroyReciprocalDescriptor(infiniopReciprocalDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                       \
+    case CASE:                                                                        \
+        delete reinterpret_cast<const op::reciprocal::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        DELETE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/round/cpu/round_cpu.cc b/src/infiniop/ops/round/cpu/round_cpu.cc
new file mode 100644
index 000000000..0b0cea7b7
--- /dev/null
+++ b/src/infiniop/ops/round/cpu/round_cpu.cc
@@ -0,0 +1,48 @@
+#include "round_cpu.h"
+
+namespace op::round::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &x_desc = input_desc_vec.at(0);
+    const auto &y_shape = out_desc->shape();
+    const auto &x_shape = x_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
+
+    CHECK_SAME_SHAPE(y_shape, x_shape);
+
+    // create CPU elementwise descriptor
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<RoundOp, fp16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<RoundOp, float>(_info, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::round::cpu
diff --git a/src/infiniop/ops/round/cpu/round_cpu.h b/src/infiniop/ops/round/cpu/round_cpu.h
new file mode 100644
index 000000000..eccd6df0f
--- /dev/null
+++ b/src/infiniop/ops/round/cpu/round_cpu.h
@@ -0,0 +1,25 @@
+#ifndef __ROUND_CPU_H__
+#define __ROUND_CPU_H__
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+#include <cmath>
+
+ELEMENTWISE_DESCRIPTOR(round, cpu)
+
+namespace op::round::cpu {
+typedef struct RoundOp {
+public:
+    static constexpr size_t num_inputs = 1;
+
+    template <typename T>
+    T operator()(const T &x) const {
+        if constexpr (std::is_integral_v<T>) {
+            return x;
+        } else {
+            return std::nearbyint(x);
+        }
+    }
+} RoundOp;
+} // namespace op::round::cpu
+
+#endif // __ROUND_CPU_H__
diff --git a/src/infiniop/ops/round/cuda/kernel.cuh b/src/infiniop/ops/round/cuda/kernel.cuh
new file mode 100644
index 000000000..c52a10716
--- /dev/null
+++ b/src/infiniop/ops/round/cuda/kernel.cuh
@@ -0,0 +1,34 @@
+#ifndef __ROUND_CUDA_H__
+#define __ROUND_CUDA_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+#include <cuda_fp16.h>
+
+namespace op::round::cuda {
+typedef struct RoundOp {
+public:
+    static constexpr size_t num_inputs = 1;
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &x) const {
+        if constexpr (std::is_same_v<T, half2>) {
+            return h2rint(x);
+        } else if constexpr (std::is_same_v<T, half>) {
+            return hrint(x);
+        } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
+            float x0 = __bfloat162float(__low2bfloat16(x));
+            float x1 = __bfloat162float(__high2bfloat16(x));
+            return __floats2bfloat162_rn(rintf(x0), rintf(x1));
+        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+            return __float2bfloat16_rn(rintf(__bfloat162float(x)));
+        } else if constexpr (std::is_same_v<T, float>) {
+            return rintf(x);
+        } else if constexpr (std::is_integral_v<T>) {
+            return x;
+        } else {
+            return std::nearbyint(x);
+        }
+    }
+} RoundOp;
+} // namespace op::round::cuda
+
+#endif // __ROUND_CUDA_H__
diff --git a/src/infiniop/ops/round/nvidia/round_nvidia.cu b/src/infiniop/ops/round/nvidia/round_nvidia.cu
new file mode 100644
index 000000000..c1fabc885
--- /dev/null
+++ b/src/infiniop/ops/round/nvidia/round_nvidia.cu
@@ -0,0 +1,54 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "round_nvidia.cuh"
+
+namespace op::round::nvidia {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &x_desc = input_desc_vec.at(0);
+    const auto &y_shape = out_desc->shape();
+    const auto &x_shape = x_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
+
+    CHECK_SAME_SHAPE(y_shape, x_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::RoundOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::RoundOp, float>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::round::nvidia
diff --git a/src/infiniop/ops/round/nvidia/round_nvidia.cuh b/src/infiniop/ops/round/nvidia/round_nvidia.cuh
new file mode 100644
index 000000000..65bb38566
--- /dev/null
+++ b/src/infiniop/ops/round/nvidia/round_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __ROUND_NVIDIA_API_H__
+#define __ROUND_NVIDIA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(round, nvidia)
+
+#endif // __ROUND_NVIDIA_API_H__
diff --git a/src/infiniop/ops/round/operator.cc b/src/infiniop/ops/round/operator.cc
new file mode 100644
index 000000000..9468803c8
--- /dev/null
+++ b/src/infiniop/ops/round/operator.cc
@@ -0,0 +1,139 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/round.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/round_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
+#include "nvidia/round_nvidia.cuh"
+#endif
+
+__C infiniStatus_t infiniopCreateRoundDescriptor(
+    infiniopHandle_t handle,
+    infiniopRoundDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                              \
+    case CASE:                                                               \
+        return op::round::NAMESPACE::Descriptor::create(                     \
+            handle,                                                          \
+            reinterpret_cast<op::round::NAMESPACE::Descriptor **>(desc_ptr), \
+            y_desc,                                                          \
+            {x_desc})
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CREATE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetRoundWorkspaceSize(infiniopRoundDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                 \
+    case CASE:                                                                               \
+        *size = reinterpret_cast<op::round::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu)
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia)
+#endif
+#ifdef ENABLE_QY_API
+        GET(INFINI_DEVICE_QY, nvidia)
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopRound(
+    infiniopRoundDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                              \
+    case CASE:                                                                  \
+        return reinterpret_cast<const op::round::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, y, {x}, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CALCULATE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroyRoundDescriptor(infiniopRoundDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                  \
+    case CASE:                                                                   \
+        delete reinterpret_cast<const op::round::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        DELETE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/sign/cpu/sign_cpu.cc b/src/infiniop/ops/sign/cpu/sign_cpu.cc
new file mode 100644
index 000000000..1f3430e73
--- /dev/null
+++ b/src/infiniop/ops/sign/cpu/sign_cpu.cc
@@ -0,0 +1,48 @@
+#include "sign_cpu.h"
+
+namespace op::sign::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &x_desc = input_desc_vec.at(0);
+    const auto &y_shape = out_desc->shape();
+    const auto &x_shape = x_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
+
+    CHECK_SAME_SHAPE(y_shape, x_shape);
+
+    // create CPU elementwise descriptor
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<SignOp, fp16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<SignOp, float>(_info, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::sign::cpu
diff --git a/src/infiniop/ops/sign/cpu/sign_cpu.h b/src/infiniop/ops/sign/cpu/sign_cpu.h
new file mode 100644
index 000000000..505194c85
--- /dev/null
+++ b/src/infiniop/ops/sign/cpu/sign_cpu.h
@@ -0,0 +1,20 @@
+#ifndef __SIGN_CPU_H__
+#define __SIGN_CPU_H__
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+
+ELEMENTWISE_DESCRIPTOR(sign, cpu)
+
+namespace op::sign::cpu {
+typedef struct SignOp {
+public:
+    static constexpr size_t num_inputs = 1;
+
+    template <typename T>
+    T operator()(const T &x) const {
+        return x > T(0) ? T(1) : (x == T(0) ? T(0) : T(-1));
+    }
+} SignOp;
+} // namespace op::sign::cpu
+
+#endif // __SIGN_CPU_H__
diff --git a/src/infiniop/ops/sign/cuda/kernel.cuh b/src/infiniop/ops/sign/cuda/kernel.cuh
new file mode 100644
index 000000000..3737282b0
--- /dev/null
+++ b/src/infiniop/ops/sign/cuda/kernel.cuh
@@ -0,0 +1,25 @@
+#ifndef __SIGN_CUDA_H__
+#define __SIGN_CUDA_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+#include <cuda_fp16.h>
+
+namespace op::sign::cuda {
+typedef struct SignOp {
+public:
+    static constexpr size_t num_inputs = 1;
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &x) const {
+        if constexpr (std::is_same_v<T, half2>) {
+            const auto lt_mask = __hlt2(x, __floats2half2_rn(0.0f, 0.0f));
+            return __hadd2(__hneg2(lt_mask), __hsub2(__floats2half2_rn(1.0f, 1.0f), lt_mask));
+        } else if constexpr (std::is_same_v<T, half>) {
+            return x > half(0) ? half(1) : (x == half(0) ? half(0) : half(-1));
+        } else {
+            return x > T(0) ? T(1) : (x == T(0) ? T(0) : T(-1));
+        }
+    }
+} SignOp;
+} // namespace op::sign::cuda
+
+#endif // __SIGN_CUDA_H__
diff --git a/src/infiniop/ops/sign/nvidia/sign_nvidia.cu b/src/infiniop/ops/sign/nvidia/sign_nvidia.cu
new file mode 100644
index 000000000..6a3152e41
--- /dev/null
+++ b/src/infiniop/ops/sign/nvidia/sign_nvidia.cu
@@ -0,0 +1,54 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "sign_nvidia.cuh"
+
+namespace op::sign::nvidia {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &x_desc = input_desc_vec.at(0);
+    const auto &y_shape = out_desc->shape();
+    const auto &x_shape = x_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
+
+    CHECK_SAME_SHAPE(y_shape, x_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::SignOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::SignOp, float>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::sign::nvidia
diff --git a/src/infiniop/ops/sign/nvidia/sign_nvidia.cuh b/src/infiniop/ops/sign/nvidia/sign_nvidia.cuh
new file mode 100644
index 000000000..d5f2540a3
--- /dev/null
+++ b/src/infiniop/ops/sign/nvidia/sign_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __SIGN_NVIDIA_API_H__
+#define __SIGN_NVIDIA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(sign, nvidia)
+
+#endif // __SIGN_NVIDIA_API_H__
diff --git a/src/infiniop/ops/sign/operator.cc b/src/infiniop/ops/sign/operator.cc
new file mode 100644
index 000000000..8f658a9b3
--- /dev/null
+++ b/src/infiniop/ops/sign/operator.cc
@@ -0,0 +1,139 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/sign.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/sign_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
+#include "nvidia/sign_nvidia.cuh"
+#endif
+
+__C infiniStatus_t infiniopCreateSignDescriptor(
+    infiniopHandle_t handle,
+    infiniopSignDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                             \
+    case CASE:                                                              \
+        return op::sign::NAMESPACE::Descriptor::create(                     \
+            handle,                                                         \
+            reinterpret_cast<op::sign::NAMESPACE::Descriptor **>(desc_ptr), \
+            y_desc,                                                         \
+            {x_desc})
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CREATE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetSignWorkspaceSize(infiniopSignDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                \
+    case CASE:                                                                              \
+        *size = reinterpret_cast<op::sign::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu)
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia)
+#endif
+#ifdef ENABLE_QY_API
+        GET(INFINI_DEVICE_QY, nvidia)
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopSign(
+    infiniopSignDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                             \
+    case CASE:                                                                 \
+        return reinterpret_cast<const op::sign::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, y, {x}, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CALCULATE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroySignDescriptor(infiniopSignDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                 \
+    case CASE:                                                                  \
+        delete reinterpret_cast<const op::sign::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        DELETE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/sinh/cpu/sinh_cpu.cc b/src/infiniop/ops/sinh/cpu/sinh_cpu.cc
new file mode 100644
index 000000000..40685847d
--- /dev/null
+++ b/src/infiniop/ops/sinh/cpu/sinh_cpu.cc
@@ -0,0 +1,48 @@
+#include "sinh_cpu.h"
+
+namespace op::sinh::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &x_desc = input_desc_vec.at(0);
+    const auto &y_shape = out_desc->shape();
+    const auto &x_shape = x_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
+
+    CHECK_SAME_SHAPE(y_shape, x_shape);
+
+    // create CPU elementwise descriptor
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<SinhOp, fp16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<SinhOp, float>(_info, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::sinh::cpu
diff --git a/src/infiniop/ops/sinh/cpu/sinh_cpu.h b/src/infiniop/ops/sinh/cpu/sinh_cpu.h
new file mode 100644
index 000000000..dbc8f3c7e
--- /dev/null
+++ b/src/infiniop/ops/sinh/cpu/sinh_cpu.h
@@ -0,0 +1,22 @@
+#ifndef __SINH_CPU_H__
+#define __SINH_CPU_H__
+
+#include <cmath>
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+
+ELEMENTWISE_DESCRIPTOR(sinh, cpu)
+
+namespace op::sinh::cpu {
+typedef struct SinhOp {
+public:
+    static constexpr size_t num_inputs = 1;
+
+    template <typename T>
+    T operator()(const T &x) const {
+        return std::sinh(x);
+    }
+} SinhOp;
+} // namespace op::sinh::cpu
+
+#endif // __SINH_CPU_H__
diff --git a/src/infiniop/ops/sinh/cuda/kernel.cuh b/src/infiniop/ops/sinh/cuda/kernel.cuh
new file mode 100644
index 000000000..c09150666
--- /dev/null
+++ b/src/infiniop/ops/sinh/cuda/kernel.cuh
@@ -0,0 +1,32 @@
+#ifndef __SINH_CUDA_H__
+#define __SINH_CUDA_H__
+
+#include <cmath>
+#include <cuda_fp16.h>
+
+namespace op::sinh::cuda {
+typedef struct SinhOp {
+public:
+    static constexpr size_t num_inputs = 1;
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &x) const {
+        if constexpr (std::is_same_v<T, half2>) {
+            return __floats2half2_rn(sinhf(__half2float(__low2half(x))), sinhf(__half2float(__high2half(x))));
+        } else if constexpr (std::is_same_v<T, half>) {
+            return __float2half(sinhf(__half2float(x)));
+        } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
+            float x0 = __bfloat162float(__low2bfloat16(x));
+            float x1 = __bfloat162float(__high2bfloat16(x));
+            return __floats2bfloat162_rn(sinhf(x0), sinhf(x1));
+        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+            return __float2bfloat16_rn(sinhf(__bfloat162float(x)));
+        } else if constexpr (std::is_same_v<T, float>) {
+            return sinhf(x);
+        } else {
+            return std::sinh(x);
+        }
+    }
+} SinhOp;
+} // namespace op::sinh::cuda
+
+#endif // __SINH_CUDA_H__
diff --git a/src/infiniop/ops/sinh/nvidia/sinh_nvidia.cu b/src/infiniop/ops/sinh/nvidia/sinh_nvidia.cu
new file mode 100644
index 000000000..d4c3fd165
--- /dev/null
+++ b/src/infiniop/ops/sinh/nvidia/sinh_nvidia.cu
@@ -0,0 +1,54 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "sinh_nvidia.cuh"
+
+namespace op::sinh::nvidia {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &x_desc = input_desc_vec.at(0);
+    const auto &y_shape = out_desc->shape();
+    const auto &x_shape = x_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
+
+    CHECK_SAME_SHAPE(y_shape, x_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::SinhOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::SinhOp, float>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::sinh::nvidia
diff --git a/src/infiniop/ops/sinh/nvidia/sinh_nvidia.cuh b/src/infiniop/ops/sinh/nvidia/sinh_nvidia.cuh
new file mode 100644
index 000000000..66e3e3e67
--- /dev/null
+++ b/src/infiniop/ops/sinh/nvidia/sinh_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __SINH_NVIDIA_API_H__
+#define __SINH_NVIDIA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(sinh, nvidia)
+
+#endif // __SINH_NVIDIA_API_H__
diff --git a/src/infiniop/ops/sinh/operator.cc b/src/infiniop/ops/sinh/operator.cc
new file mode 100644
index 000000000..1636ce2c8
--- /dev/null
+++ b/src/infiniop/ops/sinh/operator.cc
@@ -0,0 +1,139 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/sinh.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/sinh_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
+#include "nvidia/sinh_nvidia.cuh"
+#endif
+
+__C infiniStatus_t infiniopCreateSinhDescriptor(
+    infiniopHandle_t handle,
+    infiniopSinhDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                             \
+    case CASE:                                                              \
+        return op::sinh::NAMESPACE::Descriptor::create(                     \
+            handle,                                                         \
+            reinterpret_cast<op::sinh::NAMESPACE::Descriptor **>(desc_ptr), \
+            y_desc,                                                         \
+            {x_desc})
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CREATE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetSinhWorkspaceSize(infiniopSinhDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                \
+    case CASE:                                                                              \
+        *size = reinterpret_cast<op::sinh::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu)
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia)
+#endif
+#ifdef ENABLE_QY_API
+        GET(INFINI_DEVICE_QY, nvidia)
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopSinh(
+    infiniopSinhDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                             \
+    case CASE:                                                                 \
+        return reinterpret_cast<const op::sinh::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, y, {x}, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CALCULATE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroySinhDescriptor(infiniopSinhDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                 \
+    case CASE:                                                                  \
+        delete reinterpret_cast<const op::sinh::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        DELETE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/sqrt/cpu/sqrt_cpu.cc b/src/infiniop/ops/sqrt/cpu/sqrt_cpu.cc
new file mode 100644
index 000000000..99e723126
--- /dev/null
+++ b/src/infiniop/ops/sqrt/cpu/sqrt_cpu.cc
@@ -0,0 +1,48 @@
+#include "sqrt_cpu.h"
+
+namespace op::sqrt::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &x_desc = input_desc_vec.at(0);
+    const auto &y_shape = out_desc->shape();
+    const auto &x_shape = x_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
+
+    CHECK_SAME_SHAPE(y_shape, x_shape);
+
+    // create CPU elementwise descriptor
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<SqrtOp, fp16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<SqrtOp, float>(_info, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::sqrt::cpu
diff --git a/src/infiniop/ops/sqrt/cpu/sqrt_cpu.h b/src/infiniop/ops/sqrt/cpu/sqrt_cpu.h
new file mode 100644
index 000000000..3d026cf63
--- /dev/null
+++ b/src/infiniop/ops/sqrt/cpu/sqrt_cpu.h
@@ -0,0 +1,22 @@
+#ifndef __SQRT_CPU_H__
+#define __SQRT_CPU_H__
+
+#include <cmath>
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+
+ELEMENTWISE_DESCRIPTOR(sqrt, cpu)
+
+namespace op::sqrt::cpu {
+typedef struct SqrtOp {
+public:
+    static constexpr size_t num_inputs = 1;
+
+    template <typename T>
+    T operator()(const T &x) const {
+        return std::sqrt(x);
+    }
+} SqrtOp;
+} // namespace op::sqrt::cpu
+
+#endif // __SQRT_CPU_H__
diff --git a/src/infiniop/ops/sqrt/cuda/kernel.cuh b/src/infiniop/ops/sqrt/cuda/kernel.cuh
new file mode 100644
index 000000000..c82cd7dd5
--- /dev/null
+++ b/src/infiniop/ops/sqrt/cuda/kernel.cuh
@@ -0,0 +1,32 @@
+#ifndef __SQRT_CUDA_H__
+#define __SQRT_CUDA_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+#include <cuda_fp16.h>
+
+namespace op::sqrt::cuda {
+typedef struct SqrtOp {
+public:
+    static constexpr size_t num_inputs = 1;
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &x) const {
+        if constexpr (std::is_same_v<T, half2>) {
+            return h2sqrt(x);
+        } else if constexpr (std::is_same_v<T, half>) {
+            return hsqrt(x);
+        } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
+            float x0 = __bfloat162float(__low2bfloat16(x));
+            float x1 = __bfloat162float(__high2bfloat16(x));
+            return __floats2bfloat162_rn(__fsqrt_rn(x0), __fsqrt_rn(x1));
+        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+            return __float2bfloat16_rn(__fsqrt_rn(__bfloat162float(x)));
+        } else if constexpr (std::is_same_v<T, float>) {
+            return __fsqrt_rn(x);
+        } else {
+            return std::sqrt(x);
+        }
+    }
+} SqrtOp;
+} // namespace op::sqrt::cuda
+
+#endif // __SQRT_CUDA_H__
diff --git a/src/infiniop/ops/sqrt/nvidia/sqrt_nvidia.cu b/src/infiniop/ops/sqrt/nvidia/sqrt_nvidia.cu
new file mode 100644
index 000000000..519d06e89
--- /dev/null
+++ b/src/infiniop/ops/sqrt/nvidia/sqrt_nvidia.cu
@@ -0,0 +1,54 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "sqrt_nvidia.cuh"
+
+namespace op::sqrt::nvidia {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &x_desc = input_desc_vec.at(0);
+    const auto &y_shape = out_desc->shape();
+    const auto &x_shape = x_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
+
+    CHECK_SAME_SHAPE(y_shape, x_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::SqrtOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::SqrtOp, float>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::sqrt::nvidia
diff --git a/src/infiniop/ops/sqrt/nvidia/sqrt_nvidia.cuh b/src/infiniop/ops/sqrt/nvidia/sqrt_nvidia.cuh
new file mode 100644
index 000000000..6cd98c814
--- /dev/null
+++ b/src/infiniop/ops/sqrt/nvidia/sqrt_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __SQRT_NVIDIA_API_H__
+#define __SQRT_NVIDIA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(sqrt, nvidia)
+
+#endif // __SQRT_NVIDIA_API_H__
diff --git a/src/infiniop/ops/sqrt/operator.cc b/src/infiniop/ops/sqrt/operator.cc
new file mode 100644
index 000000000..b11c8a4b5
--- /dev/null
+++ b/src/infiniop/ops/sqrt/operator.cc
@@ -0,0 +1,139 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/sqrt.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/sqrt_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
+#include "nvidia/sqrt_nvidia.cuh"
+#endif
+
+__C infiniStatus_t infiniopCreateSqrtDescriptor(
+    infiniopHandle_t handle,
+    infiniopSqrtDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                             \
+    case CASE:                                                              \
+        return op::sqrt::NAMESPACE::Descriptor::create(                     \
+            handle,                                                         \
+            reinterpret_cast<op::sqrt::NAMESPACE::Descriptor **>(desc_ptr), \
+            y_desc,                                                         \
+            {x_desc})
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CREATE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetSqrtWorkspaceSize(infiniopSqrtDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                \
+    case CASE:                                                                              \
+        *size = reinterpret_cast<op::sqrt::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu)
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia)
+#endif
+#ifdef ENABLE_QY_API
+        GET(INFINI_DEVICE_QY, nvidia)
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopSqrt(
+    infiniopSqrtDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                             \
+    case CASE:                                                                 \
+        return reinterpret_cast<const op::sqrt::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, y, {x}, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CALCULATE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroySqrtDescriptor(infiniopSqrtDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                 \
+    case CASE:                                                                  \
+        delete reinterpret_cast<const op::sqrt::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        DELETE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/tan/cpu/tan_cpu.cc b/src/infiniop/ops/tan/cpu/tan_cpu.cc
new file mode 100644
index 000000000..2947dfc5e
--- /dev/null
+++ b/src/infiniop/ops/tan/cpu/tan_cpu.cc
@@ -0,0 +1,48 @@
+#include "tan_cpu.h"
+
+namespace op::tan::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &x_desc = input_desc_vec.at(0);
+    const auto &y_shape = out_desc->shape();
+    const auto &x_shape = x_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
+
+    CHECK_SAME_SHAPE(y_shape, x_shape);
+
+    // create CPU elementwise descriptor
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<TanOp, fp16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<TanOp, float>(_info, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::tan::cpu
diff --git a/src/infiniop/ops/tan/cpu/tan_cpu.h b/src/infiniop/ops/tan/cpu/tan_cpu.h
new file mode 100644
index 000000000..c3a22456c
--- /dev/null
+++ b/src/infiniop/ops/tan/cpu/tan_cpu.h
@@ -0,0 +1,22 @@
+#ifndef __TAN_CPU_H__
+#define __TAN_CPU_H__
+
+#include <cmath>
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+
+ELEMENTWISE_DESCRIPTOR(tan, cpu)
+
+namespace op::tan::cpu {
+typedef struct TanOp {
+public:
+    static constexpr size_t num_inputs = 1;
+
+    template <typename T>
+    T operator()(const T &x) const {
+        return std::tan(x);
+    }
+} TanOp;
+} // namespace op::tan::cpu
+
+#endif // __TAN_CPU_H__
diff --git a/src/infiniop/ops/tan/cuda/kernel.cuh b/src/infiniop/ops/tan/cuda/kernel.cuh
new file mode 100644
index 000000000..bbd8facaa
--- /dev/null
+++ b/src/infiniop/ops/tan/cuda/kernel.cuh
@@ -0,0 +1,55 @@
+#ifndef __TAN_CUDA_H__
+#define __TAN_CUDA_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+#include <cmath>
+#include <cuda_fp16.h>
+
+#define TAN_THRESHOLD 15000
+
+namespace op::tan::cuda {
+typedef struct TanOp {
+public:
+    static constexpr size_t num_inputs = 1;
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &x) const {
+        if constexpr (std::is_same_v<T, half2>) {
+            return h2sin(x) / h2cos(x);
+        } else if constexpr (std::is_same_v<T, half>) {
+            float tan_f = __tanf(__half2float(x));
+            if (std::fabs(tan_f) > TAN_THRESHOLD) {
+                return __float2half(tanf(__half2float(x)));
+            }
+            return __float2half(tan_f);
+        } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
+            float x0 = __bfloat162float(__low2bfloat16(x));
+            float x1 = __bfloat162float(__high2bfloat16(x));
+            float tan_f0 = __tanf(x0);
+            float tan_f1 = __tanf(x1);
+            if (std::fabs(tan_f0) > TAN_THRESHOLD) {
+                tan_f0 = tanf(x0);
+            }
+            if (std::fabs(tan_f1) > TAN_THRESHOLD) {
+                tan_f1 = tanf(x1);
+            }
+            return __floats2bfloat162_rn(tan_f0, tan_f1);
+        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+            float tan_f = __tanf(__bfloat162float(x));
+            if (std::fabs(tan_f) > TAN_THRESHOLD) {
+                return __float2bfloat16_rn(tanf(__bfloat162float(x)));
+            }
+            return __float2bfloat16_rn(tan_f);
+        } else if constexpr (std::is_same_v<T, float>) {
+            float tan_f = __tanf(x);
+            if (std::fabs(tan_f) > TAN_THRESHOLD) {
+                return tanf(x);
+            }
+            return tan_f;
+        } else {
+            return std::tan(x);
+        }
+    }
+} TanOp;
+} // namespace op::tan::cuda
+
+#endif // __TAN_CUDA_H__
diff --git a/src/infiniop/ops/tan/nvidia/tan_nvidia.cu b/src/infiniop/ops/tan/nvidia/tan_nvidia.cu
new file mode 100644
index 000000000..b4c24e2fe
--- /dev/null
+++ b/src/infiniop/ops/tan/nvidia/tan_nvidia.cu
@@ -0,0 +1,54 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "tan_nvidia.cuh"
+
+namespace op::tan::nvidia {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &x_desc = input_desc_vec.at(0);
+    const auto &y_shape = out_desc->shape();
+    const auto &x_shape = x_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
+
+    CHECK_SAME_SHAPE(y_shape, x_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::TanOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::TanOp, float>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::tan::nvidia
diff --git a/src/infiniop/ops/tan/nvidia/tan_nvidia.cuh b/src/infiniop/ops/tan/nvidia/tan_nvidia.cuh
new file mode 100644
index 000000000..ec620cbeb
--- /dev/null
+++ b/src/infiniop/ops/tan/nvidia/tan_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __TAN_NVIDIA_API_H__
+#define __TAN_NVIDIA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(tan, nvidia)
+
+#endif // __TAN_NVIDIA_API_H__
diff --git a/src/infiniop/ops/tan/operator.cc b/src/infiniop/ops/tan/operator.cc
new file mode 100644
index 000000000..48ae8d48e
--- /dev/null
+++ b/src/infiniop/ops/tan/operator.cc
@@ -0,0 +1,139 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/tan.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/tan_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
+#include "nvidia/tan_nvidia.cuh"
+#endif
+
+__C infiniStatus_t infiniopCreateTanDescriptor(
+    infiniopHandle_t handle,
+    infiniopTanDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                            \
+    case CASE:                                                             \
+        return op::tan::NAMESPACE::Descriptor::create(                     \
+            handle,                                                        \
+            reinterpret_cast<op::tan::NAMESPACE::Descriptor **>(desc_ptr), \
+            y_desc,                                                        \
+            {x_desc})
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CREATE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetTanWorkspaceSize(infiniopTanDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                               \
+    case CASE:                                                                             \
+        *size = reinterpret_cast<op::tan::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu)
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia)
+#endif
+#ifdef ENABLE_QY_API
+        GET(INFINI_DEVICE_QY, nvidia)
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopTan(
+    infiniopTanDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                            \
+    case CASE:                                                                \
+        return reinterpret_cast<const op::tan::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, y, {x}, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CALCULATE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroyTanDescriptor(infiniopTanDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                \
+    case CASE:                                                                 \
+        delete reinterpret_cast<const op::tan::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        DELETE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/test/infiniop/abs.py b/test/infiniop/abs.py
new file mode 100644
index 000000000..df8748a97
--- /dev/null
+++ b/test/infiniop/abs.py
@@ -0,0 +1,164 @@
+import ctypes
+from ctypes import c_uint64
+from enum import Enum, auto
+
+import torch
+from libinfiniop import (
+    LIBINFINIOP,
+    InfiniDeviceNames,
+    InfiniDtype,
+    InfiniDtypeNames,
+    TestTensor,
+    TestWorkspace,
+    check_error,
+    debug,
+    get_args,
+    get_test_devices,
+    get_tolerance,
+    infiniopOperatorDescriptor_t,
+    profile_operation,
+    test_operator,
+)
+
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+# These are not meant to be imported from other modules
+_TEST_CASES_ = [
+    # tensor_shape, inplace
+    ((1, 3),),
+    ((3, 3),),
+    ((32, 20, 512),),
+    ((33, 333, 333),),
+    ((32, 256, 112, 112),),
+    ((3, 3, 13, 9, 17),),
+]
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_X = auto()
+
+
+# Inplace options applied for each test case in _TEST_CASES_
+_INPLACE = [
+    Inplace.OUT_OF_PLACE,
+    Inplace.INPLACE_X,
+]
+
+# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
+_TEST_CASES = [
+    test_case + (inplace_item,)
+    for test_case in _TEST_CASES_
+    for inplace_item in _INPLACE
+]
+
+# Data types used for testing (matching old operators library: only F16 and F32)
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32]
+
+# Tolerance map for different data types
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def abs_op(x):
+    return torch.abs(x).to(x.dtype)
+
+
+def test(
+    handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=InfiniDtype.F16, sync=None
+):
+    # Generate test tensors with values in range [-1, 1) for abs operation
+    x_torch_tensor = torch.rand(shape) * 2 - 1
+
+    x = TestTensor(
+        shape,
+        x_torch_tensor.stride(),
+        dtype,
+        device,
+        mode="manual",
+        set_tensor=x_torch_tensor,
+    )
+
+    if inplace == Inplace.INPLACE_X:
+        y = x
+    else:
+        y = TestTensor(shape, None, dtype, device)
+
+    if y.is_broadcast():
+        return
+
+    print(
+        f"Testing Abs on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}"
+    )
+
+    ans = abs_op(x.torch_tensor())
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateAbsDescriptor(
+            handle, ctypes.byref(descriptor), y.descriptor, x.descriptor
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in [x, y]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetAbsWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, y.device)
+
+    def lib_abs():
+        check_error(
+            LIBINFINIOP.infiniopAbs(
+                descriptor, workspace.data(), workspace_size.value, y.data(), x.data(), None
+            )
+        )
+
+    lib_abs()
+    if sync is not None:
+        sync()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(y.actual_tensor(), ans, atol=atol, rtol=rtol)
+    assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol)
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: abs_op(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_abs(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+
+    check_error(LIBINFINIOP.infiniopDestroyAbsDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/acos.py b/test/infiniop/acos.py
new file mode 100644
index 000000000..d39e966c4
--- /dev/null
+++ b/test/infiniop/acos.py
@@ -0,0 +1,165 @@
+import ctypes
+from ctypes import c_uint64
+from enum import Enum, auto
+
+import torch
+from libinfiniop import (
+    LIBINFINIOP,
+    InfiniDeviceNames,
+    InfiniDtype,
+    InfiniDtypeNames,
+    TestTensor,
+    TestWorkspace,
+    check_error,
+    debug,
+    get_args,
+    get_test_devices,
+    get_tolerance,
+    infiniopOperatorDescriptor_t,
+    profile_operation,
+    test_operator,
+)
+
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+# These are not meant to be imported from other modules
+_TEST_CASES_ = [
+    # tensor_shape, inplace
+    ((1, 3),),
+    ((3, 3),),
+    ((32, 20, 512),),
+    ((33, 333, 333),),
+    ((32, 256, 112, 112),),
+    ((3, 3, 13, 9, 17),),
+]
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_X = auto()
+
+
+# Inplace options applied for each test case in _TEST_CASES_
+_INPLACE = [
+    Inplace.OUT_OF_PLACE,
+    Inplace.INPLACE_X,
+]
+
+# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
+_TEST_CASES = [
+    test_case + (inplace_item,)
+    for test_case in _TEST_CASES_
+    for inplace_item in _INPLACE
+]
+
+# Data types used for testing (matching old operators library: only F16 and F32)
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32]
+
+# Tolerance map for different data types
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def acos_op(x):
+    return torch.acos(x).to(x.dtype)
+
+
+def test(
+    handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=InfiniDtype.F16, sync=None
+):
+    # Generate test tensors with values in range [-1, 1) for acos operation
+    # acos domain is [-1, 1], so we use range [-1, 1)
+    x_torch_tensor = torch.rand(shape) * 2 - 1
+
+    x = TestTensor(
+        shape,
+        x_torch_tensor.stride(),
+        dtype,
+        device,
+        mode="manual",
+        set_tensor=x_torch_tensor,
+    )
+
+    if inplace == Inplace.INPLACE_X:
+        y = x
+    else:
+        y = TestTensor(shape, None, dtype, device)
+
+    if y.is_broadcast():
+        return
+
+    print(
+        f"Testing Acos on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}"
+    )
+
+    ans = acos_op(x.torch_tensor())
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateAcosDescriptor(
+            handle, ctypes.byref(descriptor), y.descriptor, x.descriptor
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in [x, y]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetAcosWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, y.device)
+
+    def lib_acos():
+        check_error(
+            LIBINFINIOP.infiniopAcos(
+                descriptor, workspace.data(), workspace_size.value, y.data(), x.data(), None
+            )
+        )
+
+    lib_acos()
+    if sync is not None:
+        sync()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(y.actual_tensor(), ans, atol=atol, rtol=rtol)
+    assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol, equal_nan=True)
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: acos_op(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_acos(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+
+    check_error(LIBINFINIOP.infiniopDestroyAcosDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/acosh.py b/test/infiniop/acosh.py
new file mode 100644
index 000000000..c6777998b
--- /dev/null
+++ b/test/infiniop/acosh.py
@@ -0,0 +1,165 @@
+import ctypes
+from ctypes import c_uint64
+from enum import Enum, auto
+
+import torch
+from libinfiniop import (
+    LIBINFINIOP,
+    InfiniDeviceNames,
+    InfiniDtype,
+    InfiniDtypeNames,
+    TestTensor,
+    TestWorkspace,
+    check_error,
+    debug,
+    get_args,
+    get_test_devices,
+    get_tolerance,
+    infiniopOperatorDescriptor_t,
+    profile_operation,
+    test_operator,
+)
+
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+# These are not meant to be imported from other modules
+_TEST_CASES_ = [
+    # tensor_shape, inplace
+    ((1, 3),),
+    ((3, 3),),
+    ((32, 20, 512),),
+    ((33, 333, 333),),
+    ((32, 256, 112, 112),),
+    ((3, 3, 13, 9, 17),),
+]
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_X = auto()
+
+
+# Inplace options applied for each test case in _TEST_CASES_
+_INPLACE = [
+    Inplace.OUT_OF_PLACE,
+    Inplace.INPLACE_X,
+]
+
+# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
+_TEST_CASES = [
+    test_case + (inplace_item,)
+    for test_case in _TEST_CASES_
+    for inplace_item in _INPLACE
+]
+
+# Data types used for testing (matching old operators library: only F16 and F32)
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32]
+
+# Tolerance map for different data types
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def acosh_op(x):
+    return torch.acosh(x).to(x.dtype)
+
+
+def test(
+    handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=InfiniDtype.F16, sync=None
+):
+    # Generate test tensors with values in range [1, 101) for acosh operation
+    # acosh domain is [1, +∞), so we use range [1, 101)
+    x_torch_tensor = torch.rand(shape) * 100 + 1
+
+    x = TestTensor(
+        shape,
+        x_torch_tensor.stride(),
+        dtype,
+        device,
+        mode="manual",
+        set_tensor=x_torch_tensor,
+    )
+
+    if inplace == Inplace.INPLACE_X:
+        y = x
+    else:
+        y = TestTensor(shape, None, dtype, device)
+
+    if y.is_broadcast():
+        return
+
+    print(
+        f"Testing Acosh on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}"
+    )
+
+    ans = acosh_op(x.torch_tensor())
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateAcoshDescriptor(
+            handle, ctypes.byref(descriptor), y.descriptor, x.descriptor
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in [x, y]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetAcoshWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, y.device)
+
+    def lib_acosh():
+        check_error(
+            LIBINFINIOP.infiniopAcosh(
+                descriptor, workspace.data(), workspace_size.value, y.data(), x.data(), None
+            )
+        )
+
+    lib_acosh()
+    if sync is not None:
+        sync()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(y.actual_tensor(), ans, atol=atol, rtol=rtol)
+    assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol, equal_nan=True)
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: acosh_op(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_acosh(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+
+    check_error(LIBINFINIOP.infiniopDestroyAcoshDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/asin.py b/test/infiniop/asin.py
new file mode 100644
index 000000000..18cf0ec8e
--- /dev/null
+++ b/test/infiniop/asin.py
@@ -0,0 +1,165 @@
+import ctypes
+from ctypes import c_uint64
+from enum import Enum, auto
+
+import torch
+from libinfiniop import (
+    LIBINFINIOP,
+    InfiniDeviceNames,
+    InfiniDtype,
+    InfiniDtypeNames,
+    TestTensor,
+    TestWorkspace,
+    check_error,
+    debug,
+    get_args,
+    get_test_devices,
+    get_tolerance,
+    infiniopOperatorDescriptor_t,
+    profile_operation,
+    test_operator,
+)
+
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+# These are not meant to be imported from other modules
+_TEST_CASES_ = [
+    # tensor_shape, inplace
+    ((1, 3),),
+    ((3, 3),),
+    ((32, 20, 512),),
+    ((33, 333, 333),),
+    ((32, 256, 112, 112),),
+    ((3, 3, 13, 9, 17),),
+]
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_X = auto()
+
+
+# Inplace options applied for each test case in _TEST_CASES_
+_INPLACE = [
+    Inplace.OUT_OF_PLACE,
+    Inplace.INPLACE_X,
+]
+
+# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
+_TEST_CASES = [
+    test_case + (inplace_item,)
+    for test_case in _TEST_CASES_
+    for inplace_item in _INPLACE
+]
+
+# Data types used for testing (matching old operators library: only F16 and F32)
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32]
+
+# Tolerance map for different data types
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def asin_op(x):
+    return torch.asin(x).to(x.dtype)
+
+
+def test(
+    handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=InfiniDtype.F16, sync=None
+):
+    # Generate test tensors with values in range [-1, 1) for asin operation
+    # asin domain is [-1, 1], so we use range [-1, 1)
+    x_torch_tensor = torch.rand(shape) * 2 - 1
+
+    x = TestTensor(
+        shape,
+        x_torch_tensor.stride(),
+        dtype,
+        device,
+        mode="manual",
+        set_tensor=x_torch_tensor,
+    )
+
+    if inplace == Inplace.INPLACE_X:
+        y = x
+    else:
+        y = TestTensor(shape, None, dtype, device)
+
+    if y.is_broadcast():
+        return
+
+    print(
+        f"Testing Asin on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}"
+    )
+
+    ans = asin_op(x.torch_tensor())
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateAsinDescriptor(
+            handle, ctypes.byref(descriptor), y.descriptor, x.descriptor
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in [x, y]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetAsinWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, y.device)
+
+    def lib_asin():
+        check_error(
+            LIBINFINIOP.infiniopAsin(
+                descriptor, workspace.data(), workspace_size.value, y.data(), x.data(), None
+            )
+        )
+
+    lib_asin()
+    if sync is not None:
+        sync()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(y.actual_tensor(), ans, atol=atol, rtol=rtol)
+    assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol, equal_nan=True)
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: asin_op(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_asin(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+
+    check_error(LIBINFINIOP.infiniopDestroyAsinDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/asinh.py b/test/infiniop/asinh.py
new file mode 100644
index 000000000..d051d486e
--- /dev/null
+++ b/test/infiniop/asinh.py
@@ -0,0 +1,165 @@
+import ctypes
+from ctypes import c_uint64
+from enum import Enum, auto
+
+import torch
+from libinfiniop import (
+    LIBINFINIOP,
+    InfiniDeviceNames,
+    InfiniDtype,
+    InfiniDtypeNames,
+    TestTensor,
+    TestWorkspace,
+    check_error,
+    debug,
+    get_args,
+    get_test_devices,
+    get_tolerance,
+    infiniopOperatorDescriptor_t,
+    profile_operation,
+    test_operator,
+)
+
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+# These are not meant to be imported from other modules
+_TEST_CASES_ = [
+    # tensor_shape, inplace
+    ((1, 3),),
+    ((3, 3),),
+    ((32, 20, 512),),
+    ((33, 333, 333),),
+    ((32, 256, 112, 112),),
+    ((3, 3, 13, 9, 17),),
+]
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_X = auto()
+
+
+# Inplace options applied for each test case in _TEST_CASES_
+_INPLACE = [
+    Inplace.OUT_OF_PLACE,
+    Inplace.INPLACE_X,
+]
+
+# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
+_TEST_CASES = [
+    test_case + (inplace_item,)
+    for test_case in _TEST_CASES_
+    for inplace_item in _INPLACE
+]
+
+# Data types used for testing (matching old operators library: only F16 and F32)
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32]
+
+# Tolerance map for different data types
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def asinh_op(x):
+    return torch.asinh(x).to(x.dtype)
+
+
+def test(
+    handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=InfiniDtype.F16, sync=None
+):
+    # Generate test tensors with values in range [0, 100) for asinh operation
+    # asinh domain is (-∞, +∞), so we use range [0, 100)
+    x_torch_tensor = torch.rand(shape) * 100
+
+    x = TestTensor(
+        shape,
+        x_torch_tensor.stride(),
+        dtype,
+        device,
+        mode="manual",
+        set_tensor=x_torch_tensor,
+    )
+
+    if inplace == Inplace.INPLACE_X:
+        y = x
+    else:
+        y = TestTensor(shape, None, dtype, device)
+
+    if y.is_broadcast():
+        return
+
+    print(
+        f"Testing Asinh on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}"
+    )
+
+    ans = asinh_op(x.torch_tensor())
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateAsinhDescriptor(
+            handle, ctypes.byref(descriptor), y.descriptor, x.descriptor
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in [x, y]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetAsinhWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, y.device)
+
+    def lib_asinh():
+        check_error(
+            LIBINFINIOP.infiniopAsinh(
+                descriptor, workspace.data(), workspace_size.value, y.data(), x.data(), None
+            )
+        )
+
+    lib_asinh()
+    if sync is not None:
+        sync()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(y.actual_tensor(), ans, atol=atol, rtol=rtol)
+    assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol, equal_nan=True)
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: asinh_op(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_asinh(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+
+    check_error(LIBINFINIOP.infiniopDestroyAsinhDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/atan.py b/test/infiniop/atan.py
new file mode 100644
index 000000000..01fceff5b
--- /dev/null
+++ b/test/infiniop/atan.py
@@ -0,0 +1,164 @@
+import ctypes
+from ctypes import c_uint64
+from enum import Enum, auto
+
+import torch
+from libinfiniop import (
+    LIBINFINIOP,
+    InfiniDeviceNames,
+    InfiniDtype,
+    InfiniDtypeNames,
+    TestTensor,
+    TestWorkspace,
+    check_error,
+    debug,
+    get_args,
+    get_test_devices,
+    get_tolerance,
+    infiniopOperatorDescriptor_t,
+    profile_operation,
+    test_operator,
+)
+
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+# These are not meant to be imported from other modules
+_TEST_CASES_ = [
+    # tensor_shape, inplace
+    ((1, 3),),
+    ((3, 3, 13, 9, 17),),
+    ((32, 20, 512),),
+    ((33, 333, 333),),
+    ((32, 256, 112, 112),),
+]
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_X = auto()
+
+
+# Inplace options applied for each test case in _TEST_CASES_
+_INPLACE = [
+    Inplace.OUT_OF_PLACE,
+    Inplace.INPLACE_X,
+]
+
+# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
+_TEST_CASES = [
+    test_case + (inplace_item,)
+    for test_case in _TEST_CASES_
+    for inplace_item in _INPLACE
+]
+
+# Data types used for testing (matching old operators library: only F16 and F32)
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32]
+
+# Tolerance map for different data types
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def atan_op(x):
+    return torch.atan(x).to(x.dtype)
+
+
+def test(
+    handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=InfiniDtype.F16, sync=None
+):
+    # Generate test tensors with values in range [-200, -100) for atan operation
+    # atan domain is (-∞, +∞), so we use range [-200, -100)
+    x_torch_tensor = torch.rand(shape) * 100 - 200
+
+    x = TestTensor(
+        shape,
+        x_torch_tensor.stride(),
+        dtype,
+        device,
+        mode="manual",
+        set_tensor=x_torch_tensor,
+    )
+
+    if inplace == Inplace.INPLACE_X:
+        y = x
+    else:
+        y = TestTensor(shape, None, dtype, device)
+
+    if y.is_broadcast():
+        return
+
+    print(
+        f"Testing Atan on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}"
+    )
+
+    ans = atan_op(x.torch_tensor())
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateAtanDescriptor(
+            handle, ctypes.byref(descriptor), y.descriptor, x.descriptor
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in [x, y]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetAtanWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, y.device)
+
+    def lib_atan():
+        check_error(
+            LIBINFINIOP.infiniopAtan(
+                descriptor, workspace.data(), workspace_size.value, y.data(), x.data(), None
+            )
+        )
+
+    lib_atan()
+    if sync is not None:
+        sync()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(y.actual_tensor(), ans, atol=atol, rtol=rtol)
+    assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol, equal_nan=True)
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: atan_op(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_atan(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+
+    check_error(LIBINFINIOP.infiniopDestroyAtanDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/atanh.py b/test/infiniop/atanh.py
new file mode 100644
index 000000000..74073a6f2
--- /dev/null
+++ b/test/infiniop/atanh.py
@@ -0,0 +1,165 @@
+import ctypes
+from ctypes import c_uint64
+from enum import Enum, auto
+
+import torch
+from libinfiniop import (
+    LIBINFINIOP,
+    InfiniDeviceNames,
+    InfiniDtype,
+    InfiniDtypeNames,
+    TestTensor,
+    TestWorkspace,
+    check_error,
+    debug,
+    get_args,
+    get_test_devices,
+    get_tolerance,
+    infiniopOperatorDescriptor_t,
+    profile_operation,
+    test_operator,
+)
+
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+# These are not meant to be imported from other modules
+_TEST_CASES_ = [
+    # tensor_shape, inplace
+    ((1, 3),),
+    ((3, 3),),
+    ((32, 20, 512),),
+    ((33, 333, 333),),
+    ((32, 256, 112, 112),),
+    ((3, 3, 13, 9, 17),),
+]
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_X = auto()
+
+
+# Inplace options applied for each test case in _TEST_CASES_
+_INPLACE = [
+    Inplace.OUT_OF_PLACE,
+    Inplace.INPLACE_X,
+]
+
+# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
+_TEST_CASES = [
+    test_case + (inplace_item,)
+    for test_case in _TEST_CASES_
+    for inplace_item in _INPLACE
+]
+
+# Data types used for testing (matching old operators library: only F16 and F32)
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32]
+
+# Tolerance map for different data types
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def atanh_op(x):
+    return torch.atanh(x).to(x.dtype)
+
+
+def test(
+    handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=InfiniDtype.F16, sync=None
+):
+    # Generate test tensors with values in range [-1, 1) for atanh operation
+    # atanh domain is (-1, 1), so we use range [-1, 1)
+    x_torch_tensor = torch.rand(shape) * 2 - 1
+
+    x = TestTensor(
+        shape,
+        x_torch_tensor.stride(),
+        dtype,
+        device,
+        mode="manual",
+        set_tensor=x_torch_tensor,
+    )
+
+    if inplace == Inplace.INPLACE_X:
+        y = x
+    else:
+        y = TestTensor(shape, None, dtype, device)
+
+    if y.is_broadcast():
+        return
+
+    print(
+        f"Testing Atanh on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}"
+    )
+
+    ans = atanh_op(x.torch_tensor())
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateAtanhDescriptor(
+            handle, ctypes.byref(descriptor), y.descriptor, x.descriptor
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in [x, y]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetAtanhWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, y.device)
+
+    def lib_atanh():
+        check_error(
+            LIBINFINIOP.infiniopAtanh(
+                descriptor, workspace.data(), workspace_size.value, y.data(), x.data(), None
+            )
+        )
+
+    lib_atanh()
+    if sync is not None:
+        sync()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(y.actual_tensor(), ans, atol=atol, rtol=rtol)
+    assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol, equal_nan=True)
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: atanh_op(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_atanh(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+
+    check_error(LIBINFINIOP.infiniopDestroyAtanhDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/ceil.py b/test/infiniop/ceil.py
new file mode 100644
index 000000000..afc1993c1
--- /dev/null
+++ b/test/infiniop/ceil.py
@@ -0,0 +1,165 @@
+import ctypes
+from ctypes import c_uint64
+from enum import Enum, auto
+
+import torch
+from libinfiniop import (
+    LIBINFINIOP,
+    InfiniDeviceNames,
+    InfiniDtype,
+    InfiniDtypeNames,
+    TestTensor,
+    TestWorkspace,
+    check_error,
+    debug,
+    get_args,
+    get_test_devices,
+    get_tolerance,
+    infiniopOperatorDescriptor_t,
+    profile_operation,
+    test_operator,
+)
+
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+# These are not meant to be imported from other modules
+_TEST_CASES_ = [
+    # tensor_shape, inplace
+    ((1, 3),),
+    ((3, 3),),
+    ((32, 20, 512),),
+    ((33, 333, 333),),
+    ((32, 256, 112, 112),),
+    ((3, 3, 13, 9, 17),),
+]
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_X = auto()
+
+
+# Inplace options applied for each test case in _TEST_CASES_
+_INPLACE = [
+    Inplace.OUT_OF_PLACE,
+    Inplace.INPLACE_X,
+]
+
+# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
+_TEST_CASES = [
+    test_case + (inplace_item,)
+    for test_case in _TEST_CASES_
+    for inplace_item in _INPLACE
+]
+
+# Data types used for testing (matching old operators library: only F16 and F32)
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32]
+
+# Tolerance map for different data types
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def ceil_op(x):
+    return torch.ceil(x).to(x.dtype)
+
+
+def test(
+    handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=InfiniDtype.F16, sync=None
+):
+    # Generate test tensors with values in range [-20, -10) for ceil operation
+    # ceil domain is (-∞, +∞), so we use range [-20, -10)
+    x_torch_tensor = torch.rand(shape) * 10 - 20
+
+    x = TestTensor(
+        shape,
+        x_torch_tensor.stride(),
+        dtype,
+        device,
+        mode="manual",
+        set_tensor=x_torch_tensor,
+    )
+
+    if inplace == Inplace.INPLACE_X:
+        y = x
+    else:
+        y = TestTensor(shape, None, dtype, device)
+
+    if y.is_broadcast():
+        return
+
+    print(
+        f"Testing Ceil on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}"
+    )
+
+    ans = ceil_op(x.torch_tensor())
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateCeilDescriptor(
+            handle, ctypes.byref(descriptor), y.descriptor, x.descriptor
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in [x, y]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetCeilWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, y.device)
+
+    def lib_ceil():
+        check_error(
+            LIBINFINIOP.infiniopCeil(
+                descriptor, workspace.data(), workspace_size.value, y.data(), x.data(), None
+            )
+        )
+
+    lib_ceil()
+    if sync is not None:
+        sync()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(y.actual_tensor(), ans, atol=atol, rtol=rtol)
+    assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol, equal_nan=True)
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: ceil_op(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_ceil(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+
+    check_error(LIBINFINIOP.infiniopDestroyCeilDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/cos.py b/test/infiniop/cos.py
new file mode 100644
index 000000000..972f17b7b
--- /dev/null
+++ b/test/infiniop/cos.py
@@ -0,0 +1,166 @@
+import ctypes
+from ctypes import c_uint64
+from enum import Enum, auto
+
+import torch
+from libinfiniop import (
+    LIBINFINIOP,
+    InfiniDeviceNames,
+    InfiniDtype,
+    InfiniDtypeNames,
+    TestTensor,
+    TestWorkspace,
+    check_error,
+    debug,
+    get_args,
+    get_test_devices,
+    get_tolerance,
+    infiniopOperatorDescriptor_t,
+    profile_operation,
+    test_operator,
+)
+
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+# These are not meant to be imported from other modules
+_TEST_CASES_ = [
+    # tensor_shape, inplace
+    ((1, 3),),
+    ((3, 3),),
+    ((32, 20, 512),),
+    ((33, 333, 333),),
+    ((32, 256, 112, 112),),
+    ((3, 3, 13, 9, 17),),
+]
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_X = auto()
+
+
+# Inplace options applied for each test case in _TEST_CASES_
+_INPLACE = [
+    Inplace.OUT_OF_PLACE,
+    Inplace.INPLACE_X,
+]
+
+# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
+_TEST_CASES = [
+    test_case + (inplace_item,)
+    for test_case in _TEST_CASES_
+    for inplace_item in _INPLACE
+]
+
+# Data types used for testing (matching old operators library: only F16 and F32)
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32]
+
+# Tolerance map for different data types
+# Copied from old operators library: atol=1e-4, rtol=1e-2
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-4, "rtol": 1e-2},
+    InfiniDtype.F32: {"atol": 1e-4, "rtol": 1e-2},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def cos_op(x):
+    return torch.cos(x).to(x.dtype)
+
+
+def test(
+    handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=InfiniDtype.F16, sync=None
+):
+    # Generate test tensors with values in range [-200, -100) for cos operation
+    # cos domain is (-∞, +∞), so we use range [-200, -100)
+    x_torch_tensor = torch.rand(shape) * 100 - 200
+
+    x = TestTensor(
+        shape,
+        x_torch_tensor.stride(),
+        dtype,
+        device,
+        mode="manual",
+        set_tensor=x_torch_tensor,
+    )
+
+    if inplace == Inplace.INPLACE_X:
+        y = x
+    else:
+        y = TestTensor(shape, None, dtype, device)
+
+    if y.is_broadcast():
+        return
+
+    print(
+        f"Testing Cos on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}"
+    )
+
+    ans = cos_op(x.torch_tensor())
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateCosDescriptor(
+            handle, ctypes.byref(descriptor), y.descriptor, x.descriptor
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in [x, y]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetCosWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, y.device)
+
+    def lib_cos():
+        check_error(
+            LIBINFINIOP.infiniopCos(
+                descriptor, workspace.data(), workspace_size.value, y.data(), x.data(), None
+            )
+        )
+
+    lib_cos()
+    if sync is not None:
+        sync()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(y.actual_tensor(), ans, atol=atol, rtol=rtol)
+    assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol, equal_nan=True)
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: cos_op(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_cos(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+
+    check_error(LIBINFINIOP.infiniopDestroyCosDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/cosh.py b/test/infiniop/cosh.py
new file mode 100644
index 000000000..ee7994531
--- /dev/null
+++ b/test/infiniop/cosh.py
@@ -0,0 +1,165 @@
+import ctypes
+from ctypes import c_uint64
+from enum import Enum, auto
+
+import torch
+from libinfiniop import (
+    LIBINFINIOP,
+    InfiniDeviceNames,
+    InfiniDtype,
+    InfiniDtypeNames,
+    TestTensor,
+    TestWorkspace,
+    check_error,
+    debug,
+    get_args,
+    get_test_devices,
+    get_tolerance,
+    infiniopOperatorDescriptor_t,
+    profile_operation,
+    test_operator,
+)
+
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+# These are not meant to be imported from other modules
+_TEST_CASES_ = [
+    # tensor_shape, inplace
+    ((1, 3),),
+    ((3, 3),),
+    ((32, 20, 512),),
+    ((33, 333, 333),),
+    ((32, 256, 112, 112),),
+    ((3, 3, 13, 9, 17),),
+]
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_X = auto()
+
+
+# Inplace options applied for each test case in _TEST_CASES_
+_INPLACE = [
+    Inplace.OUT_OF_PLACE,
+    Inplace.INPLACE_X,
+]
+
+# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
+_TEST_CASES = [
+    test_case + (inplace_item,)
+    for test_case in _TEST_CASES_
+    for inplace_item in _INPLACE
+]
+
+# Data types used for testing (matching old operators library: only F16 and F32)
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32]
+
+# Tolerance map for different data types
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def cosh_op(x):
+    return torch.cosh(x).to(x.dtype)
+
+
+def test(
+    handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=InfiniDtype.F16, sync=None
+):
+    # Generate test tensors with values in range [-200, -100) for cosh operation
+    # cosh domain is (-∞, +∞), so we use range [-200, -100)
+    x_torch_tensor = torch.rand(shape) * 100 - 200
+
+    x = TestTensor(
+        shape,
+        x_torch_tensor.stride(),
+        dtype,
+        device,
+        mode="manual",
+        set_tensor=x_torch_tensor,
+    )
+
+    if inplace == Inplace.INPLACE_X:
+        y = x
+    else:
+        y = TestTensor(shape, None, dtype, device)
+
+    if y.is_broadcast():
+        return
+
+    print(
+        f"Testing Cosh on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}"
+    )
+
+    ans = cosh_op(x.torch_tensor())
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateCoshDescriptor(
+            handle, ctypes.byref(descriptor), y.descriptor, x.descriptor
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in [x, y]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetCoshWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, y.device)
+
+    def lib_cosh():
+        check_error(
+            LIBINFINIOP.infiniopCosh(
+                descriptor, workspace.data(), workspace_size.value, y.data(), x.data(), None
+            )
+        )
+
+    lib_cosh()
+    if sync is not None:
+        sync()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(y.actual_tensor(), ans, atol=atol, rtol=rtol)
+    assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol, equal_nan=True)
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: cosh_op(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_cosh(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+
+    check_error(LIBINFINIOP.infiniopDestroyCoshDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/erf.py b/test/infiniop/erf.py
new file mode 100644
index 000000000..f5f9c4cd9
--- /dev/null
+++ b/test/infiniop/erf.py
@@ -0,0 +1,165 @@
+import ctypes
+from ctypes import c_uint64
+from enum import Enum, auto
+
+import torch
+from libinfiniop import (
+    LIBINFINIOP,
+    InfiniDeviceNames,
+    InfiniDtype,
+    InfiniDtypeNames,
+    TestTensor,
+    TestWorkspace,
+    check_error,
+    debug,
+    get_args,
+    get_test_devices,
+    get_tolerance,
+    infiniopOperatorDescriptor_t,
+    profile_operation,
+    test_operator,
+)
+
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+# These are not meant to be imported from other modules
+_TEST_CASES_ = [
+    # tensor_shape, inplace
+    ((1, 3),),
+    ((3, 3),),
+    ((32, 20, 512),),
+    ((33, 333, 333),),
+    ((32, 256, 112, 112),),
+    ((3, 3, 13, 9, 17),),
+]
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_X = auto()
+
+
+# Inplace options applied for each test case in _TEST_CASES_
+_INPLACE = [
+    Inplace.OUT_OF_PLACE,
+    Inplace.INPLACE_X,
+]
+
+# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
+_TEST_CASES = [
+    test_case + (inplace_item,)
+    for test_case in _TEST_CASES_
+    for inplace_item in _INPLACE
+]
+
+# Data types used for testing (matching old operators library: only F16 and F32)
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32]
+
+# Tolerance map for different data types
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def erf_op(x):
+    return torch.erf(x).to(x.dtype)
+
+
+def test(
+    handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=InfiniDtype.F16, sync=None
+):
+    # Generate test tensors with values in range [-3, 3) for erf operation
+    # erf domain is (-∞, +∞), so we use range [-3, 3)
+    x_torch_tensor = torch.rand(shape) * 6 - 3
+
+    x = TestTensor(
+        shape,
+        x_torch_tensor.stride(),
+        dtype,
+        device,
+        mode="manual",
+        set_tensor=x_torch_tensor,
+    )
+
+    if inplace == Inplace.INPLACE_X:
+        y = x
+    else:
+        y = TestTensor(shape, None, dtype, device)
+
+    if y.is_broadcast():
+        return
+
+    print(
+        f"Testing Erf on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}"
+    )
+
+    ans = erf_op(x.torch_tensor())
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateErfDescriptor(
+            handle, ctypes.byref(descriptor), y.descriptor, x.descriptor
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in [x, y]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetErfWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, y.device)
+
+    def lib_erf():
+        check_error(
+            LIBINFINIOP.infiniopErf(
+                descriptor, workspace.data(), workspace_size.value, y.data(), x.data(), None
+            )
+        )
+
+    lib_erf()
+    if sync is not None:
+        sync()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(y.actual_tensor(), ans, atol=atol, rtol=rtol)
+    assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol, equal_nan=True)
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: erf_op(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_erf(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+
+    check_error(LIBINFINIOP.infiniopDestroyErfDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/floor.py b/test/infiniop/floor.py
new file mode 100644
index 000000000..b981da809
--- /dev/null
+++ b/test/infiniop/floor.py
@@ -0,0 +1,165 @@
+import ctypes
+from ctypes import c_uint64
+from enum import Enum, auto
+
+import torch
+from libinfiniop import (
+    LIBINFINIOP,
+    InfiniDeviceNames,
+    InfiniDtype,
+    InfiniDtypeNames,
+    TestTensor,
+    TestWorkspace,
+    check_error,
+    debug,
+    get_args,
+    get_test_devices,
+    get_tolerance,
+    infiniopOperatorDescriptor_t,
+    profile_operation,
+    test_operator,
+)
+
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+# These are not meant to be imported from other modules
+_TEST_CASES_ = [
+    # tensor_shape, inplace
+    ((1, 3),),
+    ((3, 3),),
+    ((32, 20, 512),),
+    ((33, 333, 333),),
+    ((32, 256, 112, 112),),
+    ((3, 3, 13, 9, 17),),
+]
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_X = auto()
+
+
+# Inplace options applied for each test case in _TEST_CASES_
+_INPLACE = [
+    Inplace.OUT_OF_PLACE,
+    Inplace.INPLACE_X,
+]
+
+# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
+_TEST_CASES = [
+    test_case + (inplace_item,)
+    for test_case in _TEST_CASES_
+    for inplace_item in _INPLACE
+]
+
+# Data types used for testing (matching old operators library: only F16 and F32)
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32]
+
+# Tolerance map for different data types
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def floor_op(x):
+    return torch.floor(x).to(x.dtype)
+
+
+def test(
+    handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=InfiniDtype.F16, sync=None
+):
+    # Generate test tensors with values in range [-20, -10) for floor operation
+    # floor domain is (-∞, +∞), so we use range [-20, -10)
+    x_torch_tensor = torch.rand(shape) * 10 - 20
+
+    x = TestTensor(
+        shape,
+        x_torch_tensor.stride(),
+        dtype,
+        device,
+        mode="manual",
+        set_tensor=x_torch_tensor,
+    )
+
+    if inplace == Inplace.INPLACE_X:
+        y = x
+    else:
+        y = TestTensor(shape, None, dtype, device)
+
+    if y.is_broadcast():
+        return
+
+    print(
+        f"Testing Floor on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}"
+    )
+
+    ans = floor_op(x.torch_tensor())
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateFloorDescriptor(
+            handle, ctypes.byref(descriptor), y.descriptor, x.descriptor
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in [x, y]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetFloorWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, y.device)
+
+    def lib_floor():
+        check_error(
+            LIBINFINIOP.infiniopFloor(
+                descriptor, workspace.data(), workspace_size.value, y.data(), x.data(), None
+            )
+        )
+
+    lib_floor()
+    if sync is not None:
+        sync()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(y.actual_tensor(), ans, atol=atol, rtol=rtol)
+    assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol, equal_nan=True)
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: floor_op(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_floor(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+
+    check_error(LIBINFINIOP.infiniopDestroyFloorDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/libinfiniop/op_register.py b/test/infiniop/libinfiniop/op_register.py
index a61cea018..20a9188d6 100644
--- a/test/infiniop/libinfiniop/op_register.py
+++ b/test/infiniop/libinfiniop/op_register.py
@@ -496,6 +496,589 @@ def rearrange_(lib):
     lib.infiniopDestroyRearrangeDescriptor.argtypes = [infiniopOperatorDescriptor_t]
 
 
+@OpRegister.operator
+def abs_(lib):
+    lib.infiniopCreateAbsDescriptor.restype = c_int32
+    lib.infiniopCreateAbsDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+
+    lib.infiniopGetAbsWorkspaceSize.restype = c_int32
+    lib.infiniopGetAbsWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopAbs.restype = c_int32
+    lib.infiniopAbs.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyAbsDescriptor.restype = c_int32
+    lib.infiniopDestroyAbsDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def acos_(lib):
+    lib.infiniopCreateAcosDescriptor.restype = c_int32
+    lib.infiniopCreateAcosDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+    lib.infiniopGetAcosWorkspaceSize.restype = c_int32
+    lib.infiniopGetAcosWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+    lib.infiniopAcos.restype = c_int32
+    lib.infiniopAcos.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyAcosDescriptor.restype = c_int32
+    lib.infiniopDestroyAcosDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def acosh_(lib):
+    lib.infiniopCreateAcoshDescriptor.restype = c_int32
+    lib.infiniopCreateAcoshDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+    lib.infiniopGetAcoshWorkspaceSize.restype = c_int32
+    lib.infiniopGetAcoshWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+    lib.infiniopAcosh.restype = c_int32
+    lib.infiniopAcosh.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyAcoshDescriptor.restype = c_int32
+    lib.infiniopDestroyAcoshDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def asin_(lib):
+    lib.infiniopCreateAsinDescriptor.restype = c_int32
+    lib.infiniopCreateAsinDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+    lib.infiniopGetAsinWorkspaceSize.restype = c_int32
+    lib.infiniopGetAsinWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+    lib.infiniopAsin.restype = c_int32
+    lib.infiniopAsin.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyAsinDescriptor.restype = c_int32
+    lib.infiniopDestroyAsinDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def asinh_(lib):
+    lib.infiniopCreateAsinhDescriptor.restype = c_int32
+    lib.infiniopCreateAsinhDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+    lib.infiniopGetAsinhWorkspaceSize.restype = c_int32
+    lib.infiniopGetAsinhWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+    lib.infiniopAsinh.restype = c_int32
+    lib.infiniopAsinh.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyAsinhDescriptor.restype = c_int32
+    lib.infiniopDestroyAsinhDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def atan_(lib):
+    lib.infiniopCreateAtanDescriptor.restype = c_int32
+    lib.infiniopCreateAtanDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+    lib.infiniopGetAtanWorkspaceSize.restype = c_int32
+    lib.infiniopGetAtanWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+    lib.infiniopAtan.restype = c_int32
+    lib.infiniopAtan.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyAtanDescriptor.restype = c_int32
+    lib.infiniopDestroyAtanDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def atanh_(lib):
+    lib.infiniopCreateAtanhDescriptor.restype = c_int32
+    lib.infiniopCreateAtanhDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+    lib.infiniopGetAtanhWorkspaceSize.restype = c_int32
+    lib.infiniopGetAtanhWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+    lib.infiniopAtanh.restype = c_int32
+    lib.infiniopAtanh.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyAtanhDescriptor.restype = c_int32
+    lib.infiniopDestroyAtanhDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def ceil_(lib):
+    lib.infiniopCreateCeilDescriptor.restype = c_int32
+    lib.infiniopCreateCeilDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+    lib.infiniopGetCeilWorkspaceSize.restype = c_int32
+    lib.infiniopGetCeilWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+    lib.infiniopCeil.restype = c_int32
+    lib.infiniopCeil.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyCeilDescriptor.restype = c_int32
+    lib.infiniopDestroyCeilDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def cos_(lib):
+    lib.infiniopCreateCosDescriptor.restype = c_int32
+    lib.infiniopCreateCosDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+    lib.infiniopGetCosWorkspaceSize.restype = c_int32
+    lib.infiniopGetCosWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+    lib.infiniopCos.restype = c_int32
+    lib.infiniopCos.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyCosDescriptor.restype = c_int32
+    lib.infiniopDestroyCosDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def cosh_(lib):
+    lib.infiniopCreateCoshDescriptor.restype = c_int32
+    lib.infiniopCreateCoshDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+    lib.infiniopGetCoshWorkspaceSize.restype = c_int32
+    lib.infiniopGetCoshWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+    lib.infiniopCosh.restype = c_int32
+    lib.infiniopCosh.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyCoshDescriptor.restype = c_int32
+    lib.infiniopDestroyCoshDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def sinh_(lib):
+    lib.infiniopCreateSinhDescriptor.restype = c_int32
+    lib.infiniopCreateSinhDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+    lib.infiniopGetSinhWorkspaceSize.restype = c_int32
+    lib.infiniopGetSinhWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+    lib.infiniopSinh.restype = c_int32
+    lib.infiniopSinh.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroySinhDescriptor.restype = c_int32
+    lib.infiniopDestroySinhDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def erf_(lib):
+    lib.infiniopCreateErfDescriptor.restype = c_int32
+    lib.infiniopCreateErfDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+    lib.infiniopGetErfWorkspaceSize.restype = c_int32
+    lib.infiniopGetErfWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+    lib.infiniopErf.restype = c_int32
+    lib.infiniopErf.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyErfDescriptor.restype = c_int32
+    lib.infiniopDestroyErfDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def floor_(lib):
+    lib.infiniopCreateFloorDescriptor.restype = c_int32
+    lib.infiniopCreateFloorDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+    lib.infiniopGetFloorWorkspaceSize.restype = c_int32
+    lib.infiniopGetFloorWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+    lib.infiniopFloor.restype = c_int32
+    lib.infiniopFloor.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyFloorDescriptor.restype = c_int32
+    lib.infiniopDestroyFloorDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def neg_(lib):
+    lib.infiniopCreateNegDescriptor.restype = c_int32
+    lib.infiniopCreateNegDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+    lib.infiniopGetNegWorkspaceSize.restype = c_int32
+    lib.infiniopGetNegWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+    lib.infiniopNeg.restype = c_int32
+    lib.infiniopNeg.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyNegDescriptor.restype = c_int32
+    lib.infiniopDestroyNegDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def reciprocal_(lib):
+    lib.infiniopCreateReciprocalDescriptor.restype = c_int32
+    lib.infiniopCreateReciprocalDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+    lib.infiniopGetReciprocalWorkspaceSize.restype = c_int32
+    lib.infiniopGetReciprocalWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+    lib.infiniopReciprocal.restype = c_int32
+    lib.infiniopReciprocal.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyReciprocalDescriptor.restype = c_int32
+    lib.infiniopDestroyReciprocalDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def round_(lib):
+    lib.infiniopCreateRoundDescriptor.restype = c_int32
+    lib.infiniopCreateRoundDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+    lib.infiniopGetRoundWorkspaceSize.restype = c_int32
+    lib.infiniopGetRoundWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+    lib.infiniopRound.restype = c_int32
+    lib.infiniopRound.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyRoundDescriptor.restype = c_int32
+    lib.infiniopDestroyRoundDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def sign_(lib):
+    lib.infiniopCreateSignDescriptor.restype = c_int32
+    lib.infiniopCreateSignDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+    lib.infiniopGetSignWorkspaceSize.restype = c_int32
+    lib.infiniopGetSignWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+    lib.infiniopSign.restype = c_int32
+    lib.infiniopSign.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroySignDescriptor.restype = c_int32
+    lib.infiniopDestroySignDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def sqrt_(lib):
+    lib.infiniopCreateSqrtDescriptor.restype = c_int32
+    lib.infiniopCreateSqrtDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+    lib.infiniopGetSqrtWorkspaceSize.restype = c_int32
+    lib.infiniopGetSqrtWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+    lib.infiniopSqrt.restype = c_int32
+    lib.infiniopSqrt.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroySqrtDescriptor.restype = c_int32
+    lib.infiniopDestroySqrtDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def log_(lib):
+    lib.infiniopCreateLogDescriptor.restype = c_int32
+    lib.infiniopCreateLogDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+    lib.infiniopGetLogWorkspaceSize.restype = c_int32
+    lib.infiniopGetLogWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+    lib.infiniopLog.restype = c_int32
+    lib.infiniopLog.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyLogDescriptor.restype = c_int32
+    lib.infiniopDestroyLogDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def tan_(lib):
+    lib.infiniopCreateTanDescriptor.restype = c_int32
+    lib.infiniopCreateTanDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+    lib.infiniopGetTanWorkspaceSize.restype = c_int32
+    lib.infiniopGetTanWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+    lib.infiniopTan.restype = c_int32
+    lib.infiniopTan.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyTanDescriptor.restype = c_int32
+    lib.infiniopDestroyTanDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
 @OpRegister.operator
 def relu_(lib):
     lib.infiniopCreateReluDescriptor.restype = c_int32
diff --git a/test/infiniop/log.py b/test/infiniop/log.py
new file mode 100644
index 000000000..4f97de374
--- /dev/null
+++ b/test/infiniop/log.py
@@ -0,0 +1,166 @@
+import ctypes
+from ctypes import c_uint64
+from enum import Enum, auto
+
+import torch
+from libinfiniop import (
+    LIBINFINIOP,
+    InfiniDeviceNames,
+    InfiniDtype,
+    InfiniDtypeNames,
+    TestTensor,
+    TestWorkspace,
+    check_error,
+    debug,
+    get_args,
+    get_test_devices,
+    get_tolerance,
+    infiniopOperatorDescriptor_t,
+    profile_operation,
+    test_operator,
+)
+
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+# These are not meant to be imported from other modules
+_TEST_CASES_ = [
+    # tensor_shape, inplace
+    ((1, 3),),
+    ((3, 3),),
+    ((32, 20, 512),),
+    ((33, 333, 333),),
+    ((32, 256, 112, 112),),
+    ((3, 3, 13, 9, 17),),
+]
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_X = auto()
+
+
+# Inplace options applied for each test case in _TEST_CASES_
+_INPLACE = [
+    Inplace.OUT_OF_PLACE,
+    Inplace.INPLACE_X,
+]
+
+# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
+_TEST_CASES = [
+    test_case + (inplace_item,)
+    for test_case in _TEST_CASES_
+    for inplace_item in _INPLACE
+]
+
+# Data types used for testing (matching old operators library: only F16 and F32)
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32]
+
+# Tolerance map for different data types
+# Copied from old operators library: atol=1e-7, rtol=1e-3
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-7, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-3},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def log_op(x):
+    return torch.log(x).to(x.dtype)
+
+
+def test(
+    handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=InfiniDtype.F16, sync=None
+):
+    # Generate test tensors with values in range [0.1, 1.1) for log operation
+    # log domain is (0, +∞), so we use range [0.1, 1.1)
+    x_torch_tensor = torch.rand(shape) + 0.1
+
+    x = TestTensor(
+        shape,
+        x_torch_tensor.stride(),
+        dtype,
+        device,
+        mode="manual",
+        set_tensor=x_torch_tensor,
+    )
+
+    if inplace == Inplace.INPLACE_X:
+        y = x
+    else:
+        y = TestTensor(shape, None, dtype, device)
+
+    if y.is_broadcast():
+        return
+
+    print(
+        f"Testing Log on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}"
+    )
+
+    ans = log_op(x.torch_tensor())
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateLogDescriptor(
+            handle, ctypes.byref(descriptor), y.descriptor, x.descriptor
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in [x, y]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetLogWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, y.device)
+
+    def lib_log():
+        check_error(
+            LIBINFINIOP.infiniopLog(
+                descriptor, workspace.data(), workspace_size.value, y.data(), x.data(), None
+            )
+        )
+
+    lib_log()
+    if sync is not None:
+        sync()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(y.actual_tensor(), ans, atol=atol, rtol=rtol)
+    assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol, equal_nan=True)
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: log_op(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_log(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+
+    check_error(LIBINFINIOP.infiniopDestroyLogDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/neg.py b/test/infiniop/neg.py
new file mode 100644
index 000000000..62607bce0
--- /dev/null
+++ b/test/infiniop/neg.py
@@ -0,0 +1,165 @@
+import ctypes
+from ctypes import c_uint64
+from enum import Enum, auto
+
+import torch
+from libinfiniop import (
+    LIBINFINIOP,
+    InfiniDeviceNames,
+    InfiniDtype,
+    InfiniDtypeNames,
+    TestTensor,
+    TestWorkspace,
+    check_error,
+    debug,
+    get_args,
+    get_test_devices,
+    get_tolerance,
+    infiniopOperatorDescriptor_t,
+    profile_operation,
+    test_operator,
+)
+
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+# These are not meant to be imported from other modules
+_TEST_CASES_ = [
+    # tensor_shape, inplace
+    ((1, 3),),
+    ((3, 3),),
+    ((32, 20, 512),),
+    ((33, 333, 333),),
+    ((32, 256, 112, 112),),
+    ((3, 3, 13, 9, 17),),
+]
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_X = auto()
+
+
+# Inplace options applied for each test case in _TEST_CASES_
+_INPLACE = [
+    Inplace.OUT_OF_PLACE,
+    Inplace.INPLACE_X,
+]
+
+# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
+_TEST_CASES = [
+    test_case + (inplace_item,)
+    for test_case in _TEST_CASES_
+    for inplace_item in _INPLACE
+]
+
+# Data types used for testing (matching old operators library: only F16 and F32)
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32]
+
+# Tolerance map for different data types
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 0, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 0, "rtol": 1e-7},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def neg_op(x):
+    return torch.neg(x).to(x.dtype)
+
+
+def test(
+    handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=InfiniDtype.F16, sync=None
+):
+    # Generate test tensors with values in range [-200, -100) for neg operation
+    # This matches the original test case: * 100 - 200
+    x_torch_tensor = torch.rand(shape) * 100 - 200
+
+    x = TestTensor(
+        shape,
+        x_torch_tensor.stride(),
+        dtype,
+        device,
+        mode="manual",
+        set_tensor=x_torch_tensor,
+    )
+
+    if inplace == Inplace.INPLACE_X:
+        y = x
+    else:
+        y = TestTensor(shape, None, dtype, device)
+
+    if y.is_broadcast():
+        return
+
+    print(
+        f"Testing Neg on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}"
+    )
+
+    ans = neg_op(x.torch_tensor())
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateNegDescriptor(
+            handle, ctypes.byref(descriptor), y.descriptor, x.descriptor
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in [x, y]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetNegWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, y.device)
+
+    def lib_neg():
+        check_error(
+            LIBINFINIOP.infiniopNeg(
+                descriptor, workspace.data(), workspace_size.value, y.data(), x.data(), None
+            )
+        )
+
+    lib_neg()
+    if sync is not None:
+        sync()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(y.actual_tensor(), ans, atol=atol, rtol=rtol)
+    assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol, equal_nan=True)
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: neg_op(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_neg(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+
+    check_error(LIBINFINIOP.infiniopDestroyNegDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/reciprocal.py b/test/infiniop/reciprocal.py
new file mode 100644
index 000000000..4e816481c
--- /dev/null
+++ b/test/infiniop/reciprocal.py
@@ -0,0 +1,168 @@
+import ctypes
+from ctypes import c_uint64
+from enum import Enum, auto
+
+import torch
+from libinfiniop import (
+    LIBINFINIOP,
+    InfiniDeviceNames,
+    InfiniDtype,
+    InfiniDtypeNames,
+    TestTensor,
+    TestWorkspace,
+    check_error,
+    debug,
+    get_args,
+    get_test_devices,
+    get_tolerance,
+    infiniopOperatorDescriptor_t,
+    profile_operation,
+    test_operator,
+)
+
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+# These are not meant to be imported from other modules
+_TEST_CASES_ = [
+    # tensor_shape, inplace
+    ((1, 3),),
+    ((3, 3),),
+    ((32, 20, 512),),
+    ((33, 333, 333),),
+    ((32, 256, 112, 112),),
+    ((3, 3, 13, 9, 17),),
+]
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_X = auto()
+
+
+# Inplace options applied for each test case in _TEST_CASES_
+_INPLACE = [
+    Inplace.OUT_OF_PLACE,
+    Inplace.INPLACE_X,
+]
+
+# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
+_TEST_CASES = [
+    test_case + (inplace_item,)
+    for test_case in _TEST_CASES_
+    for inplace_item in _INPLACE
+]
+
+# Data types used for testing (matching old operators library: only F16 and F32)
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32]
+
+# Tolerance map for different data types
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 0, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 0, "rtol": 1e-7},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def reciprocal_op(x):
+    return torch.reciprocal(x).to(x.dtype)
+
+
+def test(
+    handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=InfiniDtype.F16, sync=None
+):
+    # Generate test tensors with values in range [-10, 10) for reciprocal operation
+    # This matches the original test case: * 20 - 10
+    # Note: Avoid values too close to zero to prevent division by zero issues
+    x_torch_tensor = torch.rand(shape) * 20 - 10
+    # Ensure no zero values
+    x_torch_tensor = torch.where(x_torch_tensor == 0, torch.ones_like(x_torch_tensor), x_torch_tensor)
+
+    x = TestTensor(
+        shape,
+        x_torch_tensor.stride(),
+        dtype,
+        device,
+        mode="manual",
+        set_tensor=x_torch_tensor,
+    )
+
+    if inplace == Inplace.INPLACE_X:
+        y = x
+    else:
+        y = TestTensor(shape, None, dtype, device)
+
+    if y.is_broadcast():
+        return
+
+    print(
+        f"Testing Reciprocal on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}"
+    )
+
+    ans = reciprocal_op(x.torch_tensor())
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateReciprocalDescriptor(
+            handle, ctypes.byref(descriptor), y.descriptor, x.descriptor
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in [x, y]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetReciprocalWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, y.device)
+
+    def lib_reciprocal():
+        check_error(
+            LIBINFINIOP.infiniopReciprocal(
+                descriptor, workspace.data(), workspace_size.value, y.data(), x.data(), None
+            )
+        )
+
+    lib_reciprocal()
+    if sync is not None:
+        sync()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(y.actual_tensor(), ans, atol=atol, rtol=rtol)
+    assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol, equal_nan=True)
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: reciprocal_op(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_reciprocal(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+
+    check_error(LIBINFINIOP.infiniopDestroyReciprocalDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/round.py b/test/infiniop/round.py
new file mode 100644
index 000000000..d6053f676
--- /dev/null
+++ b/test/infiniop/round.py
@@ -0,0 +1,165 @@
+import ctypes
+from ctypes import c_uint64
+from enum import Enum, auto
+
+import torch
+from libinfiniop import (
+    LIBINFINIOP,
+    InfiniDeviceNames,
+    InfiniDtype,
+    InfiniDtypeNames,
+    TestTensor,
+    TestWorkspace,
+    check_error,
+    debug,
+    get_args,
+    get_test_devices,
+    get_tolerance,
+    infiniopOperatorDescriptor_t,
+    profile_operation,
+    test_operator,
+)
+
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+# These are not meant to be imported from other modules
+_TEST_CASES_ = [
+    # tensor_shape, inplace
+    ((1, 3),),
+    ((3, 3),),
+    ((32, 20, 512),),
+    ((33, 333, 333),),
+    ((32, 256, 112, 112),),
+    ((3, 3, 13, 9, 17),),
+]
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_X = auto()
+
+
+# Inplace options applied for each test case in _TEST_CASES_
+_INPLACE = [
+    Inplace.OUT_OF_PLACE,
+    Inplace.INPLACE_X,
+]
+
+# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
+_TEST_CASES = [
+    test_case + (inplace_item,)
+    for test_case in _TEST_CASES_
+    for inplace_item in _INPLACE
+]
+
+# Data types used for testing (matching old operators library: only F16 and F32)
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32]
+
+# Tolerance map for different data types
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def round_op(x):
+    return torch.round(x).to(x.dtype)
+
+
+def test(
+    handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=InfiniDtype.F16, sync=None
+):
+    # Generate test tensors with values in range [-20, -10) for round operation
+    # This matches the original test case: * 10 - 20
+    x_torch_tensor = torch.rand(shape) * 10 - 20
+
+    x = TestTensor(
+        shape,
+        x_torch_tensor.stride(),
+        dtype,
+        device,
+        mode="manual",
+        set_tensor=x_torch_tensor,
+    )
+
+    if inplace == Inplace.INPLACE_X:
+        y = x
+    else:
+        y = TestTensor(shape, None, dtype, device)
+
+    if y.is_broadcast():
+        return
+
+    print(
+        f"Testing Round on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}"
+    )
+
+    ans = round_op(x.torch_tensor())
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateRoundDescriptor(
+            handle, ctypes.byref(descriptor), y.descriptor, x.descriptor
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in [x, y]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetRoundWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, y.device)
+
+    def lib_round():
+        check_error(
+            LIBINFINIOP.infiniopRound(
+                descriptor, workspace.data(), workspace_size.value, y.data(), x.data(), None
+            )
+        )
+
+    lib_round()
+    if sync is not None:
+        sync()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(y.actual_tensor(), ans, atol=atol, rtol=rtol)
+    assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol, equal_nan=True)
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: round_op(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_round(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+
+    check_error(LIBINFINIOP.infiniopDestroyRoundDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/sign.py b/test/infiniop/sign.py
new file mode 100644
index 000000000..f0eb5b5f8
--- /dev/null
+++ b/test/infiniop/sign.py
@@ -0,0 +1,166 @@
+import ctypes
+from ctypes import c_uint64
+from enum import Enum, auto
+
+import torch
+from libinfiniop import (
+    LIBINFINIOP,
+    InfiniDeviceNames,
+    InfiniDtype,
+    InfiniDtypeNames,
+    TestTensor,
+    TestWorkspace,
+    check_error,
+    debug,
+    get_args,
+    get_test_devices,
+    get_tolerance,
+    infiniopOperatorDescriptor_t,
+    profile_operation,
+    test_operator,
+)
+
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+# These are not meant to be imported from other modules
+_TEST_CASES_ = [
+    # tensor_shape, inplace
+    ((1, 3),),
+    ((3, 3),),
+    ((32, 20, 512),),
+    ((33, 333, 333),),
+    ((32, 256, 112, 112),),
+    ((3, 3, 13, 9, 17),),
+]
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_X = auto()
+
+
+# Inplace options applied for each test case in _TEST_CASES_
+_INPLACE = [
+    Inplace.OUT_OF_PLACE,
+    Inplace.INPLACE_X,
+]
+
+# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
+_TEST_CASES = [
+    test_case + (inplace_item,)
+    for test_case in _TEST_CASES_
+    for inplace_item in _INPLACE
+]
+
+# Data types used for testing (matching old operators library: only F16 and F32)
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32]
+
+# Tolerance map for different data types
+# Copied from old operators library: atol=0, rtol=0
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 0, "rtol": 0},
+    InfiniDtype.F32: {"atol": 0, "rtol": 0},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def sign_op(x):
+    return torch.sign(x).to(x.dtype)
+
+
+def test(
+    handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=InfiniDtype.F16, sync=None
+):
+    # Generate test tensors with values in range [-200, -100) for sign operation
+    # sign domain is (-∞, +∞), so we use range [-200, -100)
+    x_torch_tensor = torch.rand(shape) * 100 - 200
+
+    x = TestTensor(
+        shape,
+        x_torch_tensor.stride(),
+        dtype,
+        device,
+        mode="manual",
+        set_tensor=x_torch_tensor,
+    )
+
+    if inplace == Inplace.INPLACE_X:
+        y = x
+    else:
+        y = TestTensor(shape, None, dtype, device)
+
+    if y.is_broadcast():
+        return
+
+    print(
+        f"Testing Sign on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}"
+    )
+
+    ans = sign_op(x.torch_tensor())
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateSignDescriptor(
+            handle, ctypes.byref(descriptor), y.descriptor, x.descriptor
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in [x, y]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetSignWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, y.device)
+
+    def lib_sign():
+        check_error(
+            LIBINFINIOP.infiniopSign(
+                descriptor, workspace.data(), workspace_size.value, y.data(), x.data(), None
+            )
+        )
+
+    lib_sign()
+    if sync is not None:
+        sync()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(y.actual_tensor(), ans, atol=atol, rtol=rtol)
+    assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol, equal_nan=True)
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: sign_op(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_sign(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+
+    check_error(LIBINFINIOP.infiniopDestroySignDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/sinh.py b/test/infiniop/sinh.py
new file mode 100644
index 000000000..99bc02c58
--- /dev/null
+++ b/test/infiniop/sinh.py
@@ -0,0 +1,166 @@
+import ctypes
+from ctypes import c_uint64
+from enum import Enum, auto
+
+import torch
+from libinfiniop import (
+    LIBINFINIOP,
+    InfiniDeviceNames,
+    InfiniDtype,
+    InfiniDtypeNames,
+    TestTensor,
+    TestWorkspace,
+    check_error,
+    debug,
+    get_args,
+    get_test_devices,
+    get_tolerance,
+    infiniopOperatorDescriptor_t,
+    profile_operation,
+    test_operator,
+)
+
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+# These are not meant to be imported from other modules
+_TEST_CASES_ = [
+    # tensor_shape, inplace
+    ((1, 3),),
+    ((3, 3),),
+    ((32, 20, 512),),
+    ((33, 333, 333),),
+    ((32, 256, 112, 112),),
+    ((3, 3, 13, 9, 17),),
+]
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_X = auto()
+
+
+# Inplace options applied for each test case in _TEST_CASES_
+_INPLACE = [
+    Inplace.OUT_OF_PLACE,
+    Inplace.INPLACE_X,
+]
+
+# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
+_TEST_CASES = [
+    test_case + (inplace_item,)
+    for test_case in _TEST_CASES_
+    for inplace_item in _INPLACE
+]
+
+# Data types used for testing (matching old operators library: only F16 and F32)
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32]
+
+# Tolerance map for different data types
+# Copied from old operators library: atol=0, rtol=0
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 0, "rtol": 0},
+    InfiniDtype.F32: {"atol": 0, "rtol": 0},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def sinh_op(x):
+    return torch.sinh(x).to(x.dtype)
+
+
+def test(
+    handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=InfiniDtype.F16, sync=None
+):
+    # Generate test tensors with values in range [-200, -100) for sinh operation
+    # sinh domain is (-∞, +∞), so we use range [-200, -100)
+    x_torch_tensor = torch.rand(shape) * 100 - 200
+
+    x = TestTensor(
+        shape,
+        x_torch_tensor.stride(),
+        dtype,
+        device,
+        mode="manual",
+        set_tensor=x_torch_tensor,
+    )
+
+    if inplace == Inplace.INPLACE_X:
+        y = x
+    else:
+        y = TestTensor(shape, None, dtype, device)
+
+    if y.is_broadcast():
+        return
+
+    print(
+        f"Testing Sinh on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}"
+    )
+
+    ans = sinh_op(x.torch_tensor())
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateSinhDescriptor(
+            handle, ctypes.byref(descriptor), y.descriptor, x.descriptor
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in [x, y]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetSinhWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, y.device)
+
+    def lib_sinh():
+        check_error(
+            LIBINFINIOP.infiniopSinh(
+                descriptor, workspace.data(), workspace_size.value, y.data(), x.data(), None
+            )
+        )
+
+    lib_sinh()
+    if sync is not None:
+        sync()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(y.actual_tensor(), ans, atol=atol, rtol=rtol)
+    assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol, equal_nan=True)
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: sinh_op(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_sinh(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+
+    check_error(LIBINFINIOP.infiniopDestroySinhDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/sqrt.py b/test/infiniop/sqrt.py
new file mode 100644
index 000000000..6e1419971
--- /dev/null
+++ b/test/infiniop/sqrt.py
@@ -0,0 +1,166 @@
+import ctypes
+from ctypes import c_uint64
+from enum import Enum, auto
+
+import torch
+from libinfiniop import (
+    LIBINFINIOP,
+    InfiniDeviceNames,
+    InfiniDtype,
+    InfiniDtypeNames,
+    TestTensor,
+    TestWorkspace,
+    check_error,
+    debug,
+    get_args,
+    get_test_devices,
+    get_tolerance,
+    infiniopOperatorDescriptor_t,
+    profile_operation,
+    test_operator,
+)
+
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+# These are not meant to be imported from other modules
+_TEST_CASES_ = [
+    # tensor_shape, inplace
+    ((1, 3),),
+    ((3, 3),),
+    ((32, 20, 512),),
+    ((33, 333, 333),),
+    ((32, 256, 112, 112),),
+    ((3, 3, 13, 9, 17),),
+]
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_X = auto()
+
+
+# Inplace options applied for each test case in _TEST_CASES_
+_INPLACE = [
+    Inplace.OUT_OF_PLACE,
+    Inplace.INPLACE_X,
+]
+
+# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
+_TEST_CASES = [
+    test_case + (inplace_item,)
+    for test_case in _TEST_CASES_
+    for inplace_item in _INPLACE
+]
+
+# Data types used for testing (matching old operators library: only F16 and F32)
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32]
+
+# Tolerance map for different data types
+# Copied from old operators library: atol=0, rtol=1e-3
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 0, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 0, "rtol": 1e-3},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def sqrt_op(x):
+    return torch.sqrt(x).to(x.dtype)
+
+
+def test(
+    handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=InfiniDtype.F16, sync=None
+):
+    # Generate test tensors with values in range [0, 100) for sqrt operation
+    # sqrt domain is [0, +∞), so we use range [0, 100)
+    x_torch_tensor = torch.rand(shape) * 100
+
+    x = TestTensor(
+        shape,
+        x_torch_tensor.stride(),
+        dtype,
+        device,
+        mode="manual",
+        set_tensor=x_torch_tensor,
+    )
+
+    if inplace == Inplace.INPLACE_X:
+        y = x
+    else:
+        y = TestTensor(shape, None, dtype, device)
+
+    if y.is_broadcast():
+        return
+
+    print(
+        f"Testing Sqrt on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}"
+    )
+
+    ans = sqrt_op(x.torch_tensor())
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateSqrtDescriptor(
+            handle, ctypes.byref(descriptor), y.descriptor, x.descriptor
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in [x, y]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetSqrtWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, y.device)
+
+    def lib_sqrt():
+        check_error(
+            LIBINFINIOP.infiniopSqrt(
+                descriptor, workspace.data(), workspace_size.value, y.data(), x.data(), None
+            )
+        )
+
+    lib_sqrt()
+    if sync is not None:
+        sync()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(y.actual_tensor(), ans, atol=atol, rtol=rtol)
+    assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol, equal_nan=True)
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: sqrt_op(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_sqrt(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+
+    check_error(LIBINFINIOP.infiniopDestroySqrtDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/tan.py b/test/infiniop/tan.py
new file mode 100644
index 000000000..877f5dd58
--- /dev/null
+++ b/test/infiniop/tan.py
@@ -0,0 +1,167 @@
+import ctypes
+import math
+from ctypes import c_uint64
+from enum import Enum, auto
+
+import torch
+from libinfiniop import (
+    LIBINFINIOP,
+    InfiniDeviceNames,
+    InfiniDtype,
+    InfiniDtypeNames,
+    TestTensor,
+    TestWorkspace,
+    check_error,
+    debug,
+    get_args,
+    get_test_devices,
+    get_tolerance,
+    infiniopOperatorDescriptor_t,
+    profile_operation,
+    test_operator,
+)
+
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+# These are not meant to be imported from other modules
+_TEST_CASES_ = [
+    # tensor_shape, inplace
+    ((1, 3),),
+    ((3, 3),),
+    ((32, 20, 512),),
+    ((33, 333, 333),),
+    ((32, 256, 112, 112),),
+    ((3, 3, 13, 9, 17),),
+]
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_X = auto()
+
+
+# Inplace options applied for each test case in _TEST_CASES_
+_INPLACE = [
+    Inplace.OUT_OF_PLACE,
+    Inplace.INPLACE_X,
+]
+
+# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
+_TEST_CASES = [
+    test_case + (inplace_item,)
+    for test_case in _TEST_CASES_
+    for inplace_item in _INPLACE
+]
+
+# Data types used for testing (matching old operators library: only F16 and F32)
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32]
+
+# Tolerance map for different data types
+# Copied from old operators library: atol=1e-6, rtol=1e-2
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-6, "rtol": 1e-2},
+    InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-2},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def tan_op(x):
+    return torch.tan(x).to(x.dtype)
+
+
+def test(
+    handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=InfiniDtype.F16, sync=None
+):
+    # Generate test tensors with values in range [-2π, 2π) for tan operation
+    # tan domain is (-∞, +∞), so we use range [-2π, 2π)
+    x_torch_tensor = torch.rand(shape) * 4 * math.pi - 2 * math.pi
+
+    x = TestTensor(
+        shape,
+        x_torch_tensor.stride(),
+        dtype,
+        device,
+        mode="manual",
+        set_tensor=x_torch_tensor,
+    )
+
+    if inplace == Inplace.INPLACE_X:
+        y = x
+    else:
+        y = TestTensor(shape, None, dtype, device)
+
+    if y.is_broadcast():
+        return
+
+    print(
+        f"Testing Tan on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}"
+    )
+
+    ans = tan_op(x.torch_tensor())
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateTanDescriptor(
+            handle, ctypes.byref(descriptor), y.descriptor, x.descriptor
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in [x, y]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetTanWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, y.device)
+
+    def lib_tan():
+        check_error(
+            LIBINFINIOP.infiniopTan(
+                descriptor, workspace.data(), workspace_size.value, y.data(), x.data(), None
+            )
+        )
+
+    lib_tan()
+    if sync is not None:
+        sync()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(y.actual_tensor(), ans, atol=atol, rtol=rtol)
+    assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol, equal_nan=True)
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: tan_op(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_tan(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+
+    check_error(LIBINFINIOP.infiniopDestroyTanDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")

From c9247c809ec2143b7f81a87cb26cab21766227a1 Mon Sep 17 00:00:00 2001
From: gongchensu <zhuyue_134@qq.com>
Date: Thu, 15 Jan 2026 02:33:13 +0000
Subject: [PATCH 3/7] Issue/887 - Refactor binary and unary operators to reduce
 code duplication.

---
 include/infiniop/ops/abs.h                    |  20 +-
 include/infiniop/ops/acos.h                   |  20 +-
 include/infiniop/ops/acosh.h                  |  20 +-
 include/infiniop/ops/asin.h                   |  20 +-
 include/infiniop/ops/asinh.h                  |  20 +-
 include/infiniop/ops/atan.h                   |  20 +-
 include/infiniop/ops/atanh.h                  |  20 +-
 include/infiniop/ops/binary_op_api.h          |  50 ++
 include/infiniop/ops/ceil.h                   |  20 +-
 include/infiniop/ops/cos.h                    |  20 +-
 include/infiniop/ops/cosh.h                   |  20 +-
 include/infiniop/ops/div.h                    |  22 +-
 include/infiniop/ops/erf.h                    |  20 +-
 include/infiniop/ops/floor.h                  |  20 +-
 include/infiniop/ops/log.h                    |  20 +-
 include/infiniop/ops/max.h                    |  22 +-
 include/infiniop/ops/min.h                    |  22 +-
 include/infiniop/ops/mod.h                    |  22 +-
 include/infiniop/ops/neg.h                    |  20 +-
 include/infiniop/ops/pow.h                    |  22 +-
 include/infiniop/ops/reciprocal.h             |  20 +-
 include/infiniop/ops/round.h                  |  20 +-
 include/infiniop/ops/sign.h                   |  20 +-
 include/infiniop/ops/sinh.h                   |  20 +-
 include/infiniop/ops/sqrt.h                   |  20 +-
 include/infiniop/ops/tan.h                    |  20 +-
 include/infiniop/ops/unary_op_api.h           |  48 ++
 scripts/test_binary_unary.py                  | 143 +++++
 src/infiniop/elementwise/binary.h             | 261 +++++++++
 .../elementwise/cpu/elementwise_cpu_impl.h    | 130 +++++
 .../nvidia/elementwise_nvidia_impl.cuh        | 134 +++++
 src/infiniop/elementwise/unary.h              | 524 ++++++++++++++++++
 src/infiniop/operator_impl.h                  | 288 ++++++++++
 src/infiniop/ops/abs/cpu/abs_cpu.cc           |  44 +-
 src/infiniop/ops/abs/cpu/abs_cpu.h            |  21 +-
 src/infiniop/ops/abs/cuda/kernel.cuh          |  20 +-
 src/infiniop/ops/abs/nvidia/abs_nvidia.cu     |  48 +-
 src/infiniop/ops/abs/operator.cc              | 132 +----
 src/infiniop/ops/acos/cpu/acos_cpu.cc         |  44 +-
 src/infiniop/ops/acos/cpu/acos_cpu.h          |  17 +-
 src/infiniop/ops/acos/cuda/kernel.cuh         |  26 +-
 src/infiniop/ops/acos/nvidia/acos_nvidia.cu   |  48 +-
 src/infiniop/ops/acos/operator.cc             | 132 +----
 src/infiniop/ops/acosh/cpu/acosh_cpu.cc       |  44 +-
 src/infiniop/ops/acosh/cpu/acosh_cpu.h        |  17 +-
 src/infiniop/ops/acosh/cuda/kernel.cuh        |  26 +-
 src/infiniop/ops/acosh/nvidia/acosh_nvidia.cu |  48 +-
 src/infiniop/ops/acosh/operator.cc            | 132 +----
 src/infiniop/ops/asin/cpu/asin_cpu.cc         |  44 +-
 src/infiniop/ops/asin/cpu/asin_cpu.h          |  17 +-
 src/infiniop/ops/asin/cuda/kernel.cuh         |  26 +-
 src/infiniop/ops/asin/nvidia/asin_nvidia.cu   |  48 +-
 src/infiniop/ops/asin/operator.cc             | 132 +----
 src/infiniop/ops/asinh/cpu/asinh_cpu.cc       |  44 +-
 src/infiniop/ops/asinh/cpu/asinh_cpu.h        |  17 +-
 src/infiniop/ops/asinh/cuda/kernel.cuh        |  26 +-
 src/infiniop/ops/asinh/nvidia/asinh_nvidia.cu |  48 +-
 src/infiniop/ops/asinh/operator.cc            | 132 +----
 src/infiniop/ops/atan/cpu/atan_cpu.cc         |  44 +-
 src/infiniop/ops/atan/cpu/atan_cpu.h          |  17 +-
 src/infiniop/ops/atan/cuda/kernel.cuh         |  26 +-
 src/infiniop/ops/atan/nvidia/atan_nvidia.cu   |  48 +-
 src/infiniop/ops/atan/operator.cc             | 132 +----
 src/infiniop/ops/atanh/cpu/atanh_cpu.cc       |  44 +-
 src/infiniop/ops/atanh/cpu/atanh_cpu.h        |  17 +-
 src/infiniop/ops/atanh/cuda/kernel.cuh        |  26 +-
 src/infiniop/ops/atanh/nvidia/atanh_nvidia.cu |  48 +-
 src/infiniop/ops/atanh/operator.cc            | 132 +----
 src/infiniop/ops/ceil/cpu/ceil_cpu.cc         |  44 +-
 src/infiniop/ops/ceil/cpu/ceil_cpu.h          |  21 +-
 src/infiniop/ops/ceil/cuda/kernel.cuh         |  28 +-
 src/infiniop/ops/ceil/nvidia/ceil_nvidia.cu   |  48 +-
 src/infiniop/ops/ceil/operator.cc             | 132 +----
 src/infiniop/ops/cos/cpu/cos_cpu.cc           |  44 +-
 src/infiniop/ops/cos/cpu/cos_cpu.h            |  17 +-
 src/infiniop/ops/cos/cuda/kernel.cuh          |  26 +-
 src/infiniop/ops/cos/nvidia/cos_nvidia.cu     |  48 +-
 src/infiniop/ops/cos/operator.cc              | 132 +----
 src/infiniop/ops/cosh/cpu/cosh_cpu.cc         |  44 +-
 src/infiniop/ops/cosh/cpu/cosh_cpu.h          |  17 +-
 src/infiniop/ops/cosh/cuda/kernel.cuh         |  26 +-
 src/infiniop/ops/cosh/nvidia/cosh_nvidia.cu   |  48 +-
 src/infiniop/ops/cosh/operator.cc             | 132 +----
 src/infiniop/ops/div/cpu/div_cpu.cc           |  46 +-
 src/infiniop/ops/div/cpu/div_cpu.h            |  14 +-
 src/infiniop/ops/div/cuda/kernel.cuh          |  19 +-
 src/infiniop/ops/div/nvidia/div_nvidia.cu     |  51 +-
 src/infiniop/ops/div/operator.cc              | 195 +------
 src/infiniop/ops/erf/cpu/erf_cpu.cc           |  44 +-
 src/infiniop/ops/erf/cpu/erf_cpu.h            |  17 +-
 src/infiniop/ops/erf/cuda/kernel.cuh          |  26 +-
 src/infiniop/ops/erf/nvidia/erf_nvidia.cu     |  48 +-
 src/infiniop/ops/erf/operator.cc              | 132 +----
 src/infiniop/ops/floor/cpu/floor_cpu.cc       |  44 +-
 src/infiniop/ops/floor/cpu/floor_cpu.h        |  21 +-
 src/infiniop/ops/floor/cuda/kernel.cuh        |  28 +-
 src/infiniop/ops/floor/nvidia/floor_nvidia.cu |  48 +-
 src/infiniop/ops/floor/operator.cc            | 132 +----
 src/infiniop/ops/log/cpu/log_cpu.cc           |  44 +-
 src/infiniop/ops/log/cpu/log_cpu.h            |  17 +-
 src/infiniop/ops/log/cuda/kernel.cuh          |  26 +-
 src/infiniop/ops/log/nvidia/log_nvidia.cu     |  48 +-
 src/infiniop/ops/log/operator.cc              | 132 +----
 src/infiniop/ops/max/cpu/max_cpu.cc           |  46 +-
 src/infiniop/ops/max/cpu/max_cpu.h            |  15 +-
 src/infiniop/ops/max/cuda/kernel.cuh          |  19 +-
 src/infiniop/ops/max/nvidia/max_nvidia.cu     |  51 +-
 src/infiniop/ops/max/operator.cc              | 195 +------
 src/infiniop/ops/min/cpu/min_cpu.cc           |  46 +-
 src/infiniop/ops/min/cpu/min_cpu.h            |  15 +-
 src/infiniop/ops/min/cuda/kernel.cuh          |  19 +-
 src/infiniop/ops/min/nvidia/min_nvidia.cu     |  51 +-
 src/infiniop/ops/min/operator.cc              | 195 +------
 src/infiniop/ops/mod/cpu/mod_cpu.cc           |  45 +-
 src/infiniop/ops/mod/cpu/mod_cpu.h            |  18 +-
 src/infiniop/ops/mod/cuda/kernel.cuh          |  24 +-
 src/infiniop/ops/mod/nvidia/mod_nvidia.cu     |  51 +-
 src/infiniop/ops/mod/operator.cc              | 135 +----
 src/infiniop/ops/neg/cpu/neg_cpu.cc           |  44 +-
 src/infiniop/ops/neg/cpu/neg_cpu.h            |  15 +-
 src/infiniop/ops/neg/cuda/kernel.cuh          |  17 +-
 src/infiniop/ops/neg/nvidia/neg_nvidia.cu     |  48 +-
 src/infiniop/ops/neg/operator.cc              | 132 +----
 src/infiniop/ops/pow/cpu/pow_cpu.cc           |  45 +-
 src/infiniop/ops/pow/cpu/pow_cpu.h            |  14 +-
 src/infiniop/ops/pow/cuda/kernel.cuh          |  34 +-
 src/infiniop/ops/pow/nvidia/pow_nvidia.cu     |  51 +-
 src/infiniop/ops/pow/operator.cc              | 135 +----
 .../ops/reciprocal/cpu/reciprocal_cpu.cc      |  44 +-
 .../ops/reciprocal/cpu/reciprocal_cpu.h       |  15 +-
 src/infiniop/ops/reciprocal/cuda/kernel.cuh   |  26 +-
 .../reciprocal/nvidia/reciprocal_nvidia.cu    |  48 +-
 src/infiniop/ops/reciprocal/operator.cc       | 132 +----
 src/infiniop/ops/round/cpu/round_cpu.cc       |  44 +-
 src/infiniop/ops/round/cpu/round_cpu.h        |  20 +-
 src/infiniop/ops/round/cuda/kernel.cuh        |  28 +-
 src/infiniop/ops/round/nvidia/round_nvidia.cu |  48 +-
 src/infiniop/ops/round/operator.cc            | 132 +----
 src/infiniop/ops/sign/cpu/sign_cpu.cc         |  44 +-
 src/infiniop/ops/sign/cpu/sign_cpu.h          |  15 +-
 src/infiniop/ops/sign/cuda/kernel.cuh         |  19 +-
 src/infiniop/ops/sign/nvidia/sign_nvidia.cu   |  48 +-
 src/infiniop/ops/sign/operator.cc             | 132 +----
 src/infiniop/ops/sinh/cpu/sinh_cpu.cc         |  44 +-
 src/infiniop/ops/sinh/cpu/sinh_cpu.h          |  17 +-
 src/infiniop/ops/sinh/cuda/kernel.cuh         |  26 +-
 src/infiniop/ops/sinh/nvidia/sinh_nvidia.cu   |  48 +-
 src/infiniop/ops/sinh/operator.cc             | 132 +----
 src/infiniop/ops/sqrt/cpu/sqrt_cpu.cc         |  44 +-
 src/infiniop/ops/sqrt/cpu/sqrt_cpu.h          |  17 +-
 src/infiniop/ops/sqrt/cuda/kernel.cuh         |  26 +-
 src/infiniop/ops/sqrt/nvidia/sqrt_nvidia.cu   |  48 +-
 src/infiniop/ops/sqrt/operator.cc             | 132 +----
 src/infiniop/ops/tan/cpu/tan_cpu.cc           |  44 +-
 src/infiniop/ops/tan/cpu/tan_cpu.h            |  17 +-
 src/infiniop/ops/tan/cuda/kernel.cuh          |  49 +-
 src/infiniop/ops/tan/nvidia/tan_nvidia.cu     |  48 +-
 src/infiniop/ops/tan/operator.cc              | 132 +----
 src/infiniop/ops/tanh/cuda/kernel.cuh         |  38 +-
 src/infiniop/ops/tanh/nvidia/tanh_nvidia.cu   |  53 +-
 160 files changed, 1885 insertions(+), 7179 deletions(-)
 create mode 100644 include/infiniop/ops/binary_op_api.h
 create mode 100644 include/infiniop/ops/unary_op_api.h
 create mode 100755 scripts/test_binary_unary.py
 create mode 100644 src/infiniop/elementwise/binary.h
 create mode 100644 src/infiniop/elementwise/cpu/elementwise_cpu_impl.h
 create mode 100644 src/infiniop/elementwise/nvidia/elementwise_nvidia_impl.cuh
 create mode 100644 src/infiniop/elementwise/unary.h
 create mode 100644 src/infiniop/operator_impl.h

diff --git a/include/infiniop/ops/abs.h b/include/infiniop/ops/abs.h
index 7b5872657..1d1f1cbd1 100644
--- a/include/infiniop/ops/abs.h
+++ b/include/infiniop/ops/abs.h
@@ -1,24 +1,8 @@
 #ifndef __INFINIOP_ABS_API_H__
 #define __INFINIOP_ABS_API_H__
 
-#include "../operator_descriptor.h"
+#include "unary_op_api.h"
 
-typedef struct InfiniopDescriptor *infiniopAbsDescriptor_t;
-
-__C __export infiniStatus_t infiniopCreateAbsDescriptor(infiniopHandle_t handle,
-                                                        infiniopAbsDescriptor_t *desc_ptr,
-                                                        infiniopTensorDescriptor_t y,
-                                                        infiniopTensorDescriptor_t x);
-
-__C __export infiniStatus_t infiniopGetAbsWorkspaceSize(infiniopAbsDescriptor_t desc, size_t *size);
-
-__C __export infiniStatus_t infiniopAbs(infiniopAbsDescriptor_t desc,
-                                        void *workspace,
-                                        size_t workspace_size,
-                                        void *y,
-                                        const void *x,
-                                        void *stream);
-
-__C __export infiniStatus_t infiniopDestroyAbsDescriptor(infiniopAbsDescriptor_t desc);
+UNARY_OP_API_DECLARE(abs, Abs)
 
 #endif
diff --git a/include/infiniop/ops/acos.h b/include/infiniop/ops/acos.h
index fe6af01ed..c2f4de837 100644
--- a/include/infiniop/ops/acos.h
+++ b/include/infiniop/ops/acos.h
@@ -1,24 +1,8 @@
 #ifndef __INFINIOP_ACOS_API_H__
 #define __INFINIOP_ACOS_API_H__
 
-#include "../operator_descriptor.h"
+#include "unary_op_api.h"
 
-typedef struct InfiniopDescriptor *infiniopAcosDescriptor_t;
-
-__C __export infiniStatus_t infiniopCreateAcosDescriptor(infiniopHandle_t handle,
-                                                        infiniopAcosDescriptor_t *desc_ptr,
-                                                        infiniopTensorDescriptor_t y,
-                                                        infiniopTensorDescriptor_t x);
-
-__C __export infiniStatus_t infiniopGetAcosWorkspaceSize(infiniopAcosDescriptor_t desc, size_t *size);
-
-__C __export infiniStatus_t infiniopAcos(infiniopAcosDescriptor_t desc,
-                                        void *workspace,
-                                        size_t workspace_size,
-                                        void *y,
-                                        const void *x,
-                                        void *stream);
-
-__C __export infiniStatus_t infiniopDestroyAcosDescriptor(infiniopAcosDescriptor_t desc);
+UNARY_OP_API_DECLARE(acos, Acos)
 
 #endif
diff --git a/include/infiniop/ops/acosh.h b/include/infiniop/ops/acosh.h
index be28918bb..e8630b7d5 100644
--- a/include/infiniop/ops/acosh.h
+++ b/include/infiniop/ops/acosh.h
@@ -1,24 +1,8 @@
 #ifndef __INFINIOP_ACOSH_API_H__
 #define __INFINIOP_ACOSH_API_H__
 
-#include "../operator_descriptor.h"
+#include "unary_op_api.h"
 
-typedef struct InfiniopDescriptor *infiniopAcoshDescriptor_t;
-
-__C __export infiniStatus_t infiniopCreateAcoshDescriptor(infiniopHandle_t handle,
-                                                        infiniopAcoshDescriptor_t *desc_ptr,
-                                                        infiniopTensorDescriptor_t y,
-                                                        infiniopTensorDescriptor_t x);
-
-__C __export infiniStatus_t infiniopGetAcoshWorkspaceSize(infiniopAcoshDescriptor_t desc, size_t *size);
-
-__C __export infiniStatus_t infiniopAcosh(infiniopAcoshDescriptor_t desc,
-                                        void *workspace,
-                                        size_t workspace_size,
-                                        void *y,
-                                        const void *x,
-                                        void *stream);
-
-__C __export infiniStatus_t infiniopDestroyAcoshDescriptor(infiniopAcoshDescriptor_t desc);
+UNARY_OP_API_DECLARE(acosh, Acosh)
 
 #endif
diff --git a/include/infiniop/ops/asin.h b/include/infiniop/ops/asin.h
index 2aac6d1e1..1a8bdd7b8 100644
--- a/include/infiniop/ops/asin.h
+++ b/include/infiniop/ops/asin.h
@@ -1,24 +1,8 @@
 #ifndef __INFINIOP_ASIN_API_H__
 #define __INFINIOP_ASIN_API_H__
 
-#include "../operator_descriptor.h"
+#include "unary_op_api.h"
 
-typedef struct InfiniopDescriptor *infiniopAsinDescriptor_t;
-
-__C __export infiniStatus_t infiniopCreateAsinDescriptor(infiniopHandle_t handle,
-                                                        infiniopAsinDescriptor_t *desc_ptr,
-                                                        infiniopTensorDescriptor_t y,
-                                                        infiniopTensorDescriptor_t x);
-
-__C __export infiniStatus_t infiniopGetAsinWorkspaceSize(infiniopAsinDescriptor_t desc, size_t *size);
-
-__C __export infiniStatus_t infiniopAsin(infiniopAsinDescriptor_t desc,
-                                        void *workspace,
-                                        size_t workspace_size,
-                                        void *y,
-                                        const void *x,
-                                        void *stream);
-
-__C __export infiniStatus_t infiniopDestroyAsinDescriptor(infiniopAsinDescriptor_t desc);
+UNARY_OP_API_DECLARE(asin, Asin)
 
 #endif
diff --git a/include/infiniop/ops/asinh.h b/include/infiniop/ops/asinh.h
index d1385fc01..2a3aebf5a 100644
--- a/include/infiniop/ops/asinh.h
+++ b/include/infiniop/ops/asinh.h
@@ -1,24 +1,8 @@
 #ifndef __INFINIOP_ASINH_API_H__
 #define __INFINIOP_ASINH_API_H__
 
-#include "../operator_descriptor.h"
+#include "unary_op_api.h"
 
-typedef struct InfiniopDescriptor *infiniopAsinhDescriptor_t;
-
-__C __export infiniStatus_t infiniopCreateAsinhDescriptor(infiniopHandle_t handle,
-                                                        infiniopAsinhDescriptor_t *desc_ptr,
-                                                        infiniopTensorDescriptor_t y,
-                                                        infiniopTensorDescriptor_t x);
-
-__C __export infiniStatus_t infiniopGetAsinhWorkspaceSize(infiniopAsinhDescriptor_t desc, size_t *size);
-
-__C __export infiniStatus_t infiniopAsinh(infiniopAsinhDescriptor_t desc,
-                                        void *workspace,
-                                        size_t workspace_size,
-                                        void *y,
-                                        const void *x,
-                                        void *stream);
-
-__C __export infiniStatus_t infiniopDestroyAsinhDescriptor(infiniopAsinhDescriptor_t desc);
+UNARY_OP_API_DECLARE(asinh, Asinh)
 
 #endif
diff --git a/include/infiniop/ops/atan.h b/include/infiniop/ops/atan.h
index 3b1a5bde3..18eed316f 100644
--- a/include/infiniop/ops/atan.h
+++ b/include/infiniop/ops/atan.h
@@ -1,24 +1,8 @@
 #ifndef __INFINIOP_ATAN_API_H__
 #define __INFINIOP_ATAN_API_H__
 
-#include "../operator_descriptor.h"
+#include "unary_op_api.h"
 
-typedef struct InfiniopDescriptor *infiniopAtanDescriptor_t;
-
-__C __export infiniStatus_t infiniopCreateAtanDescriptor(infiniopHandle_t handle,
-                                                        infiniopAtanDescriptor_t *desc_ptr,
-                                                        infiniopTensorDescriptor_t y,
-                                                        infiniopTensorDescriptor_t x);
-
-__C __export infiniStatus_t infiniopGetAtanWorkspaceSize(infiniopAtanDescriptor_t desc, size_t *size);
-
-__C __export infiniStatus_t infiniopAtan(infiniopAtanDescriptor_t desc,
-                                        void *workspace,
-                                        size_t workspace_size,
-                                        void *y,
-                                        const void *x,
-                                        void *stream);
-
-__C __export infiniStatus_t infiniopDestroyAtanDescriptor(infiniopAtanDescriptor_t desc);
+UNARY_OP_API_DECLARE(atan, Atan)
 
 #endif
diff --git a/include/infiniop/ops/atanh.h b/include/infiniop/ops/atanh.h
index 800afd5d5..e7db5b53c 100644
--- a/include/infiniop/ops/atanh.h
+++ b/include/infiniop/ops/atanh.h
@@ -1,24 +1,8 @@
 #ifndef __INFINIOP_ATANH_API_H__
 #define __INFINIOP_ATANH_API_H__
 
-#include "../operator_descriptor.h"
+#include "unary_op_api.h"
 
-typedef struct InfiniopDescriptor *infiniopAtanhDescriptor_t;
-
-__C __export infiniStatus_t infiniopCreateAtanhDescriptor(infiniopHandle_t handle,
-                                                        infiniopAtanhDescriptor_t *desc_ptr,
-                                                        infiniopTensorDescriptor_t y,
-                                                        infiniopTensorDescriptor_t x);
-
-__C __export infiniStatus_t infiniopGetAtanhWorkspaceSize(infiniopAtanhDescriptor_t desc, size_t *size);
-
-__C __export infiniStatus_t infiniopAtanh(infiniopAtanhDescriptor_t desc,
-                                        void *workspace,
-                                        size_t workspace_size,
-                                        void *y,
-                                        const void *x,
-                                        void *stream);
-
-__C __export infiniStatus_t infiniopDestroyAtanhDescriptor(infiniopAtanhDescriptor_t desc);
+UNARY_OP_API_DECLARE(atanh, Atanh)
 
 #endif
diff --git a/include/infiniop/ops/binary_op_api.h b/include/infiniop/ops/binary_op_api.h
new file mode 100644
index 000000000..4ab2401b9
--- /dev/null
+++ b/include/infiniop/ops/binary_op_api.h
@@ -0,0 +1,50 @@
+#ifndef __INFINIOP_BINARY_OP_API_H__
+#define __INFINIOP_BINARY_OP_API_H__
+
+#include "../operator_descriptor.h"
+
+/**
+ * @brief Macro to generate the C API header for a binary operator.
+ * 
+ * This macro generates all the necessary declarations for a binary operator:
+ * - Descriptor type definition
+ * - Create descriptor function
+ * - Get workspace size function
+ * - Execute operator function
+ * - Destroy descriptor function
+ * 
+ * Usage:
+ *   BINARY_OP_API_DECLARE(div, Div)
+ *   BINARY_OP_API_DECLARE(pow, Pow)
+ * 
+ * @param OP_NAME      Lowercase operator name (e.g., div, pow, mod)
+ * @param OP_NAME_UPPER Uppercase operator name (e.g., Div, Pow, Mod)
+ */
+#define BINARY_OP_API_DECLARE(OP_NAME, OP_NAME_UPPER)                        \
+                                                                              \
+    typedef struct InfiniopDescriptor *infiniop##OP_NAME_UPPER##Descriptor_t; \
+                                                                              \
+    __C __export infiniStatus_t infiniopCreate##OP_NAME_UPPER##Descriptor(  \
+        infiniopHandle_t handle,                                             \
+        infiniop##OP_NAME_UPPER##Descriptor_t *desc_ptr,                    \
+        infiniopTensorDescriptor_t c,                                        \
+        infiniopTensorDescriptor_t a,                                        \
+        infiniopTensorDescriptor_t b);                                       \
+                                                                              \
+    __C __export infiniStatus_t infiniopGet##OP_NAME_UPPER##WorkspaceSize(  \
+        infiniop##OP_NAME_UPPER##Descriptor_t desc,                         \
+        size_t *size);                                                        \
+                                                                              \
+    __C __export infiniStatus_t infiniop##OP_NAME_UPPER(                    \
+        infiniop##OP_NAME_UPPER##Descriptor_t desc,                         \
+        void *workspace,                                                     \
+        size_t workspace_size,                                               \
+        void *c,                                                             \
+        const void *a,                                                       \
+        const void *b,                                                       \
+        void *stream);                                                        \
+                                                                              \
+    __C __export infiniStatus_t infiniopDestroy##OP_NAME_UPPER##Descriptor( \
+        infiniop##OP_NAME_UPPER##Descriptor_t desc);
+
+#endif // __INFINIOP_BINARY_OP_API_H__
diff --git a/include/infiniop/ops/ceil.h b/include/infiniop/ops/ceil.h
index 4539d77fd..8fca73b2e 100644
--- a/include/infiniop/ops/ceil.h
+++ b/include/infiniop/ops/ceil.h
@@ -1,24 +1,8 @@
 #ifndef __INFINIOP_CEIL_API_H__
 #define __INFINIOP_CEIL_API_H__
 
-#include "../operator_descriptor.h"
+#include "unary_op_api.h"
 
-typedef struct InfiniopDescriptor *infiniopCeilDescriptor_t;
-
-__C __export infiniStatus_t infiniopCreateCeilDescriptor(infiniopHandle_t handle,
-                                                        infiniopCeilDescriptor_t *desc_ptr,
-                                                        infiniopTensorDescriptor_t y,
-                                                        infiniopTensorDescriptor_t x);
-
-__C __export infiniStatus_t infiniopGetCeilWorkspaceSize(infiniopCeilDescriptor_t desc, size_t *size);
-
-__C __export infiniStatus_t infiniopCeil(infiniopCeilDescriptor_t desc,
-                                        void *workspace,
-                                        size_t workspace_size,
-                                        void *y,
-                                        const void *x,
-                                        void *stream);
-
-__C __export infiniStatus_t infiniopDestroyCeilDescriptor(infiniopCeilDescriptor_t desc);
+UNARY_OP_API_DECLARE(ceil, Ceil)
 
 #endif
diff --git a/include/infiniop/ops/cos.h b/include/infiniop/ops/cos.h
index 8f0b6eeb7..ed33b0a0e 100644
--- a/include/infiniop/ops/cos.h
+++ b/include/infiniop/ops/cos.h
@@ -1,24 +1,8 @@
 #ifndef __INFINIOP_COS_API_H__
 #define __INFINIOP_COS_API_H__
 
-#include "../operator_descriptor.h"
+#include "unary_op_api.h"
 
-typedef struct InfiniopDescriptor *infiniopCosDescriptor_t;
-
-__C __export infiniStatus_t infiniopCreateCosDescriptor(infiniopHandle_t handle,
-                                                        infiniopCosDescriptor_t *desc_ptr,
-                                                        infiniopTensorDescriptor_t y,
-                                                        infiniopTensorDescriptor_t x);
-
-__C __export infiniStatus_t infiniopGetCosWorkspaceSize(infiniopCosDescriptor_t desc, size_t *size);
-
-__C __export infiniStatus_t infiniopCos(infiniopCosDescriptor_t desc,
-                                        void *workspace,
-                                        size_t workspace_size,
-                                        void *y,
-                                        const void *x,
-                                        void *stream);
-
-__C __export infiniStatus_t infiniopDestroyCosDescriptor(infiniopCosDescriptor_t desc);
+UNARY_OP_API_DECLARE(cos, Cos)
 
 #endif
diff --git a/include/infiniop/ops/cosh.h b/include/infiniop/ops/cosh.h
index 3328151ad..b607b8fd1 100644
--- a/include/infiniop/ops/cosh.h
+++ b/include/infiniop/ops/cosh.h
@@ -1,24 +1,8 @@
 #ifndef __INFINIOP_COSH_API_H__
 #define __INFINIOP_COSH_API_H__
 
-#include "../operator_descriptor.h"
+#include "unary_op_api.h"
 
-typedef struct InfiniopDescriptor *infiniopCoshDescriptor_t;
-
-__C __export infiniStatus_t infiniopCreateCoshDescriptor(infiniopHandle_t handle,
-                                                        infiniopCoshDescriptor_t *desc_ptr,
-                                                        infiniopTensorDescriptor_t y,
-                                                        infiniopTensorDescriptor_t x);
-
-__C __export infiniStatus_t infiniopGetCoshWorkspaceSize(infiniopCoshDescriptor_t desc, size_t *size);
-
-__C __export infiniStatus_t infiniopCosh(infiniopCoshDescriptor_t desc,
-                                        void *workspace,
-                                        size_t workspace_size,
-                                        void *y,
-                                        const void *x,
-                                        void *stream);
-
-__C __export infiniStatus_t infiniopDestroyCoshDescriptor(infiniopCoshDescriptor_t desc);
+UNARY_OP_API_DECLARE(cosh, Cosh)
 
 #endif
diff --git a/include/infiniop/ops/div.h b/include/infiniop/ops/div.h
index e539b440c..6f146bf4c 100644
--- a/include/infiniop/ops/div.h
+++ b/include/infiniop/ops/div.h
@@ -1,26 +1,8 @@
 #ifndef __INFINIOP_DIV_API_H__
 #define __INFINIOP_DIV_API_H__
 
-#include "../operator_descriptor.h"
+#include "binary_op_api.h"
 
-typedef struct InfiniopDescriptor *infiniopDivDescriptor_t;
-
-__C __export infiniStatus_t infiniopCreateDivDescriptor(infiniopHandle_t handle,
-                                                        infiniopDivDescriptor_t *desc_ptr,
-                                                        infiniopTensorDescriptor_t c,
-                                                        infiniopTensorDescriptor_t a,
-                                                        infiniopTensorDescriptor_t b);
-
-__C __export infiniStatus_t infiniopGetDivWorkspaceSize(infiniopDivDescriptor_t desc, size_t *size);
-
-__C __export infiniStatus_t infiniopDiv(infiniopDivDescriptor_t desc,
-                                        void *workspace,
-                                        size_t workspace_size,
-                                        void *c,
-                                        const void *a,
-                                        const void *b,
-                                        void *stream);
-
-__C __export infiniStatus_t infiniopDestroyDivDescriptor(infiniopDivDescriptor_t desc);
+BINARY_OP_API_DECLARE(div, Div)
 
 #endif
diff --git a/include/infiniop/ops/erf.h b/include/infiniop/ops/erf.h
index 8cbb8fb74..0dcc149da 100644
--- a/include/infiniop/ops/erf.h
+++ b/include/infiniop/ops/erf.h
@@ -1,24 +1,8 @@
 #ifndef __INFINIOP_ERF_API_H__
 #define __INFINIOP_ERF_API_H__
 
-#include "../operator_descriptor.h"
+#include "unary_op_api.h"
 
-typedef struct InfiniopDescriptor *infiniopErfDescriptor_t;
-
-__C __export infiniStatus_t infiniopCreateErfDescriptor(infiniopHandle_t handle,
-                                                        infiniopErfDescriptor_t *desc_ptr,
-                                                        infiniopTensorDescriptor_t y,
-                                                        infiniopTensorDescriptor_t x);
-
-__C __export infiniStatus_t infiniopGetErfWorkspaceSize(infiniopErfDescriptor_t desc, size_t *size);
-
-__C __export infiniStatus_t infiniopErf(infiniopErfDescriptor_t desc,
-                                        void *workspace,
-                                        size_t workspace_size,
-                                        void *y,
-                                        const void *x,
-                                        void *stream);
-
-__C __export infiniStatus_t infiniopDestroyErfDescriptor(infiniopErfDescriptor_t desc);
+UNARY_OP_API_DECLARE(erf, Erf)
 
 #endif
diff --git a/include/infiniop/ops/floor.h b/include/infiniop/ops/floor.h
index 2f65f8f4a..02efc6761 100644
--- a/include/infiniop/ops/floor.h
+++ b/include/infiniop/ops/floor.h
@@ -1,24 +1,8 @@
 #ifndef __INFINIOP_FLOOR_API_H__
 #define __INFINIOP_FLOOR_API_H__
 
-#include "../operator_descriptor.h"
+#include "unary_op_api.h"
 
-typedef struct InfiniopDescriptor *infiniopFloorDescriptor_t;
-
-__C __export infiniStatus_t infiniopCreateFloorDescriptor(infiniopHandle_t handle,
-                                                        infiniopFloorDescriptor_t *desc_ptr,
-                                                        infiniopTensorDescriptor_t y,
-                                                        infiniopTensorDescriptor_t x);
-
-__C __export infiniStatus_t infiniopGetFloorWorkspaceSize(infiniopFloorDescriptor_t desc, size_t *size);
-
-__C __export infiniStatus_t infiniopFloor(infiniopFloorDescriptor_t desc,
-                                        void *workspace,
-                                        size_t workspace_size,
-                                        void *y,
-                                        const void *x,
-                                        void *stream);
-
-__C __export infiniStatus_t infiniopDestroyFloorDescriptor(infiniopFloorDescriptor_t desc);
+UNARY_OP_API_DECLARE(floor, Floor)
 
 #endif
diff --git a/include/infiniop/ops/log.h b/include/infiniop/ops/log.h
index f5bec4382..3892ccb6e 100644
--- a/include/infiniop/ops/log.h
+++ b/include/infiniop/ops/log.h
@@ -1,24 +1,8 @@
 #ifndef __INFINIOP_LOG_API_H__
 #define __INFINIOP_LOG_API_H__
 
-#include "../operator_descriptor.h"
+#include "unary_op_api.h"
 
-typedef struct InfiniopDescriptor *infiniopLogDescriptor_t;
-
-__C __export infiniStatus_t infiniopCreateLogDescriptor(infiniopHandle_t handle,
-                                                        infiniopLogDescriptor_t *desc_ptr,
-                                                        infiniopTensorDescriptor_t y,
-                                                        infiniopTensorDescriptor_t x);
-
-__C __export infiniStatus_t infiniopGetLogWorkspaceSize(infiniopLogDescriptor_t desc, size_t *size);
-
-__C __export infiniStatus_t infiniopLog(infiniopLogDescriptor_t desc,
-                                        void *workspace,
-                                        size_t workspace_size,
-                                        void *y,
-                                        const void *x,
-                                        void *stream);
-
-__C __export infiniStatus_t infiniopDestroyLogDescriptor(infiniopLogDescriptor_t desc);
+UNARY_OP_API_DECLARE(log, Log)
 
 #endif
diff --git a/include/infiniop/ops/max.h b/include/infiniop/ops/max.h
index e6f2f5d4c..4b91e5c83 100644
--- a/include/infiniop/ops/max.h
+++ b/include/infiniop/ops/max.h
@@ -1,26 +1,8 @@
 #ifndef __INFINIOP_MAX_API_H__
 #define __INFINIOP_MAX_API_H__
 
-#include "../operator_descriptor.h"
+#include "binary_op_api.h"
 
-typedef struct InfiniopDescriptor *infiniopMaxDescriptor_t;
-
-__C __export infiniStatus_t infiniopCreateMaxDescriptor(infiniopHandle_t handle,
-                                                        infiniopMaxDescriptor_t *desc_ptr,
-                                                        infiniopTensorDescriptor_t c,
-                                                        infiniopTensorDescriptor_t a,
-                                                        infiniopTensorDescriptor_t b);
-
-__C __export infiniStatus_t infiniopGetMaxWorkspaceSize(infiniopMaxDescriptor_t desc, size_t *size);
-
-__C __export infiniStatus_t infiniopMax(infiniopMaxDescriptor_t desc,
-                                        void *workspace,
-                                        size_t workspace_size,
-                                        void *c,
-                                        const void *a,
-                                        const void *b,
-                                        void *stream);
-
-__C __export infiniStatus_t infiniopDestroyMaxDescriptor(infiniopMaxDescriptor_t desc);
+BINARY_OP_API_DECLARE(max, Max)
 
 #endif
diff --git a/include/infiniop/ops/min.h b/include/infiniop/ops/min.h
index f72f0c4db..1496806df 100644
--- a/include/infiniop/ops/min.h
+++ b/include/infiniop/ops/min.h
@@ -1,26 +1,8 @@
 #ifndef __INFINIOP_MIN_API_H__
 #define __INFINIOP_MIN_API_H__
 
-#include "../operator_descriptor.h"
+#include "binary_op_api.h"
 
-typedef struct InfiniopDescriptor *infiniopMinDescriptor_t;
-
-__C __export infiniStatus_t infiniopCreateMinDescriptor(infiniopHandle_t handle,
-                                                        infiniopMinDescriptor_t *desc_ptr,
-                                                        infiniopTensorDescriptor_t c,
-                                                        infiniopTensorDescriptor_t a,
-                                                        infiniopTensorDescriptor_t b);
-
-__C __export infiniStatus_t infiniopGetMinWorkspaceSize(infiniopMinDescriptor_t desc, size_t *size);
-
-__C __export infiniStatus_t infiniopMin(infiniopMinDescriptor_t desc,
-                                        void *workspace,
-                                        size_t workspace_size,
-                                        void *c,
-                                        const void *a,
-                                        const void *b,
-                                        void *stream);
-
-__C __export infiniStatus_t infiniopDestroyMinDescriptor(infiniopMinDescriptor_t desc);
+BINARY_OP_API_DECLARE(min, Min)
 
 #endif
diff --git a/include/infiniop/ops/mod.h b/include/infiniop/ops/mod.h
index 5a6cd5bbf..e4fcd571e 100644
--- a/include/infiniop/ops/mod.h
+++ b/include/infiniop/ops/mod.h
@@ -1,26 +1,8 @@
 #ifndef __INFINIOP_MOD_API_H__
 #define __INFINIOP_MOD_API_H__
 
-#include "../operator_descriptor.h"
+#include "binary_op_api.h"
 
-typedef struct InfiniopDescriptor *infiniopModDescriptor_t;
-
-__C __export infiniStatus_t infiniopCreateModDescriptor(infiniopHandle_t handle,
-                                                        infiniopModDescriptor_t *desc_ptr,
-                                                        infiniopTensorDescriptor_t c,
-                                                        infiniopTensorDescriptor_t a,
-                                                        infiniopTensorDescriptor_t b);
-
-__C __export infiniStatus_t infiniopGetModWorkspaceSize(infiniopModDescriptor_t desc, size_t *size);
-
-__C __export infiniStatus_t infiniopMod(infiniopModDescriptor_t desc,
-                                        void *workspace,
-                                        size_t workspace_size,
-                                        void *c,
-                                        const void *a,
-                                        const void *b,
-                                        void *stream);
-
-__C __export infiniStatus_t infiniopDestroyModDescriptor(infiniopModDescriptor_t desc);
+BINARY_OP_API_DECLARE(mod, Mod)
 
 #endif
diff --git a/include/infiniop/ops/neg.h b/include/infiniop/ops/neg.h
index 4d3b06e21..0d18bbd5c 100644
--- a/include/infiniop/ops/neg.h
+++ b/include/infiniop/ops/neg.h
@@ -1,24 +1,8 @@
 #ifndef __INFINIOP_NEG_API_H__
 #define __INFINIOP_NEG_API_H__
 
-#include "../operator_descriptor.h"
+#include "unary_op_api.h"
 
-typedef struct InfiniopDescriptor *infiniopNegDescriptor_t;
-
-__C __export infiniStatus_t infiniopCreateNegDescriptor(infiniopHandle_t handle,
-                                                        infiniopNegDescriptor_t *desc_ptr,
-                                                        infiniopTensorDescriptor_t y,
-                                                        infiniopTensorDescriptor_t x);
-
-__C __export infiniStatus_t infiniopGetNegWorkspaceSize(infiniopNegDescriptor_t desc, size_t *size);
-
-__C __export infiniStatus_t infiniopNeg(infiniopNegDescriptor_t desc,
-                                        void *workspace,
-                                        size_t workspace_size,
-                                        void *y,
-                                        const void *x,
-                                        void *stream);
-
-__C __export infiniStatus_t infiniopDestroyNegDescriptor(infiniopNegDescriptor_t desc);
+UNARY_OP_API_DECLARE(neg, Neg)
 
 #endif
diff --git a/include/infiniop/ops/pow.h b/include/infiniop/ops/pow.h
index 6449d8622..f4e263a58 100644
--- a/include/infiniop/ops/pow.h
+++ b/include/infiniop/ops/pow.h
@@ -1,26 +1,8 @@
 #ifndef __INFINIOP_POW_API_H__
 #define __INFINIOP_POW_API_H__
 
-#include "../operator_descriptor.h"
+#include "binary_op_api.h"
 
-typedef struct InfiniopDescriptor *infiniopPowDescriptor_t;
-
-__C __export infiniStatus_t infiniopCreatePowDescriptor(infiniopHandle_t handle,
-                                                        infiniopPowDescriptor_t *desc_ptr,
-                                                        infiniopTensorDescriptor_t c,
-                                                        infiniopTensorDescriptor_t a,
-                                                        infiniopTensorDescriptor_t b);
-
-__C __export infiniStatus_t infiniopGetPowWorkspaceSize(infiniopPowDescriptor_t desc, size_t *size);
-
-__C __export infiniStatus_t infiniopPow(infiniopPowDescriptor_t desc,
-                                        void *workspace,
-                                        size_t workspace_size,
-                                        void *c,
-                                        const void *a,
-                                        const void *b,
-                                        void *stream);
-
-__C __export infiniStatus_t infiniopDestroyPowDescriptor(infiniopPowDescriptor_t desc);
+BINARY_OP_API_DECLARE(pow, Pow)
 
 #endif
diff --git a/include/infiniop/ops/reciprocal.h b/include/infiniop/ops/reciprocal.h
index 73836fea4..7d5626176 100644
--- a/include/infiniop/ops/reciprocal.h
+++ b/include/infiniop/ops/reciprocal.h
@@ -1,24 +1,8 @@
 #ifndef __INFINIOP_RECIPROCAL_API_H__
 #define __INFINIOP_RECIPROCAL_API_H__
 
-#include "../operator_descriptor.h"
+#include "unary_op_api.h"
 
-typedef struct InfiniopDescriptor *infiniopReciprocalDescriptor_t;
-
-__C __export infiniStatus_t infiniopCreateReciprocalDescriptor(infiniopHandle_t handle,
-                                                                infiniopReciprocalDescriptor_t *desc_ptr,
-                                                                infiniopTensorDescriptor_t y,
-                                                                infiniopTensorDescriptor_t x);
-
-__C __export infiniStatus_t infiniopGetReciprocalWorkspaceSize(infiniopReciprocalDescriptor_t desc, size_t *size);
-
-__C __export infiniStatus_t infiniopReciprocal(infiniopReciprocalDescriptor_t desc,
-                                                void *workspace,
-                                                size_t workspace_size,
-                                                void *y,
-                                                const void *x,
-                                                void *stream);
-
-__C __export infiniStatus_t infiniopDestroyReciprocalDescriptor(infiniopReciprocalDescriptor_t desc);
+UNARY_OP_API_DECLARE(reciprocal, Reciprocal)
 
 #endif
diff --git a/include/infiniop/ops/round.h b/include/infiniop/ops/round.h
index 18c7fe44e..1bf4377ff 100644
--- a/include/infiniop/ops/round.h
+++ b/include/infiniop/ops/round.h
@@ -1,24 +1,8 @@
 #ifndef __INFINIOP_ROUND_API_H__
 #define __INFINIOP_ROUND_API_H__
 
-#include "../operator_descriptor.h"
+#include "unary_op_api.h"
 
-typedef struct InfiniopDescriptor *infiniopRoundDescriptor_t;
-
-__C __export infiniStatus_t infiniopCreateRoundDescriptor(infiniopHandle_t handle,
-                                                           infiniopRoundDescriptor_t *desc_ptr,
-                                                           infiniopTensorDescriptor_t y,
-                                                           infiniopTensorDescriptor_t x);
-
-__C __export infiniStatus_t infiniopGetRoundWorkspaceSize(infiniopRoundDescriptor_t desc, size_t *size);
-
-__C __export infiniStatus_t infiniopRound(infiniopRoundDescriptor_t desc,
-                                          void *workspace,
-                                          size_t workspace_size,
-                                          void *y,
-                                          const void *x,
-                                          void *stream);
-
-__C __export infiniStatus_t infiniopDestroyRoundDescriptor(infiniopRoundDescriptor_t desc);
+UNARY_OP_API_DECLARE(round, Round)
 
 #endif
diff --git a/include/infiniop/ops/sign.h b/include/infiniop/ops/sign.h
index fe47c7190..ef7854de8 100644
--- a/include/infiniop/ops/sign.h
+++ b/include/infiniop/ops/sign.h
@@ -1,24 +1,8 @@
 #ifndef __INFINIOP_SIGN_API_H__
 #define __INFINIOP_SIGN_API_H__
 
-#include "../operator_descriptor.h"
+#include "unary_op_api.h"
 
-typedef struct InfiniopDescriptor *infiniopSignDescriptor_t;
-
-__C __export infiniStatus_t infiniopCreateSignDescriptor(infiniopHandle_t handle,
-                                                        infiniopSignDescriptor_t *desc_ptr,
-                                                        infiniopTensorDescriptor_t y,
-                                                        infiniopTensorDescriptor_t x);
-
-__C __export infiniStatus_t infiniopGetSignWorkspaceSize(infiniopSignDescriptor_t desc, size_t *size);
-
-__C __export infiniStatus_t infiniopSign(infiniopSignDescriptor_t desc,
-                                        void *workspace,
-                                        size_t workspace_size,
-                                        void *y,
-                                        const void *x,
-                                        void *stream);
-
-__C __export infiniStatus_t infiniopDestroySignDescriptor(infiniopSignDescriptor_t desc);
+UNARY_OP_API_DECLARE(sign, Sign)
 
 #endif
diff --git a/include/infiniop/ops/sinh.h b/include/infiniop/ops/sinh.h
index a5325fb81..ea8511a2b 100644
--- a/include/infiniop/ops/sinh.h
+++ b/include/infiniop/ops/sinh.h
@@ -1,24 +1,8 @@
 #ifndef __INFINIOP_SINH_API_H__
 #define __INFINIOP_SINH_API_H__
 
-#include "../operator_descriptor.h"
+#include "unary_op_api.h"
 
-typedef struct InfiniopDescriptor *infiniopSinhDescriptor_t;
-
-__C __export infiniStatus_t infiniopCreateSinhDescriptor(infiniopHandle_t handle,
-                                                        infiniopSinhDescriptor_t *desc_ptr,
-                                                        infiniopTensorDescriptor_t y,
-                                                        infiniopTensorDescriptor_t x);
-
-__C __export infiniStatus_t infiniopGetSinhWorkspaceSize(infiniopSinhDescriptor_t desc, size_t *size);
-
-__C __export infiniStatus_t infiniopSinh(infiniopSinhDescriptor_t desc,
-                                        void *workspace,
-                                        size_t workspace_size,
-                                        void *y,
-                                        const void *x,
-                                        void *stream);
-
-__C __export infiniStatus_t infiniopDestroySinhDescriptor(infiniopSinhDescriptor_t desc);
+UNARY_OP_API_DECLARE(sinh, Sinh)
 
 #endif
diff --git a/include/infiniop/ops/sqrt.h b/include/infiniop/ops/sqrt.h
index db04ec8bc..6df6fe89c 100644
--- a/include/infiniop/ops/sqrt.h
+++ b/include/infiniop/ops/sqrt.h
@@ -1,24 +1,8 @@
 #ifndef __INFINIOP_SQRT_API_H__
 #define __INFINIOP_SQRT_API_H__
 
-#include "../operator_descriptor.h"
+#include "unary_op_api.h"
 
-typedef struct InfiniopDescriptor *infiniopSqrtDescriptor_t;
-
-__C __export infiniStatus_t infiniopCreateSqrtDescriptor(infiniopHandle_t handle,
-                                                         infiniopSqrtDescriptor_t *desc_ptr,
-                                                         infiniopTensorDescriptor_t y,
-                                                         infiniopTensorDescriptor_t x);
-
-__C __export infiniStatus_t infiniopGetSqrtWorkspaceSize(infiniopSqrtDescriptor_t desc, size_t *size);
-
-__C __export infiniStatus_t infiniopSqrt(infiniopSqrtDescriptor_t desc,
-                                        void *workspace,
-                                        size_t workspace_size,
-                                        void *y,
-                                        const void *x,
-                                        void *stream);
-
-__C __export infiniStatus_t infiniopDestroySqrtDescriptor(infiniopSqrtDescriptor_t desc);
+UNARY_OP_API_DECLARE(sqrt, Sqrt)
 
 #endif
diff --git a/include/infiniop/ops/tan.h b/include/infiniop/ops/tan.h
index 69fc47bf1..d4a2f0bf2 100644
--- a/include/infiniop/ops/tan.h
+++ b/include/infiniop/ops/tan.h
@@ -1,24 +1,8 @@
 #ifndef __INFINIOP_TAN_API_H__
 #define __INFINIOP_TAN_API_H__
 
-#include "../operator_descriptor.h"
+#include "unary_op_api.h"
 
-typedef struct InfiniopDescriptor *infiniopTanDescriptor_t;
-
-__C __export infiniStatus_t infiniopCreateTanDescriptor(infiniopHandle_t handle,
-                                                        infiniopTanDescriptor_t *desc_ptr,
-                                                        infiniopTensorDescriptor_t y,
-                                                        infiniopTensorDescriptor_t x);
-
-__C __export infiniStatus_t infiniopGetTanWorkspaceSize(infiniopTanDescriptor_t desc, size_t *size);
-
-__C __export infiniStatus_t infiniopTan(infiniopTanDescriptor_t desc,
-                                       void *workspace,
-                                       size_t workspace_size,
-                                       void *y,
-                                       const void *x,
-                                       void *stream);
-
-__C __export infiniStatus_t infiniopDestroyTanDescriptor(infiniopTanDescriptor_t desc);
+UNARY_OP_API_DECLARE(tan, Tan)
 
 #endif
diff --git a/include/infiniop/ops/unary_op_api.h b/include/infiniop/ops/unary_op_api.h
new file mode 100644
index 000000000..eefe3c3a4
--- /dev/null
+++ b/include/infiniop/ops/unary_op_api.h
@@ -0,0 +1,48 @@
+#ifndef __INFINIOP_UNARY_OP_API_H__
+#define __INFINIOP_UNARY_OP_API_H__
+
+#include "../operator_descriptor.h"
+
+/**
+ * @brief Macro to generate the C API header for a unary operator.
+ * 
+ * This macro generates all the necessary declarations for a unary operator:
+ * - Descriptor type definition
+ * - Create descriptor function
+ * - Get workspace size function
+ * - Execute operator function
+ * - Destroy descriptor function
+ * 
+ * Usage:
+ *   UNARY_OP_API_DECLARE(abs, Abs)
+ *   UNARY_OP_API_DECLARE(log, Log)
+ * 
+ * @param OP_NAME      Lowercase operator name (e.g., abs, log, sin)
+ * @param OP_NAME_UPPER Uppercase operator name (e.g., Abs, Log, Sin)
+ */
+#define UNARY_OP_API_DECLARE(OP_NAME, OP_NAME_UPPER)                          \
+                                                                              \
+    typedef struct InfiniopDescriptor *infiniop##OP_NAME_UPPER##Descriptor_t; \
+                                                                              \
+    __C __export infiniStatus_t infiniopCreate##OP_NAME_UPPER##Descriptor(  \
+        infiniopHandle_t handle,                                             \
+        infiniop##OP_NAME_UPPER##Descriptor_t *desc_ptr,                    \
+        infiniopTensorDescriptor_t y,                                         \
+        infiniopTensorDescriptor_t x);                                        \
+                                                                              \
+    __C __export infiniStatus_t infiniopGet##OP_NAME_UPPER##WorkspaceSize(  \
+        infiniop##OP_NAME_UPPER##Descriptor_t desc,                         \
+        size_t *size);                                                        \
+                                                                              \
+    __C __export infiniStatus_t infiniop##OP_NAME_UPPER(                    \
+        infiniop##OP_NAME_UPPER##Descriptor_t desc,                         \
+        void *workspace,                                                     \
+        size_t workspace_size,                                               \
+        void *y,                                                             \
+        const void *x,                                                       \
+        void *stream);                                                        \
+                                                                              \
+    __C __export infiniStatus_t infiniopDestroy##OP_NAME_UPPER##Descriptor( \
+        infiniop##OP_NAME_UPPER##Descriptor_t desc);
+
+#endif // __INFINIOP_UNARY_OP_API_H__
diff --git a/scripts/test_binary_unary.py b/scripts/test_binary_unary.py
new file mode 100755
index 000000000..8dbbfbf53
--- /dev/null
+++ b/scripts/test_binary_unary.py
@@ -0,0 +1,143 @@
+import os
+import subprocess
+from set_env import set_env
+import sys
+
+PROJECT_DIR = os.path.abspath(
+    os.path.join(os.path.dirname(__file__), "..", "test", "infiniop")
+)
+os.chdir(PROJECT_DIR)
+
+
+def run_tests(args):
+    failed = []
+    
+    # Binary operators (重构过的)
+    binary_tests = [
+        "div.py",
+        "pow.py",
+        "mod.py",
+        "min.py",
+        "max.py",
+    ]
+    
+    # Unary operators (重构过的)
+    unary_tests = [
+        "abs.py",
+        "log.py",
+        "cos.py",
+        "sqrt.py",
+        "neg.py",
+        "sign.py",
+        "reciprocal.py",
+        "round.py",
+        "floor.py",
+        "ceil.py",
+        "erf.py",
+        "cosh.py",
+        "sinh.py",
+        "tan.py",
+        "acos.py",
+        "acosh.py",
+        "asin.py",
+        "asinh.py",
+        "atan.py",
+        "atanh.py",
+    ]
+    
+    all_tests = binary_tests + unary_tests
+    
+    print("\033[94m" + "=" * 60 + "\033[0m")
+    print("\033[94mTesting Binary and Unary Operators (Refactored)\033[0m")
+    print("\033[94m" + "=" * 60 + "\033[0m")
+    print(f"\033[94mTotal tests: {len(all_tests)}\033[0m")
+    print(f"\033[94m  - Binary operators: {len(binary_tests)}\033[0m")
+    print(f"\033[94m  - Unary operators: {len(unary_tests)}\033[0m")
+    print()
+    
+    for test in all_tests:
+        if not os.path.exists(test):
+            print(f"\033[93m[SKIP] {test} - test file not found\033[0m")
+            continue
+            
+        print(f"\033[96m[RUN] {test}\033[0m", end=" ... ", flush=True)
+        result = subprocess.run(
+            f"python3 {test} {args}", 
+            text=True, 
+            encoding="utf-8", 
+            shell=True,
+            capture_output=True
+        )
+        
+        if result.returncode != 0:
+            print(f"\033[91m[FAIL]\033[0m")
+            print(f"\033[91mError output:\033[0m")
+            print(result.stderr)
+            failed.append(test)
+        else:
+            print(f"\033[92m[PASS]\033[0m")
+    
+    return failed
+
+
+if __name__ == "__main__":
+    import argparse
+    
+    parser = argparse.ArgumentParser(
+        description="Test refactored binary and unary operators",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  # Test on CPU only (default)
+  python3 scripts/test_binary_unary.py --cpu
+  
+  # Test on NVIDIA GPU only
+  python3 scripts/test_binary_unary.py --nvidia
+  
+  # Test on both CPU and NVIDIA
+  python3 scripts/test_binary_unary.py --cpu --nvidia
+  
+  # Test with debug mode
+  python3 scripts/test_binary_unary.py --cpu --debug
+  
+  # Test with profiling
+  python3 scripts/test_binary_unary.py --nvidia --profile
+        """
+    )
+    
+    # Device selection arguments (same as test files)
+    parser.add_argument("--cpu", action="store_true", help="Run CPU tests")
+    parser.add_argument("--nvidia", action="store_true", help="Run NVIDIA GPU tests")
+    parser.add_argument("--debug", action="store_true", help="Enable debug mode")
+    parser.add_argument("--profile", action="store_true", help="Enable profiling")
+    
+    args, unknown = parser.parse_known_args()
+    
+    # Build command line arguments to pass to test files
+    test_args = []
+    if args.cpu:
+        test_args.append("--cpu")
+    if args.nvidia:
+        test_args.append("--nvidia")
+    if args.debug:
+        test_args.append("--debug")
+    if args.profile:
+        test_args.append("--profile")
+    
+    # Add any unknown arguments (for compatibility)
+    test_args.extend(unknown)
+    
+    set_env()
+    failed = run_tests(" ".join(test_args))
+    
+    print()
+    print("\033[94m" + "=" * 60 + "\033[0m")
+    if len(failed) == 0:
+        print("\033[92m✓ All tests passed!\033[0m")
+    else:
+        print(f"\033[91m✗ {len(failed)} test(s) failed:\033[0m")
+        for test in failed:
+            print(f"\033[91m  - {test}\033[0m")
+    print("\033[94m" + "=" * 60 + "\033[0m")
+    
+    exit(len(failed))
diff --git a/src/infiniop/elementwise/binary.h b/src/infiniop/elementwise/binary.h
new file mode 100644
index 000000000..1823fac3f
--- /dev/null
+++ b/src/infiniop/elementwise/binary.h
@@ -0,0 +1,261 @@
+#ifndef __INFINIOP_ELEMENTWISE_BINARY_H__
+#define __INFINIOP_ELEMENTWISE_BINARY_H__
+
+#include <algorithm>
+#include <cmath>
+#include <type_traits>
+
+#ifdef __CUDACC__
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+// Include device-specific type aliases for cuda_bfloat16
+#include "../devices/nvidia/nvidia_kernel_common.cuh"
+#endif
+
+namespace op::elementwise::binary {
+
+/**
+ * @brief Represents all the currently defined binary operations.
+ *
+ * This enum is used to specify which binary operation to perform
+ * in the generic BinaryOp template.
+ */
+enum class BinaryMode {
+    // Arithmetic operations:
+    Add,
+    Subtract,
+    Multiply,
+    Divide,
+    Pow,
+    Mod,
+    Max,
+    Min,
+    // Logical operations (for future use):
+    // And, Or, Xor, Less, LessOrEqual, Equal, Greater, GreaterOrEqual
+};
+
+/**
+ * @brief Generic binary operation template that performs different operations
+ *        based on the specified BinaryMode.
+ *
+ * This template allows multiple binary operators (pow, div, mod, min, max, etc.)
+ * to share the same implementation infrastructure while only differing in the
+ * operation mode.
+ *
+ * @tparam Mode The binary operation mode (from BinaryMode enum)
+ */
+template <BinaryMode Mode>
+struct BinaryOp {
+    static constexpr size_t num_inputs = 2;
+
+    template <typename T>
+    T operator()(const T &a, const T &b) const {
+        if constexpr (Mode == BinaryMode::Add) {
+            return a + b;
+        } else if constexpr (Mode == BinaryMode::Subtract) {
+            return a - b;
+        } else if constexpr (Mode == BinaryMode::Multiply) {
+            return a * b;
+        } else if constexpr (Mode == BinaryMode::Divide) {
+            return a / b;
+        } else if constexpr (Mode == BinaryMode::Pow) {
+            return std::pow(a, b);
+        } else if constexpr (Mode == BinaryMode::Mod) {
+            if constexpr (std::is_floating_point_v<T>) {
+                return std::fmod(a, b);
+            } else {
+                return a % b;
+            }
+        } else if constexpr (Mode == BinaryMode::Max) {
+            if constexpr (std::is_floating_point_v<T>) {
+                return std::fmax(a, b);
+            } else {
+                return std::max(a, b);
+            }
+        } else if constexpr (Mode == BinaryMode::Min) {
+            if constexpr (std::is_floating_point_v<T>) {
+                return std::fmin(a, b);
+            } else {
+                return std::min(a, b);
+            }
+        } else {
+            static_assert(Mode != Mode, "Unsupported binary operation mode");
+            return a;
+        }
+    }
+};
+
+#ifdef __CUDACC__
+/**
+ * @brief CUDA-specific binary operation template that performs different operations
+ *        based on the specified BinaryMode, using CUDA-optimized functions.
+ *
+ * This template provides CUDA device functions optimized for GPU execution,
+ * using intrinsics like __powf, __h2div, __hmin2, __hmax2, etc.
+ *
+ * @tparam Mode The binary operation mode (from BinaryMode enum)
+ */
+namespace cuda {
+template <BinaryMode Mode>
+struct BinaryOp {
+    static constexpr size_t num_inputs = 2;
+
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &a, const T &b) const {
+        if constexpr (Mode == BinaryMode::Add) {
+            if constexpr (std::is_same_v<T, half2>) {
+                return __hadd2(a, b);
+            } else if constexpr (std::is_same_v<T, half> || std::is_same_v<T, cuda_bfloat16>) {
+                return __hadd(a, b);
+            } else if constexpr (std::is_same_v<T, float>) {
+                return __fadd_rn(a, b);
+            } else {
+                return a + b;
+            }
+        } else if constexpr (Mode == BinaryMode::Subtract) {
+            if constexpr (std::is_same_v<T, half2>) {
+                return __hsub2(a, b);
+            } else if constexpr (std::is_same_v<T, half> || std::is_same_v<T, cuda_bfloat16>) {
+                return __hsub(a, b);
+            } else if constexpr (std::is_same_v<T, float>) {
+                return __fsub_rn(a, b);
+            } else {
+                return a - b;
+            }
+        } else if constexpr (Mode == BinaryMode::Multiply) {
+            if constexpr (std::is_same_v<T, half2>) {
+                return __hmul2(a, b);
+            } else if constexpr (std::is_same_v<T, half> || std::is_same_v<T, cuda_bfloat16>) {
+                return __hmul(a, b);
+            } else if constexpr (std::is_same_v<T, float>) {
+                return __fmul_rd(a, b);
+            } else {
+                return a * b;
+            }
+        } else if constexpr (Mode == BinaryMode::Divide) {
+            if constexpr (std::is_same_v<T, half2>) {
+                return __h2div(a, b);
+            } else if constexpr (std::is_same_v<T, half> || std::is_same_v<T, cuda_bfloat16>) {
+                return a / b;
+            } else if constexpr (std::is_same_v<T, float>) {
+                return __fdividef(a, b);
+            } else {
+                return a / b;
+            }
+        } else if constexpr (Mode == BinaryMode::Pow) {
+            if constexpr (std::is_same_v<T, half2>) {
+                float2 a_f2 = __half22float2(a);
+                float2 b_f2 = __half22float2(b);
+                return __float22half2_rn(make_float2(__powf(a_f2.x, b_f2.x), __powf(a_f2.y, b_f2.y)));
+            } else if constexpr (std::is_same_v<T, half>) {
+                float a_ = __half2float(a);
+                float b_ = __half2float(b);
+                float ans_f = __powf(a_, b_);
+                return __float2half(isnan(ans_f) ? std::pow(a_, b_) : ans_f);
+            } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
+                float2 a_f2 = __bfloat1622float2(a);
+                float2 b_f2 = __bfloat1622float2(b);
+                return __floats2bfloat162_rn(__powf(a_f2.x, b_f2.x), __powf(a_f2.y, b_f2.y));
+            } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+                float a_ = __bfloat162float(a);
+                float b_ = __bfloat162float(b);
+                return __float2bfloat16_rn(__powf(a_, b_));
+            } else if constexpr (std::is_same_v<T, float>) {
+                return __powf(a, b);
+            } else {
+                return std::pow(a, b);
+            }
+        } else if constexpr (Mode == BinaryMode::Mod) {
+            if constexpr (std::is_same_v<T, half2>) {
+                float2 a_f2 = __half22float2(a);
+                float2 b_f2 = __half22float2(b);
+                return __float22half2_rn(make_float2(std::fmod(a_f2.x, b_f2.x), std::fmod(a_f2.y, b_f2.y)));
+            } else if constexpr (std::is_same_v<T, half>) {
+                float a_ = __half2float(a);
+                float b_ = __half2float(b);
+                return __float2half(std::fmod(a_, b_));
+            } else if constexpr (std::is_floating_point_v<T>) {
+                return std::fmod(a, b);
+            } else {
+                return a % b;
+            }
+        } else if constexpr (Mode == BinaryMode::Max) {
+            if constexpr (std::is_same_v<T, half2>) {
+                return __hmax2(a, b);
+            } else if constexpr (std::is_same_v<T, half> || std::is_same_v<T, cuda_bfloat16>) {
+                return a > b ? a : b;
+            } else if constexpr (std::is_same_v<T, float>) {
+                return fmaxf(a, b);
+            } else {
+                return a > b ? a : b;
+            }
+        } else if constexpr (Mode == BinaryMode::Min) {
+            if constexpr (std::is_same_v<T, half2>) {
+                return __hmin2(a, b);
+            } else if constexpr (std::is_same_v<T, half> || std::is_same_v<T, cuda_bfloat16>) {
+                return a < b ? a : b;
+            } else if constexpr (std::is_same_v<T, float>) {
+                return fminf(a, b);
+            } else {
+                return a < b ? a : b;
+            }
+        } else {
+            static_assert(Mode != Mode, "Unsupported binary operation mode");
+            return a;
+        }
+    }
+};
+} // namespace cuda
+#endif // __CUDACC__
+
+/**
+ * @brief Macro to define a binary elementwise descriptor for a specific operation.
+ *
+ * This macro simplifies the definition of binary operators (pow, div, mod, min, max, etc.)
+ * by automatically generating the Descriptor class and operation struct using the
+ * ELEMENTWISE_DESCRIPTOR macro and BinaryOp template.
+ *
+ * Usage:
+ *   BINARY_ELEMENTWISE_DESCRIPTOR(pow, cpu, BinaryMode::Pow)
+ *   BINARY_ELEMENTWISE_DESCRIPTOR(div, cpu, BinaryMode::Divide)
+ *
+ * @param OP        The operator name (e.g., pow, div, mod)
+ * @param NAMESPACE The device namespace (e.g., cpu, nvidia)
+ * @param MODE      The BinaryMode enum value for this operation
+ */
+#define BINARY_ELEMENTWISE_DESCRIPTOR(OP, NAMESPACE, MODE) \
+                                                           \
+    ELEMENTWISE_DESCRIPTOR(OP, NAMESPACE)                  \
+                                                           \
+    namespace op::OP::NAMESPACE {                          \
+    using Op = op::elementwise::binary::BinaryOp<MODE>;    \
+    }
+
+/**
+ * @brief Macro to define a binary elementwise descriptor for CUDA/NVIDIA backend.
+ *
+ * This macro is similar to BINARY_ELEMENTWISE_DESCRIPTOR but uses the CUDA-specific
+ * BinaryOp implementation for better GPU performance.
+ *
+ * Usage:
+ *   BINARY_ELEMENTWISE_DESCRIPTOR_CUDA(pow, nvidia, BinaryMode::Pow)
+ *   BINARY_ELEMENTWISE_DESCRIPTOR_CUDA(div, nvidia, BinaryMode::Divide)
+ *
+ * @param OP        The operator name (e.g., pow, div, mod)
+ * @param NAMESPACE The device namespace (e.g., nvidia)
+ * @param MODE      The BinaryMode enum value for this operation
+ */
+#ifdef __CUDACC__
+#define BINARY_ELEMENTWISE_DESCRIPTOR_CUDA(OP, NAMESPACE, MODE) \
+                                                                \
+    ELEMENTWISE_DESCRIPTOR(OP, NAMESPACE)                       \
+                                                                \
+    namespace op::OP::cuda {                                    \
+    using Op = op::elementwise::binary::cuda::BinaryOp<MODE>;   \
+    }
+#endif // __CUDACC__
+
+} // namespace op::elementwise::binary
+
+#endif // __INFINIOP_ELEMENTWISE_BINARY_H__
diff --git a/src/infiniop/elementwise/cpu/elementwise_cpu_impl.h b/src/infiniop/elementwise/cpu/elementwise_cpu_impl.h
new file mode 100644
index 000000000..030f4d87e
--- /dev/null
+++ b/src/infiniop/elementwise/cpu/elementwise_cpu_impl.h
@@ -0,0 +1,130 @@
+#ifndef __INFINIOP_ELEMENTWISE_CPU_IMPL_H__
+#define __INFINIOP_ELEMENTWISE_CPU_IMPL_H__
+
+#include "../../../utils/check.h"
+#include "../../../utils/result.hpp"
+#include "../../devices/cpu/common_cpu.h"
+#include "elementwise_cpu.h"
+
+/**
+ * @brief Generic implementation for elementwise CPU operators.
+ *
+ * This file provides a generic implementation template that can be used
+ * by all binary and unary operators to reduce code duplication.
+ *
+ * Usage:
+ *   #include "elementwise_cpu_impl.h"
+ *   namespace op::pow::cpu {
+ *       using Op = op::elementwise::binary::BinaryOp<BinaryMode::Pow>;
+ *       ELEMENTWISE_CPU_IMPL_BINARY(pow)
+ *   }
+ *
+ *   namespace op::sqrt::cpu {
+ *       using Op = op::elementwise::unary::UnaryOp<UnaryMode::Sqrt>;
+ *       ELEMENTWISE_CPU_IMPL_UNARY(sqrt)
+ *   }
+ */
+
+/**
+ * @brief Macro to generate binary operator implementation.
+ *
+ * This macro generates the Descriptor destructor, create, and calculate methods
+ * for binary operators, using the generic implementation.
+ *
+ * Usage:
+ *   namespace op::pow::cpu {
+ *       using Op = op::elementwise::binary::BinaryOp<BinaryMode::Pow>;
+ *       ELEMENTWISE_CPU_IMPL_BINARY(pow)
+ *   }
+ */
+#define ELEMENTWISE_CPU_IMPL_BINARY(OP)                                             \
+                                                                                    \
+    Descriptor::~Descriptor() = default;                                            \
+                                                                                    \
+    infiniStatus_t Descriptor::create(                                              \
+        infiniopHandle_t handle_,                                                   \
+        Descriptor **desc_ptr,                                                      \
+        infiniopTensorDescriptor_t out_desc,                                        \
+        std::vector<infiniopTensorDescriptor_t> input_desc_vec) {                   \
+        auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);             \
+        auto dtype = out_desc->dtype();                                             \
+        const auto &a_desc = input_desc_vec.at(0);                                  \
+        const auto &b_desc = input_desc_vec.at(1);                                  \
+        const auto &out_shape = out_desc->shape();                                  \
+        const auto &a_shape = a_desc->shape();                                      \
+        const auto &b_shape = b_desc->shape();                                      \
+        CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);                     \
+        CHECK_SAME_SHAPE(out_shape, a_shape, b_shape);                              \
+        CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); \
+        return INFINI_STATUS_SUCCESS;                                               \
+    }                                                                               \
+                                                                                    \
+    infiniStatus_t Descriptor::calculate(                                           \
+        void *workspace,                                                            \
+        size_t workspace_size,                                                      \
+        void *output,                                                               \
+        std::vector<const void *> inputs,                                           \
+        void *stream) const {                                                       \
+        switch (_dtype) {                                                           \
+        case INFINI_DTYPE_F16:                                                      \
+            return _device_info->template calculate<Op, fp16_t>(                    \
+                _info, output, inputs, stream);                                     \
+        case INFINI_DTYPE_F32:                                                      \
+            return _device_info->template calculate<Op, float>(                     \
+                _info, output, inputs, stream);                                     \
+        default:                                                                    \
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;                                  \
+        }                                                                           \
+    }
+
+/**
+ * @brief Macro to generate unary operator implementation.
+ *
+ * This macro generates the Descriptor destructor, create, and calculate methods
+ * for unary operators, using the generic implementation.
+ *
+ * Usage:
+ *   namespace op::sqrt::cpu {
+ *       using Op = op::elementwise::unary::UnaryOp<UnaryMode::Sqrt>;
+ *       ELEMENTWISE_CPU_IMPL_UNARY(sqrt)
+ *   }
+ */
+#define ELEMENTWISE_CPU_IMPL_UNARY(OP)                                              \
+                                                                                    \
+    Descriptor::~Descriptor() = default;                                            \
+                                                                                    \
+    infiniStatus_t Descriptor::create(                                              \
+        infiniopHandle_t handle_,                                                   \
+        Descriptor **desc_ptr,                                                      \
+        infiniopTensorDescriptor_t out_desc,                                        \
+        std::vector<infiniopTensorDescriptor_t> input_desc_vec) {                   \
+        auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);             \
+        auto dtype = out_desc->dtype();                                             \
+        const auto &x_desc = input_desc_vec.at(0);                                  \
+        const auto &y_shape = out_desc->shape();                                    \
+        const auto &x_shape = x_desc->shape();                                      \
+        CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);                     \
+        CHECK_SAME_SHAPE(y_shape, x_shape);                                         \
+        CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); \
+        return INFINI_STATUS_SUCCESS;                                               \
+    }                                                                               \
+                                                                                    \
+    infiniStatus_t Descriptor::calculate(                                           \
+        void *workspace,                                                            \
+        size_t workspace_size,                                                      \
+        void *output,                                                               \
+        std::vector<const void *> inputs,                                           \
+        void *stream) const {                                                       \
+        switch (_dtype) {                                                           \
+        case INFINI_DTYPE_F16:                                                      \
+            return _device_info->template calculate<Op, fp16_t>(                    \
+                _info, output, inputs, stream);                                     \
+        case INFINI_DTYPE_F32:                                                      \
+            return _device_info->template calculate<Op, float>(                     \
+                _info, output, inputs, stream);                                     \
+        default:                                                                    \
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;                                  \
+        }                                                                           \
+    }
+
+#endif // __INFINIOP_ELEMENTWISE_CPU_IMPL_H__
diff --git a/src/infiniop/elementwise/nvidia/elementwise_nvidia_impl.cuh b/src/infiniop/elementwise/nvidia/elementwise_nvidia_impl.cuh
new file mode 100644
index 000000000..39b78884a
--- /dev/null
+++ b/src/infiniop/elementwise/nvidia/elementwise_nvidia_impl.cuh
@@ -0,0 +1,134 @@
+#ifndef __INFINIOP_ELEMENTWISE_NVIDIA_IMPL_CUH__
+#define __INFINIOP_ELEMENTWISE_NVIDIA_IMPL_CUH__
+
+#include "../../../utils/check.h"
+#include "../../../utils/result.hpp"
+#include "../../devices/nvidia/nvidia_common.cuh"
+#include "elementwise_nvidia.cuh"
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+
+/**
+ * @brief Generic implementation for elementwise NVIDIA/CUDA operators.
+ *
+ * This file provides a generic implementation template that can be used
+ * by all binary and unary operators to reduce code duplication.
+ *
+ * Usage:
+ *   #include "elementwise_nvidia_impl.cuh"
+ *   namespace op::pow::nvidia {
+ *       ELEMENTWISE_NVIDIA_IMPL_BINARY(pow)
+ *   }
+ *
+ *   namespace op::sqrt::nvidia {
+ *       ELEMENTWISE_NVIDIA_IMPL_UNARY(sqrt)
+ *   }
+ */
+
+/**
+ * @brief Macro to generate binary operator implementation for NVIDIA/CUDA.
+ *
+ * This macro generates the Descriptor destructor, create, and calculate methods
+ * for binary operators, using the generic implementation.
+ *
+ * Usage:
+ *   namespace op::pow::nvidia {
+ *       ELEMENTWISE_NVIDIA_IMPL_BINARY(pow)
+ *   }
+ */
+#define ELEMENTWISE_NVIDIA_IMPL_BINARY(OP)                                           \
+                                                                                     \
+    Descriptor::~Descriptor() = default;                                             \
+                                                                                     \
+    infiniStatus_t Descriptor::create(                                               \
+        infiniopHandle_t handle_,                                                    \
+        Descriptor **desc_ptr,                                                       \
+        infiniopTensorDescriptor_t out_desc,                                         \
+        std::vector<infiniopTensorDescriptor_t> input_desc_vec) {                    \
+        auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);           \
+        auto dtype = out_desc->dtype();                                              \
+        const auto &a_desc = input_desc_vec.at(0);                                   \
+        const auto &b_desc = input_desc_vec.at(1);                                   \
+        const auto &c_shape = out_desc->shape();                                     \
+        const auto &a_shape = a_desc->shape();                                       \
+        const auto &b_shape = b_desc->shape();                                       \
+        CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);                      \
+        CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);                                 \
+        CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); \
+        return INFINI_STATUS_SUCCESS;                                                \
+    }                                                                                \
+                                                                                     \
+    infiniStatus_t Descriptor::calculate(                                            \
+        void *workspace,                                                             \
+        size_t workspace_size,                                                       \
+        void *output,                                                                \
+        std::vector<const void *> inputs,                                            \
+        void *stream) const {                                                        \
+        if (workspace_size < _workspace_size) {                                      \
+            return INFINI_STATUS_INSUFFICIENT_WORKSPACE;                             \
+        }                                                                            \
+        switch (_dtype) {                                                            \
+        case INFINI_DTYPE_F16:                                                       \
+            return _device_info->calculate<256, cuda::Op, half>(                     \
+                _info, workspace, output, inputs, stream);                           \
+        case INFINI_DTYPE_F32:                                                       \
+            return _device_info->calculate<256, cuda::Op, float>(                    \
+                _info, workspace, output, inputs, stream);                           \
+        default:                                                                     \
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;                                   \
+        }                                                                            \
+    }
+
+/**
+ * @brief Macro to generate unary operator implementation for NVIDIA/CUDA.
+ *
+ * This macro generates the Descriptor destructor, create, and calculate methods
+ * for unary operators, using the generic implementation.
+ *
+ * Usage:
+ *   namespace op::sqrt::nvidia {
+ *       ELEMENTWISE_NVIDIA_IMPL_UNARY(sqrt)
+ *   }
+ */
+#define ELEMENTWISE_NVIDIA_IMPL_UNARY(OP)                                            \
+                                                                                     \
+    Descriptor::~Descriptor() = default;                                             \
+                                                                                     \
+    infiniStatus_t Descriptor::create(                                               \
+        infiniopHandle_t handle_,                                                    \
+        Descriptor **desc_ptr,                                                       \
+        infiniopTensorDescriptor_t out_desc,                                         \
+        std::vector<infiniopTensorDescriptor_t> input_desc_vec) {                    \
+        auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);           \
+        auto dtype = out_desc->dtype();                                              \
+        const auto &x_desc = input_desc_vec.at(0);                                   \
+        const auto &y_shape = out_desc->shape();                                     \
+        const auto &x_shape = x_desc->shape();                                       \
+        CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);                      \
+        CHECK_SAME_SHAPE(y_shape, x_shape);                                          \
+        CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); \
+        return INFINI_STATUS_SUCCESS;                                                \
+    }                                                                                \
+                                                                                     \
+    infiniStatus_t Descriptor::calculate(                                            \
+        void *workspace,                                                             \
+        size_t workspace_size,                                                       \
+        void *output,                                                                \
+        std::vector<const void *> inputs,                                            \
+        void *stream) const {                                                        \
+        if (workspace_size < _workspace_size) {                                      \
+            return INFINI_STATUS_INSUFFICIENT_WORKSPACE;                             \
+        }                                                                            \
+        switch (_dtype) {                                                            \
+        case INFINI_DTYPE_F16:                                                       \
+            return _device_info->calculate<256, cuda::Op, half>(                     \
+                _info, workspace, output, inputs, stream);                           \
+        case INFINI_DTYPE_F32:                                                       \
+            return _device_info->calculate<256, cuda::Op, float>(                    \
+                _info, workspace, output, inputs, stream);                           \
+        default:                                                                     \
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;                                   \
+        }                                                                            \
+    }
+
+#endif // __INFINIOP_ELEMENTWISE_NVIDIA_IMPL_CUH__
diff --git a/src/infiniop/elementwise/unary.h b/src/infiniop/elementwise/unary.h
new file mode 100644
index 000000000..9f41dedb2
--- /dev/null
+++ b/src/infiniop/elementwise/unary.h
@@ -0,0 +1,524 @@
+#ifndef __INFINIOP_ELEMENTWISE_UNARY_H__
+#define __INFINIOP_ELEMENTWISE_UNARY_H__
+
+#include <algorithm>
+#include <cmath>
+#include <type_traits>
+
+#ifdef __CUDACC__
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+// Include device-specific type aliases for cuda_bfloat16
+#include "../devices/nvidia/nvidia_kernel_common.cuh"
+#endif
+
+namespace op::elementwise::unary {
+
+/**
+ * @brief Represents all the currently defined unary operations.
+ *
+ * This enum is used to specify which unary operation to perform
+ * in the generic UnaryOp template.
+ */
+enum class UnaryMode {
+    // Math operations:
+    Abs,
+    Exp,
+    Log,
+    Reciprocal,
+    Sqrt,
+    Neg,
+    Ceil,
+    Floor,
+    Round,
+    Sin,
+    Cos,
+    Tan,
+    Asin,
+    Acos,
+    Atan,
+    Sinh,
+    Cosh,
+    Tanh,
+    Asinh,
+    Acosh,
+    Atanh,
+    Relu,
+    Sigmoid,
+    Sign,
+    Erf,
+};
+
+/**
+ * @brief Generic unary operation template that performs different operations
+ *        based on the specified UnaryMode.
+ *
+ * This template allows multiple unary operators (abs, log, sin, cos, etc.)
+ * to share the same implementation infrastructure while only differing in the
+ * operation mode.
+ *
+ * @tparam Mode The unary operation mode (from UnaryMode enum)
+ */
+template <UnaryMode Mode>
+struct UnaryOp {
+    static constexpr size_t num_inputs = 1;
+
+    template <typename T>
+    T operator()(const T &x) const {
+        if constexpr (Mode == UnaryMode::Abs) {
+            if constexpr (std::is_floating_point_v<T>) {
+                return std::fabs(x);
+            } else {
+                return std::abs(x);
+            }
+        } else if constexpr (Mode == UnaryMode::Exp) {
+            return std::exp(x);
+        } else if constexpr (Mode == UnaryMode::Log) {
+            return std::log(x);
+        } else if constexpr (Mode == UnaryMode::Reciprocal) {
+            return T(1) / x;
+        } else if constexpr (Mode == UnaryMode::Sqrt) {
+            return std::sqrt(x);
+        } else if constexpr (Mode == UnaryMode::Neg) {
+            return -x;
+        } else if constexpr (Mode == UnaryMode::Ceil) {
+            return std::ceil(x);
+        } else if constexpr (Mode == UnaryMode::Floor) {
+            return std::floor(x);
+        } else if constexpr (Mode == UnaryMode::Round) {
+            if constexpr (std::is_integral_v<T>) {
+                return x;
+            } else {
+                return std::nearbyint(x);
+            }
+        } else if constexpr (Mode == UnaryMode::Sin) {
+            return std::sin(x);
+        } else if constexpr (Mode == UnaryMode::Cos) {
+            return std::cos(x);
+        } else if constexpr (Mode == UnaryMode::Tan) {
+            return std::tan(x);
+        } else if constexpr (Mode == UnaryMode::Asin) {
+            return std::asin(x);
+        } else if constexpr (Mode == UnaryMode::Acos) {
+            return std::acos(x);
+        } else if constexpr (Mode == UnaryMode::Atan) {
+            return std::atan(x);
+        } else if constexpr (Mode == UnaryMode::Sinh) {
+            return std::sinh(x);
+        } else if constexpr (Mode == UnaryMode::Cosh) {
+            return std::cosh(x);
+        } else if constexpr (Mode == UnaryMode::Tanh) {
+            return std::tanh(x);
+        } else if constexpr (Mode == UnaryMode::Asinh) {
+            return std::asinh(x);
+        } else if constexpr (Mode == UnaryMode::Acosh) {
+            return std::acosh(x);
+        } else if constexpr (Mode == UnaryMode::Atanh) {
+            return std::atanh(x);
+        } else if constexpr (Mode == UnaryMode::Relu) {
+            return x > T(0) ? x : T(0);
+        } else if constexpr (Mode == UnaryMode::Sigmoid) {
+            return T(1) / (T(1) + std::exp(-x));
+        } else if constexpr (Mode == UnaryMode::Sign) {
+            return x > T(0) ? T(1) : (x == T(0) ? T(0) : T(-1));
+        } else if constexpr (Mode == UnaryMode::Erf) {
+            return std::erf(x);
+        } else {
+            static_assert(Mode != Mode, "Unsupported unary operation mode");
+            return x;
+        }
+    }
+};
+
+#ifdef __CUDACC__
+/**
+ * @brief CUDA-specific unary operation template that performs different operations
+ *        based on the specified UnaryMode, using CUDA-optimized functions.
+ *
+ * This template provides CUDA device functions optimized for GPU execution,
+ * using intrinsics like __habs2, __logf, __sinf, etc.
+ *
+ * @tparam Mode The unary operation mode (from UnaryMode enum)
+ */
+namespace cuda {
+template <UnaryMode Mode>
+struct UnaryOp {
+    static constexpr size_t num_inputs = 1;
+
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &x) const {
+        if constexpr (Mode == UnaryMode::Abs) {
+            if constexpr (std::is_same_v<T, half2>) {
+                return __habs2(x);
+            } else if constexpr (std::is_same_v<T, half>) {
+                return __habs(x);
+            } else if constexpr (std::is_floating_point_v<T>) {
+                return std::fabs(x);
+            } else {
+                return std::abs(x);
+            }
+        } else if constexpr (Mode == UnaryMode::Exp) {
+            if constexpr (std::is_same_v<T, half2>) {
+                float2 x_f2 = __half22float2(x);
+                return __float22half2_rn(make_float2(__expf(x_f2.x), __expf(x_f2.y)));
+            } else if constexpr (std::is_same_v<T, half>) {
+                return __float2half(__expf(__half2float(x)));
+            } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
+                float2 x_f2 = __bfloat1622float2(x);
+                return __floats2bfloat162_rn(__expf(x_f2.x), __expf(x_f2.y));
+            } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+                return __float2bfloat16_rn(__expf(__bfloat162float(x)));
+            } else if constexpr (std::is_same_v<T, float>) {
+                return __expf(x);
+            } else {
+                return std::exp(x);
+            }
+        } else if constexpr (Mode == UnaryMode::Log) {
+            if constexpr (std::is_same_v<T, half2>) {
+                return h2log(x);
+            } else if constexpr (std::is_same_v<T, half>) {
+                return __float2half(__logf(__half2float(x)));
+            } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
+                float x0 = __bfloat162float(__low2bfloat16(x));
+                float x1 = __bfloat162float(__high2bfloat16(x));
+                return __floats2bfloat162_rn(logf(x0), logf(x1));
+            } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+                return __float2bfloat16_rn(logf(__bfloat162float(x)));
+            } else if constexpr (std::is_same_v<T, float>) {
+                return __logf(x);
+            } else {
+                return std::log(x);
+            }
+        } else if constexpr (Mode == UnaryMode::Reciprocal) {
+            if constexpr (std::is_same_v<T, half2>) {
+                return h2rcp(x);
+            } else if constexpr (std::is_same_v<T, half>) {
+                return hrcp(x);
+            } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
+                float x0 = __bfloat162float(__low2bfloat16(x));
+                float x1 = __bfloat162float(__high2bfloat16(x));
+                return __floats2bfloat162_rn(__frcp_rn(x0), __frcp_rn(x1));
+            } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+                return __float2bfloat16_rn(__frcp_rn(__bfloat162float(x)));
+            } else if constexpr (std::is_same_v<T, float>) {
+                return __frcp_rn(x);
+            } else {
+                return T(1) / x;
+            }
+        } else if constexpr (Mode == UnaryMode::Sqrt) {
+            if constexpr (std::is_same_v<T, half2>) {
+                return h2sqrt(x);
+            } else if constexpr (std::is_same_v<T, half>) {
+                return hsqrt(x);
+            } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
+                float x0 = __bfloat162float(__low2bfloat16(x));
+                float x1 = __bfloat162float(__high2bfloat16(x));
+                return __floats2bfloat162_rn(sqrtf(x0), sqrtf(x1));
+            } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+                return __float2bfloat16_rn(sqrtf(__bfloat162float(x)));
+            } else if constexpr (std::is_same_v<T, float>) {
+                return __fsqrt_rn(x);
+            } else {
+                return std::sqrt(x);
+            }
+        } else if constexpr (Mode == UnaryMode::Neg) {
+            if constexpr (std::is_same_v<T, half2>) {
+                return __hneg2(x);
+            } else if constexpr (std::is_same_v<T, half>) {
+                return __hneg(x);
+            } else {
+                return -x;
+            }
+        } else if constexpr (Mode == UnaryMode::Ceil) {
+            if constexpr (std::is_same_v<T, half2>) {
+                return h2ceil(x);
+            } else if constexpr (std::is_same_v<T, half>) {
+                return hceil(x);
+            } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
+                float x0 = __bfloat162float(__low2bfloat16(x));
+                float x1 = __bfloat162float(__high2bfloat16(x));
+                return __floats2bfloat162_rn(ceilf(x0), ceilf(x1));
+            } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+                return __float2bfloat16_rn(ceilf(__bfloat162float(x)));
+            } else if constexpr (std::is_same_v<T, float>) {
+                return ceilf(x);
+            } else if constexpr (std::is_integral_v<T>) {
+                return x;
+            } else {
+                return std::ceil(x);
+            }
+        } else if constexpr (Mode == UnaryMode::Floor) {
+            if constexpr (std::is_same_v<T, half2>) {
+                return h2floor(x);
+            } else if constexpr (std::is_same_v<T, half>) {
+                return hfloor(x);
+            } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
+                float x0 = __bfloat162float(__low2bfloat16(x));
+                float x1 = __bfloat162float(__high2bfloat16(x));
+                return __floats2bfloat162_rn(floorf(x0), floorf(x1));
+            } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+                return __float2bfloat16_rn(floorf(__bfloat162float(x)));
+            } else if constexpr (std::is_same_v<T, float>) {
+                return floorf(x);
+            } else if constexpr (std::is_integral_v<T>) {
+                return x;
+            } else {
+                return std::floor(x);
+            }
+        } else if constexpr (Mode == UnaryMode::Round) {
+            if constexpr (std::is_same_v<T, half2>) {
+                return h2rint(x);
+            } else if constexpr (std::is_same_v<T, half>) {
+                return hrint(x);
+            } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
+                float x0 = __bfloat162float(__low2bfloat16(x));
+                float x1 = __bfloat162float(__high2bfloat16(x));
+                return __floats2bfloat162_rn(rintf(x0), rintf(x1));
+            } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+                return __float2bfloat16_rn(rintf(__bfloat162float(x)));
+            } else if constexpr (std::is_same_v<T, float>) {
+                return rintf(x);
+            } else if constexpr (std::is_integral_v<T>) {
+                return x;
+            } else {
+                return std::nearbyint(x);
+            }
+        } else if constexpr (Mode == UnaryMode::Sin) {
+            if constexpr (std::is_same_v<T, half2>) {
+                float2 x_f2 = __half22float2(x);
+                return __float22half2_rn(make_float2(__sinf(x_f2.x), __sinf(x_f2.y)));
+            } else if constexpr (std::is_same_v<T, half>) {
+                return __float2half(__sinf(__half2float(x)));
+            } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
+                float x0 = __bfloat162float(__low2bfloat16(x));
+                float x1 = __bfloat162float(__high2bfloat16(x));
+                return __floats2bfloat162_rn(sinf(x0), sinf(x1));
+            } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+                return __float2bfloat16_rn(sinf(__bfloat162float(x)));
+            } else if constexpr (std::is_same_v<T, float>) {
+                return __sinf(x);
+            } else {
+                return std::sin(x);
+            }
+        } else if constexpr (Mode == UnaryMode::Cos) {
+            if constexpr (std::is_same_v<T, half2>) {
+                float2 x_f2 = __half22float2(x);
+                return __float22half2_rn(make_float2(__cosf(x_f2.x), __cosf(x_f2.y)));
+            } else if constexpr (std::is_same_v<T, half>) {
+                return __float2half(__cosf(__half2float(x)));
+            } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
+                float x0 = __bfloat162float(__low2bfloat16(x));
+                float x1 = __bfloat162float(__high2bfloat16(x));
+                return __floats2bfloat162_rn(cosf(x0), cosf(x1));
+            } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+                return __float2bfloat16_rn(cosf(__bfloat162float(x)));
+            } else if constexpr (std::is_same_v<T, float>) {
+                return __cosf(x);
+            } else {
+                return std::cos(x);
+            }
+        } else if constexpr (Mode == UnaryMode::Tan) {
+            if constexpr (std::is_same_v<T, half2>) {
+                float2 x_f2 = __half22float2(x);
+                return __float22half2_rn(make_float2(tanf(x_f2.x), tanf(x_f2.y)));
+            } else if constexpr (std::is_same_v<T, half>) {
+                return __float2half(tanf(__half2float(x)));
+            } else if constexpr (std::is_same_v<T, float>) {
+                return tanf(x);
+            } else {
+                return std::tan(x);
+            }
+        } else if constexpr (Mode == UnaryMode::Asin) {
+            if constexpr (std::is_same_v<T, half2>) {
+                float2 x_f2 = __half22float2(x);
+                return __float22half2_rn(make_float2(asinf(x_f2.x), asinf(x_f2.y)));
+            } else if constexpr (std::is_same_v<T, half>) {
+                return __float2half(asinf(__half2float(x)));
+            } else if constexpr (std::is_same_v<T, float>) {
+                return asinf(x);
+            } else {
+                return std::asin(x);
+            }
+        } else if constexpr (Mode == UnaryMode::Acos) {
+            if constexpr (std::is_same_v<T, half2>) {
+                float2 x_f2 = __half22float2(x);
+                return __float22half2_rn(make_float2(acosf(x_f2.x), acosf(x_f2.y)));
+            } else if constexpr (std::is_same_v<T, half>) {
+                return __float2half(acosf(__half2float(x)));
+            } else if constexpr (std::is_same_v<T, float>) {
+                return acosf(x);
+            } else {
+                return std::acos(x);
+            }
+        } else if constexpr (Mode == UnaryMode::Atan) {
+            if constexpr (std::is_same_v<T, half2>) {
+                float2 x_f2 = __half22float2(x);
+                return __float22half2_rn(make_float2(atanf(x_f2.x), atanf(x_f2.y)));
+            } else if constexpr (std::is_same_v<T, half>) {
+                return __float2half(atanf(__half2float(x)));
+            } else if constexpr (std::is_same_v<T, float>) {
+                return atanf(x);
+            } else {
+                return std::atan(x);
+            }
+        } else if constexpr (Mode == UnaryMode::Sinh) {
+            if constexpr (std::is_same_v<T, half2>) {
+                float2 x_f2 = __half22float2(x);
+                return __float22half2_rn(make_float2(sinhf(x_f2.x), sinhf(x_f2.y)));
+            } else if constexpr (std::is_same_v<T, half>) {
+                return __float2half(sinhf(__half2float(x)));
+            } else if constexpr (std::is_same_v<T, float>) {
+                return sinhf(x);
+            } else {
+                return std::sinh(x);
+            }
+        } else if constexpr (Mode == UnaryMode::Cosh) {
+            if constexpr (std::is_same_v<T, half2>) {
+                float2 x_f2 = __half22float2(x);
+                return __float22half2_rn(make_float2(coshf(x_f2.x), coshf(x_f2.y)));
+            } else if constexpr (std::is_same_v<T, half>) {
+                return __float2half(coshf(__half2float(x)));
+            } else if constexpr (std::is_same_v<T, float>) {
+                return coshf(x);
+            } else {
+                return std::cosh(x);
+            }
+        } else if constexpr (Mode == UnaryMode::Tanh) {
+            if constexpr (std::is_same_v<T, half2>) {
+                return __h2tanh(x);
+            } else if constexpr (std::is_same_v<T, half>) {
+                return __float2half(tanhf(__half2float(x)));
+            } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
+                float f0 = __bfloat162float(__low2bfloat16(x));
+                float f1 = __bfloat162float(__high2bfloat16(x));
+                return __floats2bfloat162_rn(tanhf(f0), tanhf(f1));
+            } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+                return __float2bfloat16_rn(tanhf(__bfloat162float(x)));
+            } else if constexpr (std::is_same_v<T, float>) {
+                return tanhf(x);
+            } else {
+                return std::tanh(x);
+            }
+        } else if constexpr (Mode == UnaryMode::Asinh) {
+            if constexpr (std::is_same_v<T, half2>) {
+                return __floats2half2_rn(asinhf(__half2float(__low2half(x))), asinhf(__half2float(__high2half(x))));
+            } else if constexpr (std::is_same_v<T, half>) {
+                return __float2half(asinhf(__half2float(x)));
+            } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
+                float x0 = __bfloat162float(__low2bfloat16(x));
+                float x1 = __bfloat162float(__high2bfloat16(x));
+                return __floats2bfloat162_rn(asinhf(x0), asinhf(x1));
+            } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+                return __float2bfloat16_rn(asinhf(__bfloat162float(x)));
+            } else if constexpr (std::is_same_v<T, float>) {
+                return asinhf(x);
+            } else {
+                return std::asinh(x);
+            }
+        } else if constexpr (Mode == UnaryMode::Acosh) {
+            if constexpr (std::is_same_v<T, half2>) {
+                return __floats2half2_rn(acoshf(__half2float(__low2half(x))), acoshf(__half2float(__high2half(x))));
+            } else if constexpr (std::is_same_v<T, half>) {
+                return __float2half(acoshf(__half2float(x)));
+            } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
+                float x0 = __bfloat162float(__low2bfloat16(x));
+                float x1 = __bfloat162float(__high2bfloat16(x));
+                return __floats2bfloat162_rn(acoshf(x0), acoshf(x1));
+            } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+                return __float2bfloat16_rn(acoshf(__bfloat162float(x)));
+            } else if constexpr (std::is_same_v<T, float>) {
+                return acoshf(x);
+            } else {
+                return std::acosh(x);
+            }
+        } else if constexpr (Mode == UnaryMode::Atanh) {
+            if constexpr (std::is_same_v<T, half2>) {
+                return __floats2half2_rn(atanhf(__half2float(__low2half(x))), atanhf(__half2float(__high2half(x))));
+            } else if constexpr (std::is_same_v<T, half>) {
+                return __float2half(atanhf(__half2float(x)));
+            } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
+                float x0 = __bfloat162float(__low2bfloat16(x));
+                float x1 = __bfloat162float(__high2bfloat16(x));
+                return __floats2bfloat162_rn(atanhf(x0), atanhf(x1));
+            } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+                return __float2bfloat16_rn(atanhf(__bfloat162float(x)));
+            } else if constexpr (std::is_same_v<T, float>) {
+                return atanhf(x);
+            } else {
+                return std::atanh(x);
+            }
+        } else if constexpr (Mode == UnaryMode::Relu) {
+            if constexpr (std::is_same_v<T, half2>) {
+                return __hmax2(x, __floats2half2_rn(0.0f, 0.0f));
+            } else {
+                return x > T(0) ? x : T(0);
+            }
+        } else if constexpr (Mode == UnaryMode::Sigmoid) {
+            if constexpr (std::is_same_v<T, half2>) {
+                float2 x_f2 = __half22float2(x);
+                float2 exp_neg_x = make_float2(__expf(-x_f2.x), __expf(-x_f2.y));
+                return __float22half2_rn(make_float2(1.0f / (1.0f + exp_neg_x.x), 1.0f / (1.0f + exp_neg_x.y)));
+            } else if constexpr (std::is_same_v<T, half>) {
+                float x_ = __half2float(x);
+                return __float2half(1.0f / (1.0f + __expf(-x_)));
+            } else if constexpr (std::is_same_v<T, float>) {
+                return 1.0f / (1.0f + __expf(-x));
+            } else {
+                return T(1) / (T(1) + std::exp(-x));
+            }
+        } else if constexpr (Mode == UnaryMode::Sign) {
+            if constexpr (std::is_same_v<T, half2>) {
+                const auto lt_mask = __hlt2(x, __floats2half2_rn(0.0f, 0.0f));
+                return __hadd2(__hneg2(lt_mask), __hsub2(__floats2half2_rn(1.0f, 1.0f), lt_mask));
+            } else if constexpr (std::is_same_v<T, half>) {
+                return x > half(0) ? half(1) : (x == half(0) ? half(0) : half(-1));
+            } else {
+                return x > T(0) ? T(1) : (x == T(0) ? T(0) : T(-1));
+            }
+        } else if constexpr (Mode == UnaryMode::Erf) {
+            if constexpr (std::is_same_v<T, half2>) {
+                float2 x_f2 = __half22float2(x);
+                return __float22half2_rn(make_float2(erff(x_f2.x), erff(x_f2.y)));
+            } else if constexpr (std::is_same_v<T, half>) {
+                return __float2half(erff(__half2float(x)));
+            } else if constexpr (std::is_same_v<T, float>) {
+                return erff(x);
+            } else {
+                return std::erf(x);
+            }
+        } else {
+            static_assert(Mode != Mode, "Unsupported unary operation mode");
+            return x;
+        }
+    }
+};
+} // namespace cuda
+#endif // __CUDACC__
+
+/**
+ * @brief Macro to define a unary elementwise descriptor for a specific operation.
+ *
+ * This macro simplifies the definition of unary operators (abs, log, sin, cos, etc.)
+ * by automatically generating the Descriptor class and operation struct using the
+ * ELEMENTWISE_DESCRIPTOR macro and UnaryOp template.
+ *
+ * Usage:
+ *   UNARY_ELEMENTWISE_DESCRIPTOR(abs, cpu, UnaryMode::Abs)
+ *   UNARY_ELEMENTWISE_DESCRIPTOR(log, cpu, UnaryMode::Log)
+ *
+ * @param OP        The operator name (e.g., abs, log, sin)
+ * @param NAMESPACE The device namespace (e.g., cpu, nvidia)
+ * @param MODE      The UnaryMode enum value for this operation
+ */
+#define UNARY_ELEMENTWISE_DESCRIPTOR(OP, NAMESPACE, MODE) \
+                                                          \
+    ELEMENTWISE_DESCRIPTOR(OP, NAMESPACE)                 \
+                                                          \
+    namespace op::OP::NAMESPACE {                         \
+    using Op = op::elementwise::unary::UnaryOp<MODE>;     \
+    }
+
+} // namespace op::elementwise::unary
+
+#endif // __INFINIOP_ELEMENTWISE_UNARY_H__
diff --git a/src/infiniop/operator_impl.h b/src/infiniop/operator_impl.h
new file mode 100644
index 000000000..3ff543f7e
--- /dev/null
+++ b/src/infiniop/operator_impl.h
@@ -0,0 +1,288 @@
+#ifndef __INFINIOP_OPERATOR_IMPL_H__
+#define __INFINIOP_OPERATOR_IMPL_H__
+
+#include "handle.h"
+#include "operator.h"
+
+// Conditional compilation helpers
+#ifdef ENABLE_CPU_API
+#define IF_ENABLE_CPU_API(...) __VA_ARGS__
+#else
+#define IF_ENABLE_CPU_API(...)
+#endif
+
+#ifdef ENABLE_NVIDIA_API
+#define IF_ENABLE_NVIDIA_API(...) __VA_ARGS__
+#else
+#define IF_ENABLE_NVIDIA_API(...)
+#endif
+
+#ifdef ENABLE_ILUVATAR_API
+#define IF_ENABLE_ILUVATAR_API(...) __VA_ARGS__
+#else
+#define IF_ENABLE_ILUVATAR_API(...)
+#endif
+
+#ifdef ENABLE_QY_API
+#define IF_ENABLE_QY_API(...) __VA_ARGS__
+#else
+#define IF_ENABLE_QY_API(...)
+#endif
+
+#ifdef ENABLE_METAX_API
+#define IF_ENABLE_METAX_API(...) __VA_ARGS__
+#else
+#define IF_ENABLE_METAX_API(...)
+#endif
+
+#ifdef ENABLE_KUNLUN_API
+#define IF_ENABLE_KUNLUN_API(...) __VA_ARGS__
+#else
+#define IF_ENABLE_KUNLUN_API(...)
+#endif
+
+#ifdef ENABLE_CAMBRICON_API
+#define IF_ENABLE_CAMBRICON_API(...) __VA_ARGS__
+#else
+#define IF_ENABLE_CAMBRICON_API(...)
+#endif
+
+#ifdef ENABLE_MOORE_API
+#define IF_ENABLE_MOORE_API(...) __VA_ARGS__
+#else
+#define IF_ENABLE_MOORE_API(...)
+#endif
+
+/**
+ * Binary operator implementation macros
+ */
+#define BINARY_OP_IMPL_CASE(OP_NAME, DEVICE, NAMESPACE, c_desc, a_desc, b_desc) \
+    IF_ENABLE_##DEVICE##_API(                                                   \
+        case INFINI_DEVICE_##DEVICE                                             \
+        : return op::OP_NAME::NAMESPACE::Descriptor::create(                    \
+            handle,                                                             \
+            reinterpret_cast<op::OP_NAME::NAMESPACE::Descriptor **>(desc_ptr),  \
+            c_desc,                                                             \
+            {a_desc, b_desc});)
+
+#define BINARY_OP_IMPL_DEVICE_CASES(OP_NAME, c_desc, a_desc, b_desc)       \
+    BINARY_OP_IMPL_CASE(OP_NAME, CPU, cpu, c_desc, a_desc, b_desc)         \
+    BINARY_OP_IMPL_CASE(OP_NAME, NVIDIA, nvidia, c_desc, a_desc, b_desc)   \
+    BINARY_OP_IMPL_CASE(OP_NAME, ILUVATAR, nvidia, c_desc, a_desc, b_desc) \
+    BINARY_OP_IMPL_CASE(OP_NAME, QY, nvidia, c_desc, a_desc, b_desc)       \
+    BINARY_OP_IMPL_CASE(OP_NAME, METAX, metax, c_desc, a_desc, b_desc)     \
+    BINARY_OP_IMPL_CASE(OP_NAME, KUNLUN, kunlun, c_desc, a_desc, b_desc)   \
+    BINARY_OP_IMPL_CASE(OP_NAME, CAMBRICON, bang, c_desc, a_desc, b_desc)  \
+    BINARY_OP_IMPL_CASE(OP_NAME, MOORE, moore, c_desc, a_desc, b_desc)
+
+#define BINARY_OP_IMPL_GET_WORKSPACE_CASE(OP_NAME, DEVICE, NAMESPACE)                              \
+    IF_ENABLE_##DEVICE##_API(                                                                      \
+        case INFINI_DEVICE_##DEVICE                                                                \
+        :                                                                                          \
+            *size = reinterpret_cast<op::OP_NAME::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS;)
+
+#define BINARY_OP_IMPL_GET_WORKSPACE_CASES(OP_NAME)              \
+    BINARY_OP_IMPL_GET_WORKSPACE_CASE(OP_NAME, CPU, cpu)         \
+    BINARY_OP_IMPL_GET_WORKSPACE_CASE(OP_NAME, NVIDIA, nvidia)   \
+    BINARY_OP_IMPL_GET_WORKSPACE_CASE(OP_NAME, ILUVATAR, nvidia) \
+    BINARY_OP_IMPL_GET_WORKSPACE_CASE(OP_NAME, QY, nvidia)       \
+    BINARY_OP_IMPL_GET_WORKSPACE_CASE(OP_NAME, METAX, metax)     \
+    BINARY_OP_IMPL_GET_WORKSPACE_CASE(OP_NAME, KUNLUN, kunlun)   \
+    BINARY_OP_IMPL_GET_WORKSPACE_CASE(OP_NAME, CAMBRICON, bang)  \
+    BINARY_OP_IMPL_GET_WORKSPACE_CASE(OP_NAME, MOORE, moore)
+
+#define BINARY_OP_IMPL_CALCULATE_CASE(OP_NAME, DEVICE, NAMESPACE, c, a, b)          \
+    IF_ENABLE_##DEVICE##_API(                                                       \
+        case INFINI_DEVICE_##DEVICE                                                 \
+        : return reinterpret_cast<const op::OP_NAME::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, c, {a, b}, stream);)
+
+#define BINARY_OP_IMPL_CALCULATE_CASES(OP_NAME, c, a, b)              \
+    BINARY_OP_IMPL_CALCULATE_CASE(OP_NAME, CPU, cpu, c, a, b)         \
+    BINARY_OP_IMPL_CALCULATE_CASE(OP_NAME, NVIDIA, nvidia, c, a, b)   \
+    BINARY_OP_IMPL_CALCULATE_CASE(OP_NAME, ILUVATAR, nvidia, c, a, b) \
+    BINARY_OP_IMPL_CALCULATE_CASE(OP_NAME, QY, nvidia, c, a, b)       \
+    BINARY_OP_IMPL_CALCULATE_CASE(OP_NAME, METAX, metax, c, a, b)     \
+    BINARY_OP_IMPL_CALCULATE_CASE(OP_NAME, KUNLUN, kunlun, c, a, b)   \
+    BINARY_OP_IMPL_CALCULATE_CASE(OP_NAME, CAMBRICON, bang, c, a, b)  \
+    BINARY_OP_IMPL_CALCULATE_CASE(OP_NAME, MOORE, moore, c, a, b)
+
+#define BINARY_OP_IMPL_DESTROY_CASE(OP_NAME, DEVICE, NAMESPACE)                      \
+    IF_ENABLE_##DEVICE##_API(                                                        \
+        case INFINI_DEVICE_##DEVICE                                                  \
+        : delete reinterpret_cast<const op::OP_NAME::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS;)
+
+#define BINARY_OP_IMPL_DESTROY_CASES(OP_NAME)              \
+    BINARY_OP_IMPL_DESTROY_CASE(OP_NAME, CPU, cpu)         \
+    BINARY_OP_IMPL_DESTROY_CASE(OP_NAME, NVIDIA, nvidia)   \
+    BINARY_OP_IMPL_DESTROY_CASE(OP_NAME, ILUVATAR, nvidia) \
+    BINARY_OP_IMPL_DESTROY_CASE(OP_NAME, QY, nvidia)       \
+    BINARY_OP_IMPL_DESTROY_CASE(OP_NAME, METAX, metax)     \
+    BINARY_OP_IMPL_DESTROY_CASE(OP_NAME, KUNLUN, kunlun)   \
+    BINARY_OP_IMPL_DESTROY_CASE(OP_NAME, CAMBRICON, bang)  \
+    BINARY_OP_IMPL_DESTROY_CASE(OP_NAME, MOORE, moore)
+
+#define BINARY_OP_IMPL(OP_NAME, OP_NAME_UPPER)                           \
+    __C infiniStatus_t infiniopCreate##OP_NAME_UPPER##Descriptor(        \
+        infiniopHandle_t handle,                                         \
+        infiniop##OP_NAME_UPPER##Descriptor_t *desc_ptr,                 \
+        infiniopTensorDescriptor_t c_desc,                               \
+        infiniopTensorDescriptor_t a_desc,                               \
+        infiniopTensorDescriptor_t b_desc) {                             \
+        switch (handle->device) {                                        \
+            BINARY_OP_IMPL_DEVICE_CASES(OP_NAME, c_desc, a_desc, b_desc) \
+        default:                                                         \
+            return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;              \
+        }                                                                \
+    }                                                                    \
+    __C infiniStatus_t infiniopGet##OP_NAME_UPPER##WorkspaceSize(        \
+        infiniop##OP_NAME_UPPER##Descriptor_t desc,                      \
+        size_t *size) {                                                  \
+        switch (desc->device_type) {                                     \
+            BINARY_OP_IMPL_GET_WORKSPACE_CASES(OP_NAME)                  \
+        default:                                                         \
+            return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;              \
+        }                                                                \
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;                  \
+    }                                                                    \
+    __C infiniStatus_t infiniop##OP_NAME_UPPER(                          \
+        infiniop##OP_NAME_UPPER##Descriptor_t desc,                      \
+        void *workspace,                                                 \
+        size_t workspace_size,                                           \
+        void *c,                                                         \
+        const void *a,                                                   \
+        const void *b,                                                   \
+        void *stream) {                                                  \
+        switch (desc->device_type) {                                     \
+            BINARY_OP_IMPL_CALCULATE_CASES(OP_NAME, c, a, b)             \
+        default:                                                         \
+            return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;              \
+        }                                                                \
+    }                                                                    \
+    __C infiniStatus_t infiniopDestroy##OP_NAME_UPPER##Descriptor(       \
+        infiniop##OP_NAME_UPPER##Descriptor_t desc) {                    \
+        switch (desc->device_type) {                                     \
+            BINARY_OP_IMPL_DESTROY_CASES(OP_NAME)                        \
+        default:                                                         \
+            return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;              \
+        }                                                                \
+    }
+
+/**
+ * Unary operator implementation macros
+ */
+#define UNARY_OP_IMPL_CASE(OP_NAME, DEVICE, NAMESPACE, y_desc, x_desc)         \
+    IF_ENABLE_##DEVICE##_API(                                                  \
+        case INFINI_DEVICE_##DEVICE                                            \
+        : return op::OP_NAME::NAMESPACE::Descriptor::create(                   \
+            handle,                                                            \
+            reinterpret_cast<op::OP_NAME::NAMESPACE::Descriptor **>(desc_ptr), \
+            y_desc,                                                            \
+            {x_desc});)
+
+#define UNARY_OP_IMPL_DEVICE_CASES(OP_NAME, y_desc, x_desc)       \
+    UNARY_OP_IMPL_CASE(OP_NAME, CPU, cpu, y_desc, x_desc)         \
+    UNARY_OP_IMPL_CASE(OP_NAME, NVIDIA, nvidia, y_desc, x_desc)   \
+    UNARY_OP_IMPL_CASE(OP_NAME, ILUVATAR, nvidia, y_desc, x_desc) \
+    UNARY_OP_IMPL_CASE(OP_NAME, QY, nvidia, y_desc, x_desc)       \
+    UNARY_OP_IMPL_CASE(OP_NAME, METAX, metax, y_desc, x_desc)     \
+    UNARY_OP_IMPL_CASE(OP_NAME, KUNLUN, kunlun, y_desc, x_desc)   \
+    UNARY_OP_IMPL_CASE(OP_NAME, CAMBRICON, bang, y_desc, x_desc)  \
+    UNARY_OP_IMPL_CASE(OP_NAME, MOORE, moore, y_desc, x_desc)
+
+#define UNARY_OP_IMPL_GET_WORKSPACE_CASE(OP_NAME, DEVICE, NAMESPACE)                               \
+    IF_ENABLE_##DEVICE##_API(                                                                      \
+        case INFINI_DEVICE_##DEVICE                                                                \
+        :                                                                                          \
+            *size = reinterpret_cast<op::OP_NAME::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS;)
+
+#define UNARY_OP_IMPL_GET_WORKSPACE_CASES(OP_NAME)              \
+    UNARY_OP_IMPL_GET_WORKSPACE_CASE(OP_NAME, CPU, cpu)         \
+    UNARY_OP_IMPL_GET_WORKSPACE_CASE(OP_NAME, NVIDIA, nvidia)   \
+    UNARY_OP_IMPL_GET_WORKSPACE_CASE(OP_NAME, ILUVATAR, nvidia) \
+    UNARY_OP_IMPL_GET_WORKSPACE_CASE(OP_NAME, QY, nvidia)       \
+    UNARY_OP_IMPL_GET_WORKSPACE_CASE(OP_NAME, METAX, metax)     \
+    UNARY_OP_IMPL_GET_WORKSPACE_CASE(OP_NAME, KUNLUN, kunlun)   \
+    UNARY_OP_IMPL_GET_WORKSPACE_CASE(OP_NAME, CAMBRICON, bang)  \
+    UNARY_OP_IMPL_GET_WORKSPACE_CASE(OP_NAME, MOORE, moore)
+
+#define UNARY_OP_IMPL_CALCULATE_CASE(OP_NAME, DEVICE, NAMESPACE, y, x)              \
+    IF_ENABLE_##DEVICE##_API(                                                       \
+        case INFINI_DEVICE_##DEVICE                                                 \
+        : return reinterpret_cast<const op::OP_NAME::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, y, {x}, stream);)
+
+#define UNARY_OP_IMPL_CALCULATE_CASES(OP_NAME, y, x)              \
+    UNARY_OP_IMPL_CALCULATE_CASE(OP_NAME, CPU, cpu, y, x)         \
+    UNARY_OP_IMPL_CALCULATE_CASE(OP_NAME, NVIDIA, nvidia, y, x)   \
+    UNARY_OP_IMPL_CALCULATE_CASE(OP_NAME, ILUVATAR, nvidia, y, x) \
+    UNARY_OP_IMPL_CALCULATE_CASE(OP_NAME, QY, nvidia, y, x)       \
+    UNARY_OP_IMPL_CALCULATE_CASE(OP_NAME, METAX, metax, y, x)     \
+    UNARY_OP_IMPL_CALCULATE_CASE(OP_NAME, KUNLUN, kunlun, y, x)   \
+    UNARY_OP_IMPL_CALCULATE_CASE(OP_NAME, CAMBRICON, bang, y, x)  \
+    UNARY_OP_IMPL_CALCULATE_CASE(OP_NAME, MOORE, moore, y, x)
+
+#define UNARY_OP_IMPL_DESTROY_CASE(OP_NAME, DEVICE, NAMESPACE)                       \
+    IF_ENABLE_##DEVICE##_API(                                                        \
+        case INFINI_DEVICE_##DEVICE                                                  \
+        : delete reinterpret_cast<const op::OP_NAME::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS;)
+
+#define UNARY_OP_IMPL_DESTROY_CASES(OP_NAME)              \
+    UNARY_OP_IMPL_DESTROY_CASE(OP_NAME, CPU, cpu)         \
+    UNARY_OP_IMPL_DESTROY_CASE(OP_NAME, NVIDIA, nvidia)   \
+    UNARY_OP_IMPL_DESTROY_CASE(OP_NAME, ILUVATAR, nvidia) \
+    UNARY_OP_IMPL_DESTROY_CASE(OP_NAME, QY, nvidia)       \
+    UNARY_OP_IMPL_DESTROY_CASE(OP_NAME, METAX, metax)     \
+    UNARY_OP_IMPL_DESTROY_CASE(OP_NAME, KUNLUN, kunlun)   \
+    UNARY_OP_IMPL_DESTROY_CASE(OP_NAME, CAMBRICON, bang)  \
+    UNARY_OP_IMPL_DESTROY_CASE(OP_NAME, MOORE, moore)
+
+#define UNARY_OP_IMPL(OP_NAME, OP_NAME_UPPER)                      \
+    __C infiniStatus_t infiniopCreate##OP_NAME_UPPER##Descriptor(  \
+        infiniopHandle_t handle,                                   \
+        infiniop##OP_NAME_UPPER##Descriptor_t *desc_ptr,           \
+        infiniopTensorDescriptor_t y_desc,                         \
+        infiniopTensorDescriptor_t x_desc) {                       \
+        switch (handle->device) {                                  \
+            UNARY_OP_IMPL_DEVICE_CASES(OP_NAME, y_desc, x_desc)    \
+        default:                                                   \
+            return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;        \
+        }                                                          \
+    }                                                              \
+    __C infiniStatus_t infiniopGet##OP_NAME_UPPER##WorkspaceSize(  \
+        infiniop##OP_NAME_UPPER##Descriptor_t desc,                \
+        size_t *size) {                                            \
+        switch (desc->device_type) {                               \
+            UNARY_OP_IMPL_GET_WORKSPACE_CASES(OP_NAME)             \
+        default:                                                   \
+            return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;        \
+        }                                                          \
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;            \
+    }                                                              \
+    __C infiniStatus_t infiniop##OP_NAME_UPPER(                    \
+        infiniop##OP_NAME_UPPER##Descriptor_t desc,                \
+        void *workspace,                                           \
+        size_t workspace_size,                                     \
+        void *y,                                                   \
+        const void *x,                                             \
+        void *stream) {                                            \
+        switch (desc->device_type) {                               \
+            UNARY_OP_IMPL_CALCULATE_CASES(OP_NAME, y, x)           \
+        default:                                                   \
+            return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;        \
+        }                                                          \
+    }                                                              \
+    __C infiniStatus_t infiniopDestroy##OP_NAME_UPPER##Descriptor( \
+        infiniop##OP_NAME_UPPER##Descriptor_t desc) {              \
+        switch (desc->device_type) {                               \
+            UNARY_OP_IMPL_DESTROY_CASES(OP_NAME)                   \
+        default:                                                   \
+            return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;        \
+        }                                                          \
+    }
+
+#endif // __INFINIOP_OPERATOR_IMPL_H__
diff --git a/src/infiniop/ops/abs/cpu/abs_cpu.cc b/src/infiniop/ops/abs/cpu/abs_cpu.cc
index 7d6e81d04..d4b541ba7 100644
--- a/src/infiniop/ops/abs/cpu/abs_cpu.cc
+++ b/src/infiniop/ops/abs/cpu/abs_cpu.cc
@@ -1,48 +1,8 @@
 #include "abs_cpu.h"
+#include "../../../elementwise/cpu/elementwise_cpu_impl.h"
 
 namespace op::abs::cpu {
 
-Descriptor::~Descriptor() = default;
+ELEMENTWISE_CPU_IMPL_UNARY(abs)
 
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t out_desc,
-    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
-
-    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
-    auto dtype = out_desc->dtype();
-
-    const auto &x_desc = input_desc_vec.at(0);
-    const auto &y_shape = out_desc->shape();
-    const auto &x_shape = x_desc->shape();
-
-    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
-
-    CHECK_SAME_SHAPE(y_shape, x_shape);
-
-    // create CPU elementwise descriptor
-    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    std::vector<const void *> inputs,
-    void *stream) const {
-
-    switch (_dtype) {
-    case INFINI_DTYPE_F16:
-        return _device_info->calculate<AbsOp, fp16_t>(_info, output, inputs, stream);
-    case INFINI_DTYPE_F32:
-        return _device_info->calculate<AbsOp, float>(_info, output, inputs, stream);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
 } // namespace op::abs::cpu
diff --git a/src/infiniop/ops/abs/cpu/abs_cpu.h b/src/infiniop/ops/abs/cpu/abs_cpu.h
index 5b9773298..cba8274e6 100644
--- a/src/infiniop/ops/abs/cpu/abs_cpu.h
+++ b/src/infiniop/ops/abs/cpu/abs_cpu.h
@@ -1,26 +1,9 @@
 #ifndef __ABS_CPU_H__
 #define __ABS_CPU_H__
 
-#include <cmath>
-
 #include "../../../elementwise/cpu/elementwise_cpu.h"
+#include "../../../elementwise/unary.h"
 
-ELEMENTWISE_DESCRIPTOR(abs, cpu)
-
-namespace op::abs::cpu {
-typedef struct AbsOp {
-public:
-    static constexpr size_t num_inputs = 1;
-
-    template <typename T>
-    T operator()(const T &x) const {
-        if constexpr (std::is_floating_point_v<T>) {
-            return std::fabs(x);
-        } else {
-            return std::abs(x);
-        }
-    }
-} AbsOp;
-} // namespace op::abs::cpu
+UNARY_ELEMENTWISE_DESCRIPTOR(abs, cpu, op::elementwise::unary::UnaryMode::Abs)
 
 #endif // __ABS_CPU_H__
diff --git a/src/infiniop/ops/abs/cuda/kernel.cuh b/src/infiniop/ops/abs/cuda/kernel.cuh
index d7ff2db12..406aa423f 100644
--- a/src/infiniop/ops/abs/cuda/kernel.cuh
+++ b/src/infiniop/ops/abs/cuda/kernel.cuh
@@ -1,26 +1,10 @@
 #ifndef __ABS_CUDA_H__
 #define __ABS_CUDA_H__
 
-#include <cmath>
-#include <cuda_fp16.h>
+#include "../../../elementwise/unary.h"
 
 namespace op::abs::cuda {
-typedef struct AbsOp {
-public:
-    static constexpr size_t num_inputs = 1;
-    template <typename T>
-    __device__ __forceinline__ T operator()(const T &x) const {
-        if constexpr (std::is_same_v<T, half2>) {
-            return __habs2(x);
-        } else if constexpr (std::is_same_v<T, half>) {
-            return __habs(x);
-        } else if constexpr (std::is_floating_point_v<T>) {
-            return std::fabs(x);
-        } else {
-            return std::abs(x);
-        }
-    }
-} AbsOp;
+using Op = op::elementwise::unary::cuda::UnaryOp<op::elementwise::unary::UnaryMode::Abs>;
 } // namespace op::abs::cuda
 
 #endif // __ABS_CUDA_H__
diff --git a/src/infiniop/ops/abs/nvidia/abs_nvidia.cu b/src/infiniop/ops/abs/nvidia/abs_nvidia.cu
index 485f0406a..b9687226a 100644
--- a/src/infiniop/ops/abs/nvidia/abs_nvidia.cu
+++ b/src/infiniop/ops/abs/nvidia/abs_nvidia.cu
@@ -1,54 +1,10 @@
-#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh"
 
 #include "../cuda/kernel.cuh"
 #include "abs_nvidia.cuh"
 
 namespace op::abs::nvidia {
 
-Descriptor::~Descriptor() = default;
+ELEMENTWISE_NVIDIA_IMPL_UNARY(abs)
 
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t out_desc,
-    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
-
-    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
-    auto dtype = out_desc->dtype();
-
-    const auto &x_desc = input_desc_vec.at(0);
-    const auto &y_shape = out_desc->shape();
-    const auto &x_shape = x_desc->shape();
-
-    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
-
-    CHECK_SAME_SHAPE(y_shape, x_shape);
-
-    // create CUDA elementwise descriptor
-    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    std::vector<const void *> inputs,
-    void *stream) const {
-    if (workspace_size < _workspace_size) {
-        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
-    }
-
-    switch (_dtype) {
-    case INFINI_DTYPE_F16:
-        return _device_info->calculate<256, cuda::AbsOp, half>(_info, workspace, output, inputs, stream);
-    case INFINI_DTYPE_F32:
-        return _device_info->calculate<256, cuda::AbsOp, float>(_info, workspace, output, inputs, stream);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
 } // namespace op::abs::nvidia
diff --git a/src/infiniop/ops/abs/operator.cc b/src/infiniop/ops/abs/operator.cc
index b6820079d..051b8711a 100644
--- a/src/infiniop/ops/abs/operator.cc
+++ b/src/infiniop/ops/abs/operator.cc
@@ -1,5 +1,4 @@
-#include "../../operator.h"
-#include "../../handle.h"
+#include "../../operator_impl.h"
 #include "infiniop/ops/abs.h"
 
 #ifdef ENABLE_CPU_API
@@ -9,131 +8,4 @@
 #include "nvidia/abs_nvidia.cuh"
 #endif
 
-__C infiniStatus_t infiniopCreateAbsDescriptor(
-    infiniopHandle_t handle,
-    infiniopAbsDescriptor_t *desc_ptr,
-    infiniopTensorDescriptor_t y_desc,
-    infiniopTensorDescriptor_t x_desc) {
-
-#define CREATE(CASE, NAMESPACE)                                            \
-    case CASE:                                                             \
-        return op::abs::NAMESPACE::Descriptor::create(                     \
-            handle,                                                        \
-            reinterpret_cast<op::abs::NAMESPACE::Descriptor **>(desc_ptr), \
-            y_desc,                                                        \
-            {x_desc})
-
-    switch (handle->device) {
-
-#ifdef ENABLE_CPU_API
-        CREATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        CREATE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef CREATE
-}
-
-__C infiniStatus_t infiniopGetAbsWorkspaceSize(infiniopAbsDescriptor_t desc, size_t *size) {
-
-#define GET(CASE, NAMESPACE)                                                               \
-    case CASE:                                                                             \
-        *size = reinterpret_cast<op::abs::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
-        return INFINI_STATUS_SUCCESS;
-
-    switch (desc->device_type) {
-#ifdef ENABLE_CPU_API
-        GET(INFINI_DEVICE_CPU, cpu)
-#endif
-#ifdef ENABLE_NVIDIA_API
-        GET(INFINI_DEVICE_NVIDIA, nvidia)
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        GET(INFINI_DEVICE_ILUVATAR, nvidia)
-#endif
-#ifdef ENABLE_QY_API
-        GET(INFINI_DEVICE_QY, nvidia)
-#endif
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-#undef GET
-
-    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-}
-
-__C infiniStatus_t infiniopAbs(
-    infiniopAbsDescriptor_t desc,
-    void *workspace,
-    size_t workspace_size,
-    void *y,
-    const void *x,
-    void *stream) {
-
-#define CALCULATE(CASE, NAMESPACE)                                            \
-    case CASE:                                                                \
-        return reinterpret_cast<const op::abs::NAMESPACE::Descriptor *>(desc) \
-            ->calculate(workspace, workspace_size, y, {x}, stream)
-
-    switch (desc->device_type) {
-
-#ifdef ENABLE_CPU_API
-        CALCULATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        CALCULATE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef CALCULATE
-}
-
-__C infiniStatus_t
-infiniopDestroyAbsDescriptor(infiniopAbsDescriptor_t desc) {
-
-#define DELETE(CASE, NAMESPACE)                                                \
-    case CASE:                                                                 \
-        delete reinterpret_cast<const op::abs::NAMESPACE::Descriptor *>(desc); \
-        return INFINI_STATUS_SUCCESS
-
-    switch (desc->device_type) {
-
-#ifdef ENABLE_CPU_API
-        DELETE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        DELETE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef DELETE
-}
+UNARY_OP_IMPL(abs, Abs)
diff --git a/src/infiniop/ops/acos/cpu/acos_cpu.cc b/src/infiniop/ops/acos/cpu/acos_cpu.cc
index 1accb6752..9be4ca1fe 100644
--- a/src/infiniop/ops/acos/cpu/acos_cpu.cc
+++ b/src/infiniop/ops/acos/cpu/acos_cpu.cc
@@ -1,48 +1,8 @@
 #include "acos_cpu.h"
+#include "../../../elementwise/cpu/elementwise_cpu_impl.h"
 
 namespace op::acos::cpu {
 
-Descriptor::~Descriptor() = default;
+ELEMENTWISE_CPU_IMPL_UNARY(acos)
 
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t out_desc,
-    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
-
-    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
-    auto dtype = out_desc->dtype();
-
-    const auto &x_desc = input_desc_vec.at(0);
-    const auto &y_shape = out_desc->shape();
-    const auto &x_shape = x_desc->shape();
-
-    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
-
-    CHECK_SAME_SHAPE(y_shape, x_shape);
-
-    // create CPU elementwise descriptor
-    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    std::vector<const void *> inputs,
-    void *stream) const {
-
-    switch (_dtype) {
-    case INFINI_DTYPE_F16:
-        return _device_info->calculate<AcosOp, fp16_t>(_info, output, inputs, stream);
-    case INFINI_DTYPE_F32:
-        return _device_info->calculate<AcosOp, float>(_info, output, inputs, stream);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
 } // namespace op::acos::cpu
diff --git a/src/infiniop/ops/acos/cpu/acos_cpu.h b/src/infiniop/ops/acos/cpu/acos_cpu.h
index 14e74b75c..50900e217 100644
--- a/src/infiniop/ops/acos/cpu/acos_cpu.h
+++ b/src/infiniop/ops/acos/cpu/acos_cpu.h
@@ -1,22 +1,9 @@
 #ifndef __ACOS_CPU_H__
 #define __ACOS_CPU_H__
 
-#include <cmath>
-
 #include "../../../elementwise/cpu/elementwise_cpu.h"
+#include "../../../elementwise/unary.h"
 
-ELEMENTWISE_DESCRIPTOR(acos, cpu)
-
-namespace op::acos::cpu {
-typedef struct AcosOp {
-public:
-    static constexpr size_t num_inputs = 1;
-
-    template <typename T>
-    T operator()(const T &x) const {
-        return std::acos(x);
-    }
-} AcosOp;
-} // namespace op::acos::cpu
+UNARY_ELEMENTWISE_DESCRIPTOR(acos, cpu, op::elementwise::unary::UnaryMode::Acos)
 
 #endif // __ACOS_CPU_H__
diff --git a/src/infiniop/ops/acos/cuda/kernel.cuh b/src/infiniop/ops/acos/cuda/kernel.cuh
index c3281c7e3..b62bf1e88 100644
--- a/src/infiniop/ops/acos/cuda/kernel.cuh
+++ b/src/infiniop/ops/acos/cuda/kernel.cuh
@@ -1,32 +1,10 @@
 #ifndef __ACOS_CUDA_H__
 #define __ACOS_CUDA_H__
 
-#include <cmath>
-#include <cuda_fp16.h>
+#include "../../../elementwise/unary.h"
 
 namespace op::acos::cuda {
-typedef struct AcosOp {
-public:
-    static constexpr size_t num_inputs = 1;
-    template <typename T>
-    __device__ __forceinline__ T operator()(const T &x) const {
-        if constexpr (std::is_same_v<T, half2>) {
-            return __floats2half2_rn(acosf(__half2float(__low2half(x))), acosf(__half2float(__high2half(x))));
-        } else if constexpr (std::is_same_v<T, half>) {
-            return __float2half(acosf(__half2float(x)));
-        } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
-            float x0 = __bfloat162float(__low2bfloat16(x));
-            float x1 = __bfloat162float(__high2bfloat16(x));
-            return __floats2bfloat162_rn(acosf(x0), acosf(x1));
-        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
-            return __float2bfloat16_rn(acosf(__bfloat162float(x)));
-        } else if constexpr (std::is_same_v<T, float>) {
-            return acosf(x);
-        } else {
-            return std::acos(x);
-        }
-    }
-} AcosOp;
+using Op = op::elementwise::unary::cuda::UnaryOp<op::elementwise::unary::UnaryMode::Acos>;
 } // namespace op::acos::cuda
 
 #endif // __ACOS_CUDA_H__
diff --git a/src/infiniop/ops/acos/nvidia/acos_nvidia.cu b/src/infiniop/ops/acos/nvidia/acos_nvidia.cu
index 8480219bc..e7cf1feea 100644
--- a/src/infiniop/ops/acos/nvidia/acos_nvidia.cu
+++ b/src/infiniop/ops/acos/nvidia/acos_nvidia.cu
@@ -1,54 +1,10 @@
-#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh"
 
 #include "../cuda/kernel.cuh"
 #include "acos_nvidia.cuh"
 
 namespace op::acos::nvidia {
 
-Descriptor::~Descriptor() = default;
+ELEMENTWISE_NVIDIA_IMPL_UNARY(acos)
 
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t out_desc,
-    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
-
-    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
-    auto dtype = out_desc->dtype();
-
-    const auto &x_desc = input_desc_vec.at(0);
-    const auto &y_shape = out_desc->shape();
-    const auto &x_shape = x_desc->shape();
-
-    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
-
-    CHECK_SAME_SHAPE(y_shape, x_shape);
-
-    // create CUDA elementwise descriptor
-    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    std::vector<const void *> inputs,
-    void *stream) const {
-    if (workspace_size < _workspace_size) {
-        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
-    }
-
-    switch (_dtype) {
-    case INFINI_DTYPE_F16:
-        return _device_info->calculate<256, cuda::AcosOp, half>(_info, workspace, output, inputs, stream);
-    case INFINI_DTYPE_F32:
-        return _device_info->calculate<256, cuda::AcosOp, float>(_info, workspace, output, inputs, stream);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
 } // namespace op::acos::nvidia
diff --git a/src/infiniop/ops/acos/operator.cc b/src/infiniop/ops/acos/operator.cc
index e775a005a..15872b754 100644
--- a/src/infiniop/ops/acos/operator.cc
+++ b/src/infiniop/ops/acos/operator.cc
@@ -1,5 +1,4 @@
-#include "../../operator.h"
-#include "../../handle.h"
+#include "../../operator_impl.h"
 #include "infiniop/ops/acos.h"
 
 #ifdef ENABLE_CPU_API
@@ -9,131 +8,4 @@
 #include "nvidia/acos_nvidia.cuh"
 #endif
 
-__C infiniStatus_t infiniopCreateAcosDescriptor(
-    infiniopHandle_t handle,
-    infiniopAcosDescriptor_t *desc_ptr,
-    infiniopTensorDescriptor_t y_desc,
-    infiniopTensorDescriptor_t x_desc) {
-
-#define CREATE(CASE, NAMESPACE)                                             \
-    case CASE:                                                              \
-        return op::acos::NAMESPACE::Descriptor::create(                     \
-            handle,                                                         \
-            reinterpret_cast<op::acos::NAMESPACE::Descriptor **>(desc_ptr), \
-            y_desc,                                                         \
-            {x_desc})
-
-    switch (handle->device) {
-
-#ifdef ENABLE_CPU_API
-        CREATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        CREATE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef CREATE
-}
-
-__C infiniStatus_t infiniopGetAcosWorkspaceSize(infiniopAcosDescriptor_t desc, size_t *size) {
-
-#define GET(CASE, NAMESPACE)                                                                \
-    case CASE:                                                                              \
-        *size = reinterpret_cast<op::acos::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
-        return INFINI_STATUS_SUCCESS;
-
-    switch (desc->device_type) {
-#ifdef ENABLE_CPU_API
-        GET(INFINI_DEVICE_CPU, cpu)
-#endif
-#ifdef ENABLE_NVIDIA_API
-        GET(INFINI_DEVICE_NVIDIA, nvidia)
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        GET(INFINI_DEVICE_ILUVATAR, nvidia)
-#endif
-#ifdef ENABLE_QY_API
-        GET(INFINI_DEVICE_QY, nvidia)
-#endif
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-#undef GET
-
-    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-}
-
-__C infiniStatus_t infiniopAcos(
-    infiniopAcosDescriptor_t desc,
-    void *workspace,
-    size_t workspace_size,
-    void *y,
-    const void *x,
-    void *stream) {
-
-#define CALCULATE(CASE, NAMESPACE)                                             \
-    case CASE:                                                                 \
-        return reinterpret_cast<const op::acos::NAMESPACE::Descriptor *>(desc) \
-            ->calculate(workspace, workspace_size, y, {x}, stream)
-
-    switch (desc->device_type) {
-
-#ifdef ENABLE_CPU_API
-        CALCULATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        CALCULATE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef CALCULATE
-}
-
-__C infiniStatus_t
-infiniopDestroyAcosDescriptor(infiniopAcosDescriptor_t desc) {
-
-#define DELETE(CASE, NAMESPACE)                                                 \
-    case CASE:                                                                  \
-        delete reinterpret_cast<const op::acos::NAMESPACE::Descriptor *>(desc); \
-        return INFINI_STATUS_SUCCESS
-
-    switch (desc->device_type) {
-
-#ifdef ENABLE_CPU_API
-        DELETE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        DELETE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef DELETE
-}
+UNARY_OP_IMPL(acos, Acos)
diff --git a/src/infiniop/ops/acosh/cpu/acosh_cpu.cc b/src/infiniop/ops/acosh/cpu/acosh_cpu.cc
index 005463679..0cb424c00 100644
--- a/src/infiniop/ops/acosh/cpu/acosh_cpu.cc
+++ b/src/infiniop/ops/acosh/cpu/acosh_cpu.cc
@@ -1,48 +1,8 @@
 #include "acosh_cpu.h"
+#include "../../../elementwise/cpu/elementwise_cpu_impl.h"
 
 namespace op::acosh::cpu {
 
-Descriptor::~Descriptor() = default;
+ELEMENTWISE_CPU_IMPL_UNARY(acosh)
 
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t out_desc,
-    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
-
-    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
-    auto dtype = out_desc->dtype();
-
-    const auto &x_desc = input_desc_vec.at(0);
-    const auto &y_shape = out_desc->shape();
-    const auto &x_shape = x_desc->shape();
-
-    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
-
-    CHECK_SAME_SHAPE(y_shape, x_shape);
-
-    // create CPU elementwise descriptor
-    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    std::vector<const void *> inputs,
-    void *stream) const {
-
-    switch (_dtype) {
-    case INFINI_DTYPE_F16:
-        return _device_info->calculate<AcoshOp, fp16_t>(_info, output, inputs, stream);
-    case INFINI_DTYPE_F32:
-        return _device_info->calculate<AcoshOp, float>(_info, output, inputs, stream);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
 } // namespace op::acosh::cpu
diff --git a/src/infiniop/ops/acosh/cpu/acosh_cpu.h b/src/infiniop/ops/acosh/cpu/acosh_cpu.h
index b4b710ed5..bb05baf14 100644
--- a/src/infiniop/ops/acosh/cpu/acosh_cpu.h
+++ b/src/infiniop/ops/acosh/cpu/acosh_cpu.h
@@ -1,22 +1,9 @@
 #ifndef __ACOSH_CPU_H__
 #define __ACOSH_CPU_H__
 
-#include <cmath>
-
 #include "../../../elementwise/cpu/elementwise_cpu.h"
+#include "../../../elementwise/unary.h"
 
-ELEMENTWISE_DESCRIPTOR(acosh, cpu)
-
-namespace op::acosh::cpu {
-typedef struct AcoshOp {
-public:
-    static constexpr size_t num_inputs = 1;
-
-    template <typename T>
-    T operator()(const T &x) const {
-        return std::acosh(x);
-    }
-} AcoshOp;
-} // namespace op::acosh::cpu
+UNARY_ELEMENTWISE_DESCRIPTOR(acosh, cpu, op::elementwise::unary::UnaryMode::Acosh)
 
 #endif // __ACOSH_CPU_H__
diff --git a/src/infiniop/ops/acosh/cuda/kernel.cuh b/src/infiniop/ops/acosh/cuda/kernel.cuh
index fe444b1b4..9fbb54636 100644
--- a/src/infiniop/ops/acosh/cuda/kernel.cuh
+++ b/src/infiniop/ops/acosh/cuda/kernel.cuh
@@ -1,32 +1,10 @@
 #ifndef __ACOSH_CUDA_H__
 #define __ACOSH_CUDA_H__
 
-#include <cmath>
-#include <cuda_fp16.h>
+#include "../../../elementwise/unary.h"
 
 namespace op::acosh::cuda {
-typedef struct AcoshOp {
-public:
-    static constexpr size_t num_inputs = 1;
-    template <typename T>
-    __device__ __forceinline__ T operator()(const T &x) const {
-        if constexpr (std::is_same_v<T, half2>) {
-            return __floats2half2_rn(acoshf(__half2float(__low2half(x))), acoshf(__half2float(__high2half(x))));
-        } else if constexpr (std::is_same_v<T, half>) {
-            return __float2half(acoshf(__half2float(x)));
-        } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
-            float x0 = __bfloat162float(__low2bfloat16(x));
-            float x1 = __bfloat162float(__high2bfloat16(x));
-            return __floats2bfloat162_rn(acoshf(x0), acoshf(x1));
-        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
-            return __float2bfloat16_rn(acoshf(__bfloat162float(x)));
-        } else if constexpr (std::is_same_v<T, float>) {
-            return acoshf(x);
-        } else {
-            return std::acosh(x);
-        }
-    }
-} AcoshOp;
+using Op = op::elementwise::unary::cuda::UnaryOp<op::elementwise::unary::UnaryMode::Acosh>;
 } // namespace op::acosh::cuda
 
 #endif // __ACOSH_CUDA_H__
diff --git a/src/infiniop/ops/acosh/nvidia/acosh_nvidia.cu b/src/infiniop/ops/acosh/nvidia/acosh_nvidia.cu
index fc06590a7..5d065bdbc 100644
--- a/src/infiniop/ops/acosh/nvidia/acosh_nvidia.cu
+++ b/src/infiniop/ops/acosh/nvidia/acosh_nvidia.cu
@@ -1,54 +1,10 @@
-#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh"
 
 #include "../cuda/kernel.cuh"
 #include "acosh_nvidia.cuh"
 
 namespace op::acosh::nvidia {
 
-Descriptor::~Descriptor() = default;
+ELEMENTWISE_NVIDIA_IMPL_UNARY(acosh)
 
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t out_desc,
-    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
-
-    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
-    auto dtype = out_desc->dtype();
-
-    const auto &x_desc = input_desc_vec.at(0);
-    const auto &y_shape = out_desc->shape();
-    const auto &x_shape = x_desc->shape();
-
-    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
-
-    CHECK_SAME_SHAPE(y_shape, x_shape);
-
-    // create CUDA elementwise descriptor
-    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    std::vector<const void *> inputs,
-    void *stream) const {
-    if (workspace_size < _workspace_size) {
-        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
-    }
-
-    switch (_dtype) {
-    case INFINI_DTYPE_F16:
-        return _device_info->calculate<256, cuda::AcoshOp, half>(_info, workspace, output, inputs, stream);
-    case INFINI_DTYPE_F32:
-        return _device_info->calculate<256, cuda::AcoshOp, float>(_info, workspace, output, inputs, stream);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
 } // namespace op::acosh::nvidia
diff --git a/src/infiniop/ops/acosh/operator.cc b/src/infiniop/ops/acosh/operator.cc
index 9bba3389a..c1939a54c 100644
--- a/src/infiniop/ops/acosh/operator.cc
+++ b/src/infiniop/ops/acosh/operator.cc
@@ -1,5 +1,4 @@
-#include "../../operator.h"
-#include "../../handle.h"
+#include "../../operator_impl.h"
 #include "infiniop/ops/acosh.h"
 
 #ifdef ENABLE_CPU_API
@@ -9,131 +8,4 @@
 #include "nvidia/acosh_nvidia.cuh"
 #endif
 
-__C infiniStatus_t infiniopCreateAcoshDescriptor(
-    infiniopHandle_t handle,
-    infiniopAcoshDescriptor_t *desc_ptr,
-    infiniopTensorDescriptor_t y_desc,
-    infiniopTensorDescriptor_t x_desc) {
-
-#define CREATE(CASE, NAMESPACE)                                              \
-    case CASE:                                                               \
-        return op::acosh::NAMESPACE::Descriptor::create(                     \
-            handle,                                                          \
-            reinterpret_cast<op::acosh::NAMESPACE::Descriptor **>(desc_ptr), \
-            y_desc,                                                          \
-            {x_desc})
-
-    switch (handle->device) {
-
-#ifdef ENABLE_CPU_API
-        CREATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        CREATE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef CREATE
-}
-
-__C infiniStatus_t infiniopGetAcoshWorkspaceSize(infiniopAcoshDescriptor_t desc, size_t *size) {
-
-#define GET(CASE, NAMESPACE)                                                                 \
-    case CASE:                                                                               \
-        *size = reinterpret_cast<op::acosh::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
-        return INFINI_STATUS_SUCCESS;
-
-    switch (desc->device_type) {
-#ifdef ENABLE_CPU_API
-        GET(INFINI_DEVICE_CPU, cpu)
-#endif
-#ifdef ENABLE_NVIDIA_API
-        GET(INFINI_DEVICE_NVIDIA, nvidia)
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        GET(INFINI_DEVICE_ILUVATAR, nvidia)
-#endif
-#ifdef ENABLE_QY_API
-        GET(INFINI_DEVICE_QY, nvidia)
-#endif
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-#undef GET
-
-    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-}
-
-__C infiniStatus_t infiniopAcosh(
-    infiniopAcoshDescriptor_t desc,
-    void *workspace,
-    size_t workspace_size,
-    void *y,
-    const void *x,
-    void *stream) {
-
-#define CALCULATE(CASE, NAMESPACE)                                              \
-    case CASE:                                                                  \
-        return reinterpret_cast<const op::acosh::NAMESPACE::Descriptor *>(desc) \
-            ->calculate(workspace, workspace_size, y, {x}, stream)
-
-    switch (desc->device_type) {
-
-#ifdef ENABLE_CPU_API
-        CALCULATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        CALCULATE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef CALCULATE
-}
-
-__C infiniStatus_t
-infiniopDestroyAcoshDescriptor(infiniopAcoshDescriptor_t desc) {
-
-#define DELETE(CASE, NAMESPACE)                                                  \
-    case CASE:                                                                   \
-        delete reinterpret_cast<const op::acosh::NAMESPACE::Descriptor *>(desc); \
-        return INFINI_STATUS_SUCCESS
-
-    switch (desc->device_type) {
-
-#ifdef ENABLE_CPU_API
-        DELETE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        DELETE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef DELETE
-}
+UNARY_OP_IMPL(acosh, Acosh)
diff --git a/src/infiniop/ops/asin/cpu/asin_cpu.cc b/src/infiniop/ops/asin/cpu/asin_cpu.cc
index e149044f1..de42639ff 100644
--- a/src/infiniop/ops/asin/cpu/asin_cpu.cc
+++ b/src/infiniop/ops/asin/cpu/asin_cpu.cc
@@ -1,48 +1,8 @@
 #include "asin_cpu.h"
+#include "../../../elementwise/cpu/elementwise_cpu_impl.h"
 
 namespace op::asin::cpu {
 
-Descriptor::~Descriptor() = default;
+ELEMENTWISE_CPU_IMPL_UNARY(asin)
 
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t out_desc,
-    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
-
-    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
-    auto dtype = out_desc->dtype();
-
-    const auto &x_desc = input_desc_vec.at(0);
-    const auto &y_shape = out_desc->shape();
-    const auto &x_shape = x_desc->shape();
-
-    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
-
-    CHECK_SAME_SHAPE(y_shape, x_shape);
-
-    // create CPU elementwise descriptor
-    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    std::vector<const void *> inputs,
-    void *stream) const {
-
-    switch (_dtype) {
-    case INFINI_DTYPE_F16:
-        return _device_info->calculate<AsinOp, fp16_t>(_info, output, inputs, stream);
-    case INFINI_DTYPE_F32:
-        return _device_info->calculate<AsinOp, float>(_info, output, inputs, stream);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
 } // namespace op::asin::cpu
diff --git a/src/infiniop/ops/asin/cpu/asin_cpu.h b/src/infiniop/ops/asin/cpu/asin_cpu.h
index 22bcba337..8c6da5e20 100644
--- a/src/infiniop/ops/asin/cpu/asin_cpu.h
+++ b/src/infiniop/ops/asin/cpu/asin_cpu.h
@@ -1,22 +1,9 @@
 #ifndef __ASIN_CPU_H__
 #define __ASIN_CPU_H__
 
-#include <cmath>
-
 #include "../../../elementwise/cpu/elementwise_cpu.h"
+#include "../../../elementwise/unary.h"
 
-ELEMENTWISE_DESCRIPTOR(asin, cpu)
-
-namespace op::asin::cpu {
-typedef struct AsinOp {
-public:
-    static constexpr size_t num_inputs = 1;
-
-    template <typename T>
-    T operator()(const T &x) const {
-        return std::asin(x);
-    }
-} AsinOp;
-} // namespace op::asin::cpu
+UNARY_ELEMENTWISE_DESCRIPTOR(asin, cpu, op::elementwise::unary::UnaryMode::Asin)
 
 #endif // __ASIN_CPU_H__
diff --git a/src/infiniop/ops/asin/cuda/kernel.cuh b/src/infiniop/ops/asin/cuda/kernel.cuh
index 3e8d11a07..a7063f015 100644
--- a/src/infiniop/ops/asin/cuda/kernel.cuh
+++ b/src/infiniop/ops/asin/cuda/kernel.cuh
@@ -1,32 +1,10 @@
 #ifndef __ASIN_CUDA_H__
 #define __ASIN_CUDA_H__
 
-#include <cmath>
-#include <cuda_fp16.h>
+#include "../../../elementwise/unary.h"
 
 namespace op::asin::cuda {
-typedef struct AsinOp {
-public:
-    static constexpr size_t num_inputs = 1;
-    template <typename T>
-    __device__ __forceinline__ T operator()(const T &x) const {
-        if constexpr (std::is_same_v<T, half2>) {
-            return __floats2half2_rn(asinf(__half2float(__low2half(x))), asinf(__half2float(__high2half(x))));
-        } else if constexpr (std::is_same_v<T, half>) {
-            return __float2half(asinf(__half2float(x)));
-        } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
-            float x0 = __bfloat162float(__low2bfloat16(x));
-            float x1 = __bfloat162float(__high2bfloat16(x));
-            return __floats2bfloat162_rn(asinf(x0), asinf(x1));
-        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
-            return __float2bfloat16_rn(asinf(__bfloat162float(x)));
-        } else if constexpr (std::is_same_v<T, float>) {
-            return asinf(x);
-        } else {
-            return std::asin(x);
-        }
-    }
-} AsinOp;
+using Op = op::elementwise::unary::cuda::UnaryOp<op::elementwise::unary::UnaryMode::Asin>;
 } // namespace op::asin::cuda
 
 #endif // __ASIN_CUDA_H__
diff --git a/src/infiniop/ops/asin/nvidia/asin_nvidia.cu b/src/infiniop/ops/asin/nvidia/asin_nvidia.cu
index 714d2b1b3..262755d50 100644
--- a/src/infiniop/ops/asin/nvidia/asin_nvidia.cu
+++ b/src/infiniop/ops/asin/nvidia/asin_nvidia.cu
@@ -1,54 +1,10 @@
-#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh"
 
 #include "../cuda/kernel.cuh"
 #include "asin_nvidia.cuh"
 
 namespace op::asin::nvidia {
 
-Descriptor::~Descriptor() = default;
+ELEMENTWISE_NVIDIA_IMPL_UNARY(asin)
 
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t out_desc,
-    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
-
-    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
-    auto dtype = out_desc->dtype();
-
-    const auto &x_desc = input_desc_vec.at(0);
-    const auto &y_shape = out_desc->shape();
-    const auto &x_shape = x_desc->shape();
-
-    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
-
-    CHECK_SAME_SHAPE(y_shape, x_shape);
-
-    // create CUDA elementwise descriptor
-    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    std::vector<const void *> inputs,
-    void *stream) const {
-    if (workspace_size < _workspace_size) {
-        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
-    }
-
-    switch (_dtype) {
-    case INFINI_DTYPE_F16:
-        return _device_info->calculate<256, cuda::AsinOp, half>(_info, workspace, output, inputs, stream);
-    case INFINI_DTYPE_F32:
-        return _device_info->calculate<256, cuda::AsinOp, float>(_info, workspace, output, inputs, stream);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
 } // namespace op::asin::nvidia
diff --git a/src/infiniop/ops/asin/operator.cc b/src/infiniop/ops/asin/operator.cc
index c4973e9f5..edb8fa867 100644
--- a/src/infiniop/ops/asin/operator.cc
+++ b/src/infiniop/ops/asin/operator.cc
@@ -1,5 +1,4 @@
-#include "../../operator.h"
-#include "../../handle.h"
+#include "../../operator_impl.h"
 #include "infiniop/ops/asin.h"
 
 #ifdef ENABLE_CPU_API
@@ -9,131 +8,4 @@
 #include "nvidia/asin_nvidia.cuh"
 #endif
 
-__C infiniStatus_t infiniopCreateAsinDescriptor(
-    infiniopHandle_t handle,
-    infiniopAsinDescriptor_t *desc_ptr,
-    infiniopTensorDescriptor_t y_desc,
-    infiniopTensorDescriptor_t x_desc) {
-
-#define CREATE(CASE, NAMESPACE)                                             \
-    case CASE:                                                              \
-        return op::asin::NAMESPACE::Descriptor::create(                     \
-            handle,                                                         \
-            reinterpret_cast<op::asin::NAMESPACE::Descriptor **>(desc_ptr), \
-            y_desc,                                                         \
-            {x_desc})
-
-    switch (handle->device) {
-
-#ifdef ENABLE_CPU_API
-        CREATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        CREATE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef CREATE
-}
-
-__C infiniStatus_t infiniopGetAsinWorkspaceSize(infiniopAsinDescriptor_t desc, size_t *size) {
-
-#define GET(CASE, NAMESPACE)                                                                \
-    case CASE:                                                                              \
-        *size = reinterpret_cast<op::asin::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
-        return INFINI_STATUS_SUCCESS;
-
-    switch (desc->device_type) {
-#ifdef ENABLE_CPU_API
-        GET(INFINI_DEVICE_CPU, cpu)
-#endif
-#ifdef ENABLE_NVIDIA_API
-        GET(INFINI_DEVICE_NVIDIA, nvidia)
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        GET(INFINI_DEVICE_ILUVATAR, nvidia)
-#endif
-#ifdef ENABLE_QY_API
-        GET(INFINI_DEVICE_QY, nvidia)
-#endif
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-#undef GET
-
-    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-}
-
-__C infiniStatus_t infiniopAsin(
-    infiniopAsinDescriptor_t desc,
-    void *workspace,
-    size_t workspace_size,
-    void *y,
-    const void *x,
-    void *stream) {
-
-#define CALCULATE(CASE, NAMESPACE)                                             \
-    case CASE:                                                                 \
-        return reinterpret_cast<const op::asin::NAMESPACE::Descriptor *>(desc) \
-            ->calculate(workspace, workspace_size, y, {x}, stream)
-
-    switch (desc->device_type) {
-
-#ifdef ENABLE_CPU_API
-        CALCULATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        CALCULATE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef CALCULATE
-}
-
-__C infiniStatus_t
-infiniopDestroyAsinDescriptor(infiniopAsinDescriptor_t desc) {
-
-#define DELETE(CASE, NAMESPACE)                                                 \
-    case CASE:                                                                  \
-        delete reinterpret_cast<const op::asin::NAMESPACE::Descriptor *>(desc); \
-        return INFINI_STATUS_SUCCESS
-
-    switch (desc->device_type) {
-
-#ifdef ENABLE_CPU_API
-        DELETE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        DELETE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef DELETE
-}
+UNARY_OP_IMPL(asin, Asin)
diff --git a/src/infiniop/ops/asinh/cpu/asinh_cpu.cc b/src/infiniop/ops/asinh/cpu/asinh_cpu.cc
index e0d5b749a..8b18ab6f8 100644
--- a/src/infiniop/ops/asinh/cpu/asinh_cpu.cc
+++ b/src/infiniop/ops/asinh/cpu/asinh_cpu.cc
@@ -1,48 +1,8 @@
 #include "asinh_cpu.h"
+#include "../../../elementwise/cpu/elementwise_cpu_impl.h"
 
 namespace op::asinh::cpu {
 
-Descriptor::~Descriptor() = default;
+ELEMENTWISE_CPU_IMPL_UNARY(asinh)
 
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t out_desc,
-    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
-
-    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
-    auto dtype = out_desc->dtype();
-
-    const auto &x_desc = input_desc_vec.at(0);
-    const auto &y_shape = out_desc->shape();
-    const auto &x_shape = x_desc->shape();
-
-    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
-
-    CHECK_SAME_SHAPE(y_shape, x_shape);
-
-    // create CPU elementwise descriptor
-    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    std::vector<const void *> inputs,
-    void *stream) const {
-
-    switch (_dtype) {
-    case INFINI_DTYPE_F16:
-        return _device_info->calculate<AsinhOp, fp16_t>(_info, output, inputs, stream);
-    case INFINI_DTYPE_F32:
-        return _device_info->calculate<AsinhOp, float>(_info, output, inputs, stream);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
 } // namespace op::asinh::cpu
diff --git a/src/infiniop/ops/asinh/cpu/asinh_cpu.h b/src/infiniop/ops/asinh/cpu/asinh_cpu.h
index 0a999b63b..4c3603752 100644
--- a/src/infiniop/ops/asinh/cpu/asinh_cpu.h
+++ b/src/infiniop/ops/asinh/cpu/asinh_cpu.h
@@ -1,22 +1,9 @@
 #ifndef __ASINH_CPU_H__
 #define __ASINH_CPU_H__
 
-#include <cmath>
-
 #include "../../../elementwise/cpu/elementwise_cpu.h"
+#include "../../../elementwise/unary.h"
 
-ELEMENTWISE_DESCRIPTOR(asinh, cpu)
-
-namespace op::asinh::cpu {
-typedef struct AsinhOp {
-public:
-    static constexpr size_t num_inputs = 1;
-
-    template <typename T>
-    T operator()(const T &x) const {
-        return std::asinh(x);
-    }
-} AsinhOp;
-} // namespace op::asinh::cpu
+UNARY_ELEMENTWISE_DESCRIPTOR(asinh, cpu, op::elementwise::unary::UnaryMode::Asinh)
 
 #endif // __ASINH_CPU_H__
diff --git a/src/infiniop/ops/asinh/cuda/kernel.cuh b/src/infiniop/ops/asinh/cuda/kernel.cuh
index 7cb018c8a..866ea147a 100644
--- a/src/infiniop/ops/asinh/cuda/kernel.cuh
+++ b/src/infiniop/ops/asinh/cuda/kernel.cuh
@@ -1,32 +1,10 @@
 #ifndef __ASINH_CUDA_H__
 #define __ASINH_CUDA_H__
 
-#include <cmath>
-#include <cuda_fp16.h>
+#include "../../../elementwise/unary.h"
 
 namespace op::asinh::cuda {
-typedef struct AsinhOp {
-public:
-    static constexpr size_t num_inputs = 1;
-    template <typename T>
-    __device__ __forceinline__ T operator()(const T &x) const {
-        if constexpr (std::is_same_v<T, half2>) {
-            return __floats2half2_rn(asinhf(__half2float(__low2half(x))), asinhf(__half2float(__high2half(x))));
-        } else if constexpr (std::is_same_v<T, half>) {
-            return __float2half(asinhf(__half2float(x)));
-        } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
-            float x0 = __bfloat162float(__low2bfloat16(x));
-            float x1 = __bfloat162float(__high2bfloat16(x));
-            return __floats2bfloat162_rn(asinhf(x0), asinhf(x1));
-        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
-            return __float2bfloat16_rn(asinhf(__bfloat162float(x)));
-        } else if constexpr (std::is_same_v<T, float>) {
-            return asinhf(x);
-        } else {
-            return std::asinh(x);
-        }
-    }
-} AsinhOp;
+using Op = op::elementwise::unary::cuda::UnaryOp<op::elementwise::unary::UnaryMode::Asinh>;
 } // namespace op::asinh::cuda
 
 #endif // __ASINH_CUDA_H__
diff --git a/src/infiniop/ops/asinh/nvidia/asinh_nvidia.cu b/src/infiniop/ops/asinh/nvidia/asinh_nvidia.cu
index 203008b81..37c44baf0 100644
--- a/src/infiniop/ops/asinh/nvidia/asinh_nvidia.cu
+++ b/src/infiniop/ops/asinh/nvidia/asinh_nvidia.cu
@@ -1,54 +1,10 @@
-#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh"
 
 #include "../cuda/kernel.cuh"
 #include "asinh_nvidia.cuh"
 
 namespace op::asinh::nvidia {
 
-Descriptor::~Descriptor() = default;
+ELEMENTWISE_NVIDIA_IMPL_UNARY(asinh)
 
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t out_desc,
-    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
-
-    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
-    auto dtype = out_desc->dtype();
-
-    const auto &x_desc = input_desc_vec.at(0);
-    const auto &y_shape = out_desc->shape();
-    const auto &x_shape = x_desc->shape();
-
-    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
-
-    CHECK_SAME_SHAPE(y_shape, x_shape);
-
-    // create CUDA elementwise descriptor
-    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    std::vector<const void *> inputs,
-    void *stream) const {
-    if (workspace_size < _workspace_size) {
-        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
-    }
-
-    switch (_dtype) {
-    case INFINI_DTYPE_F16:
-        return _device_info->calculate<256, cuda::AsinhOp, half>(_info, workspace, output, inputs, stream);
-    case INFINI_DTYPE_F32:
-        return _device_info->calculate<256, cuda::AsinhOp, float>(_info, workspace, output, inputs, stream);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
 } // namespace op::asinh::nvidia
diff --git a/src/infiniop/ops/asinh/operator.cc b/src/infiniop/ops/asinh/operator.cc
index d9ff5beda..7b519ec05 100644
--- a/src/infiniop/ops/asinh/operator.cc
+++ b/src/infiniop/ops/asinh/operator.cc
@@ -1,5 +1,4 @@
-#include "../../operator.h"
-#include "../../handle.h"
+#include "../../operator_impl.h"
 #include "infiniop/ops/asinh.h"
 
 #ifdef ENABLE_CPU_API
@@ -9,131 +8,4 @@
 #include "nvidia/asinh_nvidia.cuh"
 #endif
 
-__C infiniStatus_t infiniopCreateAsinhDescriptor(
-    infiniopHandle_t handle,
-    infiniopAsinhDescriptor_t *desc_ptr,
-    infiniopTensorDescriptor_t y_desc,
-    infiniopTensorDescriptor_t x_desc) {
-
-#define CREATE(CASE, NAMESPACE)                                              \
-    case CASE:                                                               \
-        return op::asinh::NAMESPACE::Descriptor::create(                     \
-            handle,                                                          \
-            reinterpret_cast<op::asinh::NAMESPACE::Descriptor **>(desc_ptr), \
-            y_desc,                                                          \
-            {x_desc})
-
-    switch (handle->device) {
-
-#ifdef ENABLE_CPU_API
-        CREATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        CREATE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef CREATE
-}
-
-__C infiniStatus_t infiniopGetAsinhWorkspaceSize(infiniopAsinhDescriptor_t desc, size_t *size) {
-
-#define GET(CASE, NAMESPACE)                                                                 \
-    case CASE:                                                                               \
-        *size = reinterpret_cast<op::asinh::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
-        return INFINI_STATUS_SUCCESS;
-
-    switch (desc->device_type) {
-#ifdef ENABLE_CPU_API
-        GET(INFINI_DEVICE_CPU, cpu)
-#endif
-#ifdef ENABLE_NVIDIA_API
-        GET(INFINI_DEVICE_NVIDIA, nvidia)
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        GET(INFINI_DEVICE_ILUVATAR, nvidia)
-#endif
-#ifdef ENABLE_QY_API
-        GET(INFINI_DEVICE_QY, nvidia)
-#endif
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-#undef GET
-
-    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-}
-
-__C infiniStatus_t infiniopAsinh(
-    infiniopAsinhDescriptor_t desc,
-    void *workspace,
-    size_t workspace_size,
-    void *y,
-    const void *x,
-    void *stream) {
-
-#define CALCULATE(CASE, NAMESPACE)                                              \
-    case CASE:                                                                  \
-        return reinterpret_cast<const op::asinh::NAMESPACE::Descriptor *>(desc) \
-            ->calculate(workspace, workspace_size, y, {x}, stream)
-
-    switch (desc->device_type) {
-
-#ifdef ENABLE_CPU_API
-        CALCULATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        CALCULATE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef CALCULATE
-}
-
-__C infiniStatus_t
-infiniopDestroyAsinhDescriptor(infiniopAsinhDescriptor_t desc) {
-
-#define DELETE(CASE, NAMESPACE)                                                  \
-    case CASE:                                                                   \
-        delete reinterpret_cast<const op::asinh::NAMESPACE::Descriptor *>(desc); \
-        return INFINI_STATUS_SUCCESS
-
-    switch (desc->device_type) {
-
-#ifdef ENABLE_CPU_API
-        DELETE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        DELETE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef DELETE
-}
+UNARY_OP_IMPL(asinh, Asinh)
diff --git a/src/infiniop/ops/atan/cpu/atan_cpu.cc b/src/infiniop/ops/atan/cpu/atan_cpu.cc
index a8c613d1e..075c7fd4e 100644
--- a/src/infiniop/ops/atan/cpu/atan_cpu.cc
+++ b/src/infiniop/ops/atan/cpu/atan_cpu.cc
@@ -1,48 +1,8 @@
 #include "atan_cpu.h"
+#include "../../../elementwise/cpu/elementwise_cpu_impl.h"
 
 namespace op::atan::cpu {
 
-Descriptor::~Descriptor() = default;
+ELEMENTWISE_CPU_IMPL_UNARY(atan)
 
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t out_desc,
-    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
-
-    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
-    auto dtype = out_desc->dtype();
-
-    const auto &x_desc = input_desc_vec.at(0);
-    const auto &y_shape = out_desc->shape();
-    const auto &x_shape = x_desc->shape();
-
-    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
-
-    CHECK_SAME_SHAPE(y_shape, x_shape);
-
-    // create CPU elementwise descriptor
-    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    std::vector<const void *> inputs,
-    void *stream) const {
-
-    switch (_dtype) {
-    case INFINI_DTYPE_F16:
-        return _device_info->calculate<AtanOp, fp16_t>(_info, output, inputs, stream);
-    case INFINI_DTYPE_F32:
-        return _device_info->calculate<AtanOp, float>(_info, output, inputs, stream);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
 } // namespace op::atan::cpu
diff --git a/src/infiniop/ops/atan/cpu/atan_cpu.h b/src/infiniop/ops/atan/cpu/atan_cpu.h
index ac2a1bc0c..6b333cfb1 100644
--- a/src/infiniop/ops/atan/cpu/atan_cpu.h
+++ b/src/infiniop/ops/atan/cpu/atan_cpu.h
@@ -1,22 +1,9 @@
 #ifndef __ATAN_CPU_H__
 #define __ATAN_CPU_H__
 
-#include <cmath>
-
 #include "../../../elementwise/cpu/elementwise_cpu.h"
+#include "../../../elementwise/unary.h"
 
-ELEMENTWISE_DESCRIPTOR(atan, cpu)
-
-namespace op::atan::cpu {
-typedef struct AtanOp {
-public:
-    static constexpr size_t num_inputs = 1;
-
-    template <typename T>
-    T operator()(const T &x) const {
-        return std::atan(x);
-    }
-} AtanOp;
-} // namespace op::atan::cpu
+UNARY_ELEMENTWISE_DESCRIPTOR(atan, cpu, op::elementwise::unary::UnaryMode::Atan)
 
 #endif // __ATAN_CPU_H__
diff --git a/src/infiniop/ops/atan/cuda/kernel.cuh b/src/infiniop/ops/atan/cuda/kernel.cuh
index 0c7745196..ce553c1c1 100644
--- a/src/infiniop/ops/atan/cuda/kernel.cuh
+++ b/src/infiniop/ops/atan/cuda/kernel.cuh
@@ -1,32 +1,10 @@
 #ifndef __ATAN_CUDA_H__
 #define __ATAN_CUDA_H__
 
-#include <cmath>
-#include <cuda_fp16.h>
+#include "../../../elementwise/unary.h"
 
 namespace op::atan::cuda {
-typedef struct AtanOp {
-public:
-    static constexpr size_t num_inputs = 1;
-    template <typename T>
-    __device__ __forceinline__ T operator()(const T &x) const {
-        if constexpr (std::is_same_v<T, half2>) {
-            return __floats2half2_rn(atanf(__half2float(__low2half(x))), atanf(__half2float(__high2half(x))));
-        } else if constexpr (std::is_same_v<T, half>) {
-            return __float2half(atanf(__half2float(x)));
-        } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
-            float x0 = __bfloat162float(__low2bfloat16(x));
-            float x1 = __bfloat162float(__high2bfloat16(x));
-            return __floats2bfloat162_rn(atanf(x0), atanf(x1));
-        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
-            return __float2bfloat16_rn(atanf(__bfloat162float(x)));
-        } else if constexpr (std::is_same_v<T, float>) {
-            return atanf(x);
-        } else {
-            return std::atan(x);
-        }
-    }
-} AtanOp;
+using Op = op::elementwise::unary::cuda::UnaryOp<op::elementwise::unary::UnaryMode::Atan>;
 } // namespace op::atan::cuda
 
 #endif // __ATAN_CUDA_H__
diff --git a/src/infiniop/ops/atan/nvidia/atan_nvidia.cu b/src/infiniop/ops/atan/nvidia/atan_nvidia.cu
index 2c6cf53d4..a05d65b79 100644
--- a/src/infiniop/ops/atan/nvidia/atan_nvidia.cu
+++ b/src/infiniop/ops/atan/nvidia/atan_nvidia.cu
@@ -1,54 +1,10 @@
-#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh"
 
 #include "../cuda/kernel.cuh"
 #include "atan_nvidia.cuh"
 
 namespace op::atan::nvidia {
 
-Descriptor::~Descriptor() = default;
+ELEMENTWISE_NVIDIA_IMPL_UNARY(atan)
 
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t out_desc,
-    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
-
-    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
-    auto dtype = out_desc->dtype();
-
-    const auto &x_desc = input_desc_vec.at(0);
-    const auto &y_shape = out_desc->shape();
-    const auto &x_shape = x_desc->shape();
-
-    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
-
-    CHECK_SAME_SHAPE(y_shape, x_shape);
-
-    // create CUDA elementwise descriptor
-    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    std::vector<const void *> inputs,
-    void *stream) const {
-    if (workspace_size < _workspace_size) {
-        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
-    }
-
-    switch (_dtype) {
-    case INFINI_DTYPE_F16:
-        return _device_info->calculate<256, cuda::AtanOp, half>(_info, workspace, output, inputs, stream);
-    case INFINI_DTYPE_F32:
-        return _device_info->calculate<256, cuda::AtanOp, float>(_info, workspace, output, inputs, stream);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
 } // namespace op::atan::nvidia
diff --git a/src/infiniop/ops/atan/operator.cc b/src/infiniop/ops/atan/operator.cc
index c56e101d2..9025489c3 100644
--- a/src/infiniop/ops/atan/operator.cc
+++ b/src/infiniop/ops/atan/operator.cc
@@ -1,5 +1,4 @@
-#include "../../operator.h"
-#include "../../handle.h"
+#include "../../operator_impl.h"
 #include "infiniop/ops/atan.h"
 
 #ifdef ENABLE_CPU_API
@@ -9,131 +8,4 @@
 #include "nvidia/atan_nvidia.cuh"
 #endif
 
-__C infiniStatus_t infiniopCreateAtanDescriptor(
-    infiniopHandle_t handle,
-    infiniopAtanDescriptor_t *desc_ptr,
-    infiniopTensorDescriptor_t y_desc,
-    infiniopTensorDescriptor_t x_desc) {
-
-#define CREATE(CASE, NAMESPACE)                                             \
-    case CASE:                                                              \
-        return op::atan::NAMESPACE::Descriptor::create(                     \
-            handle,                                                         \
-            reinterpret_cast<op::atan::NAMESPACE::Descriptor **>(desc_ptr), \
-            y_desc,                                                         \
-            {x_desc})
-
-    switch (handle->device) {
-
-#ifdef ENABLE_CPU_API
-        CREATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        CREATE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef CREATE
-}
-
-__C infiniStatus_t infiniopGetAtanWorkspaceSize(infiniopAtanDescriptor_t desc, size_t *size) {
-
-#define GET(CASE, NAMESPACE)                                                                \
-    case CASE:                                                                              \
-        *size = reinterpret_cast<op::atan::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
-        return INFINI_STATUS_SUCCESS;
-
-    switch (desc->device_type) {
-#ifdef ENABLE_CPU_API
-        GET(INFINI_DEVICE_CPU, cpu)
-#endif
-#ifdef ENABLE_NVIDIA_API
-        GET(INFINI_DEVICE_NVIDIA, nvidia)
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        GET(INFINI_DEVICE_ILUVATAR, nvidia)
-#endif
-#ifdef ENABLE_QY_API
-        GET(INFINI_DEVICE_QY, nvidia)
-#endif
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-#undef GET
-
-    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-}
-
-__C infiniStatus_t infiniopAtan(
-    infiniopAtanDescriptor_t desc,
-    void *workspace,
-    size_t workspace_size,
-    void *y,
-    const void *x,
-    void *stream) {
-
-#define CALCULATE(CASE, NAMESPACE)                                             \
-    case CASE:                                                                 \
-        return reinterpret_cast<const op::atan::NAMESPACE::Descriptor *>(desc) \
-            ->calculate(workspace, workspace_size, y, {x}, stream)
-
-    switch (desc->device_type) {
-
-#ifdef ENABLE_CPU_API
-        CALCULATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        CALCULATE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef CALCULATE
-}
-
-__C infiniStatus_t
-infiniopDestroyAtanDescriptor(infiniopAtanDescriptor_t desc) {
-
-#define DELETE(CASE, NAMESPACE)                                                 \
-    case CASE:                                                                  \
-        delete reinterpret_cast<const op::atan::NAMESPACE::Descriptor *>(desc); \
-        return INFINI_STATUS_SUCCESS
-
-    switch (desc->device_type) {
-
-#ifdef ENABLE_CPU_API
-        DELETE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        DELETE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef DELETE
-}
+UNARY_OP_IMPL(atan, Atan)
diff --git a/src/infiniop/ops/atanh/cpu/atanh_cpu.cc b/src/infiniop/ops/atanh/cpu/atanh_cpu.cc
index 66ef4b1df..d19c978e4 100644
--- a/src/infiniop/ops/atanh/cpu/atanh_cpu.cc
+++ b/src/infiniop/ops/atanh/cpu/atanh_cpu.cc
@@ -1,48 +1,8 @@
 #include "atanh_cpu.h"
+#include "../../../elementwise/cpu/elementwise_cpu_impl.h"
 
 namespace op::atanh::cpu {
 
-Descriptor::~Descriptor() = default;
+ELEMENTWISE_CPU_IMPL_UNARY(atanh)
 
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t out_desc,
-    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
-
-    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
-    auto dtype = out_desc->dtype();
-
-    const auto &x_desc = input_desc_vec.at(0);
-    const auto &y_shape = out_desc->shape();
-    const auto &x_shape = x_desc->shape();
-
-    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
-
-    CHECK_SAME_SHAPE(y_shape, x_shape);
-
-    // create CPU elementwise descriptor
-    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    std::vector<const void *> inputs,
-    void *stream) const {
-
-    switch (_dtype) {
-    case INFINI_DTYPE_F16:
-        return _device_info->calculate<AtanhOp, fp16_t>(_info, output, inputs, stream);
-    case INFINI_DTYPE_F32:
-        return _device_info->calculate<AtanhOp, float>(_info, output, inputs, stream);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
 } // namespace op::atanh::cpu
diff --git a/src/infiniop/ops/atanh/cpu/atanh_cpu.h b/src/infiniop/ops/atanh/cpu/atanh_cpu.h
index 8c2b04755..1a37453f0 100644
--- a/src/infiniop/ops/atanh/cpu/atanh_cpu.h
+++ b/src/infiniop/ops/atanh/cpu/atanh_cpu.h
@@ -1,22 +1,9 @@
 #ifndef __ATANH_CPU_H__
 #define __ATANH_CPU_H__
 
-#include <cmath>
-
 #include "../../../elementwise/cpu/elementwise_cpu.h"
+#include "../../../elementwise/unary.h"
 
-ELEMENTWISE_DESCRIPTOR(atanh, cpu)
-
-namespace op::atanh::cpu {
-typedef struct AtanhOp {
-public:
-    static constexpr size_t num_inputs = 1;
-
-    template <typename T>
-    T operator()(const T &x) const {
-        return std::atanh(x);
-    }
-} AtanhOp;
-} // namespace op::atanh::cpu
+UNARY_ELEMENTWISE_DESCRIPTOR(atanh, cpu, op::elementwise::unary::UnaryMode::Atanh)
 
 #endif // __ATANH_CPU_H__
diff --git a/src/infiniop/ops/atanh/cuda/kernel.cuh b/src/infiniop/ops/atanh/cuda/kernel.cuh
index 5337d8243..de0866ba5 100644
--- a/src/infiniop/ops/atanh/cuda/kernel.cuh
+++ b/src/infiniop/ops/atanh/cuda/kernel.cuh
@@ -1,32 +1,10 @@
 #ifndef __ATANH_CUDA_H__
 #define __ATANH_CUDA_H__
 
-#include <cmath>
-#include <cuda_fp16.h>
+#include "../../../elementwise/unary.h"
 
 namespace op::atanh::cuda {
-typedef struct AtanhOp {
-public:
-    static constexpr size_t num_inputs = 1;
-    template <typename T>
-    __device__ __forceinline__ T operator()(const T &x) const {
-        if constexpr (std::is_same_v<T, half2>) {
-            return __floats2half2_rn(atanhf(__half2float(__low2half(x))), atanhf(__half2float(__high2half(x))));
-        } else if constexpr (std::is_same_v<T, half>) {
-            return __float2half(atanhf(__half2float(x)));
-        } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
-            float x0 = __bfloat162float(__low2bfloat16(x));
-            float x1 = __bfloat162float(__high2bfloat16(x));
-            return __floats2bfloat162_rn(atanhf(x0), atanhf(x1));
-        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
-            return __float2bfloat16_rn(atanhf(__bfloat162float(x)));
-        } else if constexpr (std::is_same_v<T, float>) {
-            return atanhf(x);
-        } else {
-            return std::atanh(x);
-        }
-    }
-} AtanhOp;
+using Op = op::elementwise::unary::cuda::UnaryOp<op::elementwise::unary::UnaryMode::Atanh>;
 } // namespace op::atanh::cuda
 
 #endif // __ATANH_CUDA_H__
diff --git a/src/infiniop/ops/atanh/nvidia/atanh_nvidia.cu b/src/infiniop/ops/atanh/nvidia/atanh_nvidia.cu
index cb5a1ff03..55b435920 100644
--- a/src/infiniop/ops/atanh/nvidia/atanh_nvidia.cu
+++ b/src/infiniop/ops/atanh/nvidia/atanh_nvidia.cu
@@ -1,54 +1,10 @@
-#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh"
 
 #include "../cuda/kernel.cuh"
 #include "atanh_nvidia.cuh"
 
 namespace op::atanh::nvidia {
 
-Descriptor::~Descriptor() = default;
+ELEMENTWISE_NVIDIA_IMPL_UNARY(atanh)
 
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t out_desc,
-    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
-
-    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
-    auto dtype = out_desc->dtype();
-
-    const auto &x_desc = input_desc_vec.at(0);
-    const auto &y_shape = out_desc->shape();
-    const auto &x_shape = x_desc->shape();
-
-    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
-
-    CHECK_SAME_SHAPE(y_shape, x_shape);
-
-    // create CUDA elementwise descriptor
-    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    std::vector<const void *> inputs,
-    void *stream) const {
-    if (workspace_size < _workspace_size) {
-        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
-    }
-
-    switch (_dtype) {
-    case INFINI_DTYPE_F16:
-        return _device_info->calculate<256, cuda::AtanhOp, half>(_info, workspace, output, inputs, stream);
-    case INFINI_DTYPE_F32:
-        return _device_info->calculate<256, cuda::AtanhOp, float>(_info, workspace, output, inputs, stream);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
 } // namespace op::atanh::nvidia
diff --git a/src/infiniop/ops/atanh/operator.cc b/src/infiniop/ops/atanh/operator.cc
index a73adcb23..cc9d6131e 100644
--- a/src/infiniop/ops/atanh/operator.cc
+++ b/src/infiniop/ops/atanh/operator.cc
@@ -1,5 +1,4 @@
-#include "../../operator.h"
-#include "../../handle.h"
+#include "../../operator_impl.h"
 #include "infiniop/ops/atanh.h"
 
 #ifdef ENABLE_CPU_API
@@ -9,131 +8,4 @@
 #include "nvidia/atanh_nvidia.cuh"
 #endif
 
-__C infiniStatus_t infiniopCreateAtanhDescriptor(
-    infiniopHandle_t handle,
-    infiniopAtanhDescriptor_t *desc_ptr,
-    infiniopTensorDescriptor_t y_desc,
-    infiniopTensorDescriptor_t x_desc) {
-
-#define CREATE(CASE, NAMESPACE)                                              \
-    case CASE:                                                               \
-        return op::atanh::NAMESPACE::Descriptor::create(                     \
-            handle,                                                          \
-            reinterpret_cast<op::atanh::NAMESPACE::Descriptor **>(desc_ptr), \
-            y_desc,                                                          \
-            {x_desc})
-
-    switch (handle->device) {
-
-#ifdef ENABLE_CPU_API
-        CREATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        CREATE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef CREATE
-}
-
-__C infiniStatus_t infiniopGetAtanhWorkspaceSize(infiniopAtanhDescriptor_t desc, size_t *size) {
-
-#define GET(CASE, NAMESPACE)                                                                 \
-    case CASE:                                                                               \
-        *size = reinterpret_cast<op::atanh::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
-        return INFINI_STATUS_SUCCESS;
-
-    switch (desc->device_type) {
-#ifdef ENABLE_CPU_API
-        GET(INFINI_DEVICE_CPU, cpu)
-#endif
-#ifdef ENABLE_NVIDIA_API
-        GET(INFINI_DEVICE_NVIDIA, nvidia)
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        GET(INFINI_DEVICE_ILUVATAR, nvidia)
-#endif
-#ifdef ENABLE_QY_API
-        GET(INFINI_DEVICE_QY, nvidia)
-#endif
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-#undef GET
-
-    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-}
-
-__C infiniStatus_t infiniopAtanh(
-    infiniopAtanhDescriptor_t desc,
-    void *workspace,
-    size_t workspace_size,
-    void *y,
-    const void *x,
-    void *stream) {
-
-#define CALCULATE(CASE, NAMESPACE)                                              \
-    case CASE:                                                                  \
-        return reinterpret_cast<const op::atanh::NAMESPACE::Descriptor *>(desc) \
-            ->calculate(workspace, workspace_size, y, {x}, stream)
-
-    switch (desc->device_type) {
-
-#ifdef ENABLE_CPU_API
-        CALCULATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        CALCULATE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef CALCULATE
-}
-
-__C infiniStatus_t
-infiniopDestroyAtanhDescriptor(infiniopAtanhDescriptor_t desc) {
-
-#define DELETE(CASE, NAMESPACE)                                                  \
-    case CASE:                                                                   \
-        delete reinterpret_cast<const op::atanh::NAMESPACE::Descriptor *>(desc); \
-        return INFINI_STATUS_SUCCESS
-
-    switch (desc->device_type) {
-
-#ifdef ENABLE_CPU_API
-        DELETE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        DELETE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef DELETE
-}
+UNARY_OP_IMPL(atanh, Atanh)
diff --git a/src/infiniop/ops/ceil/cpu/ceil_cpu.cc b/src/infiniop/ops/ceil/cpu/ceil_cpu.cc
index 17b3ec888..81ca2fe7a 100644
--- a/src/infiniop/ops/ceil/cpu/ceil_cpu.cc
+++ b/src/infiniop/ops/ceil/cpu/ceil_cpu.cc
@@ -1,48 +1,8 @@
 #include "ceil_cpu.h"
+#include "../../../elementwise/cpu/elementwise_cpu_impl.h"
 
 namespace op::ceil::cpu {
 
-Descriptor::~Descriptor() = default;
+ELEMENTWISE_CPU_IMPL_UNARY(ceil)
 
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t out_desc,
-    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
-
-    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
-    auto dtype = out_desc->dtype();
-
-    const auto &x_desc = input_desc_vec.at(0);
-    const auto &y_shape = out_desc->shape();
-    const auto &x_shape = x_desc->shape();
-
-    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
-
-    CHECK_SAME_SHAPE(y_shape, x_shape);
-
-    // create CPU elementwise descriptor
-    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    std::vector<const void *> inputs,
-    void *stream) const {
-
-    switch (_dtype) {
-    case INFINI_DTYPE_F16:
-        return _device_info->calculate<CeilOp, fp16_t>(_info, output, inputs, stream);
-    case INFINI_DTYPE_F32:
-        return _device_info->calculate<CeilOp, float>(_info, output, inputs, stream);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
 } // namespace op::ceil::cpu
diff --git a/src/infiniop/ops/ceil/cpu/ceil_cpu.h b/src/infiniop/ops/ceil/cpu/ceil_cpu.h
index c3ca8e441..423c784cc 100644
--- a/src/infiniop/ops/ceil/cpu/ceil_cpu.h
+++ b/src/infiniop/ops/ceil/cpu/ceil_cpu.h
@@ -1,26 +1,9 @@
 #ifndef __CEIL_CPU_H__
 #define __CEIL_CPU_H__
 
-#include <cmath>
-
 #include "../../../elementwise/cpu/elementwise_cpu.h"
+#include "../../../elementwise/unary.h"
 
-ELEMENTWISE_DESCRIPTOR(ceil, cpu)
-
-namespace op::ceil::cpu {
-typedef struct CeilOp {
-public:
-    static constexpr size_t num_inputs = 1;
-
-    template <typename T>
-    T operator()(const T &x) const {
-        if constexpr (std::is_integral_v<T>) {
-            return x;
-        } else {
-            return std::ceil(x);
-        }
-    }
-} CeilOp;
-} // namespace op::ceil::cpu
+UNARY_ELEMENTWISE_DESCRIPTOR(ceil, cpu, op::elementwise::unary::UnaryMode::Ceil)
 
 #endif // __CEIL_CPU_H__
diff --git a/src/infiniop/ops/ceil/cuda/kernel.cuh b/src/infiniop/ops/ceil/cuda/kernel.cuh
index a2d2e7fb5..1d30a42eb 100644
--- a/src/infiniop/ops/ceil/cuda/kernel.cuh
+++ b/src/infiniop/ops/ceil/cuda/kernel.cuh
@@ -1,34 +1,10 @@
 #ifndef __CEIL_CUDA_H__
 #define __CEIL_CUDA_H__
 
-#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
-#include <cuda_fp16.h>
+#include "../../../elementwise/unary.h"
 
 namespace op::ceil::cuda {
-typedef struct CeilOp {
-public:
-    static constexpr size_t num_inputs = 1;
-    template <typename T>
-    __device__ __forceinline__ T operator()(const T &x) const {
-        if constexpr (std::is_same_v<T, half2>) {
-            return h2ceil(x);
-        } else if constexpr (std::is_same_v<T, half>) {
-            return hceil(x);
-        } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
-            float x0 = __bfloat162float(__low2bfloat16(x));
-            float x1 = __bfloat162float(__high2bfloat16(x));
-            return __floats2bfloat162_rn(ceilf(x0), ceilf(x1));
-        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
-            return __float2bfloat16_rn(ceilf(__bfloat162float(x)));
-        } else if constexpr (std::is_same_v<T, float>) {
-            return ceilf(x);
-        } else if constexpr (std::is_integral_v<T>) {
-            return x;
-        } else {
-            return std::ceil(x);
-        }
-    }
-} CeilOp;
+using Op = op::elementwise::unary::cuda::UnaryOp<op::elementwise::unary::UnaryMode::Ceil>;
 } // namespace op::ceil::cuda
 
 #endif // __CEIL_CUDA_H__
diff --git a/src/infiniop/ops/ceil/nvidia/ceil_nvidia.cu b/src/infiniop/ops/ceil/nvidia/ceil_nvidia.cu
index c7ad2ee5b..88ee35be8 100644
--- a/src/infiniop/ops/ceil/nvidia/ceil_nvidia.cu
+++ b/src/infiniop/ops/ceil/nvidia/ceil_nvidia.cu
@@ -1,54 +1,10 @@
-#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh"
 
 #include "../cuda/kernel.cuh"
 #include "ceil_nvidia.cuh"
 
 namespace op::ceil::nvidia {
 
-Descriptor::~Descriptor() = default;
+ELEMENTWISE_NVIDIA_IMPL_UNARY(ceil)
 
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t out_desc,
-    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
-
-    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
-    auto dtype = out_desc->dtype();
-
-    const auto &x_desc = input_desc_vec.at(0);
-    const auto &y_shape = out_desc->shape();
-    const auto &x_shape = x_desc->shape();
-
-    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
-
-    CHECK_SAME_SHAPE(y_shape, x_shape);
-
-    // create CUDA elementwise descriptor
-    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    std::vector<const void *> inputs,
-    void *stream) const {
-    if (workspace_size < _workspace_size) {
-        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
-    }
-
-    switch (_dtype) {
-    case INFINI_DTYPE_F16:
-        return _device_info->calculate<256, cuda::CeilOp, half>(_info, workspace, output, inputs, stream);
-    case INFINI_DTYPE_F32:
-        return _device_info->calculate<256, cuda::CeilOp, float>(_info, workspace, output, inputs, stream);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
 } // namespace op::ceil::nvidia
diff --git a/src/infiniop/ops/ceil/operator.cc b/src/infiniop/ops/ceil/operator.cc
index 4e5ee7800..dbe591043 100644
--- a/src/infiniop/ops/ceil/operator.cc
+++ b/src/infiniop/ops/ceil/operator.cc
@@ -1,5 +1,4 @@
-#include "../../operator.h"
-#include "../../handle.h"
+#include "../../operator_impl.h"
 #include "infiniop/ops/ceil.h"
 
 #ifdef ENABLE_CPU_API
@@ -9,131 +8,4 @@
 #include "nvidia/ceil_nvidia.cuh"
 #endif
 
-__C infiniStatus_t infiniopCreateCeilDescriptor(
-    infiniopHandle_t handle,
-    infiniopCeilDescriptor_t *desc_ptr,
-    infiniopTensorDescriptor_t y_desc,
-    infiniopTensorDescriptor_t x_desc) {
-
-#define CREATE(CASE, NAMESPACE)                                             \
-    case CASE:                                                              \
-        return op::ceil::NAMESPACE::Descriptor::create(                     \
-            handle,                                                         \
-            reinterpret_cast<op::ceil::NAMESPACE::Descriptor **>(desc_ptr), \
-            y_desc,                                                         \
-            {x_desc})
-
-    switch (handle->device) {
-
-#ifdef ENABLE_CPU_API
-        CREATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        CREATE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef CREATE
-}
-
-__C infiniStatus_t infiniopGetCeilWorkspaceSize(infiniopCeilDescriptor_t desc, size_t *size) {
-
-#define GET(CASE, NAMESPACE)                                                                \
-    case CASE:                                                                              \
-        *size = reinterpret_cast<op::ceil::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
-        return INFINI_STATUS_SUCCESS;
-
-    switch (desc->device_type) {
-#ifdef ENABLE_CPU_API
-        GET(INFINI_DEVICE_CPU, cpu)
-#endif
-#ifdef ENABLE_NVIDIA_API
-        GET(INFINI_DEVICE_NVIDIA, nvidia)
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        GET(INFINI_DEVICE_ILUVATAR, nvidia)
-#endif
-#ifdef ENABLE_QY_API
-        GET(INFINI_DEVICE_QY, nvidia)
-#endif
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-#undef GET
-
-    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-}
-
-__C infiniStatus_t infiniopCeil(
-    infiniopCeilDescriptor_t desc,
-    void *workspace,
-    size_t workspace_size,
-    void *y,
-    const void *x,
-    void *stream) {
-
-#define CALCULATE(CASE, NAMESPACE)                                             \
-    case CASE:                                                                 \
-        return reinterpret_cast<const op::ceil::NAMESPACE::Descriptor *>(desc) \
-            ->calculate(workspace, workspace_size, y, {x}, stream)
-
-    switch (desc->device_type) {
-
-#ifdef ENABLE_CPU_API
-        CALCULATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        CALCULATE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef CALCULATE
-}
-
-__C infiniStatus_t
-infiniopDestroyCeilDescriptor(infiniopCeilDescriptor_t desc) {
-
-#define DELETE(CASE, NAMESPACE)                                                 \
-    case CASE:                                                                  \
-        delete reinterpret_cast<const op::ceil::NAMESPACE::Descriptor *>(desc); \
-        return INFINI_STATUS_SUCCESS
-
-    switch (desc->device_type) {
-
-#ifdef ENABLE_CPU_API
-        DELETE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        DELETE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef DELETE
-}
+UNARY_OP_IMPL(ceil, Ceil)
diff --git a/src/infiniop/ops/cos/cpu/cos_cpu.cc b/src/infiniop/ops/cos/cpu/cos_cpu.cc
index 9dc68d327..19ef002cf 100644
--- a/src/infiniop/ops/cos/cpu/cos_cpu.cc
+++ b/src/infiniop/ops/cos/cpu/cos_cpu.cc
@@ -1,48 +1,8 @@
 #include "cos_cpu.h"
+#include "../../../elementwise/cpu/elementwise_cpu_impl.h"
 
 namespace op::cos::cpu {
 
-Descriptor::~Descriptor() = default;
+ELEMENTWISE_CPU_IMPL_UNARY(cos)
 
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t out_desc,
-    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
-
-    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
-    auto dtype = out_desc->dtype();
-
-    const auto &x_desc = input_desc_vec.at(0);
-    const auto &y_shape = out_desc->shape();
-    const auto &x_shape = x_desc->shape();
-
-    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
-
-    CHECK_SAME_SHAPE(y_shape, x_shape);
-
-    // create CPU elementwise descriptor
-    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    std::vector<const void *> inputs,
-    void *stream) const {
-
-    switch (_dtype) {
-    case INFINI_DTYPE_F16:
-        return _device_info->calculate<CosOp, fp16_t>(_info, output, inputs, stream);
-    case INFINI_DTYPE_F32:
-        return _device_info->calculate<CosOp, float>(_info, output, inputs, stream);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
 } // namespace op::cos::cpu
diff --git a/src/infiniop/ops/cos/cpu/cos_cpu.h b/src/infiniop/ops/cos/cpu/cos_cpu.h
index 9b4236fc2..d62aa91b8 100644
--- a/src/infiniop/ops/cos/cpu/cos_cpu.h
+++ b/src/infiniop/ops/cos/cpu/cos_cpu.h
@@ -1,22 +1,9 @@
 #ifndef __COS_CPU_H__
 #define __COS_CPU_H__
 
-#include <cmath>
-
 #include "../../../elementwise/cpu/elementwise_cpu.h"
+#include "../../../elementwise/unary.h"
 
-ELEMENTWISE_DESCRIPTOR(cos, cpu)
-
-namespace op::cos::cpu {
-typedef struct CosOp {
-public:
-    static constexpr size_t num_inputs = 1;
-
-    template <typename T>
-    T operator()(const T &x) const {
-        return std::cos(x);
-    }
-} CosOp;
-} // namespace op::cos::cpu
+UNARY_ELEMENTWISE_DESCRIPTOR(cos, cpu, op::elementwise::unary::UnaryMode::Cos)
 
 #endif // __COS_CPU_H__
diff --git a/src/infiniop/ops/cos/cuda/kernel.cuh b/src/infiniop/ops/cos/cuda/kernel.cuh
index b0dabb340..57fe4f50e 100644
--- a/src/infiniop/ops/cos/cuda/kernel.cuh
+++ b/src/infiniop/ops/cos/cuda/kernel.cuh
@@ -1,32 +1,10 @@
 #ifndef __COS_CUDA_H__
 #define __COS_CUDA_H__
 
-#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
-#include <cuda_fp16.h>
+#include "../../../elementwise/unary.h"
 
 namespace op::cos::cuda {
-typedef struct CosOp {
-public:
-    static constexpr size_t num_inputs = 1;
-    template <typename T>
-    __device__ __forceinline__ T operator()(const T &x) const {
-        if constexpr (std::is_same_v<T, half2>) {
-            return h2cos(x);
-        } else if constexpr (std::is_same_v<T, half>) {
-            return hcos(x);
-        } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
-            float x0 = __bfloat162float(__low2bfloat16(x));
-            float x1 = __bfloat162float(__high2bfloat16(x));
-            return __floats2bfloat162_rn(cosf(x0), cosf(x1));
-        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
-            return __float2bfloat16_rn(cosf(__bfloat162float(x)));
-        } else if constexpr (std::is_same_v<T, float>) {
-            return __cosf(x);
-        } else {
-            return std::cos(x);
-        }
-    }
-} CosOp;
+using Op = op::elementwise::unary::cuda::UnaryOp<op::elementwise::unary::UnaryMode::Cos>;
 } // namespace op::cos::cuda
 
 #endif // __COS_CUDA_H__
diff --git a/src/infiniop/ops/cos/nvidia/cos_nvidia.cu b/src/infiniop/ops/cos/nvidia/cos_nvidia.cu
index 044c59ca0..5da3c02e8 100644
--- a/src/infiniop/ops/cos/nvidia/cos_nvidia.cu
+++ b/src/infiniop/ops/cos/nvidia/cos_nvidia.cu
@@ -1,54 +1,10 @@
-#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh"
 
 #include "../cuda/kernel.cuh"
 #include "cos_nvidia.cuh"
 
 namespace op::cos::nvidia {
 
-Descriptor::~Descriptor() = default;
+ELEMENTWISE_NVIDIA_IMPL_UNARY(cos)
 
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t out_desc,
-    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
-
-    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
-    auto dtype = out_desc->dtype();
-
-    const auto &x_desc = input_desc_vec.at(0);
-    const auto &y_shape = out_desc->shape();
-    const auto &x_shape = x_desc->shape();
-
-    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
-
-    CHECK_SAME_SHAPE(y_shape, x_shape);
-
-    // create CUDA elementwise descriptor
-    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    std::vector<const void *> inputs,
-    void *stream) const {
-    if (workspace_size < _workspace_size) {
-        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
-    }
-
-    switch (_dtype) {
-    case INFINI_DTYPE_F16:
-        return _device_info->calculate<256, cuda::CosOp, half>(_info, workspace, output, inputs, stream);
-    case INFINI_DTYPE_F32:
-        return _device_info->calculate<256, cuda::CosOp, float>(_info, workspace, output, inputs, stream);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
 } // namespace op::cos::nvidia
diff --git a/src/infiniop/ops/cos/operator.cc b/src/infiniop/ops/cos/operator.cc
index 5c464ad60..1531c6caa 100644
--- a/src/infiniop/ops/cos/operator.cc
+++ b/src/infiniop/ops/cos/operator.cc
@@ -1,5 +1,4 @@
-#include "../../operator.h"
-#include "../../handle.h"
+#include "../../operator_impl.h"
 #include "infiniop/ops/cos.h"
 
 #ifdef ENABLE_CPU_API
@@ -9,131 +8,4 @@
 #include "nvidia/cos_nvidia.cuh"
 #endif
 
-__C infiniStatus_t infiniopCreateCosDescriptor(
-    infiniopHandle_t handle,
-    infiniopCosDescriptor_t *desc_ptr,
-    infiniopTensorDescriptor_t y_desc,
-    infiniopTensorDescriptor_t x_desc) {
-
-#define CREATE(CASE, NAMESPACE)                                            \
-    case CASE:                                                             \
-        return op::cos::NAMESPACE::Descriptor::create(                     \
-            handle,                                                        \
-            reinterpret_cast<op::cos::NAMESPACE::Descriptor **>(desc_ptr), \
-            y_desc,                                                        \
-            {x_desc})
-
-    switch (handle->device) {
-
-#ifdef ENABLE_CPU_API
-        CREATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        CREATE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef CREATE
-}
-
-__C infiniStatus_t infiniopGetCosWorkspaceSize(infiniopCosDescriptor_t desc, size_t *size) {
-
-#define GET(CASE, NAMESPACE)                                                               \
-    case CASE:                                                                             \
-        *size = reinterpret_cast<op::cos::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
-        return INFINI_STATUS_SUCCESS;
-
-    switch (desc->device_type) {
-#ifdef ENABLE_CPU_API
-        GET(INFINI_DEVICE_CPU, cpu)
-#endif
-#ifdef ENABLE_NVIDIA_API
-        GET(INFINI_DEVICE_NVIDIA, nvidia)
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        GET(INFINI_DEVICE_ILUVATAR, nvidia)
-#endif
-#ifdef ENABLE_QY_API
-        GET(INFINI_DEVICE_QY, nvidia)
-#endif
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-#undef GET
-
-    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-}
-
-__C infiniStatus_t infiniopCos(
-    infiniopCosDescriptor_t desc,
-    void *workspace,
-    size_t workspace_size,
-    void *y,
-    const void *x,
-    void *stream) {
-
-#define CALCULATE(CASE, NAMESPACE)                                            \
-    case CASE:                                                                \
-        return reinterpret_cast<const op::cos::NAMESPACE::Descriptor *>(desc) \
-            ->calculate(workspace, workspace_size, y, {x}, stream)
-
-    switch (desc->device_type) {
-
-#ifdef ENABLE_CPU_API
-        CALCULATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        CALCULATE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef CALCULATE
-}
-
-__C infiniStatus_t
-infiniopDestroyCosDescriptor(infiniopCosDescriptor_t desc) {
-
-#define DELETE(CASE, NAMESPACE)                                                \
-    case CASE:                                                                 \
-        delete reinterpret_cast<const op::cos::NAMESPACE::Descriptor *>(desc); \
-        return INFINI_STATUS_SUCCESS
-
-    switch (desc->device_type) {
-
-#ifdef ENABLE_CPU_API
-        DELETE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        DELETE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef DELETE
-}
+UNARY_OP_IMPL(cos, Cos)
diff --git a/src/infiniop/ops/cosh/cpu/cosh_cpu.cc b/src/infiniop/ops/cosh/cpu/cosh_cpu.cc
index 9ed8e33da..e7b2a6dad 100644
--- a/src/infiniop/ops/cosh/cpu/cosh_cpu.cc
+++ b/src/infiniop/ops/cosh/cpu/cosh_cpu.cc
@@ -1,48 +1,8 @@
 #include "cosh_cpu.h"
+#include "../../../elementwise/cpu/elementwise_cpu_impl.h"
 
 namespace op::cosh::cpu {
 
-Descriptor::~Descriptor() = default;
+ELEMENTWISE_CPU_IMPL_UNARY(cosh)
 
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t out_desc,
-    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
-
-    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
-    auto dtype = out_desc->dtype();
-
-    const auto &x_desc = input_desc_vec.at(0);
-    const auto &y_shape = out_desc->shape();
-    const auto &x_shape = x_desc->shape();
-
-    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
-
-    CHECK_SAME_SHAPE(y_shape, x_shape);
-
-    // create CPU elementwise descriptor
-    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    std::vector<const void *> inputs,
-    void *stream) const {
-
-    switch (_dtype) {
-    case INFINI_DTYPE_F16:
-        return _device_info->calculate<CoshOp, fp16_t>(_info, output, inputs, stream);
-    case INFINI_DTYPE_F32:
-        return _device_info->calculate<CoshOp, float>(_info, output, inputs, stream);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
 } // namespace op::cosh::cpu
diff --git a/src/infiniop/ops/cosh/cpu/cosh_cpu.h b/src/infiniop/ops/cosh/cpu/cosh_cpu.h
index aea359ef2..c789d38ea 100644
--- a/src/infiniop/ops/cosh/cpu/cosh_cpu.h
+++ b/src/infiniop/ops/cosh/cpu/cosh_cpu.h
@@ -1,22 +1,9 @@
 #ifndef __COSH_CPU_H__
 #define __COSH_CPU_H__
 
-#include <cmath>
-
 #include "../../../elementwise/cpu/elementwise_cpu.h"
+#include "../../../elementwise/unary.h"
 
-ELEMENTWISE_DESCRIPTOR(cosh, cpu)
-
-namespace op::cosh::cpu {
-typedef struct CoshOp {
-public:
-    static constexpr size_t num_inputs = 1;
-
-    template <typename T>
-    T operator()(const T &x) const {
-        return std::cosh(x);
-    }
-} CoshOp;
-} // namespace op::cosh::cpu
+UNARY_ELEMENTWISE_DESCRIPTOR(cosh, cpu, op::elementwise::unary::UnaryMode::Cosh)
 
 #endif // __COSH_CPU_H__
diff --git a/src/infiniop/ops/cosh/cuda/kernel.cuh b/src/infiniop/ops/cosh/cuda/kernel.cuh
index ce6806433..934bfe12d 100644
--- a/src/infiniop/ops/cosh/cuda/kernel.cuh
+++ b/src/infiniop/ops/cosh/cuda/kernel.cuh
@@ -1,32 +1,10 @@
 #ifndef __COSH_CUDA_H__
 #define __COSH_CUDA_H__
 
-#include <cmath>
-#include <cuda_fp16.h>
+#include "../../../elementwise/unary.h"
 
 namespace op::cosh::cuda {
-typedef struct CoshOp {
-public:
-    static constexpr size_t num_inputs = 1;
-    template <typename T>
-    __device__ __forceinline__ T operator()(const T &x) const {
-        if constexpr (std::is_same_v<T, half2>) {
-            return __floats2half2_rn(coshf(__half2float(__low2half(x))), coshf(__half2float(__high2half(x))));
-        } else if constexpr (std::is_same_v<T, half>) {
-            return __float2half(coshf(__half2float(x)));
-        } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
-            float x0 = __bfloat162float(__low2bfloat16(x));
-            float x1 = __bfloat162float(__high2bfloat16(x));
-            return __floats2bfloat162_rn(coshf(x0), coshf(x1));
-        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
-            return __float2bfloat16_rn(coshf(__bfloat162float(x)));
-        } else if constexpr (std::is_same_v<T, float>) {
-            return coshf(x);
-        } else {
-            return std::cosh(x);
-        }
-    }
-} CoshOp;
+using Op = op::elementwise::unary::cuda::UnaryOp<op::elementwise::unary::UnaryMode::Cosh>;
 } // namespace op::cosh::cuda
 
 #endif // __COSH_CUDA_H__
diff --git a/src/infiniop/ops/cosh/nvidia/cosh_nvidia.cu b/src/infiniop/ops/cosh/nvidia/cosh_nvidia.cu
index a5e1442ce..038b0373e 100644
--- a/src/infiniop/ops/cosh/nvidia/cosh_nvidia.cu
+++ b/src/infiniop/ops/cosh/nvidia/cosh_nvidia.cu
@@ -1,54 +1,10 @@
-#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh"
 
 #include "../cuda/kernel.cuh"
 #include "cosh_nvidia.cuh"
 
 namespace op::cosh::nvidia {
 
-Descriptor::~Descriptor() = default;
+ELEMENTWISE_NVIDIA_IMPL_UNARY(cosh)
 
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t out_desc,
-    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
-
-    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
-    auto dtype = out_desc->dtype();
-
-    const auto &x_desc = input_desc_vec.at(0);
-    const auto &y_shape = out_desc->shape();
-    const auto &x_shape = x_desc->shape();
-
-    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
-
-    CHECK_SAME_SHAPE(y_shape, x_shape);
-
-    // create CUDA elementwise descriptor
-    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    std::vector<const void *> inputs,
-    void *stream) const {
-    if (workspace_size < _workspace_size) {
-        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
-    }
-
-    switch (_dtype) {
-    case INFINI_DTYPE_F16:
-        return _device_info->calculate<256, cuda::CoshOp, half>(_info, workspace, output, inputs, stream);
-    case INFINI_DTYPE_F32:
-        return _device_info->calculate<256, cuda::CoshOp, float>(_info, workspace, output, inputs, stream);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
 } // namespace op::cosh::nvidia
diff --git a/src/infiniop/ops/cosh/operator.cc b/src/infiniop/ops/cosh/operator.cc
index 75aac0c91..9b18b47ee 100644
--- a/src/infiniop/ops/cosh/operator.cc
+++ b/src/infiniop/ops/cosh/operator.cc
@@ -1,5 +1,4 @@
-#include "../../operator.h"
-#include "../../handle.h"
+#include "../../operator_impl.h"
 #include "infiniop/ops/cosh.h"
 
 #ifdef ENABLE_CPU_API
@@ -9,131 +8,4 @@
 #include "nvidia/cosh_nvidia.cuh"
 #endif
 
-__C infiniStatus_t infiniopCreateCoshDescriptor(
-    infiniopHandle_t handle,
-    infiniopCoshDescriptor_t *desc_ptr,
-    infiniopTensorDescriptor_t y_desc,
-    infiniopTensorDescriptor_t x_desc) {
-
-#define CREATE(CASE, NAMESPACE)                                             \
-    case CASE:                                                              \
-        return op::cosh::NAMESPACE::Descriptor::create(                     \
-            handle,                                                         \
-            reinterpret_cast<op::cosh::NAMESPACE::Descriptor **>(desc_ptr), \
-            y_desc,                                                         \
-            {x_desc})
-
-    switch (handle->device) {
-
-#ifdef ENABLE_CPU_API
-        CREATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        CREATE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef CREATE
-}
-
-__C infiniStatus_t infiniopGetCoshWorkspaceSize(infiniopCoshDescriptor_t desc, size_t *size) {
-
-#define GET(CASE, NAMESPACE)                                                                \
-    case CASE:                                                                              \
-        *size = reinterpret_cast<op::cosh::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
-        return INFINI_STATUS_SUCCESS;
-
-    switch (desc->device_type) {
-#ifdef ENABLE_CPU_API
-        GET(INFINI_DEVICE_CPU, cpu)
-#endif
-#ifdef ENABLE_NVIDIA_API
-        GET(INFINI_DEVICE_NVIDIA, nvidia)
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        GET(INFINI_DEVICE_ILUVATAR, nvidia)
-#endif
-#ifdef ENABLE_QY_API
-        GET(INFINI_DEVICE_QY, nvidia)
-#endif
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-#undef GET
-
-    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-}
-
-__C infiniStatus_t infiniopCosh(
-    infiniopCoshDescriptor_t desc,
-    void *workspace,
-    size_t workspace_size,
-    void *y,
-    const void *x,
-    void *stream) {
-
-#define CALCULATE(CASE, NAMESPACE)                                             \
-    case CASE:                                                                 \
-        return reinterpret_cast<const op::cosh::NAMESPACE::Descriptor *>(desc) \
-            ->calculate(workspace, workspace_size, y, {x}, stream)
-
-    switch (desc->device_type) {
-
-#ifdef ENABLE_CPU_API
-        CALCULATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        CALCULATE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef CALCULATE
-}
-
-__C infiniStatus_t
-infiniopDestroyCoshDescriptor(infiniopCoshDescriptor_t desc) {
-
-#define DELETE(CASE, NAMESPACE)                                                 \
-    case CASE:                                                                  \
-        delete reinterpret_cast<const op::cosh::NAMESPACE::Descriptor *>(desc); \
-        return INFINI_STATUS_SUCCESS
-
-    switch (desc->device_type) {
-
-#ifdef ENABLE_CPU_API
-        DELETE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        DELETE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef DELETE
-}
+UNARY_OP_IMPL(cosh, Cosh)
diff --git a/src/infiniop/ops/div/cpu/div_cpu.cc b/src/infiniop/ops/div/cpu/div_cpu.cc
index 19e222031..6d150070c 100644
--- a/src/infiniop/ops/div/cpu/div_cpu.cc
+++ b/src/infiniop/ops/div/cpu/div_cpu.cc
@@ -1,50 +1,8 @@
 #include "div_cpu.h"
+#include "../../../elementwise/cpu/elementwise_cpu_impl.h"
 
 namespace op::div::cpu {
 
-Descriptor::~Descriptor() = default;
+ELEMENTWISE_CPU_IMPL_BINARY(div)
 
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t out_desc,
-    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
-
-    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
-    auto dtype = out_desc->dtype();
-
-    const auto &a_desc = input_desc_vec.at(0);
-    const auto &b_desc = input_desc_vec.at(1);
-    const auto &c_shape = out_desc->shape();
-    const auto &a_shape = a_desc->shape();
-    const auto &b_shape = b_desc->shape();
-
-    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
-
-    CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);
-
-    // create CPU elementwise descriptor
-    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    std::vector<const void *> inputs,
-    void *stream) const {
-
-    switch (_dtype) {
-    case INFINI_DTYPE_F16:
-        return _device_info->calculate<DivOp, fp16_t>(_info, output, inputs, stream);
-    case INFINI_DTYPE_F32:
-        return _device_info->calculate<DivOp, float>(_info, output, inputs, stream);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
 } // namespace op::div::cpu
diff --git a/src/infiniop/ops/div/cpu/div_cpu.h b/src/infiniop/ops/div/cpu/div_cpu.h
index 0373b766f..ad76e7ef1 100644
--- a/src/infiniop/ops/div/cpu/div_cpu.h
+++ b/src/infiniop/ops/div/cpu/div_cpu.h
@@ -1,19 +1,9 @@
 #ifndef __DIV_CPU_H__
 #define __DIV_CPU_H__
 
+#include "../../../elementwise/binary.h"
 #include "../../../elementwise/cpu/elementwise_cpu.h"
 
-ELEMENTWISE_DESCRIPTOR(div, cpu)
-
-namespace op::div::cpu {
-typedef struct DivOp {
-public:
-    static constexpr size_t num_inputs = 2;
-    template <typename T>
-    T operator()(const T &a, const T &b) const {
-        return a / b;
-    }
-} DivOp;
-} // namespace op::div::cpu
+BINARY_ELEMENTWISE_DESCRIPTOR(div, cpu, op::elementwise::binary::BinaryMode::Divide)
 
 #endif // __DIV_CPU_H__
diff --git a/src/infiniop/ops/div/cuda/kernel.cuh b/src/infiniop/ops/div/cuda/kernel.cuh
index a67993da5..f1ab13152 100644
--- a/src/infiniop/ops/div/cuda/kernel.cuh
+++ b/src/infiniop/ops/div/cuda/kernel.cuh
@@ -1,23 +1,10 @@
 #ifndef __DIV_CUDA_H__
 #define __DIV_CUDA_H__
 
+#include "../../../elementwise/binary.h"
+
 namespace op::div::cuda {
-typedef struct DivOp {
-public:
-    static constexpr size_t num_inputs = 2;
-    template <typename T>
-    __device__ __forceinline__ T operator()(const T &a, const T &b) const {
-        if constexpr (std::is_same_v<T, half2>) {
-            return __h2div(a, b);
-        } else if constexpr (std::is_same_v<T, half> || std::is_same_v<T, cuda_bfloat16>) {
-            return a / b;
-        } else if constexpr (std::is_same_v<T, float>) {
-            return __fdividef(a, b);
-        } else {
-            return a / b;
-        }
-    }
-} DivOp;
+using Op = op::elementwise::binary::cuda::BinaryOp<op::elementwise::binary::BinaryMode::Divide>;
 } // namespace op::div::cuda
 
 #endif // __DIV_CUDA_H__
diff --git a/src/infiniop/ops/div/nvidia/div_nvidia.cu b/src/infiniop/ops/div/nvidia/div_nvidia.cu
index 1abffe816..8aaba09b4 100644
--- a/src/infiniop/ops/div/nvidia/div_nvidia.cu
+++ b/src/infiniop/ops/div/nvidia/div_nvidia.cu
@@ -1,57 +1,10 @@
-#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh"
 
 #include "../cuda/kernel.cuh"
 #include "div_nvidia.cuh"
 
 namespace op::div::nvidia {
 
-Descriptor::~Descriptor() = default;
+ELEMENTWISE_NVIDIA_IMPL_BINARY(div)
 
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t out_desc,
-    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
-
-    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
-    auto dtype = out_desc->dtype();
-
-    const auto &a_desc = input_desc_vec.at(0);
-    const auto &b_desc = input_desc_vec.at(1);
-    const auto &c_shape = out_desc->shape();
-    const auto &a_shape = a_desc->shape();
-    const auto &b_shape = b_desc->shape();
-
-    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
-
-    CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);
-
-    // create CUDA elementwise descriptor
-    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    std::vector<const void *> inputs,
-    void *stream) const {
-
-    if (workspace_size < _workspace_size) {
-        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
-    }
-
-    switch (_dtype) {
-    case INFINI_DTYPE_F16:
-        return _device_info->calculate<256, cuda::DivOp, half>(_info, workspace, output, inputs, stream);
-    case INFINI_DTYPE_F32:
-        return _device_info->calculate<256, cuda::DivOp, float>(_info, workspace, output, inputs, stream);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
 } // namespace op::div::nvidia
diff --git a/src/infiniop/ops/div/operator.cc b/src/infiniop/ops/div/operator.cc
index 84021a1af..af9d1929a 100644
--- a/src/infiniop/ops/div/operator.cc
+++ b/src/infiniop/ops/div/operator.cc
@@ -1,5 +1,4 @@
-#include "../../operator.h"
-#include "../../handle.h"
+#include "../../operator_impl.h"
 #include "infiniop/ops/div.h"
 
 #ifdef ENABLE_CPU_API
@@ -8,195 +7,5 @@
 #if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
 #include "nvidia/div_nvidia.cuh"
 #endif
-#ifdef ENABLE_METAX_API
-#include "metax/div_metax.h"
-#endif
-#ifdef ENABLE_KUNLUN_API
-#include "kunlun/div_kunlun.h"
-#endif
-#ifdef ENABLE_CAMBRICON_API
-#include "bang/div_bang.h"
-#endif
-#ifdef ENABLE_MOORE_API
-#include "moore/div_moore.h"
-#endif
-
-__C infiniStatus_t infiniopCreateDivDescriptor(
-    infiniopHandle_t handle,
-    infiniopDivDescriptor_t *desc_ptr,
-    infiniopTensorDescriptor_t c_desc,
-    infiniopTensorDescriptor_t a_desc,
-    infiniopTensorDescriptor_t b_desc) {
-
-#define CREATE(CASE, NAMESPACE)                                            \
-    case CASE:                                                             \
-        return op::div::NAMESPACE::Descriptor::create(                     \
-            handle,                                                        \
-            reinterpret_cast<op::div::NAMESPACE::Descriptor **>(desc_ptr), \
-            c_desc,                                                        \
-            {a_desc,                                                       \
-             b_desc})
-
-    switch (handle->device) {
-
-#ifdef ENABLE_CPU_API
-        CREATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        CREATE(INFINI_DEVICE_QY, nvidia);
-#endif
-#ifdef ENABLE_METAX_API
-        CREATE(INFINI_DEVICE_METAX, metax);
-#endif
-#ifdef ENABLE_KUNLUN_API
-        CREATE(INFINI_DEVICE_KUNLUN, kunlun);
-#endif
-#ifdef ENABLE_CAMBRICON_API
-        CREATE(INFINI_DEVICE_CAMBRICON, bang);
-#endif
-#ifdef ENABLE_MOORE_API
-        CREATE(INFINI_DEVICE_MOORE, moore);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef CREATE
-}
-
-__C infiniStatus_t infiniopGetDivWorkspaceSize(infiniopDivDescriptor_t desc, size_t *size) {
-
-#define GET(CASE, NAMESPACE)                                                               \
-    case CASE:                                                                             \
-        *size = reinterpret_cast<op::div::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
-        return INFINI_STATUS_SUCCESS
-
-    switch (desc->device_type) {
-#ifdef ENABLE_CPU_API
-        GET(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        GET(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        GET(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        GET(INFINI_DEVICE_QY, nvidia);
-#endif
-#ifdef ENABLE_METAX_API
-        GET(INFINI_DEVICE_METAX, metax);
-#endif
-#ifdef ENABLE_KUNLUN_API
-        GET(INFINI_DEVICE_KUNLUN, kunlun);
-#endif
-#ifdef ENABLE_CAMBRICON_API
-        GET(INFINI_DEVICE_CAMBRICON, bang);
-#endif
-#ifdef ENABLE_MOORE_API
-        GET(INFINI_DEVICE_MOORE, moore);
-#endif
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-#undef GET
-
-    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-}
-
-__C infiniStatus_t infiniopDiv(
-    infiniopDivDescriptor_t desc,
-    void *workspace,
-    size_t workspace_size,
-    void *c,
-    const void *a,
-    const void *b,
-    void *stream) {
-
-#define CALCULATE(CASE, NAMESPACE)                                            \
-    case CASE:                                                                \
-        return reinterpret_cast<const op::div::NAMESPACE::Descriptor *>(desc) \
-            ->calculate(workspace, workspace_size, c, {a, b}, stream)
-
-    switch (desc->device_type) {
-
-#ifdef ENABLE_CPU_API
-        CALCULATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        CALCULATE(INFINI_DEVICE_QY, nvidia);
-#endif
-#ifdef ENABLE_METAX_API
-        CALCULATE(INFINI_DEVICE_METAX, metax);
-#endif
-#ifdef ENABLE_KUNLUN_API
-        CALCULATE(INFINI_DEVICE_KUNLUN, kunlun);
-#endif
-#ifdef ENABLE_CAMBRICON_API
-        CALCULATE(INFINI_DEVICE_CAMBRICON, bang);
-#endif
-#ifdef ENABLE_MOORE_API
-        CALCULATE(INFINI_DEVICE_MOORE, moore);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef CALCULATE
-}
-
-__C infiniStatus_t
-infiniopDestroyDivDescriptor(infiniopDivDescriptor_t desc) {
-
-#define DELETE(CASE, NAMESPACE)                                                \
-    case CASE:                                                                 \
-        delete reinterpret_cast<const op::div::NAMESPACE::Descriptor *>(desc); \
-        return INFINI_STATUS_SUCCESS
-
-    switch (desc->device_type) {
-
-#ifdef ENABLE_CPU_API
-        DELETE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        DELETE(INFINI_DEVICE_QY, nvidia);
-#endif
-#ifdef ENABLE_METAX_API
-        DELETE(INFINI_DEVICE_METAX, metax);
-#endif
-#ifdef ENABLE_KUNLUN_API
-        DELETE(INFINI_DEVICE_KUNLUN, kunlun);
-#endif
-#ifdef ENABLE_CAMBRICON_API
-        DELETE(INFINI_DEVICE_CAMBRICON, bang);
-#endif
-#ifdef ENABLE_MOORE_API
-        DELETE(INFINI_DEVICE_MOORE, moore);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
 
-#undef DELETE
-}
+BINARY_OP_IMPL(div, Div)
diff --git a/src/infiniop/ops/erf/cpu/erf_cpu.cc b/src/infiniop/ops/erf/cpu/erf_cpu.cc
index 00b1897d1..d9119c697 100644
--- a/src/infiniop/ops/erf/cpu/erf_cpu.cc
+++ b/src/infiniop/ops/erf/cpu/erf_cpu.cc
@@ -1,48 +1,8 @@
 #include "erf_cpu.h"
+#include "../../../elementwise/cpu/elementwise_cpu_impl.h"
 
 namespace op::erf::cpu {
 
-Descriptor::~Descriptor() = default;
+ELEMENTWISE_CPU_IMPL_UNARY(erf)
 
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t out_desc,
-    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
-
-    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
-    auto dtype = out_desc->dtype();
-
-    const auto &x_desc = input_desc_vec.at(0);
-    const auto &y_shape = out_desc->shape();
-    const auto &x_shape = x_desc->shape();
-
-    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
-
-    CHECK_SAME_SHAPE(y_shape, x_shape);
-
-    // create CPU elementwise descriptor
-    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    std::vector<const void *> inputs,
-    void *stream) const {
-
-    switch (_dtype) {
-    case INFINI_DTYPE_F16:
-        return _device_info->calculate<ErfOp, fp16_t>(_info, output, inputs, stream);
-    case INFINI_DTYPE_F32:
-        return _device_info->calculate<ErfOp, float>(_info, output, inputs, stream);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
 } // namespace op::erf::cpu
diff --git a/src/infiniop/ops/erf/cpu/erf_cpu.h b/src/infiniop/ops/erf/cpu/erf_cpu.h
index c26f519cf..f50cd157d 100644
--- a/src/infiniop/ops/erf/cpu/erf_cpu.h
+++ b/src/infiniop/ops/erf/cpu/erf_cpu.h
@@ -1,22 +1,9 @@
 #ifndef __ERF_CPU_H__
 #define __ERF_CPU_H__
 
-#include <cmath>
-
 #include "../../../elementwise/cpu/elementwise_cpu.h"
+#include "../../../elementwise/unary.h"
 
-ELEMENTWISE_DESCRIPTOR(erf, cpu)
-
-namespace op::erf::cpu {
-typedef struct ErfOp {
-public:
-    static constexpr size_t num_inputs = 1;
-
-    template <typename T>
-    T operator()(const T &x) const {
-        return std::erf(x);
-    }
-} ErfOp;
-} // namespace op::erf::cpu
+UNARY_ELEMENTWISE_DESCRIPTOR(erf, cpu, op::elementwise::unary::UnaryMode::Erf)
 
 #endif // __ERF_CPU_H__
diff --git a/src/infiniop/ops/erf/cuda/kernel.cuh b/src/infiniop/ops/erf/cuda/kernel.cuh
index 820c10b19..978890cff 100644
--- a/src/infiniop/ops/erf/cuda/kernel.cuh
+++ b/src/infiniop/ops/erf/cuda/kernel.cuh
@@ -1,32 +1,10 @@
 #ifndef __ERF_CUDA_H__
 #define __ERF_CUDA_H__
 
-#include <cmath>
-#include <cuda_fp16.h>
+#include "../../../elementwise/unary.h"
 
 namespace op::erf::cuda {
-typedef struct ErfOp {
-public:
-    static constexpr size_t num_inputs = 1;
-    template <typename T>
-    __device__ __forceinline__ T operator()(const T &x) const {
-        if constexpr (std::is_same_v<T, half2>) {
-            return __floats2half2_rn(erff(__half2float(__low2half(x))), erff(__half2float(__high2half(x))));
-        } else if constexpr (std::is_same_v<T, half>) {
-            return __float2half(erff(__half2float(x)));
-        } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
-            float x0 = __bfloat162float(__low2bfloat16(x));
-            float x1 = __bfloat162float(__high2bfloat16(x));
-            return __floats2bfloat162_rn(erff(x0), erff(x1));
-        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
-            return __float2bfloat16_rn(erff(__bfloat162float(x)));
-        } else if constexpr (std::is_same_v<T, float>) {
-            return erff(x);
-        } else {
-            return std::erf(x);
-        }
-    }
-} ErfOp;
+using Op = op::elementwise::unary::cuda::UnaryOp<op::elementwise::unary::UnaryMode::Erf>;
 } // namespace op::erf::cuda
 
 #endif // __ERF_CUDA_H__
diff --git a/src/infiniop/ops/erf/nvidia/erf_nvidia.cu b/src/infiniop/ops/erf/nvidia/erf_nvidia.cu
index 9080593de..0d743b538 100644
--- a/src/infiniop/ops/erf/nvidia/erf_nvidia.cu
+++ b/src/infiniop/ops/erf/nvidia/erf_nvidia.cu
@@ -1,54 +1,10 @@
-#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh"
 
 #include "../cuda/kernel.cuh"
 #include "erf_nvidia.cuh"
 
 namespace op::erf::nvidia {
 
-Descriptor::~Descriptor() = default;
+ELEMENTWISE_NVIDIA_IMPL_UNARY(erf)
 
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t out_desc,
-    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
-
-    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
-    auto dtype = out_desc->dtype();
-
-    const auto &x_desc = input_desc_vec.at(0);
-    const auto &y_shape = out_desc->shape();
-    const auto &x_shape = x_desc->shape();
-
-    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
-
-    CHECK_SAME_SHAPE(y_shape, x_shape);
-
-    // create CUDA elementwise descriptor
-    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    std::vector<const void *> inputs,
-    void *stream) const {
-    if (workspace_size < _workspace_size) {
-        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
-    }
-
-    switch (_dtype) {
-    case INFINI_DTYPE_F16:
-        return _device_info->calculate<256, cuda::ErfOp, half>(_info, workspace, output, inputs, stream);
-    case INFINI_DTYPE_F32:
-        return _device_info->calculate<256, cuda::ErfOp, float>(_info, workspace, output, inputs, stream);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
 } // namespace op::erf::nvidia
diff --git a/src/infiniop/ops/erf/operator.cc b/src/infiniop/ops/erf/operator.cc
index 1491cfa9a..9304cf525 100644
--- a/src/infiniop/ops/erf/operator.cc
+++ b/src/infiniop/ops/erf/operator.cc
@@ -1,5 +1,4 @@
-#include "../../operator.h"
-#include "../../handle.h"
+#include "../../operator_impl.h"
 #include "infiniop/ops/erf.h"
 
 #ifdef ENABLE_CPU_API
@@ -9,131 +8,4 @@
 #include "nvidia/erf_nvidia.cuh"
 #endif
 
-__C infiniStatus_t infiniopCreateErfDescriptor(
-    infiniopHandle_t handle,
-    infiniopErfDescriptor_t *desc_ptr,
-    infiniopTensorDescriptor_t y_desc,
-    infiniopTensorDescriptor_t x_desc) {
-
-#define CREATE(CASE, NAMESPACE)                                            \
-    case CASE:                                                             \
-        return op::erf::NAMESPACE::Descriptor::create(                     \
-            handle,                                                        \
-            reinterpret_cast<op::erf::NAMESPACE::Descriptor **>(desc_ptr), \
-            y_desc,                                                        \
-            {x_desc})
-
-    switch (handle->device) {
-
-#ifdef ENABLE_CPU_API
-        CREATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        CREATE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef CREATE
-}
-
-__C infiniStatus_t infiniopGetErfWorkspaceSize(infiniopErfDescriptor_t desc, size_t *size) {
-
-#define GET(CASE, NAMESPACE)                                                               \
-    case CASE:                                                                             \
-        *size = reinterpret_cast<op::erf::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
-        return INFINI_STATUS_SUCCESS;
-
-    switch (desc->device_type) {
-#ifdef ENABLE_CPU_API
-        GET(INFINI_DEVICE_CPU, cpu)
-#endif
-#ifdef ENABLE_NVIDIA_API
-        GET(INFINI_DEVICE_NVIDIA, nvidia)
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        GET(INFINI_DEVICE_ILUVATAR, nvidia)
-#endif
-#ifdef ENABLE_QY_API
-        GET(INFINI_DEVICE_QY, nvidia)
-#endif
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-#undef GET
-
-    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-}
-
-__C infiniStatus_t infiniopErf(
-    infiniopErfDescriptor_t desc,
-    void *workspace,
-    size_t workspace_size,
-    void *y,
-    const void *x,
-    void *stream) {
-
-#define CALCULATE(CASE, NAMESPACE)                                            \
-    case CASE:                                                                \
-        return reinterpret_cast<const op::erf::NAMESPACE::Descriptor *>(desc) \
-            ->calculate(workspace, workspace_size, y, {x}, stream)
-
-    switch (desc->device_type) {
-
-#ifdef ENABLE_CPU_API
-        CALCULATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        CALCULATE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef CALCULATE
-}
-
-__C infiniStatus_t
-infiniopDestroyErfDescriptor(infiniopErfDescriptor_t desc) {
-
-#define DELETE(CASE, NAMESPACE)                                                \
-    case CASE:                                                                 \
-        delete reinterpret_cast<const op::erf::NAMESPACE::Descriptor *>(desc); \
-        return INFINI_STATUS_SUCCESS
-
-    switch (desc->device_type) {
-
-#ifdef ENABLE_CPU_API
-        DELETE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        DELETE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef DELETE
-}
+UNARY_OP_IMPL(erf, Erf)
diff --git a/src/infiniop/ops/floor/cpu/floor_cpu.cc b/src/infiniop/ops/floor/cpu/floor_cpu.cc
index e809a02e2..cc717ac11 100644
--- a/src/infiniop/ops/floor/cpu/floor_cpu.cc
+++ b/src/infiniop/ops/floor/cpu/floor_cpu.cc
@@ -1,48 +1,8 @@
 #include "floor_cpu.h"
+#include "../../../elementwise/cpu/elementwise_cpu_impl.h"
 
 namespace op::floor::cpu {
 
-Descriptor::~Descriptor() = default;
+ELEMENTWISE_CPU_IMPL_UNARY(floor)
 
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t out_desc,
-    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
-
-    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
-    auto dtype = out_desc->dtype();
-
-    const auto &x_desc = input_desc_vec.at(0);
-    const auto &y_shape = out_desc->shape();
-    const auto &x_shape = x_desc->shape();
-
-    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
-
-    CHECK_SAME_SHAPE(y_shape, x_shape);
-
-    // create CPU elementwise descriptor
-    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    std::vector<const void *> inputs,
-    void *stream) const {
-
-    switch (_dtype) {
-    case INFINI_DTYPE_F16:
-        return _device_info->calculate<FloorOp, fp16_t>(_info, output, inputs, stream);
-    case INFINI_DTYPE_F32:
-        return _device_info->calculate<FloorOp, float>(_info, output, inputs, stream);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
 } // namespace op::floor::cpu
diff --git a/src/infiniop/ops/floor/cpu/floor_cpu.h b/src/infiniop/ops/floor/cpu/floor_cpu.h
index 91508a384..a246309e8 100644
--- a/src/infiniop/ops/floor/cpu/floor_cpu.h
+++ b/src/infiniop/ops/floor/cpu/floor_cpu.h
@@ -1,26 +1,9 @@
 #ifndef __FLOOR_CPU_H__
 #define __FLOOR_CPU_H__
 
-#include <cmath>
-
 #include "../../../elementwise/cpu/elementwise_cpu.h"
+#include "../../../elementwise/unary.h"
 
-ELEMENTWISE_DESCRIPTOR(floor, cpu)
-
-namespace op::floor::cpu {
-typedef struct FloorOp {
-public:
-    static constexpr size_t num_inputs = 1;
-
-    template <typename T>
-    T operator()(const T &x) const {
-        if constexpr (std::is_integral_v<T>) {
-            return x;
-        } else {
-            return std::floor(x);
-        }
-    }
-} FloorOp;
-} // namespace op::floor::cpu
+UNARY_ELEMENTWISE_DESCRIPTOR(floor, cpu, op::elementwise::unary::UnaryMode::Floor)
 
 #endif // __FLOOR_CPU_H__
diff --git a/src/infiniop/ops/floor/cuda/kernel.cuh b/src/infiniop/ops/floor/cuda/kernel.cuh
index c89ce34f4..23a7a44e9 100644
--- a/src/infiniop/ops/floor/cuda/kernel.cuh
+++ b/src/infiniop/ops/floor/cuda/kernel.cuh
@@ -1,34 +1,10 @@
 #ifndef __FLOOR_CUDA_H__
 #define __FLOOR_CUDA_H__
 
-#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
-#include <cuda_fp16.h>
+#include "../../../elementwise/unary.h"
 
 namespace op::floor::cuda {
-typedef struct FloorOp {
-public:
-    static constexpr size_t num_inputs = 1;
-    template <typename T>
-    __device__ __forceinline__ T operator()(const T &x) const {
-        if constexpr (std::is_same_v<T, half2>) {
-            return h2floor(x);
-        } else if constexpr (std::is_same_v<T, half>) {
-            return hfloor(x);
-        } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
-            float x0 = __bfloat162float(__low2bfloat16(x));
-            float x1 = __bfloat162float(__high2bfloat16(x));
-            return __floats2bfloat162_rn(floorf(x0), floorf(x1));
-        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
-            return __float2bfloat16_rn(floorf(__bfloat162float(x)));
-        } else if constexpr (std::is_same_v<T, float>) {
-            return floorf(x);
-        } else if constexpr (std::is_integral_v<T>) {
-            return x;
-        } else {
-            return std::floor(x);
-        }
-    }
-} FloorOp;
+using Op = op::elementwise::unary::cuda::UnaryOp<op::elementwise::unary::UnaryMode::Floor>;
 } // namespace op::floor::cuda
 
 #endif // __FLOOR_CUDA_H__
diff --git a/src/infiniop/ops/floor/nvidia/floor_nvidia.cu b/src/infiniop/ops/floor/nvidia/floor_nvidia.cu
index 08305048a..cec304a1c 100644
--- a/src/infiniop/ops/floor/nvidia/floor_nvidia.cu
+++ b/src/infiniop/ops/floor/nvidia/floor_nvidia.cu
@@ -1,54 +1,10 @@
-#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh"
 
 #include "../cuda/kernel.cuh"
 #include "floor_nvidia.cuh"
 
 namespace op::floor::nvidia {
 
-Descriptor::~Descriptor() = default;
+ELEMENTWISE_NVIDIA_IMPL_UNARY(floor)
 
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t out_desc,
-    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
-
-    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
-    auto dtype = out_desc->dtype();
-
-    const auto &x_desc = input_desc_vec.at(0);
-    const auto &y_shape = out_desc->shape();
-    const auto &x_shape = x_desc->shape();
-
-    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
-
-    CHECK_SAME_SHAPE(y_shape, x_shape);
-
-    // create CUDA elementwise descriptor
-    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    std::vector<const void *> inputs,
-    void *stream) const {
-    if (workspace_size < _workspace_size) {
-        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
-    }
-
-    switch (_dtype) {
-    case INFINI_DTYPE_F16:
-        return _device_info->calculate<256, cuda::FloorOp, half>(_info, workspace, output, inputs, stream);
-    case INFINI_DTYPE_F32:
-        return _device_info->calculate<256, cuda::FloorOp, float>(_info, workspace, output, inputs, stream);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
 } // namespace op::floor::nvidia
diff --git a/src/infiniop/ops/floor/operator.cc b/src/infiniop/ops/floor/operator.cc
index 4e4ed2b5a..64e4a586b 100644
--- a/src/infiniop/ops/floor/operator.cc
+++ b/src/infiniop/ops/floor/operator.cc
@@ -1,5 +1,4 @@
-#include "../../operator.h"
-#include "../../handle.h"
+#include "../../operator_impl.h"
 #include "infiniop/ops/floor.h"
 
 #ifdef ENABLE_CPU_API
@@ -9,131 +8,4 @@
 #include "nvidia/floor_nvidia.cuh"
 #endif
 
-__C infiniStatus_t infiniopCreateFloorDescriptor(
-    infiniopHandle_t handle,
-    infiniopFloorDescriptor_t *desc_ptr,
-    infiniopTensorDescriptor_t y_desc,
-    infiniopTensorDescriptor_t x_desc) {
-
-#define CREATE(CASE, NAMESPACE)                                              \
-    case CASE:                                                               \
-        return op::floor::NAMESPACE::Descriptor::create(                     \
-            handle,                                                          \
-            reinterpret_cast<op::floor::NAMESPACE::Descriptor **>(desc_ptr), \
-            y_desc,                                                          \
-            {x_desc})
-
-    switch (handle->device) {
-
-#ifdef ENABLE_CPU_API
-        CREATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        CREATE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef CREATE
-}
-
-__C infiniStatus_t infiniopGetFloorWorkspaceSize(infiniopFloorDescriptor_t desc, size_t *size) {
-
-#define GET(CASE, NAMESPACE)                                                                 \
-    case CASE:                                                                               \
-        *size = reinterpret_cast<op::floor::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
-        return INFINI_STATUS_SUCCESS;
-
-    switch (desc->device_type) {
-#ifdef ENABLE_CPU_API
-        GET(INFINI_DEVICE_CPU, cpu)
-#endif
-#ifdef ENABLE_NVIDIA_API
-        GET(INFINI_DEVICE_NVIDIA, nvidia)
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        GET(INFINI_DEVICE_ILUVATAR, nvidia)
-#endif
-#ifdef ENABLE_QY_API
-        GET(INFINI_DEVICE_QY, nvidia)
-#endif
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-#undef GET
-
-    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-}
-
-__C infiniStatus_t infiniopFloor(
-    infiniopFloorDescriptor_t desc,
-    void *workspace,
-    size_t workspace_size,
-    void *y,
-    const void *x,
-    void *stream) {
-
-#define CALCULATE(CASE, NAMESPACE)                                              \
-    case CASE:                                                                  \
-        return reinterpret_cast<const op::floor::NAMESPACE::Descriptor *>(desc) \
-            ->calculate(workspace, workspace_size, y, {x}, stream)
-
-    switch (desc->device_type) {
-
-#ifdef ENABLE_CPU_API
-        CALCULATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        CALCULATE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef CALCULATE
-}
-
-__C infiniStatus_t
-infiniopDestroyFloorDescriptor(infiniopFloorDescriptor_t desc) {
-
-#define DELETE(CASE, NAMESPACE)                                                  \
-    case CASE:                                                                   \
-        delete reinterpret_cast<const op::floor::NAMESPACE::Descriptor *>(desc); \
-        return INFINI_STATUS_SUCCESS
-
-    switch (desc->device_type) {
-
-#ifdef ENABLE_CPU_API
-        DELETE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        DELETE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef DELETE
-}
+UNARY_OP_IMPL(floor, Floor)
diff --git a/src/infiniop/ops/log/cpu/log_cpu.cc b/src/infiniop/ops/log/cpu/log_cpu.cc
index e7314c319..734ad1617 100644
--- a/src/infiniop/ops/log/cpu/log_cpu.cc
+++ b/src/infiniop/ops/log/cpu/log_cpu.cc
@@ -1,48 +1,8 @@
 #include "log_cpu.h"
+#include "../../../elementwise/cpu/elementwise_cpu_impl.h"
 
 namespace op::log::cpu {
 
-Descriptor::~Descriptor() = default;
+ELEMENTWISE_CPU_IMPL_UNARY(log)
 
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t out_desc,
-    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
-
-    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
-    auto dtype = out_desc->dtype();
-
-    const auto &x_desc = input_desc_vec.at(0);
-    const auto &y_shape = out_desc->shape();
-    const auto &x_shape = x_desc->shape();
-
-    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
-
-    CHECK_SAME_SHAPE(y_shape, x_shape);
-
-    // create CPU elementwise descriptor
-    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    std::vector<const void *> inputs,
-    void *stream) const {
-
-    switch (_dtype) {
-    case INFINI_DTYPE_F16:
-        return _device_info->calculate<LogOp, fp16_t>(_info, output, inputs, stream);
-    case INFINI_DTYPE_F32:
-        return _device_info->calculate<LogOp, float>(_info, output, inputs, stream);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
 } // namespace op::log::cpu
diff --git a/src/infiniop/ops/log/cpu/log_cpu.h b/src/infiniop/ops/log/cpu/log_cpu.h
index 535e681d3..b13d01442 100644
--- a/src/infiniop/ops/log/cpu/log_cpu.h
+++ b/src/infiniop/ops/log/cpu/log_cpu.h
@@ -1,22 +1,9 @@
 #ifndef __LOG_CPU_H__
 #define __LOG_CPU_H__
 
-#include <cmath>
-
 #include "../../../elementwise/cpu/elementwise_cpu.h"
+#include "../../../elementwise/unary.h"
 
-ELEMENTWISE_DESCRIPTOR(log, cpu)
-
-namespace op::log::cpu {
-typedef struct LogOp {
-public:
-    static constexpr size_t num_inputs = 1;
-
-    template <typename T>
-    T operator()(const T &x) const {
-        return std::log(x);
-    }
-} LogOp;
-} // namespace op::log::cpu
+UNARY_ELEMENTWISE_DESCRIPTOR(log, cpu, op::elementwise::unary::UnaryMode::Log)
 
 #endif // __LOG_CPU_H__
diff --git a/src/infiniop/ops/log/cuda/kernel.cuh b/src/infiniop/ops/log/cuda/kernel.cuh
index b1e46873c..80980ada1 100644
--- a/src/infiniop/ops/log/cuda/kernel.cuh
+++ b/src/infiniop/ops/log/cuda/kernel.cuh
@@ -1,32 +1,10 @@
 #ifndef __LOG_CUDA_H__
 #define __LOG_CUDA_H__
 
-#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
-#include <cuda_fp16.h>
+#include "../../../elementwise/unary.h"
 
 namespace op::log::cuda {
-typedef struct LogOp {
-public:
-    static constexpr size_t num_inputs = 1;
-    template <typename T>
-    __device__ __forceinline__ T operator()(const T &x) const {
-        if constexpr (std::is_same_v<T, half2>) {
-            return h2log(x);
-        } else if constexpr (std::is_same_v<T, half>) {
-            return __float2half(__logf(__half2float(x)));
-        } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
-            float x0 = __bfloat162float(__low2bfloat16(x));
-            float x1 = __bfloat162float(__high2bfloat16(x));
-            return __floats2bfloat162_rn(logf(x0), logf(x1));
-        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
-            return __float2bfloat16_rn(logf(__bfloat162float(x)));
-        } else if constexpr (std::is_same_v<T, float>) {
-            return __logf(x);
-        } else {
-            return std::log(x);
-        }
-    }
-} LogOp;
+using Op = op::elementwise::unary::cuda::UnaryOp<op::elementwise::unary::UnaryMode::Log>;
 } // namespace op::log::cuda
 
 #endif // __LOG_CUDA_H__
diff --git a/src/infiniop/ops/log/nvidia/log_nvidia.cu b/src/infiniop/ops/log/nvidia/log_nvidia.cu
index 9e7bcafc4..87aaa0388 100644
--- a/src/infiniop/ops/log/nvidia/log_nvidia.cu
+++ b/src/infiniop/ops/log/nvidia/log_nvidia.cu
@@ -1,54 +1,10 @@
-#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh"
 
 #include "../cuda/kernel.cuh"
 #include "log_nvidia.cuh"
 
 namespace op::log::nvidia {
 
-Descriptor::~Descriptor() = default;
+ELEMENTWISE_NVIDIA_IMPL_UNARY(log)
 
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t out_desc,
-    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
-
-    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
-    auto dtype = out_desc->dtype();
-
-    const auto &x_desc = input_desc_vec.at(0);
-    const auto &y_shape = out_desc->shape();
-    const auto &x_shape = x_desc->shape();
-
-    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
-
-    CHECK_SAME_SHAPE(y_shape, x_shape);
-
-    // create CUDA elementwise descriptor
-    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    std::vector<const void *> inputs,
-    void *stream) const {
-    if (workspace_size < _workspace_size) {
-        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
-    }
-
-    switch (_dtype) {
-    case INFINI_DTYPE_F16:
-        return _device_info->calculate<256, cuda::LogOp, half>(_info, workspace, output, inputs, stream);
-    case INFINI_DTYPE_F32:
-        return _device_info->calculate<256, cuda::LogOp, float>(_info, workspace, output, inputs, stream);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
 } // namespace op::log::nvidia
diff --git a/src/infiniop/ops/log/operator.cc b/src/infiniop/ops/log/operator.cc
index 8f2add408..9614a0861 100644
--- a/src/infiniop/ops/log/operator.cc
+++ b/src/infiniop/ops/log/operator.cc
@@ -1,5 +1,4 @@
-#include "../../operator.h"
-#include "../../handle.h"
+#include "../../operator_impl.h"
 #include "infiniop/ops/log.h"
 
 #ifdef ENABLE_CPU_API
@@ -9,131 +8,4 @@
 #include "nvidia/log_nvidia.cuh"
 #endif
 
-__C infiniStatus_t infiniopCreateLogDescriptor(
-    infiniopHandle_t handle,
-    infiniopLogDescriptor_t *desc_ptr,
-    infiniopTensorDescriptor_t y_desc,
-    infiniopTensorDescriptor_t x_desc) {
-
-#define CREATE(CASE, NAMESPACE)                                            \
-    case CASE:                                                             \
-        return op::log::NAMESPACE::Descriptor::create(                     \
-            handle,                                                        \
-            reinterpret_cast<op::log::NAMESPACE::Descriptor **>(desc_ptr), \
-            y_desc,                                                        \
-            {x_desc})
-
-    switch (handle->device) {
-
-#ifdef ENABLE_CPU_API
-        CREATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        CREATE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef CREATE
-}
-
-__C infiniStatus_t infiniopGetLogWorkspaceSize(infiniopLogDescriptor_t desc, size_t *size) {
-
-#define GET(CASE, NAMESPACE)                                                               \
-    case CASE:                                                                             \
-        *size = reinterpret_cast<op::log::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
-        return INFINI_STATUS_SUCCESS;
-
-    switch (desc->device_type) {
-#ifdef ENABLE_CPU_API
-        GET(INFINI_DEVICE_CPU, cpu)
-#endif
-#ifdef ENABLE_NVIDIA_API
-        GET(INFINI_DEVICE_NVIDIA, nvidia)
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        GET(INFINI_DEVICE_ILUVATAR, nvidia)
-#endif
-#ifdef ENABLE_QY_API
-        GET(INFINI_DEVICE_QY, nvidia)
-#endif
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-#undef GET
-
-    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-}
-
-__C infiniStatus_t infiniopLog(
-    infiniopLogDescriptor_t desc,
-    void *workspace,
-    size_t workspace_size,
-    void *y,
-    const void *x,
-    void *stream) {
-
-#define CALCULATE(CASE, NAMESPACE)                                            \
-    case CASE:                                                                \
-        return reinterpret_cast<const op::log::NAMESPACE::Descriptor *>(desc) \
-            ->calculate(workspace, workspace_size, y, {x}, stream)
-
-    switch (desc->device_type) {
-
-#ifdef ENABLE_CPU_API
-        CALCULATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        CALCULATE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef CALCULATE
-}
-
-__C infiniStatus_t
-infiniopDestroyLogDescriptor(infiniopLogDescriptor_t desc) {
-
-#define DELETE(CASE, NAMESPACE)                                                \
-    case CASE:                                                                 \
-        delete reinterpret_cast<const op::log::NAMESPACE::Descriptor *>(desc); \
-        return INFINI_STATUS_SUCCESS
-
-    switch (desc->device_type) {
-
-#ifdef ENABLE_CPU_API
-        DELETE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        DELETE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef DELETE
-}
+UNARY_OP_IMPL(log, Log)
diff --git a/src/infiniop/ops/max/cpu/max_cpu.cc b/src/infiniop/ops/max/cpu/max_cpu.cc
index 1b30fa4e4..98e8a52a2 100644
--- a/src/infiniop/ops/max/cpu/max_cpu.cc
+++ b/src/infiniop/ops/max/cpu/max_cpu.cc
@@ -1,50 +1,8 @@
 #include "max_cpu.h"
+#include "../../../elementwise/cpu/elementwise_cpu_impl.h"
 
 namespace op::max::cpu {
 
-Descriptor::~Descriptor() = default;
+ELEMENTWISE_CPU_IMPL_BINARY(max)
 
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t out_desc,
-    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
-
-    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
-    auto dtype = out_desc->dtype();
-
-    const auto &a_desc = input_desc_vec.at(0);
-    const auto &b_desc = input_desc_vec.at(1);
-    const auto &c_shape = out_desc->shape();
-    const auto &a_shape = a_desc->shape();
-    const auto &b_shape = b_desc->shape();
-
-    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
-
-    CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);
-
-    // create CPU elementwise descriptor
-    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    std::vector<const void *> inputs,
-    void *stream) const {
-
-    switch (_dtype) {
-    case INFINI_DTYPE_F16:
-        return _device_info->calculate<MaxOp, fp16_t>(_info, output, inputs, stream);
-    case INFINI_DTYPE_F32:
-        return _device_info->calculate<MaxOp, float>(_info, output, inputs, stream);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
 } // namespace op::max::cpu
diff --git a/src/infiniop/ops/max/cpu/max_cpu.h b/src/infiniop/ops/max/cpu/max_cpu.h
index 4d085ed39..2219994d5 100644
--- a/src/infiniop/ops/max/cpu/max_cpu.h
+++ b/src/infiniop/ops/max/cpu/max_cpu.h
@@ -1,20 +1,9 @@
 #ifndef __MAX_CPU_H__
 #define __MAX_CPU_H__
 
+#include "../../../elementwise/binary.h"
 #include "../../../elementwise/cpu/elementwise_cpu.h"
-#include <algorithm>
 
-ELEMENTWISE_DESCRIPTOR(max, cpu)
-
-namespace op::max::cpu {
-typedef struct MaxOp {
-public:
-    static constexpr size_t num_inputs = 2;
-    template <typename T>
-    T operator()(const T &a, const T &b) const {
-        return std::max(a, b);
-    }
-} MaxOp;
-} // namespace op::max::cpu
+BINARY_ELEMENTWISE_DESCRIPTOR(max, cpu, op::elementwise::binary::BinaryMode::Max)
 
 #endif // __MAX_CPU_H__
diff --git a/src/infiniop/ops/max/cuda/kernel.cuh b/src/infiniop/ops/max/cuda/kernel.cuh
index bf3977a31..68f634559 100644
--- a/src/infiniop/ops/max/cuda/kernel.cuh
+++ b/src/infiniop/ops/max/cuda/kernel.cuh
@@ -1,23 +1,10 @@
 #ifndef __MAX_CUDA_H__
 #define __MAX_CUDA_H__
 
+#include "../../../elementwise/binary.h"
+
 namespace op::max::cuda {
-typedef struct MaxOp {
-public:
-    static constexpr size_t num_inputs = 2;
-    template <typename T>
-    __device__ __forceinline__ T operator()(const T &a, const T &b) const {
-        if constexpr (std::is_same_v<T, half2>) {
-            return __hmax2(a, b);
-        } else if constexpr (std::is_same_v<T, half> || std::is_same_v<T, cuda_bfloat16>) {
-            return a > b ? a : b;
-        } else if constexpr (std::is_same_v<T, float>) {
-            return fmaxf(a, b);
-        } else {
-            return a > b ? a : b;
-        }
-    }
-} MaxOp;
+using Op = op::elementwise::binary::cuda::BinaryOp<op::elementwise::binary::BinaryMode::Max>;
 } // namespace op::max::cuda
 
 #endif // __MAX_CUDA_H__
diff --git a/src/infiniop/ops/max/nvidia/max_nvidia.cu b/src/infiniop/ops/max/nvidia/max_nvidia.cu
index 5e9fb13f4..ba4620f3b 100644
--- a/src/infiniop/ops/max/nvidia/max_nvidia.cu
+++ b/src/infiniop/ops/max/nvidia/max_nvidia.cu
@@ -1,57 +1,10 @@
-#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh"
 
 #include "../cuda/kernel.cuh"
 #include "max_nvidia.cuh"
 
 namespace op::max::nvidia {
 
-Descriptor::~Descriptor() = default;
+ELEMENTWISE_NVIDIA_IMPL_BINARY(max)
 
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t out_desc,
-    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
-
-    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
-    auto dtype = out_desc->dtype();
-
-    const auto &a_desc = input_desc_vec.at(0);
-    const auto &b_desc = input_desc_vec.at(1);
-    const auto &c_shape = out_desc->shape();
-    const auto &a_shape = a_desc->shape();
-    const auto &b_shape = b_desc->shape();
-
-    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
-
-    CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);
-
-    // create CUDA elementwise descriptor
-    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    std::vector<const void *> inputs,
-    void *stream) const {
-
-    if (workspace_size < _workspace_size) {
-        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
-    }
-
-    switch (_dtype) {
-    case INFINI_DTYPE_F16:
-        return _device_info->calculate<256, cuda::MaxOp, half>(_info, workspace, output, inputs, stream);
-    case INFINI_DTYPE_F32:
-        return _device_info->calculate<256, cuda::MaxOp, float>(_info, workspace, output, inputs, stream);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
 } // namespace op::max::nvidia
diff --git a/src/infiniop/ops/max/operator.cc b/src/infiniop/ops/max/operator.cc
index e04368533..3e5299f52 100644
--- a/src/infiniop/ops/max/operator.cc
+++ b/src/infiniop/ops/max/operator.cc
@@ -1,5 +1,4 @@
-#include "../../operator.h"
-#include "../../handle.h"
+#include "../../operator_impl.h"
 #include "infiniop/ops/max.h"
 
 #ifdef ENABLE_CPU_API
@@ -8,195 +7,5 @@
 #if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
 #include "nvidia/max_nvidia.cuh"
 #endif
-#ifdef ENABLE_METAX_API
-#include "metax/max_metax.h"
-#endif
-#ifdef ENABLE_KUNLUN_API
-#include "kunlun/max_kunlun.h"
-#endif
-#ifdef ENABLE_CAMBRICON_API
-#include "bang/max_bang.h"
-#endif
-#ifdef ENABLE_MOORE_API
-#include "moore/max_moore.h"
-#endif
-
-__C infiniStatus_t infiniopCreateMaxDescriptor(
-    infiniopHandle_t handle,
-    infiniopMaxDescriptor_t *desc_ptr,
-    infiniopTensorDescriptor_t c_desc,
-    infiniopTensorDescriptor_t a_desc,
-    infiniopTensorDescriptor_t b_desc) {
-
-#define CREATE(CASE, NAMESPACE)                                            \
-    case CASE:                                                             \
-        return op::max::NAMESPACE::Descriptor::create(                     \
-            handle,                                                        \
-            reinterpret_cast<op::max::NAMESPACE::Descriptor **>(desc_ptr), \
-            c_desc,                                                        \
-            {a_desc,                                                       \
-             b_desc})
-
-    switch (handle->device) {
-
-#ifdef ENABLE_CPU_API
-        CREATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        CREATE(INFINI_DEVICE_QY, nvidia);
-#endif
-#ifdef ENABLE_METAX_API
-        CREATE(INFINI_DEVICE_METAX, metax);
-#endif
-#ifdef ENABLE_KUNLUN_API
-        CREATE(INFINI_DEVICE_KUNLUN, kunlun);
-#endif
-#ifdef ENABLE_CAMBRICON_API
-        CREATE(INFINI_DEVICE_CAMBRICON, bang);
-#endif
-#ifdef ENABLE_MOORE_API
-        CREATE(INFINI_DEVICE_MOORE, moore);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef CREATE
-}
-
-__C infiniStatus_t infiniopGetMaxWorkspaceSize(infiniopMaxDescriptor_t desc, size_t *size) {
-
-#define GET(CASE, NAMESPACE)                                                               \
-    case CASE:                                                                             \
-        *size = reinterpret_cast<op::max::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
-        return INFINI_STATUS_SUCCESS
-
-    switch (desc->device_type) {
-#ifdef ENABLE_CPU_API
-        GET(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        GET(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        GET(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        GET(INFINI_DEVICE_QY, nvidia);
-#endif
-#ifdef ENABLE_METAX_API
-        GET(INFINI_DEVICE_METAX, metax);
-#endif
-#ifdef ENABLE_KUNLUN_API
-        GET(INFINI_DEVICE_KUNLUN, kunlun);
-#endif
-#ifdef ENABLE_CAMBRICON_API
-        GET(INFINI_DEVICE_CAMBRICON, bang);
-#endif
-#ifdef ENABLE_MOORE_API
-        GET(INFINI_DEVICE_MOORE, moore);
-#endif
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-#undef GET
-
-    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-}
-
-__C infiniStatus_t infiniopMax(
-    infiniopMaxDescriptor_t desc,
-    void *workspace,
-    size_t workspace_size,
-    void *c,
-    const void *a,
-    const void *b,
-    void *stream) {
-
-#define CALCULATE(CASE, NAMESPACE)                                            \
-    case CASE:                                                                \
-        return reinterpret_cast<const op::max::NAMESPACE::Descriptor *>(desc) \
-            ->calculate(workspace, workspace_size, c, {a, b}, stream)
-
-    switch (desc->device_type) {
-
-#ifdef ENABLE_CPU_API
-        CALCULATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        CALCULATE(INFINI_DEVICE_QY, nvidia);
-#endif
-#ifdef ENABLE_METAX_API
-        CALCULATE(INFINI_DEVICE_METAX, metax);
-#endif
-#ifdef ENABLE_KUNLUN_API
-        CALCULATE(INFINI_DEVICE_KUNLUN, kunlun);
-#endif
-#ifdef ENABLE_CAMBRICON_API
-        CALCULATE(INFINI_DEVICE_CAMBRICON, bang);
-#endif
-#ifdef ENABLE_MOORE_API
-        CALCULATE(INFINI_DEVICE_MOORE, moore);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef CALCULATE
-}
-
-__C infiniStatus_t
-infiniopDestroyMaxDescriptor(infiniopMaxDescriptor_t desc) {
-
-#define DELETE(CASE, NAMESPACE)                                                \
-    case CASE:                                                                 \
-        delete reinterpret_cast<const op::max::NAMESPACE::Descriptor *>(desc); \
-        return INFINI_STATUS_SUCCESS
-
-    switch (desc->device_type) {
-
-#ifdef ENABLE_CPU_API
-        DELETE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        DELETE(INFINI_DEVICE_QY, nvidia);
-#endif
-#ifdef ENABLE_METAX_API
-        DELETE(INFINI_DEVICE_METAX, metax);
-#endif
-#ifdef ENABLE_KUNLUN_API
-        DELETE(INFINI_DEVICE_KUNLUN, kunlun);
-#endif
-#ifdef ENABLE_CAMBRICON_API
-        DELETE(INFINI_DEVICE_CAMBRICON, bang);
-#endif
-#ifdef ENABLE_MOORE_API
-        DELETE(INFINI_DEVICE_MOORE, moore);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
 
-#undef DELETE
-}
+BINARY_OP_IMPL(max, Max)
diff --git a/src/infiniop/ops/min/cpu/min_cpu.cc b/src/infiniop/ops/min/cpu/min_cpu.cc
index dc30ee57f..1bac9ea61 100644
--- a/src/infiniop/ops/min/cpu/min_cpu.cc
+++ b/src/infiniop/ops/min/cpu/min_cpu.cc
@@ -1,50 +1,8 @@
 #include "min_cpu.h"
+#include "../../../elementwise/cpu/elementwise_cpu_impl.h"
 
 namespace op::min::cpu {
 
-Descriptor::~Descriptor() = default;
+ELEMENTWISE_CPU_IMPL_BINARY(min)
 
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t out_desc,
-    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
-
-    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
-    auto dtype = out_desc->dtype();
-
-    const auto &a_desc = input_desc_vec.at(0);
-    const auto &b_desc = input_desc_vec.at(1);
-    const auto &c_shape = out_desc->shape();
-    const auto &a_shape = a_desc->shape();
-    const auto &b_shape = b_desc->shape();
-
-    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
-
-    CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);
-
-    // create CPU elementwise descriptor
-    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    std::vector<const void *> inputs,
-    void *stream) const {
-
-    switch (_dtype) {
-    case INFINI_DTYPE_F16:
-        return _device_info->calculate<MinOp, fp16_t>(_info, output, inputs, stream);
-    case INFINI_DTYPE_F32:
-        return _device_info->calculate<MinOp, float>(_info, output, inputs, stream);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
 } // namespace op::min::cpu
diff --git a/src/infiniop/ops/min/cpu/min_cpu.h b/src/infiniop/ops/min/cpu/min_cpu.h
index 1c84d4fca..74042db50 100644
--- a/src/infiniop/ops/min/cpu/min_cpu.h
+++ b/src/infiniop/ops/min/cpu/min_cpu.h
@@ -1,20 +1,9 @@
 #ifndef __MIN_CPU_H__
 #define __MIN_CPU_H__
 
+#include "../../../elementwise/binary.h"
 #include "../../../elementwise/cpu/elementwise_cpu.h"
-#include <algorithm>
 
-ELEMENTWISE_DESCRIPTOR(min, cpu)
-
-namespace op::min::cpu {
-typedef struct MinOp {
-public:
-    static constexpr size_t num_inputs = 2;
-    template <typename T>
-    T operator()(const T &a, const T &b) const {
-        return std::min(a, b);
-    }
-} MinOp;
-} // namespace op::min::cpu
+BINARY_ELEMENTWISE_DESCRIPTOR(min, cpu, op::elementwise::binary::BinaryMode::Min)
 
 #endif // __MIN_CPU_H__
diff --git a/src/infiniop/ops/min/cuda/kernel.cuh b/src/infiniop/ops/min/cuda/kernel.cuh
index aac14a0e8..75c6ab6b9 100644
--- a/src/infiniop/ops/min/cuda/kernel.cuh
+++ b/src/infiniop/ops/min/cuda/kernel.cuh
@@ -1,23 +1,10 @@
 #ifndef __MIN_CUDA_H__
 #define __MIN_CUDA_H__
 
+#include "../../../elementwise/binary.h"
+
 namespace op::min::cuda {
-typedef struct MinOp {
-public:
-    static constexpr size_t num_inputs = 2;
-    template <typename T>
-    __device__ __forceinline__ T operator()(const T &a, const T &b) const {
-        if constexpr (std::is_same_v<T, half2>) {
-            return __hmin2(a, b);
-        } else if constexpr (std::is_same_v<T, half> || std::is_same_v<T, cuda_bfloat16>) {
-            return a < b ? a : b;
-        } else if constexpr (std::is_same_v<T, float>) {
-            return fminf(a, b);
-        } else {
-            return a < b ? a : b;
-        }
-    }
-} MinOp;
+using Op = op::elementwise::binary::cuda::BinaryOp<op::elementwise::binary::BinaryMode::Min>;
 } // namespace op::min::cuda
 
 #endif // __MIN_CUDA_H__
diff --git a/src/infiniop/ops/min/nvidia/min_nvidia.cu b/src/infiniop/ops/min/nvidia/min_nvidia.cu
index 419655e29..0708cbcaf 100644
--- a/src/infiniop/ops/min/nvidia/min_nvidia.cu
+++ b/src/infiniop/ops/min/nvidia/min_nvidia.cu
@@ -1,57 +1,10 @@
-#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh"
 
 #include "../cuda/kernel.cuh"
 #include "min_nvidia.cuh"
 
 namespace op::min::nvidia {
 
-Descriptor::~Descriptor() = default;
+ELEMENTWISE_NVIDIA_IMPL_BINARY(min)
 
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t out_desc,
-    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
-
-    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
-    auto dtype = out_desc->dtype();
-
-    const auto &a_desc = input_desc_vec.at(0);
-    const auto &b_desc = input_desc_vec.at(1);
-    const auto &c_shape = out_desc->shape();
-    const auto &a_shape = a_desc->shape();
-    const auto &b_shape = b_desc->shape();
-
-    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
-
-    CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);
-
-    // create CUDA elementwise descriptor
-    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    std::vector<const void *> inputs,
-    void *stream) const {
-
-    if (workspace_size < _workspace_size) {
-        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
-    }
-
-    switch (_dtype) {
-    case INFINI_DTYPE_F16:
-        return _device_info->calculate<256, cuda::MinOp, half>(_info, workspace, output, inputs, stream);
-    case INFINI_DTYPE_F32:
-        return _device_info->calculate<256, cuda::MinOp, float>(_info, workspace, output, inputs, stream);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
 } // namespace op::min::nvidia
diff --git a/src/infiniop/ops/min/operator.cc b/src/infiniop/ops/min/operator.cc
index 8479feab4..6f67ecf87 100644
--- a/src/infiniop/ops/min/operator.cc
+++ b/src/infiniop/ops/min/operator.cc
@@ -1,5 +1,4 @@
-#include "../../operator.h"
-#include "../../handle.h"
+#include "../../operator_impl.h"
 #include "infiniop/ops/min.h"
 
 #ifdef ENABLE_CPU_API
@@ -8,195 +7,5 @@
 #if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
 #include "nvidia/min_nvidia.cuh"
 #endif
-#ifdef ENABLE_METAX_API
-#include "metax/min_metax.h"
-#endif
-#ifdef ENABLE_KUNLUN_API
-#include "kunlun/min_kunlun.h"
-#endif
-#ifdef ENABLE_CAMBRICON_API
-#include "bang/min_bang.h"
-#endif
-#ifdef ENABLE_MOORE_API
-#include "moore/min_moore.h"
-#endif
-
-__C infiniStatus_t infiniopCreateMinDescriptor(
-    infiniopHandle_t handle,
-    infiniopMinDescriptor_t *desc_ptr,
-    infiniopTensorDescriptor_t c_desc,
-    infiniopTensorDescriptor_t a_desc,
-    infiniopTensorDescriptor_t b_desc) {
-
-#define CREATE(CASE, NAMESPACE)                                            \
-    case CASE:                                                             \
-        return op::min::NAMESPACE::Descriptor::create(                     \
-            handle,                                                        \
-            reinterpret_cast<op::min::NAMESPACE::Descriptor **>(desc_ptr), \
-            c_desc,                                                        \
-            {a_desc,                                                       \
-             b_desc})
-
-    switch (handle->device) {
-
-#ifdef ENABLE_CPU_API
-        CREATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        CREATE(INFINI_DEVICE_QY, nvidia);
-#endif
-#ifdef ENABLE_METAX_API
-        CREATE(INFINI_DEVICE_METAX, metax);
-#endif
-#ifdef ENABLE_KUNLUN_API
-        CREATE(INFINI_DEVICE_KUNLUN, kunlun);
-#endif
-#ifdef ENABLE_CAMBRICON_API
-        CREATE(INFINI_DEVICE_CAMBRICON, bang);
-#endif
-#ifdef ENABLE_MOORE_API
-        CREATE(INFINI_DEVICE_MOORE, moore);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef CREATE
-}
-
-__C infiniStatus_t infiniopGetMinWorkspaceSize(infiniopMinDescriptor_t desc, size_t *size) {
-
-#define GET(CASE, NAMESPACE)                                                               \
-    case CASE:                                                                             \
-        *size = reinterpret_cast<op::min::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
-        return INFINI_STATUS_SUCCESS
-
-    switch (desc->device_type) {
-#ifdef ENABLE_CPU_API
-        GET(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        GET(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        GET(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        GET(INFINI_DEVICE_QY, nvidia);
-#endif
-#ifdef ENABLE_METAX_API
-        GET(INFINI_DEVICE_METAX, metax);
-#endif
-#ifdef ENABLE_KUNLUN_API
-        GET(INFINI_DEVICE_KUNLUN, kunlun);
-#endif
-#ifdef ENABLE_CAMBRICON_API
-        GET(INFINI_DEVICE_CAMBRICON, bang);
-#endif
-#ifdef ENABLE_MOORE_API
-        GET(INFINI_DEVICE_MOORE, moore);
-#endif
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-#undef GET
-
-    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-}
-
-__C infiniStatus_t infiniopMin(
-    infiniopMinDescriptor_t desc,
-    void *workspace,
-    size_t workspace_size,
-    void *c,
-    const void *a,
-    const void *b,
-    void *stream) {
-
-#define CALCULATE(CASE, NAMESPACE)                                            \
-    case CASE:                                                                \
-        return reinterpret_cast<const op::min::NAMESPACE::Descriptor *>(desc) \
-            ->calculate(workspace, workspace_size, c, {a, b}, stream)
-
-    switch (desc->device_type) {
-
-#ifdef ENABLE_CPU_API
-        CALCULATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        CALCULATE(INFINI_DEVICE_QY, nvidia);
-#endif
-#ifdef ENABLE_METAX_API
-        CALCULATE(INFINI_DEVICE_METAX, metax);
-#endif
-#ifdef ENABLE_KUNLUN_API
-        CALCULATE(INFINI_DEVICE_KUNLUN, kunlun);
-#endif
-#ifdef ENABLE_CAMBRICON_API
-        CALCULATE(INFINI_DEVICE_CAMBRICON, bang);
-#endif
-#ifdef ENABLE_MOORE_API
-        CALCULATE(INFINI_DEVICE_MOORE, moore);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef CALCULATE
-}
-
-__C infiniStatus_t
-infiniopDestroyMinDescriptor(infiniopMinDescriptor_t desc) {
-
-#define DELETE(CASE, NAMESPACE)                                                \
-    case CASE:                                                                 \
-        delete reinterpret_cast<const op::min::NAMESPACE::Descriptor *>(desc); \
-        return INFINI_STATUS_SUCCESS
-
-    switch (desc->device_type) {
-
-#ifdef ENABLE_CPU_API
-        DELETE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        DELETE(INFINI_DEVICE_QY, nvidia);
-#endif
-#ifdef ENABLE_METAX_API
-        DELETE(INFINI_DEVICE_METAX, metax);
-#endif
-#ifdef ENABLE_KUNLUN_API
-        DELETE(INFINI_DEVICE_KUNLUN, kunlun);
-#endif
-#ifdef ENABLE_CAMBRICON_API
-        DELETE(INFINI_DEVICE_CAMBRICON, bang);
-#endif
-#ifdef ENABLE_MOORE_API
-        DELETE(INFINI_DEVICE_MOORE, moore);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
 
-#undef DELETE
-}
+BINARY_OP_IMPL(min, Min)
diff --git a/src/infiniop/ops/mod/cpu/mod_cpu.cc b/src/infiniop/ops/mod/cpu/mod_cpu.cc
index 907d05166..609c2e76e 100644
--- a/src/infiniop/ops/mod/cpu/mod_cpu.cc
+++ b/src/infiniop/ops/mod/cpu/mod_cpu.cc
@@ -1,49 +1,8 @@
 #include "mod_cpu.h"
+#include "../../../elementwise/cpu/elementwise_cpu_impl.h"
 
 namespace op::mod::cpu {
 
-Descriptor::~Descriptor() = default;
+ELEMENTWISE_CPU_IMPL_BINARY(mod)
 
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t out_desc,
-    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
-
-    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
-    auto dtype = out_desc->dtype();
-
-    const auto &a_desc = input_desc_vec.at(0);
-    const auto &b_desc = input_desc_vec.at(1);
-    const auto &out_shape = out_desc->shape();
-    const auto &a_shape = a_desc->shape();
-    const auto &b_shape = b_desc->shape();
-
-    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
-
-    CHECK_SAME_SHAPE(out_shape, a_shape, b_shape);
-
-    // create CPU elementwise descriptor
-    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    std::vector<const void *> inputs,
-    void *stream) const {
-
-    switch (_dtype) {
-    case INFINI_DTYPE_F16:
-        return _device_info->calculate<ModOp, fp16_t>(_info, output, inputs, stream);
-    case INFINI_DTYPE_F32:
-        return _device_info->calculate<ModOp, float>(_info, output, inputs, stream);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-    return INFINI_STATUS_SUCCESS;
-}
 } // namespace op::mod::cpu
diff --git a/src/infiniop/ops/mod/cpu/mod_cpu.h b/src/infiniop/ops/mod/cpu/mod_cpu.h
index 9e78adca6..72ea7dede 100644
--- a/src/infiniop/ops/mod/cpu/mod_cpu.h
+++ b/src/infiniop/ops/mod/cpu/mod_cpu.h
@@ -1,23 +1,9 @@
 #ifndef __MOD_CPU_H__
 #define __MOD_CPU_H__
 
+#include "../../../elementwise/binary.h"
 #include "../../../elementwise/cpu/elementwise_cpu.h"
 
-ELEMENTWISE_DESCRIPTOR(mod, cpu)
-
-namespace op::mod::cpu {
-typedef struct ModOp {
-public:
-    static constexpr size_t num_inputs = 2;
-    template <typename T>
-    T operator()(const T &a, const T &b) const {
-        if constexpr (std::is_floating_point_v<T>) {
-            return std::fmod(a, b);
-        } else {
-            return a % b;
-        }
-    }
-} ModOp;
-} // namespace op::mod::cpu
+BINARY_ELEMENTWISE_DESCRIPTOR(mod, cpu, op::elementwise::binary::BinaryMode::Mod)
 
 #endif // __MOD_CPU_H__
diff --git a/src/infiniop/ops/mod/cuda/kernel.cuh b/src/infiniop/ops/mod/cuda/kernel.cuh
index 0dcb54136..164784081 100644
--- a/src/infiniop/ops/mod/cuda/kernel.cuh
+++ b/src/infiniop/ops/mod/cuda/kernel.cuh
@@ -1,30 +1,10 @@
 #ifndef __MOD_CUDA_H__
 #define __MOD_CUDA_H__
 
-#include <cmath>
-#include <cuda_fp16.h>
+#include "../../../elementwise/binary.h"
 
 namespace op::mod::cuda {
-typedef struct ModOp {
-public:
-    static constexpr size_t num_inputs = 2;
-    template <typename T>
-    __device__ __forceinline__ T operator()(const T &a, const T &b) const {
-        if constexpr (std::is_same_v<T, half2>) {
-            float2 a_f2 = __half22float2(a);
-            float2 b_f2 = __half22float2(b);
-            return __float22half2_rn(make_float2(std::fmod(a_f2.x, b_f2.x), std::fmod(a_f2.y, b_f2.y)));
-        } else if constexpr (std::is_same_v<T, half>) {
-            float a_ = __half2float(a);
-            float b_ = __half2float(b);
-            return __float2half(std::fmod(a_, b_));
-        } else if constexpr (std::is_floating_point_v<T>) {
-            return std::fmod(a, b);
-        } else {
-            return a % b;
-        }
-    }
-} ModOp;
+using Op = op::elementwise::binary::cuda::BinaryOp<op::elementwise::binary::BinaryMode::Mod>;
 } // namespace op::mod::cuda
 
 #endif // __MOD_CUDA_H__
diff --git a/src/infiniop/ops/mod/nvidia/mod_nvidia.cu b/src/infiniop/ops/mod/nvidia/mod_nvidia.cu
index 64326d441..68b78ee70 100644
--- a/src/infiniop/ops/mod/nvidia/mod_nvidia.cu
+++ b/src/infiniop/ops/mod/nvidia/mod_nvidia.cu
@@ -1,57 +1,10 @@
-#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh"
 
 #include "../cuda/kernel.cuh"
 #include "mod_nvidia.cuh"
 
 namespace op::mod::nvidia {
 
-Descriptor::~Descriptor() = default;
+ELEMENTWISE_NVIDIA_IMPL_BINARY(mod)
 
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t out_desc,
-    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
-
-    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
-    auto dtype = out_desc->dtype();
-
-    const auto &a_desc = input_desc_vec.at(0);
-    const auto &b_desc = input_desc_vec.at(1);
-    const auto &c_shape = out_desc->shape();
-    const auto &a_shape = a_desc->shape();
-    const auto &b_shape = b_desc->shape();
-
-    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
-
-    CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);
-
-    // create CUDA elementwise descriptor
-    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    std::vector<const void *> inputs,
-    void *stream) const {
-
-    if (workspace_size < _workspace_size) {
-        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
-    }
-
-    switch (_dtype) {
-    case INFINI_DTYPE_F16:
-        return _device_info->calculate<256, cuda::ModOp, half>(_info, workspace, output, inputs, stream);
-    case INFINI_DTYPE_F32:
-        return _device_info->calculate<256, cuda::ModOp, float>(_info, workspace, output, inputs, stream);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
 } // namespace op::mod::nvidia
diff --git a/src/infiniop/ops/mod/operator.cc b/src/infiniop/ops/mod/operator.cc
index 85810e794..aef892ce1 100644
--- a/src/infiniop/ops/mod/operator.cc
+++ b/src/infiniop/ops/mod/operator.cc
@@ -1,5 +1,4 @@
-#include "../../operator.h"
-#include "../../handle.h"
+#include "../../operator_impl.h"
 #include "infiniop/ops/mod.h"
 
 #ifdef ENABLE_CPU_API
@@ -9,134 +8,4 @@
 #include "nvidia/mod_nvidia.cuh"
 #endif
 
-__C infiniStatus_t infiniopCreateModDescriptor(
-    infiniopHandle_t handle,
-    infiniopModDescriptor_t *desc_ptr,
-    infiniopTensorDescriptor_t c_desc,
-    infiniopTensorDescriptor_t a_desc,
-    infiniopTensorDescriptor_t b_desc) {
-
-#define CREATE(CASE, NAMESPACE)                                            \
-    case CASE:                                                             \
-        return op::mod::NAMESPACE::Descriptor::create(                     \
-            handle,                                                        \
-            reinterpret_cast<op::mod::NAMESPACE::Descriptor **>(desc_ptr), \
-            c_desc,                                                        \
-            {a_desc,                                                       \
-             b_desc})
-
-    switch (handle->device) {
-
-#ifdef ENABLE_CPU_API
-        CREATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        CREATE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef CREATE
-}
-
-__C infiniStatus_t infiniopGetModWorkspaceSize(infiniopModDescriptor_t desc, size_t *size) {
-
-#define GET(CASE, NAMESPACE)                                                               \
-    case CASE:                                                                             \
-        *size = reinterpret_cast<op::mod::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
-        return INFINI_STATUS_SUCCESS
-
-    switch (desc->device_type) {
-#ifdef ENABLE_CPU_API
-        GET(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        GET(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        GET(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        GET(INFINI_DEVICE_QY, nvidia);
-#endif
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-#undef GET
-
-    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-}
-
-__C infiniStatus_t infiniopMod(
-    infiniopModDescriptor_t desc,
-    void *workspace,
-    size_t workspace_size,
-    void *c,
-    const void *a,
-    const void *b,
-    void *stream) {
-
-#define CALCULATE(CASE, NAMESPACE)                                            \
-    case CASE:                                                                \
-        return reinterpret_cast<const op::mod::NAMESPACE::Descriptor *>(desc) \
-            ->calculate(workspace, workspace_size, c, {a, b}, stream)
-
-    switch (desc->device_type) {
-
-#ifdef ENABLE_CPU_API
-        CALCULATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        CALCULATE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef CALCULATE
-}
-
-__C infiniStatus_t
-infiniopDestroyModDescriptor(infiniopModDescriptor_t desc) {
-
-#define DELETE(CASE, NAMESPACE)                                                \
-    case CASE:                                                                 \
-        delete reinterpret_cast<const op::mod::NAMESPACE::Descriptor *>(desc); \
-        return INFINI_STATUS_SUCCESS
-
-    switch (desc->device_type) {
-
-#ifdef ENABLE_CPU_API
-        DELETE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        DELETE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef DELETE
-}
+BINARY_OP_IMPL(mod, Mod)
diff --git a/src/infiniop/ops/neg/cpu/neg_cpu.cc b/src/infiniop/ops/neg/cpu/neg_cpu.cc
index 5da2ae4c3..47f4d2b2e 100644
--- a/src/infiniop/ops/neg/cpu/neg_cpu.cc
+++ b/src/infiniop/ops/neg/cpu/neg_cpu.cc
@@ -1,48 +1,8 @@
 #include "neg_cpu.h"
+#include "../../../elementwise/cpu/elementwise_cpu_impl.h"
 
 namespace op::neg::cpu {
 
-Descriptor::~Descriptor() = default;
+ELEMENTWISE_CPU_IMPL_UNARY(neg)
 
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t out_desc,
-    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
-
-    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
-    auto dtype = out_desc->dtype();
-
-    const auto &x_desc = input_desc_vec.at(0);
-    const auto &y_shape = out_desc->shape();
-    const auto &x_shape = x_desc->shape();
-
-    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
-
-    CHECK_SAME_SHAPE(y_shape, x_shape);
-
-    // create CPU elementwise descriptor
-    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    std::vector<const void *> inputs,
-    void *stream) const {
-
-    switch (_dtype) {
-    case INFINI_DTYPE_F16:
-        return _device_info->calculate<NegOp, fp16_t>(_info, output, inputs, stream);
-    case INFINI_DTYPE_F32:
-        return _device_info->calculate<NegOp, float>(_info, output, inputs, stream);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
 } // namespace op::neg::cpu
diff --git a/src/infiniop/ops/neg/cpu/neg_cpu.h b/src/infiniop/ops/neg/cpu/neg_cpu.h
index ea45989b3..f6778a6d3 100644
--- a/src/infiniop/ops/neg/cpu/neg_cpu.h
+++ b/src/infiniop/ops/neg/cpu/neg_cpu.h
@@ -2,19 +2,8 @@
 #define __NEG_CPU_H__
 
 #include "../../../elementwise/cpu/elementwise_cpu.h"
+#include "../../../elementwise/unary.h"
 
-ELEMENTWISE_DESCRIPTOR(neg, cpu)
-
-namespace op::neg::cpu {
-typedef struct NegOp {
-public:
-    static constexpr size_t num_inputs = 1;
-
-    template <typename T>
-    T operator()(const T &x) const {
-        return -x;
-    }
-} NegOp;
-} // namespace op::neg::cpu
+UNARY_ELEMENTWISE_DESCRIPTOR(neg, cpu, op::elementwise::unary::UnaryMode::Neg)
 
 #endif // __NEG_CPU_H__
diff --git a/src/infiniop/ops/neg/cuda/kernel.cuh b/src/infiniop/ops/neg/cuda/kernel.cuh
index 57904b3df..f5cf5a449 100644
--- a/src/infiniop/ops/neg/cuda/kernel.cuh
+++ b/src/infiniop/ops/neg/cuda/kernel.cuh
@@ -1,23 +1,10 @@
 #ifndef __NEG_CUDA_H__
 #define __NEG_CUDA_H__
 
-#include <cuda_fp16.h>
+#include "../../../elementwise/unary.h"
 
 namespace op::neg::cuda {
-typedef struct NegOp {
-public:
-    static constexpr size_t num_inputs = 1;
-    template <typename T>
-    __device__ __forceinline__ T operator()(const T &x) const {
-        if constexpr (std::is_same_v<T, half2>) {
-            return __hneg2(x);
-        } else if constexpr (std::is_same_v<T, half>) {
-            return __hneg(x);
-        } else {
-            return -x;
-        }
-    }
-} NegOp;
+using Op = op::elementwise::unary::cuda::UnaryOp<op::elementwise::unary::UnaryMode::Neg>;
 } // namespace op::neg::cuda
 
 #endif // __NEG_CUDA_H__
diff --git a/src/infiniop/ops/neg/nvidia/neg_nvidia.cu b/src/infiniop/ops/neg/nvidia/neg_nvidia.cu
index d18b8bf25..f568585f0 100644
--- a/src/infiniop/ops/neg/nvidia/neg_nvidia.cu
+++ b/src/infiniop/ops/neg/nvidia/neg_nvidia.cu
@@ -1,54 +1,10 @@
-#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh"
 
 #include "../cuda/kernel.cuh"
 #include "neg_nvidia.cuh"
 
 namespace op::neg::nvidia {
 
-Descriptor::~Descriptor() = default;
+ELEMENTWISE_NVIDIA_IMPL_UNARY(neg)
 
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t out_desc,
-    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
-
-    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
-    auto dtype = out_desc->dtype();
-
-    const auto &x_desc = input_desc_vec.at(0);
-    const auto &y_shape = out_desc->shape();
-    const auto &x_shape = x_desc->shape();
-
-    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
-
-    CHECK_SAME_SHAPE(y_shape, x_shape);
-
-    // create CUDA elementwise descriptor
-    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    std::vector<const void *> inputs,
-    void *stream) const {
-    if (workspace_size < _workspace_size) {
-        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
-    }
-
-    switch (_dtype) {
-    case INFINI_DTYPE_F16:
-        return _device_info->calculate<256, cuda::NegOp, half>(_info, workspace, output, inputs, stream);
-    case INFINI_DTYPE_F32:
-        return _device_info->calculate<256, cuda::NegOp, float>(_info, workspace, output, inputs, stream);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
 } // namespace op::neg::nvidia
diff --git a/src/infiniop/ops/neg/operator.cc b/src/infiniop/ops/neg/operator.cc
index d4134df3e..c3945f4bb 100644
--- a/src/infiniop/ops/neg/operator.cc
+++ b/src/infiniop/ops/neg/operator.cc
@@ -1,5 +1,4 @@
-#include "../../operator.h"
-#include "../../handle.h"
+#include "../../operator_impl.h"
 #include "infiniop/ops/neg.h"
 
 #ifdef ENABLE_CPU_API
@@ -9,131 +8,4 @@
 #include "nvidia/neg_nvidia.cuh"
 #endif
 
-__C infiniStatus_t infiniopCreateNegDescriptor(
-    infiniopHandle_t handle,
-    infiniopNegDescriptor_t *desc_ptr,
-    infiniopTensorDescriptor_t y_desc,
-    infiniopTensorDescriptor_t x_desc) {
-
-#define CREATE(CASE, NAMESPACE)                                            \
-    case CASE:                                                             \
-        return op::neg::NAMESPACE::Descriptor::create(                     \
-            handle,                                                        \
-            reinterpret_cast<op::neg::NAMESPACE::Descriptor **>(desc_ptr), \
-            y_desc,                                                        \
-            {x_desc})
-
-    switch (handle->device) {
-
-#ifdef ENABLE_CPU_API
-        CREATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        CREATE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef CREATE
-}
-
-__C infiniStatus_t infiniopGetNegWorkspaceSize(infiniopNegDescriptor_t desc, size_t *size) {
-
-#define GET(CASE, NAMESPACE)                                                               \
-    case CASE:                                                                             \
-        *size = reinterpret_cast<op::neg::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
-        return INFINI_STATUS_SUCCESS;
-
-    switch (desc->device_type) {
-#ifdef ENABLE_CPU_API
-        GET(INFINI_DEVICE_CPU, cpu)
-#endif
-#ifdef ENABLE_NVIDIA_API
-        GET(INFINI_DEVICE_NVIDIA, nvidia)
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        GET(INFINI_DEVICE_ILUVATAR, nvidia)
-#endif
-#ifdef ENABLE_QY_API
-        GET(INFINI_DEVICE_QY, nvidia)
-#endif
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-#undef GET
-
-    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-}
-
-__C infiniStatus_t infiniopNeg(
-    infiniopNegDescriptor_t desc,
-    void *workspace,
-    size_t workspace_size,
-    void *y,
-    const void *x,
-    void *stream) {
-
-#define CALCULATE(CASE, NAMESPACE)                                            \
-    case CASE:                                                                \
-        return reinterpret_cast<const op::neg::NAMESPACE::Descriptor *>(desc) \
-            ->calculate(workspace, workspace_size, y, {x}, stream)
-
-    switch (desc->device_type) {
-
-#ifdef ENABLE_CPU_API
-        CALCULATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        CALCULATE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef CALCULATE
-}
-
-__C infiniStatus_t
-infiniopDestroyNegDescriptor(infiniopNegDescriptor_t desc) {
-
-#define DELETE(CASE, NAMESPACE)                                                \
-    case CASE:                                                                 \
-        delete reinterpret_cast<const op::neg::NAMESPACE::Descriptor *>(desc); \
-        return INFINI_STATUS_SUCCESS
-
-    switch (desc->device_type) {
-
-#ifdef ENABLE_CPU_API
-        DELETE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        DELETE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef DELETE
-}
+UNARY_OP_IMPL(neg, Neg)
diff --git a/src/infiniop/ops/pow/cpu/pow_cpu.cc b/src/infiniop/ops/pow/cpu/pow_cpu.cc
index 0c6fda0f7..1134d8aae 100644
--- a/src/infiniop/ops/pow/cpu/pow_cpu.cc
+++ b/src/infiniop/ops/pow/cpu/pow_cpu.cc
@@ -1,49 +1,8 @@
 #include "pow_cpu.h"
+#include "../../../elementwise/cpu/elementwise_cpu_impl.h"
 
 namespace op::pow::cpu {
 
-Descriptor::~Descriptor() = default;
+ELEMENTWISE_CPU_IMPL_BINARY(pow)
 
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t out_desc,
-    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
-
-    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
-    auto dtype = out_desc->dtype();
-
-    const auto &a_desc = input_desc_vec.at(0);
-    const auto &b_desc = input_desc_vec.at(1);
-    const auto &out_shape = out_desc->shape();
-    const auto &a_shape = a_desc->shape();
-    const auto &b_shape = b_desc->shape();
-
-    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
-
-    CHECK_SAME_SHAPE(out_shape, a_shape, b_shape);
-
-    // create CPU elementwise descriptor
-    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    std::vector<const void *> inputs,
-    void *stream) const {
-
-    switch (_dtype) {
-    case INFINI_DTYPE_F16:
-        return _device_info->calculate<PowOp, fp16_t>(_info, output, inputs, stream);
-    case INFINI_DTYPE_F32:
-        return _device_info->calculate<PowOp, float>(_info, output, inputs, stream);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-    return INFINI_STATUS_SUCCESS;
-}
 } // namespace op::pow::cpu
diff --git a/src/infiniop/ops/pow/cpu/pow_cpu.h b/src/infiniop/ops/pow/cpu/pow_cpu.h
index 21d9bb897..9c8e8a368 100644
--- a/src/infiniop/ops/pow/cpu/pow_cpu.h
+++ b/src/infiniop/ops/pow/cpu/pow_cpu.h
@@ -1,19 +1,9 @@
 #ifndef __POW_CPU_H__
 #define __POW_CPU_H__
 
+#include "../../../elementwise/binary.h"
 #include "../../../elementwise/cpu/elementwise_cpu.h"
 
-ELEMENTWISE_DESCRIPTOR(pow, cpu)
-
-namespace op::pow::cpu {
-typedef struct PowOp {
-public:
-    static constexpr size_t num_inputs = 2;
-    template <typename T>
-    T operator()(const T &a, const T &b) const {
-        return std::pow(a, b);
-    }
-} PowOp;
-} // namespace op::pow::cpu
+BINARY_ELEMENTWISE_DESCRIPTOR(pow, cpu, op::elementwise::binary::BinaryMode::Pow)
 
 #endif // __POW_CPU_H__
diff --git a/src/infiniop/ops/pow/cuda/kernel.cuh b/src/infiniop/ops/pow/cuda/kernel.cuh
index 3786e7a52..0637240e8 100644
--- a/src/infiniop/ops/pow/cuda/kernel.cuh
+++ b/src/infiniop/ops/pow/cuda/kernel.cuh
@@ -1,40 +1,10 @@
 #ifndef __POW_CUDA_H__
 #define __POW_CUDA_H__
 
-#include <cmath>
-#include <cuda_bf16.h>
-#include <cuda_fp16.h>
+#include "../../../elementwise/binary.h"
 
 namespace op::pow::cuda {
-typedef struct PowOp {
-    static constexpr size_t num_inputs = 2;
-    template <typename T>
-    __device__ __forceinline__ T operator()(const T &a, const T &b) const {
-        if constexpr (std::is_same_v<T, half2>) {
-            float2 a_f2 = __half22float2(a);
-            float2 b_f2 = __half22float2(b);
-            return __float22half2_rn(make_float2(__powf(a_f2.x, b_f2.x), __powf(a_f2.y, b_f2.y)));
-        } else if constexpr (std::is_same_v<T, half>) {
-            float a_ = __half2float(a);
-            float b_ = __half2float(b);
-            float ans_f = __powf(a_, b_);
-            return __float2half(isnan(ans_f) ? std::pow(a_, b_) : ans_f);
-        } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
-            float2 a_f2 = __bfloat1622float2(a);
-            float2 b_f2 = __bfloat1622float2(b);
-            return __floats2bfloat162_rn(__powf(a_f2.x, b_f2.x), __powf(a_f2.y, b_f2.y));
-        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
-            float a_ = __bfloat162float(a);
-            float b_ = __bfloat162float(b);
-            return __float2bfloat16_rn(__powf(a_, b_));
-        } else if constexpr (std::is_same_v<T, float>) {
-            return __powf(a, b);
-        } else {
-            return std::pow(a, b);
-        }
-    }
-} PowOp;
-
+using Op = op::elementwise::binary::cuda::BinaryOp<op::elementwise::binary::BinaryMode::Pow>;
 } // namespace op::pow::cuda
 
 #endif // __POW_CUDA_H__
diff --git a/src/infiniop/ops/pow/nvidia/pow_nvidia.cu b/src/infiniop/ops/pow/nvidia/pow_nvidia.cu
index 3cfd0cd2f..63a3d40a3 100644
--- a/src/infiniop/ops/pow/nvidia/pow_nvidia.cu
+++ b/src/infiniop/ops/pow/nvidia/pow_nvidia.cu
@@ -1,57 +1,10 @@
-#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh"
 
 #include "../cuda/kernel.cuh"
 #include "pow_nvidia.cuh"
 
 namespace op::pow::nvidia {
 
-Descriptor::~Descriptor() = default;
+ELEMENTWISE_NVIDIA_IMPL_BINARY(pow)
 
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t out_desc,
-    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
-
-    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
-    auto dtype = out_desc->dtype();
-
-    const auto &a_desc = input_desc_vec.at(0);
-    const auto &b_desc = input_desc_vec.at(1);
-    const auto &c_shape = out_desc->shape();
-    const auto &a_shape = a_desc->shape();
-    const auto &b_shape = b_desc->shape();
-
-    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
-
-    CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);
-
-    // create CUDA elementwise descriptor
-    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    std::vector<const void *> inputs,
-    void *stream) const {
-
-    if (workspace_size < _workspace_size) {
-        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
-    }
-
-    switch (_dtype) {
-    case INFINI_DTYPE_F16:
-        return _device_info->calculate<256, cuda::PowOp, half>(_info, workspace, output, inputs, stream);
-    case INFINI_DTYPE_F32:
-        return _device_info->calculate<256, cuda::PowOp, float>(_info, workspace, output, inputs, stream);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
 } // namespace op::pow::nvidia
diff --git a/src/infiniop/ops/pow/operator.cc b/src/infiniop/ops/pow/operator.cc
index e90639f67..b1ddbc9c1 100644
--- a/src/infiniop/ops/pow/operator.cc
+++ b/src/infiniop/ops/pow/operator.cc
@@ -1,5 +1,4 @@
-#include "../../operator.h"
-#include "../../handle.h"
+#include "../../operator_impl.h"
 #include "infiniop/ops/pow.h"
 
 #ifdef ENABLE_CPU_API
@@ -9,134 +8,4 @@
 #include "nvidia/pow_nvidia.cuh"
 #endif
 
-__C infiniStatus_t infiniopCreatePowDescriptor(
-    infiniopHandle_t handle,
-    infiniopPowDescriptor_t *desc_ptr,
-    infiniopTensorDescriptor_t c_desc,
-    infiniopTensorDescriptor_t a_desc,
-    infiniopTensorDescriptor_t b_desc) {
-
-#define CREATE(CASE, NAMESPACE)                                            \
-    case CASE:                                                             \
-        return op::pow::NAMESPACE::Descriptor::create(                     \
-            handle,                                                        \
-            reinterpret_cast<op::pow::NAMESPACE::Descriptor **>(desc_ptr), \
-            c_desc,                                                        \
-            {a_desc,                                                       \
-             b_desc})
-
-    switch (handle->device) {
-
-#ifdef ENABLE_CPU_API
-        CREATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        CREATE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef CREATE
-}
-
-__C infiniStatus_t infiniopGetPowWorkspaceSize(infiniopPowDescriptor_t desc, size_t *size) {
-
-#define GET(CASE, NAMESPACE)                                                               \
-    case CASE:                                                                             \
-        *size = reinterpret_cast<op::pow::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
-        return INFINI_STATUS_SUCCESS
-
-    switch (desc->device_type) {
-#ifdef ENABLE_CPU_API
-        GET(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        GET(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        GET(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        GET(INFINI_DEVICE_QY, nvidia);
-#endif
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-#undef GET
-
-    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-}
-
-__C infiniStatus_t infiniopPow(
-    infiniopPowDescriptor_t desc,
-    void *workspace,
-    size_t workspace_size,
-    void *c,
-    const void *a,
-    const void *b,
-    void *stream) {
-
-#define CALCULATE(CASE, NAMESPACE)                                            \
-    case CASE:                                                                \
-        return reinterpret_cast<const op::pow::NAMESPACE::Descriptor *>(desc) \
-            ->calculate(workspace, workspace_size, c, {a, b}, stream)
-
-    switch (desc->device_type) {
-
-#ifdef ENABLE_CPU_API
-        CALCULATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        CALCULATE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef CALCULATE
-}
-
-__C infiniStatus_t
-infiniopDestroyPowDescriptor(infiniopPowDescriptor_t desc) {
-
-#define DELETE(CASE, NAMESPACE)                                                \
-    case CASE:                                                                 \
-        delete reinterpret_cast<const op::pow::NAMESPACE::Descriptor *>(desc); \
-        return INFINI_STATUS_SUCCESS;
-
-    switch (desc->device_type) {
-
-#ifdef ENABLE_CPU_API
-        DELETE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        DELETE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef DELETE
-}
+BINARY_OP_IMPL(pow, Pow)
diff --git a/src/infiniop/ops/reciprocal/cpu/reciprocal_cpu.cc b/src/infiniop/ops/reciprocal/cpu/reciprocal_cpu.cc
index 52874c8b3..0b66eca64 100644
--- a/src/infiniop/ops/reciprocal/cpu/reciprocal_cpu.cc
+++ b/src/infiniop/ops/reciprocal/cpu/reciprocal_cpu.cc
@@ -1,48 +1,8 @@
 #include "reciprocal_cpu.h"
+#include "../../../elementwise/cpu/elementwise_cpu_impl.h"
 
 namespace op::reciprocal::cpu {
 
-Descriptor::~Descriptor() = default;
+ELEMENTWISE_CPU_IMPL_UNARY(reciprocal)
 
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t out_desc,
-    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
-
-    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
-    auto dtype = out_desc->dtype();
-
-    const auto &x_desc = input_desc_vec.at(0);
-    const auto &y_shape = out_desc->shape();
-    const auto &x_shape = x_desc->shape();
-
-    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
-
-    CHECK_SAME_SHAPE(y_shape, x_shape);
-
-    // create CPU elementwise descriptor
-    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    std::vector<const void *> inputs,
-    void *stream) const {
-
-    switch (_dtype) {
-    case INFINI_DTYPE_F16:
-        return _device_info->calculate<ReciprocalOp, fp16_t>(_info, output, inputs, stream);
-    case INFINI_DTYPE_F32:
-        return _device_info->calculate<ReciprocalOp, float>(_info, output, inputs, stream);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
 } // namespace op::reciprocal::cpu
diff --git a/src/infiniop/ops/reciprocal/cpu/reciprocal_cpu.h b/src/infiniop/ops/reciprocal/cpu/reciprocal_cpu.h
index 0a0f223f0..9af583ab7 100644
--- a/src/infiniop/ops/reciprocal/cpu/reciprocal_cpu.h
+++ b/src/infiniop/ops/reciprocal/cpu/reciprocal_cpu.h
@@ -2,19 +2,8 @@
 #define __RECIPROCAL_CPU_H__
 
 #include "../../../elementwise/cpu/elementwise_cpu.h"
+#include "../../../elementwise/unary.h"
 
-ELEMENTWISE_DESCRIPTOR(reciprocal, cpu)
-
-namespace op::reciprocal::cpu {
-typedef struct ReciprocalOp {
-public:
-    static constexpr size_t num_inputs = 1;
-
-    template <typename T>
-    T operator()(const T &x) const {
-        return T(1) / x;
-    }
-} ReciprocalOp;
-} // namespace op::reciprocal::cpu
+UNARY_ELEMENTWISE_DESCRIPTOR(reciprocal, cpu, op::elementwise::unary::UnaryMode::Reciprocal)
 
 #endif // __RECIPROCAL_CPU_H__
diff --git a/src/infiniop/ops/reciprocal/cuda/kernel.cuh b/src/infiniop/ops/reciprocal/cuda/kernel.cuh
index 94c71de90..8c29a8e9e 100644
--- a/src/infiniop/ops/reciprocal/cuda/kernel.cuh
+++ b/src/infiniop/ops/reciprocal/cuda/kernel.cuh
@@ -1,32 +1,10 @@
 #ifndef __RECIPROCAL_CUDA_H__
 #define __RECIPROCAL_CUDA_H__
 
-#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
-#include <cuda_fp16.h>
+#include "../../../elementwise/unary.h"
 
 namespace op::reciprocal::cuda {
-typedef struct ReciprocalOp {
-public:
-    static constexpr size_t num_inputs = 1;
-    template <typename T>
-    __device__ __forceinline__ T operator()(const T &x) const {
-        if constexpr (std::is_same_v<T, half2>) {
-            return h2rcp(x);
-        } else if constexpr (std::is_same_v<T, half>) {
-            return hrcp(x);
-        } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
-            float x0 = __bfloat162float(__low2bfloat16(x));
-            float x1 = __bfloat162float(__high2bfloat16(x));
-            return __floats2bfloat162_rn(__frcp_rn(x0), __frcp_rn(x1));
-        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
-            return __float2bfloat16_rn(__frcp_rn(__bfloat162float(x)));
-        } else if constexpr (std::is_same_v<T, float>) {
-            return __frcp_rn(x);
-        } else {
-            return T(1) / x;
-        }
-    }
-} ReciprocalOp;
+using Op = op::elementwise::unary::cuda::UnaryOp<op::elementwise::unary::UnaryMode::Reciprocal>;
 } // namespace op::reciprocal::cuda
 
 #endif // __RECIPROCAL_CUDA_H__
diff --git a/src/infiniop/ops/reciprocal/nvidia/reciprocal_nvidia.cu b/src/infiniop/ops/reciprocal/nvidia/reciprocal_nvidia.cu
index 45b74e25e..39a41b583 100644
--- a/src/infiniop/ops/reciprocal/nvidia/reciprocal_nvidia.cu
+++ b/src/infiniop/ops/reciprocal/nvidia/reciprocal_nvidia.cu
@@ -1,54 +1,10 @@
-#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh"
 
 #include "../cuda/kernel.cuh"
 #include "reciprocal_nvidia.cuh"
 
 namespace op::reciprocal::nvidia {
 
-Descriptor::~Descriptor() = default;
+ELEMENTWISE_NVIDIA_IMPL_UNARY(reciprocal)
 
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t out_desc,
-    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
-
-    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
-    auto dtype = out_desc->dtype();
-
-    const auto &x_desc = input_desc_vec.at(0);
-    const auto &y_shape = out_desc->shape();
-    const auto &x_shape = x_desc->shape();
-
-    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
-
-    CHECK_SAME_SHAPE(y_shape, x_shape);
-
-    // create CUDA elementwise descriptor
-    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    std::vector<const void *> inputs,
-    void *stream) const {
-    if (workspace_size < _workspace_size) {
-        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
-    }
-
-    switch (_dtype) {
-    case INFINI_DTYPE_F16:
-        return _device_info->calculate<256, cuda::ReciprocalOp, half>(_info, workspace, output, inputs, stream);
-    case INFINI_DTYPE_F32:
-        return _device_info->calculate<256, cuda::ReciprocalOp, float>(_info, workspace, output, inputs, stream);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
 } // namespace op::reciprocal::nvidia
diff --git a/src/infiniop/ops/reciprocal/operator.cc b/src/infiniop/ops/reciprocal/operator.cc
index 033286024..966bd72d8 100644
--- a/src/infiniop/ops/reciprocal/operator.cc
+++ b/src/infiniop/ops/reciprocal/operator.cc
@@ -1,5 +1,4 @@
-#include "../../operator.h"
-#include "../../handle.h"
+#include "../../operator_impl.h"
 #include "infiniop/ops/reciprocal.h"
 
 #ifdef ENABLE_CPU_API
@@ -9,131 +8,4 @@
 #include "nvidia/reciprocal_nvidia.cuh"
 #endif
 
-__C infiniStatus_t infiniopCreateReciprocalDescriptor(
-    infiniopHandle_t handle,
-    infiniopReciprocalDescriptor_t *desc_ptr,
-    infiniopTensorDescriptor_t y_desc,
-    infiniopTensorDescriptor_t x_desc) {
-
-#define CREATE(CASE, NAMESPACE)                                                   \
-    case CASE:                                                                    \
-        return op::reciprocal::NAMESPACE::Descriptor::create(                     \
-            handle,                                                               \
-            reinterpret_cast<op::reciprocal::NAMESPACE::Descriptor **>(desc_ptr), \
-            y_desc,                                                               \
-            {x_desc})
-
-    switch (handle->device) {
-
-#ifdef ENABLE_CPU_API
-        CREATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        CREATE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef CREATE
-}
-
-__C infiniStatus_t infiniopGetReciprocalWorkspaceSize(infiniopReciprocalDescriptor_t desc, size_t *size) {
-
-#define GET(CASE, NAMESPACE)                                                                      \
-    case CASE:                                                                                    \
-        *size = reinterpret_cast<op::reciprocal::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
-        return INFINI_STATUS_SUCCESS;
-
-    switch (desc->device_type) {
-#ifdef ENABLE_CPU_API
-        GET(INFINI_DEVICE_CPU, cpu)
-#endif
-#ifdef ENABLE_NVIDIA_API
-        GET(INFINI_DEVICE_NVIDIA, nvidia)
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        GET(INFINI_DEVICE_ILUVATAR, nvidia)
-#endif
-#ifdef ENABLE_QY_API
-        GET(INFINI_DEVICE_QY, nvidia)
-#endif
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-#undef GET
-
-    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-}
-
-__C infiniStatus_t infiniopReciprocal(
-    infiniopReciprocalDescriptor_t desc,
-    void *workspace,
-    size_t workspace_size,
-    void *y,
-    const void *x,
-    void *stream) {
-
-#define CALCULATE(CASE, NAMESPACE)                                                   \
-    case CASE:                                                                       \
-        return reinterpret_cast<const op::reciprocal::NAMESPACE::Descriptor *>(desc) \
-            ->calculate(workspace, workspace_size, y, {x}, stream)
-
-    switch (desc->device_type) {
-
-#ifdef ENABLE_CPU_API
-        CALCULATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        CALCULATE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef CALCULATE
-}
-
-__C infiniStatus_t
-infiniopDestroyReciprocalDescriptor(infiniopReciprocalDescriptor_t desc) {
-
-#define DELETE(CASE, NAMESPACE)                                                       \
-    case CASE:                                                                        \
-        delete reinterpret_cast<const op::reciprocal::NAMESPACE::Descriptor *>(desc); \
-        return INFINI_STATUS_SUCCESS
-
-    switch (desc->device_type) {
-
-#ifdef ENABLE_CPU_API
-        DELETE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        DELETE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef DELETE
-}
+UNARY_OP_IMPL(reciprocal, Reciprocal)
diff --git a/src/infiniop/ops/round/cpu/round_cpu.cc b/src/infiniop/ops/round/cpu/round_cpu.cc
index 0b0cea7b7..20ae304bd 100644
--- a/src/infiniop/ops/round/cpu/round_cpu.cc
+++ b/src/infiniop/ops/round/cpu/round_cpu.cc
@@ -1,48 +1,8 @@
 #include "round_cpu.h"
+#include "../../../elementwise/cpu/elementwise_cpu_impl.h"
 
 namespace op::round::cpu {
 
-Descriptor::~Descriptor() = default;
+ELEMENTWISE_CPU_IMPL_UNARY(round)
 
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t out_desc,
-    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
-
-    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
-    auto dtype = out_desc->dtype();
-
-    const auto &x_desc = input_desc_vec.at(0);
-    const auto &y_shape = out_desc->shape();
-    const auto &x_shape = x_desc->shape();
-
-    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
-
-    CHECK_SAME_SHAPE(y_shape, x_shape);
-
-    // create CPU elementwise descriptor
-    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    std::vector<const void *> inputs,
-    void *stream) const {
-
-    switch (_dtype) {
-    case INFINI_DTYPE_F16:
-        return _device_info->calculate<RoundOp, fp16_t>(_info, output, inputs, stream);
-    case INFINI_DTYPE_F32:
-        return _device_info->calculate<RoundOp, float>(_info, output, inputs, stream);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
 } // namespace op::round::cpu
diff --git a/src/infiniop/ops/round/cpu/round_cpu.h b/src/infiniop/ops/round/cpu/round_cpu.h
index eccd6df0f..1a755dbf8 100644
--- a/src/infiniop/ops/round/cpu/round_cpu.h
+++ b/src/infiniop/ops/round/cpu/round_cpu.h
@@ -2,24 +2,8 @@
 #define __ROUND_CPU_H__
 
 #include "../../../elementwise/cpu/elementwise_cpu.h"
-#include <cmath>
+#include "../../../elementwise/unary.h"
 
-ELEMENTWISE_DESCRIPTOR(round, cpu)
-
-namespace op::round::cpu {
-typedef struct RoundOp {
-public:
-    static constexpr size_t num_inputs = 1;
-
-    template <typename T>
-    T operator()(const T &x) const {
-        if constexpr (std::is_integral_v<T>) {
-            return x;
-        } else {
-            return std::nearbyint(x);
-        }
-    }
-} RoundOp;
-} // namespace op::round::cpu
+UNARY_ELEMENTWISE_DESCRIPTOR(round, cpu, op::elementwise::unary::UnaryMode::Round)
 
 #endif // __ROUND_CPU_H__
diff --git a/src/infiniop/ops/round/cuda/kernel.cuh b/src/infiniop/ops/round/cuda/kernel.cuh
index c52a10716..f4de9c772 100644
--- a/src/infiniop/ops/round/cuda/kernel.cuh
+++ b/src/infiniop/ops/round/cuda/kernel.cuh
@@ -1,34 +1,10 @@
 #ifndef __ROUND_CUDA_H__
 #define __ROUND_CUDA_H__
 
-#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
-#include <cuda_fp16.h>
+#include "../../../elementwise/unary.h"
 
 namespace op::round::cuda {
-typedef struct RoundOp {
-public:
-    static constexpr size_t num_inputs = 1;
-    template <typename T>
-    __device__ __forceinline__ T operator()(const T &x) const {
-        if constexpr (std::is_same_v<T, half2>) {
-            return h2rint(x);
-        } else if constexpr (std::is_same_v<T, half>) {
-            return hrint(x);
-        } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
-            float x0 = __bfloat162float(__low2bfloat16(x));
-            float x1 = __bfloat162float(__high2bfloat16(x));
-            return __floats2bfloat162_rn(rintf(x0), rintf(x1));
-        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
-            return __float2bfloat16_rn(rintf(__bfloat162float(x)));
-        } else if constexpr (std::is_same_v<T, float>) {
-            return rintf(x);
-        } else if constexpr (std::is_integral_v<T>) {
-            return x;
-        } else {
-            return std::nearbyint(x);
-        }
-    }
-} RoundOp;
+using Op = op::elementwise::unary::cuda::UnaryOp<op::elementwise::unary::UnaryMode::Round>;
 } // namespace op::round::cuda
 
 #endif // __ROUND_CUDA_H__
diff --git a/src/infiniop/ops/round/nvidia/round_nvidia.cu b/src/infiniop/ops/round/nvidia/round_nvidia.cu
index c1fabc885..dc84388a3 100644
--- a/src/infiniop/ops/round/nvidia/round_nvidia.cu
+++ b/src/infiniop/ops/round/nvidia/round_nvidia.cu
@@ -1,54 +1,10 @@
-#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh"
 
 #include "../cuda/kernel.cuh"
 #include "round_nvidia.cuh"
 
 namespace op::round::nvidia {
 
-Descriptor::~Descriptor() = default;
+ELEMENTWISE_NVIDIA_IMPL_UNARY(round)
 
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t out_desc,
-    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
-
-    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
-    auto dtype = out_desc->dtype();
-
-    const auto &x_desc = input_desc_vec.at(0);
-    const auto &y_shape = out_desc->shape();
-    const auto &x_shape = x_desc->shape();
-
-    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
-
-    CHECK_SAME_SHAPE(y_shape, x_shape);
-
-    // create CUDA elementwise descriptor
-    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    std::vector<const void *> inputs,
-    void *stream) const {
-    if (workspace_size < _workspace_size) {
-        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
-    }
-
-    switch (_dtype) {
-    case INFINI_DTYPE_F16:
-        return _device_info->calculate<256, cuda::RoundOp, half>(_info, workspace, output, inputs, stream);
-    case INFINI_DTYPE_F32:
-        return _device_info->calculate<256, cuda::RoundOp, float>(_info, workspace, output, inputs, stream);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
 } // namespace op::round::nvidia
diff --git a/src/infiniop/ops/round/operator.cc b/src/infiniop/ops/round/operator.cc
index 9468803c8..a20fbcb17 100644
--- a/src/infiniop/ops/round/operator.cc
+++ b/src/infiniop/ops/round/operator.cc
@@ -1,5 +1,4 @@
-#include "../../operator.h"
-#include "../../handle.h"
+#include "../../operator_impl.h"
 #include "infiniop/ops/round.h"
 
 #ifdef ENABLE_CPU_API
@@ -9,131 +8,4 @@
 #include "nvidia/round_nvidia.cuh"
 #endif
 
-__C infiniStatus_t infiniopCreateRoundDescriptor(
-    infiniopHandle_t handle,
-    infiniopRoundDescriptor_t *desc_ptr,
-    infiniopTensorDescriptor_t y_desc,
-    infiniopTensorDescriptor_t x_desc) {
-
-#define CREATE(CASE, NAMESPACE)                                              \
-    case CASE:                                                               \
-        return op::round::NAMESPACE::Descriptor::create(                     \
-            handle,                                                          \
-            reinterpret_cast<op::round::NAMESPACE::Descriptor **>(desc_ptr), \
-            y_desc,                                                          \
-            {x_desc})
-
-    switch (handle->device) {
-
-#ifdef ENABLE_CPU_API
-        CREATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        CREATE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef CREATE
-}
-
-__C infiniStatus_t infiniopGetRoundWorkspaceSize(infiniopRoundDescriptor_t desc, size_t *size) {
-
-#define GET(CASE, NAMESPACE)                                                                 \
-    case CASE:                                                                               \
-        *size = reinterpret_cast<op::round::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
-        return INFINI_STATUS_SUCCESS;
-
-    switch (desc->device_type) {
-#ifdef ENABLE_CPU_API
-        GET(INFINI_DEVICE_CPU, cpu)
-#endif
-#ifdef ENABLE_NVIDIA_API
-        GET(INFINI_DEVICE_NVIDIA, nvidia)
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        GET(INFINI_DEVICE_ILUVATAR, nvidia)
-#endif
-#ifdef ENABLE_QY_API
-        GET(INFINI_DEVICE_QY, nvidia)
-#endif
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-#undef GET
-
-    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-}
-
-__C infiniStatus_t infiniopRound(
-    infiniopRoundDescriptor_t desc,
-    void *workspace,
-    size_t workspace_size,
-    void *y,
-    const void *x,
-    void *stream) {
-
-#define CALCULATE(CASE, NAMESPACE)                                              \
-    case CASE:                                                                  \
-        return reinterpret_cast<const op::round::NAMESPACE::Descriptor *>(desc) \
-            ->calculate(workspace, workspace_size, y, {x}, stream)
-
-    switch (desc->device_type) {
-
-#ifdef ENABLE_CPU_API
-        CALCULATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        CALCULATE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef CALCULATE
-}
-
-__C infiniStatus_t
-infiniopDestroyRoundDescriptor(infiniopRoundDescriptor_t desc) {
-
-#define DELETE(CASE, NAMESPACE)                                                  \
-    case CASE:                                                                   \
-        delete reinterpret_cast<const op::round::NAMESPACE::Descriptor *>(desc); \
-        return INFINI_STATUS_SUCCESS
-
-    switch (desc->device_type) {
-
-#ifdef ENABLE_CPU_API
-        DELETE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        DELETE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef DELETE
-}
+UNARY_OP_IMPL(round, Round)
diff --git a/src/infiniop/ops/sign/cpu/sign_cpu.cc b/src/infiniop/ops/sign/cpu/sign_cpu.cc
index 1f3430e73..c65868d09 100644
--- a/src/infiniop/ops/sign/cpu/sign_cpu.cc
+++ b/src/infiniop/ops/sign/cpu/sign_cpu.cc
@@ -1,48 +1,8 @@
 #include "sign_cpu.h"
+#include "../../../elementwise/cpu/elementwise_cpu_impl.h"
 
 namespace op::sign::cpu {
 
-Descriptor::~Descriptor() = default;
+ELEMENTWISE_CPU_IMPL_UNARY(sign)
 
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t out_desc,
-    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
-
-    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
-    auto dtype = out_desc->dtype();
-
-    const auto &x_desc = input_desc_vec.at(0);
-    const auto &y_shape = out_desc->shape();
-    const auto &x_shape = x_desc->shape();
-
-    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
-
-    CHECK_SAME_SHAPE(y_shape, x_shape);
-
-    // create CPU elementwise descriptor
-    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    std::vector<const void *> inputs,
-    void *stream) const {
-
-    switch (_dtype) {
-    case INFINI_DTYPE_F16:
-        return _device_info->calculate<SignOp, fp16_t>(_info, output, inputs, stream);
-    case INFINI_DTYPE_F32:
-        return _device_info->calculate<SignOp, float>(_info, output, inputs, stream);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
 } // namespace op::sign::cpu
diff --git a/src/infiniop/ops/sign/cpu/sign_cpu.h b/src/infiniop/ops/sign/cpu/sign_cpu.h
index 505194c85..7ddeec543 100644
--- a/src/infiniop/ops/sign/cpu/sign_cpu.h
+++ b/src/infiniop/ops/sign/cpu/sign_cpu.h
@@ -2,19 +2,8 @@
 #define __SIGN_CPU_H__
 
 #include "../../../elementwise/cpu/elementwise_cpu.h"
+#include "../../../elementwise/unary.h"
 
-ELEMENTWISE_DESCRIPTOR(sign, cpu)
-
-namespace op::sign::cpu {
-typedef struct SignOp {
-public:
-    static constexpr size_t num_inputs = 1;
-
-    template <typename T>
-    T operator()(const T &x) const {
-        return x > T(0) ? T(1) : (x == T(0) ? T(0) : T(-1));
-    }
-} SignOp;
-} // namespace op::sign::cpu
+UNARY_ELEMENTWISE_DESCRIPTOR(sign, cpu, op::elementwise::unary::UnaryMode::Sign)
 
 #endif // __SIGN_CPU_H__
diff --git a/src/infiniop/ops/sign/cuda/kernel.cuh b/src/infiniop/ops/sign/cuda/kernel.cuh
index 3737282b0..a1216fb82 100644
--- a/src/infiniop/ops/sign/cuda/kernel.cuh
+++ b/src/infiniop/ops/sign/cuda/kernel.cuh
@@ -1,25 +1,10 @@
 #ifndef __SIGN_CUDA_H__
 #define __SIGN_CUDA_H__
 
-#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
-#include <cuda_fp16.h>
+#include "../../../elementwise/unary.h"
 
 namespace op::sign::cuda {
-typedef struct SignOp {
-public:
-    static constexpr size_t num_inputs = 1;
-    template <typename T>
-    __device__ __forceinline__ T operator()(const T &x) const {
-        if constexpr (std::is_same_v<T, half2>) {
-            const auto lt_mask = __hlt2(x, __floats2half2_rn(0.0f, 0.0f));
-            return __hadd2(__hneg2(lt_mask), __hsub2(__floats2half2_rn(1.0f, 1.0f), lt_mask));
-        } else if constexpr (std::is_same_v<T, half>) {
-            return x > half(0) ? half(1) : (x == half(0) ? half(0) : half(-1));
-        } else {
-            return x > T(0) ? T(1) : (x == T(0) ? T(0) : T(-1));
-        }
-    }
-} SignOp;
+using Op = op::elementwise::unary::cuda::UnaryOp<op::elementwise::unary::UnaryMode::Sign>;
 } // namespace op::sign::cuda
 
 #endif // __SIGN_CUDA_H__
diff --git a/src/infiniop/ops/sign/nvidia/sign_nvidia.cu b/src/infiniop/ops/sign/nvidia/sign_nvidia.cu
index 6a3152e41..2a11f9e23 100644
--- a/src/infiniop/ops/sign/nvidia/sign_nvidia.cu
+++ b/src/infiniop/ops/sign/nvidia/sign_nvidia.cu
@@ -1,54 +1,10 @@
-#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh"
 
 #include "../cuda/kernel.cuh"
 #include "sign_nvidia.cuh"
 
 namespace op::sign::nvidia {
 
-Descriptor::~Descriptor() = default;
+ELEMENTWISE_NVIDIA_IMPL_UNARY(sign)
 
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t out_desc,
-    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
-
-    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
-    auto dtype = out_desc->dtype();
-
-    const auto &x_desc = input_desc_vec.at(0);
-    const auto &y_shape = out_desc->shape();
-    const auto &x_shape = x_desc->shape();
-
-    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
-
-    CHECK_SAME_SHAPE(y_shape, x_shape);
-
-    // create CUDA elementwise descriptor
-    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    std::vector<const void *> inputs,
-    void *stream) const {
-    if (workspace_size < _workspace_size) {
-        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
-    }
-
-    switch (_dtype) {
-    case INFINI_DTYPE_F16:
-        return _device_info->calculate<256, cuda::SignOp, half>(_info, workspace, output, inputs, stream);
-    case INFINI_DTYPE_F32:
-        return _device_info->calculate<256, cuda::SignOp, float>(_info, workspace, output, inputs, stream);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
 } // namespace op::sign::nvidia
diff --git a/src/infiniop/ops/sign/operator.cc b/src/infiniop/ops/sign/operator.cc
index 8f658a9b3..1a4599d5d 100644
--- a/src/infiniop/ops/sign/operator.cc
+++ b/src/infiniop/ops/sign/operator.cc
@@ -1,5 +1,4 @@
-#include "../../operator.h"
-#include "../../handle.h"
+#include "../../operator_impl.h"
 #include "infiniop/ops/sign.h"
 
 #ifdef ENABLE_CPU_API
@@ -9,131 +8,4 @@
 #include "nvidia/sign_nvidia.cuh"
 #endif
 
-__C infiniStatus_t infiniopCreateSignDescriptor(
-    infiniopHandle_t handle,
-    infiniopSignDescriptor_t *desc_ptr,
-    infiniopTensorDescriptor_t y_desc,
-    infiniopTensorDescriptor_t x_desc) {
-
-#define CREATE(CASE, NAMESPACE)                                             \
-    case CASE:                                                              \
-        return op::sign::NAMESPACE::Descriptor::create(                     \
-            handle,                                                         \
-            reinterpret_cast<op::sign::NAMESPACE::Descriptor **>(desc_ptr), \
-            y_desc,                                                         \
-            {x_desc})
-
-    switch (handle->device) {
-
-#ifdef ENABLE_CPU_API
-        CREATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        CREATE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef CREATE
-}
-
-__C infiniStatus_t infiniopGetSignWorkspaceSize(infiniopSignDescriptor_t desc, size_t *size) {
-
-#define GET(CASE, NAMESPACE)                                                                \
-    case CASE:                                                                              \
-        *size = reinterpret_cast<op::sign::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
-        return INFINI_STATUS_SUCCESS;
-
-    switch (desc->device_type) {
-#ifdef ENABLE_CPU_API
-        GET(INFINI_DEVICE_CPU, cpu)
-#endif
-#ifdef ENABLE_NVIDIA_API
-        GET(INFINI_DEVICE_NVIDIA, nvidia)
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        GET(INFINI_DEVICE_ILUVATAR, nvidia)
-#endif
-#ifdef ENABLE_QY_API
-        GET(INFINI_DEVICE_QY, nvidia)
-#endif
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-#undef GET
-
-    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-}
-
-__C infiniStatus_t infiniopSign(
-    infiniopSignDescriptor_t desc,
-    void *workspace,
-    size_t workspace_size,
-    void *y,
-    const void *x,
-    void *stream) {
-
-#define CALCULATE(CASE, NAMESPACE)                                             \
-    case CASE:                                                                 \
-        return reinterpret_cast<const op::sign::NAMESPACE::Descriptor *>(desc) \
-            ->calculate(workspace, workspace_size, y, {x}, stream)
-
-    switch (desc->device_type) {
-
-#ifdef ENABLE_CPU_API
-        CALCULATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        CALCULATE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef CALCULATE
-}
-
-__C infiniStatus_t
-infiniopDestroySignDescriptor(infiniopSignDescriptor_t desc) {
-
-#define DELETE(CASE, NAMESPACE)                                                 \
-    case CASE:                                                                  \
-        delete reinterpret_cast<const op::sign::NAMESPACE::Descriptor *>(desc); \
-        return INFINI_STATUS_SUCCESS
-
-    switch (desc->device_type) {
-
-#ifdef ENABLE_CPU_API
-        DELETE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        DELETE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef DELETE
-}
+UNARY_OP_IMPL(sign, Sign)
diff --git a/src/infiniop/ops/sinh/cpu/sinh_cpu.cc b/src/infiniop/ops/sinh/cpu/sinh_cpu.cc
index 40685847d..897439905 100644
--- a/src/infiniop/ops/sinh/cpu/sinh_cpu.cc
+++ b/src/infiniop/ops/sinh/cpu/sinh_cpu.cc
@@ -1,48 +1,8 @@
 #include "sinh_cpu.h"
+#include "../../../elementwise/cpu/elementwise_cpu_impl.h"
 
 namespace op::sinh::cpu {
 
-Descriptor::~Descriptor() = default;
+ELEMENTWISE_CPU_IMPL_UNARY(sinh)
 
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t out_desc,
-    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
-
-    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
-    auto dtype = out_desc->dtype();
-
-    const auto &x_desc = input_desc_vec.at(0);
-    const auto &y_shape = out_desc->shape();
-    const auto &x_shape = x_desc->shape();
-
-    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
-
-    CHECK_SAME_SHAPE(y_shape, x_shape);
-
-    // create CPU elementwise descriptor
-    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    std::vector<const void *> inputs,
-    void *stream) const {
-
-    switch (_dtype) {
-    case INFINI_DTYPE_F16:
-        return _device_info->calculate<SinhOp, fp16_t>(_info, output, inputs, stream);
-    case INFINI_DTYPE_F32:
-        return _device_info->calculate<SinhOp, float>(_info, output, inputs, stream);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
 } // namespace op::sinh::cpu
diff --git a/src/infiniop/ops/sinh/cpu/sinh_cpu.h b/src/infiniop/ops/sinh/cpu/sinh_cpu.h
index dbc8f3c7e..573027ee3 100644
--- a/src/infiniop/ops/sinh/cpu/sinh_cpu.h
+++ b/src/infiniop/ops/sinh/cpu/sinh_cpu.h
@@ -1,22 +1,9 @@
 #ifndef __SINH_CPU_H__
 #define __SINH_CPU_H__
 
-#include <cmath>
-
 #include "../../../elementwise/cpu/elementwise_cpu.h"
+#include "../../../elementwise/unary.h"
 
-ELEMENTWISE_DESCRIPTOR(sinh, cpu)
-
-namespace op::sinh::cpu {
-typedef struct SinhOp {
-public:
-    static constexpr size_t num_inputs = 1;
-
-    template <typename T>
-    T operator()(const T &x) const {
-        return std::sinh(x);
-    }
-} SinhOp;
-} // namespace op::sinh::cpu
+UNARY_ELEMENTWISE_DESCRIPTOR(sinh, cpu, op::elementwise::unary::UnaryMode::Sinh)
 
 #endif // __SINH_CPU_H__
diff --git a/src/infiniop/ops/sinh/cuda/kernel.cuh b/src/infiniop/ops/sinh/cuda/kernel.cuh
index c09150666..d5bb7491f 100644
--- a/src/infiniop/ops/sinh/cuda/kernel.cuh
+++ b/src/infiniop/ops/sinh/cuda/kernel.cuh
@@ -1,32 +1,10 @@
 #ifndef __SINH_CUDA_H__
 #define __SINH_CUDA_H__
 
-#include <cmath>
-#include <cuda_fp16.h>
+#include "../../../elementwise/unary.h"
 
 namespace op::sinh::cuda {
-typedef struct SinhOp {
-public:
-    static constexpr size_t num_inputs = 1;
-    template <typename T>
-    __device__ __forceinline__ T operator()(const T &x) const {
-        if constexpr (std::is_same_v<T, half2>) {
-            return __floats2half2_rn(sinhf(__half2float(__low2half(x))), sinhf(__half2float(__high2half(x))));
-        } else if constexpr (std::is_same_v<T, half>) {
-            return __float2half(sinhf(__half2float(x)));
-        } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
-            float x0 = __bfloat162float(__low2bfloat16(x));
-            float x1 = __bfloat162float(__high2bfloat16(x));
-            return __floats2bfloat162_rn(sinhf(x0), sinhf(x1));
-        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
-            return __float2bfloat16_rn(sinhf(__bfloat162float(x)));
-        } else if constexpr (std::is_same_v<T, float>) {
-            return sinhf(x);
-        } else {
-            return std::sinh(x);
-        }
-    }
-} SinhOp;
+using Op = op::elementwise::unary::cuda::UnaryOp<op::elementwise::unary::UnaryMode::Sinh>;
 } // namespace op::sinh::cuda
 
 #endif // __SINH_CUDA_H__
diff --git a/src/infiniop/ops/sinh/nvidia/sinh_nvidia.cu b/src/infiniop/ops/sinh/nvidia/sinh_nvidia.cu
index d4c3fd165..3abfc2973 100644
--- a/src/infiniop/ops/sinh/nvidia/sinh_nvidia.cu
+++ b/src/infiniop/ops/sinh/nvidia/sinh_nvidia.cu
@@ -1,54 +1,10 @@
-#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh"
 
 #include "../cuda/kernel.cuh"
 #include "sinh_nvidia.cuh"
 
 namespace op::sinh::nvidia {
 
-Descriptor::~Descriptor() = default;
+ELEMENTWISE_NVIDIA_IMPL_UNARY(sinh)
 
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t out_desc,
-    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
-
-    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
-    auto dtype = out_desc->dtype();
-
-    const auto &x_desc = input_desc_vec.at(0);
-    const auto &y_shape = out_desc->shape();
-    const auto &x_shape = x_desc->shape();
-
-    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
-
-    CHECK_SAME_SHAPE(y_shape, x_shape);
-
-    // create CUDA elementwise descriptor
-    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    std::vector<const void *> inputs,
-    void *stream) const {
-    if (workspace_size < _workspace_size) {
-        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
-    }
-
-    switch (_dtype) {
-    case INFINI_DTYPE_F16:
-        return _device_info->calculate<256, cuda::SinhOp, half>(_info, workspace, output, inputs, stream);
-    case INFINI_DTYPE_F32:
-        return _device_info->calculate<256, cuda::SinhOp, float>(_info, workspace, output, inputs, stream);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
 } // namespace op::sinh::nvidia
diff --git a/src/infiniop/ops/sinh/operator.cc b/src/infiniop/ops/sinh/operator.cc
index 1636ce2c8..41940d235 100644
--- a/src/infiniop/ops/sinh/operator.cc
+++ b/src/infiniop/ops/sinh/operator.cc
@@ -1,5 +1,4 @@
-#include "../../operator.h"
-#include "../../handle.h"
+#include "../../operator_impl.h"
 #include "infiniop/ops/sinh.h"
 
 #ifdef ENABLE_CPU_API
@@ -9,131 +8,4 @@
 #include "nvidia/sinh_nvidia.cuh"
 #endif
 
-__C infiniStatus_t infiniopCreateSinhDescriptor(
-    infiniopHandle_t handle,
-    infiniopSinhDescriptor_t *desc_ptr,
-    infiniopTensorDescriptor_t y_desc,
-    infiniopTensorDescriptor_t x_desc) {
-
-#define CREATE(CASE, NAMESPACE)                                             \
-    case CASE:                                                              \
-        return op::sinh::NAMESPACE::Descriptor::create(                     \
-            handle,                                                         \
-            reinterpret_cast<op::sinh::NAMESPACE::Descriptor **>(desc_ptr), \
-            y_desc,                                                         \
-            {x_desc})
-
-    switch (handle->device) {
-
-#ifdef ENABLE_CPU_API
-        CREATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        CREATE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef CREATE
-}
-
-__C infiniStatus_t infiniopGetSinhWorkspaceSize(infiniopSinhDescriptor_t desc, size_t *size) {
-
-#define GET(CASE, NAMESPACE)                                                                \
-    case CASE:                                                                              \
-        *size = reinterpret_cast<op::sinh::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
-        return INFINI_STATUS_SUCCESS;
-
-    switch (desc->device_type) {
-#ifdef ENABLE_CPU_API
-        GET(INFINI_DEVICE_CPU, cpu)
-#endif
-#ifdef ENABLE_NVIDIA_API
-        GET(INFINI_DEVICE_NVIDIA, nvidia)
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        GET(INFINI_DEVICE_ILUVATAR, nvidia)
-#endif
-#ifdef ENABLE_QY_API
-        GET(INFINI_DEVICE_QY, nvidia)
-#endif
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-#undef GET
-
-    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-}
-
-__C infiniStatus_t infiniopSinh(
-    infiniopSinhDescriptor_t desc,
-    void *workspace,
-    size_t workspace_size,
-    void *y,
-    const void *x,
-    void *stream) {
-
-#define CALCULATE(CASE, NAMESPACE)                                             \
-    case CASE:                                                                 \
-        return reinterpret_cast<const op::sinh::NAMESPACE::Descriptor *>(desc) \
-            ->calculate(workspace, workspace_size, y, {x}, stream)
-
-    switch (desc->device_type) {
-
-#ifdef ENABLE_CPU_API
-        CALCULATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        CALCULATE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef CALCULATE
-}
-
-__C infiniStatus_t
-infiniopDestroySinhDescriptor(infiniopSinhDescriptor_t desc) {
-
-#define DELETE(CASE, NAMESPACE)                                                 \
-    case CASE:                                                                  \
-        delete reinterpret_cast<const op::sinh::NAMESPACE::Descriptor *>(desc); \
-        return INFINI_STATUS_SUCCESS
-
-    switch (desc->device_type) {
-
-#ifdef ENABLE_CPU_API
-        DELETE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        DELETE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef DELETE
-}
+UNARY_OP_IMPL(sinh, Sinh)
diff --git a/src/infiniop/ops/sqrt/cpu/sqrt_cpu.cc b/src/infiniop/ops/sqrt/cpu/sqrt_cpu.cc
index 99e723126..eb9ac4d66 100644
--- a/src/infiniop/ops/sqrt/cpu/sqrt_cpu.cc
+++ b/src/infiniop/ops/sqrt/cpu/sqrt_cpu.cc
@@ -1,48 +1,8 @@
 #include "sqrt_cpu.h"
+#include "../../../elementwise/cpu/elementwise_cpu_impl.h"
 
 namespace op::sqrt::cpu {
 
-Descriptor::~Descriptor() = default;
+ELEMENTWISE_CPU_IMPL_UNARY(sqrt)
 
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t out_desc,
-    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
-
-    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
-    auto dtype = out_desc->dtype();
-
-    const auto &x_desc = input_desc_vec.at(0);
-    const auto &y_shape = out_desc->shape();
-    const auto &x_shape = x_desc->shape();
-
-    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
-
-    CHECK_SAME_SHAPE(y_shape, x_shape);
-
-    // create CPU elementwise descriptor
-    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    std::vector<const void *> inputs,
-    void *stream) const {
-
-    switch (_dtype) {
-    case INFINI_DTYPE_F16:
-        return _device_info->calculate<SqrtOp, fp16_t>(_info, output, inputs, stream);
-    case INFINI_DTYPE_F32:
-        return _device_info->calculate<SqrtOp, float>(_info, output, inputs, stream);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
 } // namespace op::sqrt::cpu
diff --git a/src/infiniop/ops/sqrt/cpu/sqrt_cpu.h b/src/infiniop/ops/sqrt/cpu/sqrt_cpu.h
index 3d026cf63..ed6217e1f 100644
--- a/src/infiniop/ops/sqrt/cpu/sqrt_cpu.h
+++ b/src/infiniop/ops/sqrt/cpu/sqrt_cpu.h
@@ -1,22 +1,9 @@
 #ifndef __SQRT_CPU_H__
 #define __SQRT_CPU_H__
 
-#include <cmath>
-
 #include "../../../elementwise/cpu/elementwise_cpu.h"
+#include "../../../elementwise/unary.h"
 
-ELEMENTWISE_DESCRIPTOR(sqrt, cpu)
-
-namespace op::sqrt::cpu {
-typedef struct SqrtOp {
-public:
-    static constexpr size_t num_inputs = 1;
-
-    template <typename T>
-    T operator()(const T &x) const {
-        return std::sqrt(x);
-    }
-} SqrtOp;
-} // namespace op::sqrt::cpu
+UNARY_ELEMENTWISE_DESCRIPTOR(sqrt, cpu, op::elementwise::unary::UnaryMode::Sqrt)
 
 #endif // __SQRT_CPU_H__
diff --git a/src/infiniop/ops/sqrt/cuda/kernel.cuh b/src/infiniop/ops/sqrt/cuda/kernel.cuh
index c82cd7dd5..40ab9708f 100644
--- a/src/infiniop/ops/sqrt/cuda/kernel.cuh
+++ b/src/infiniop/ops/sqrt/cuda/kernel.cuh
@@ -1,32 +1,10 @@
 #ifndef __SQRT_CUDA_H__
 #define __SQRT_CUDA_H__
 
-#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
-#include <cuda_fp16.h>
+#include "../../../elementwise/unary.h"
 
 namespace op::sqrt::cuda {
-typedef struct SqrtOp {
-public:
-    static constexpr size_t num_inputs = 1;
-    template <typename T>
-    __device__ __forceinline__ T operator()(const T &x) const {
-        if constexpr (std::is_same_v<T, half2>) {
-            return h2sqrt(x);
-        } else if constexpr (std::is_same_v<T, half>) {
-            return hsqrt(x);
-        } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
-            float x0 = __bfloat162float(__low2bfloat16(x));
-            float x1 = __bfloat162float(__high2bfloat16(x));
-            return __floats2bfloat162_rn(__fsqrt_rn(x0), __fsqrt_rn(x1));
-        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
-            return __float2bfloat16_rn(__fsqrt_rn(__bfloat162float(x)));
-        } else if constexpr (std::is_same_v<T, float>) {
-            return __fsqrt_rn(x);
-        } else {
-            return std::sqrt(x);
-        }
-    }
-} SqrtOp;
+using Op = op::elementwise::unary::cuda::UnaryOp<op::elementwise::unary::UnaryMode::Sqrt>;
 } // namespace op::sqrt::cuda
 
 #endif // __SQRT_CUDA_H__
diff --git a/src/infiniop/ops/sqrt/nvidia/sqrt_nvidia.cu b/src/infiniop/ops/sqrt/nvidia/sqrt_nvidia.cu
index 519d06e89..4d6c70d72 100644
--- a/src/infiniop/ops/sqrt/nvidia/sqrt_nvidia.cu
+++ b/src/infiniop/ops/sqrt/nvidia/sqrt_nvidia.cu
@@ -1,54 +1,10 @@
-#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh"
 
 #include "../cuda/kernel.cuh"
 #include "sqrt_nvidia.cuh"
 
 namespace op::sqrt::nvidia {
 
-Descriptor::~Descriptor() = default;
+ELEMENTWISE_NVIDIA_IMPL_UNARY(sqrt)
 
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t out_desc,
-    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
-
-    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
-    auto dtype = out_desc->dtype();
-
-    const auto &x_desc = input_desc_vec.at(0);
-    const auto &y_shape = out_desc->shape();
-    const auto &x_shape = x_desc->shape();
-
-    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
-
-    CHECK_SAME_SHAPE(y_shape, x_shape);
-
-    // create CUDA elementwise descriptor
-    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    std::vector<const void *> inputs,
-    void *stream) const {
-    if (workspace_size < _workspace_size) {
-        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
-    }
-
-    switch (_dtype) {
-    case INFINI_DTYPE_F16:
-        return _device_info->calculate<256, cuda::SqrtOp, half>(_info, workspace, output, inputs, stream);
-    case INFINI_DTYPE_F32:
-        return _device_info->calculate<256, cuda::SqrtOp, float>(_info, workspace, output, inputs, stream);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
 } // namespace op::sqrt::nvidia
diff --git a/src/infiniop/ops/sqrt/operator.cc b/src/infiniop/ops/sqrt/operator.cc
index b11c8a4b5..fe999f58f 100644
--- a/src/infiniop/ops/sqrt/operator.cc
+++ b/src/infiniop/ops/sqrt/operator.cc
@@ -1,5 +1,4 @@
-#include "../../operator.h"
-#include "../../handle.h"
+#include "../../operator_impl.h"
 #include "infiniop/ops/sqrt.h"
 
 #ifdef ENABLE_CPU_API
@@ -9,131 +8,4 @@
 #include "nvidia/sqrt_nvidia.cuh"
 #endif
 
-__C infiniStatus_t infiniopCreateSqrtDescriptor(
-    infiniopHandle_t handle,
-    infiniopSqrtDescriptor_t *desc_ptr,
-    infiniopTensorDescriptor_t y_desc,
-    infiniopTensorDescriptor_t x_desc) {
-
-#define CREATE(CASE, NAMESPACE)                                             \
-    case CASE:                                                              \
-        return op::sqrt::NAMESPACE::Descriptor::create(                     \
-            handle,                                                         \
-            reinterpret_cast<op::sqrt::NAMESPACE::Descriptor **>(desc_ptr), \
-            y_desc,                                                         \
-            {x_desc})
-
-    switch (handle->device) {
-
-#ifdef ENABLE_CPU_API
-        CREATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        CREATE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef CREATE
-}
-
-__C infiniStatus_t infiniopGetSqrtWorkspaceSize(infiniopSqrtDescriptor_t desc, size_t *size) {
-
-#define GET(CASE, NAMESPACE)                                                                \
-    case CASE:                                                                              \
-        *size = reinterpret_cast<op::sqrt::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
-        return INFINI_STATUS_SUCCESS;
-
-    switch (desc->device_type) {
-#ifdef ENABLE_CPU_API
-        GET(INFINI_DEVICE_CPU, cpu)
-#endif
-#ifdef ENABLE_NVIDIA_API
-        GET(INFINI_DEVICE_NVIDIA, nvidia)
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        GET(INFINI_DEVICE_ILUVATAR, nvidia)
-#endif
-#ifdef ENABLE_QY_API
-        GET(INFINI_DEVICE_QY, nvidia)
-#endif
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-#undef GET
-
-    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-}
-
-__C infiniStatus_t infiniopSqrt(
-    infiniopSqrtDescriptor_t desc,
-    void *workspace,
-    size_t workspace_size,
-    void *y,
-    const void *x,
-    void *stream) {
-
-#define CALCULATE(CASE, NAMESPACE)                                             \
-    case CASE:                                                                 \
-        return reinterpret_cast<const op::sqrt::NAMESPACE::Descriptor *>(desc) \
-            ->calculate(workspace, workspace_size, y, {x}, stream)
-
-    switch (desc->device_type) {
-
-#ifdef ENABLE_CPU_API
-        CALCULATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        CALCULATE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef CALCULATE
-}
-
-__C infiniStatus_t
-infiniopDestroySqrtDescriptor(infiniopSqrtDescriptor_t desc) {
-
-#define DELETE(CASE, NAMESPACE)                                                 \
-    case CASE:                                                                  \
-        delete reinterpret_cast<const op::sqrt::NAMESPACE::Descriptor *>(desc); \
-        return INFINI_STATUS_SUCCESS
-
-    switch (desc->device_type) {
-
-#ifdef ENABLE_CPU_API
-        DELETE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        DELETE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef DELETE
-}
+UNARY_OP_IMPL(sqrt, Sqrt)
diff --git a/src/infiniop/ops/tan/cpu/tan_cpu.cc b/src/infiniop/ops/tan/cpu/tan_cpu.cc
index 2947dfc5e..5166cf64f 100644
--- a/src/infiniop/ops/tan/cpu/tan_cpu.cc
+++ b/src/infiniop/ops/tan/cpu/tan_cpu.cc
@@ -1,48 +1,8 @@
 #include "tan_cpu.h"
+#include "../../../elementwise/cpu/elementwise_cpu_impl.h"
 
 namespace op::tan::cpu {
 
-Descriptor::~Descriptor() = default;
+ELEMENTWISE_CPU_IMPL_UNARY(tan)
 
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t out_desc,
-    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
-
-    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
-    auto dtype = out_desc->dtype();
-
-    const auto &x_desc = input_desc_vec.at(0);
-    const auto &y_shape = out_desc->shape();
-    const auto &x_shape = x_desc->shape();
-
-    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
-
-    CHECK_SAME_SHAPE(y_shape, x_shape);
-
-    // create CPU elementwise descriptor
-    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    std::vector<const void *> inputs,
-    void *stream) const {
-
-    switch (_dtype) {
-    case INFINI_DTYPE_F16:
-        return _device_info->calculate<TanOp, fp16_t>(_info, output, inputs, stream);
-    case INFINI_DTYPE_F32:
-        return _device_info->calculate<TanOp, float>(_info, output, inputs, stream);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
 } // namespace op::tan::cpu
diff --git a/src/infiniop/ops/tan/cpu/tan_cpu.h b/src/infiniop/ops/tan/cpu/tan_cpu.h
index c3a22456c..6c697c311 100644
--- a/src/infiniop/ops/tan/cpu/tan_cpu.h
+++ b/src/infiniop/ops/tan/cpu/tan_cpu.h
@@ -1,22 +1,9 @@
 #ifndef __TAN_CPU_H__
 #define __TAN_CPU_H__
 
-#include <cmath>
-
 #include "../../../elementwise/cpu/elementwise_cpu.h"
+#include "../../../elementwise/unary.h"
 
-ELEMENTWISE_DESCRIPTOR(tan, cpu)
-
-namespace op::tan::cpu {
-typedef struct TanOp {
-public:
-    static constexpr size_t num_inputs = 1;
-
-    template <typename T>
-    T operator()(const T &x) const {
-        return std::tan(x);
-    }
-} TanOp;
-} // namespace op::tan::cpu
+UNARY_ELEMENTWISE_DESCRIPTOR(tan, cpu, op::elementwise::unary::UnaryMode::Tan)
 
 #endif // __TAN_CPU_H__
diff --git a/src/infiniop/ops/tan/cuda/kernel.cuh b/src/infiniop/ops/tan/cuda/kernel.cuh
index bbd8facaa..c3cf45350 100644
--- a/src/infiniop/ops/tan/cuda/kernel.cuh
+++ b/src/infiniop/ops/tan/cuda/kernel.cuh
@@ -1,55 +1,10 @@
 #ifndef __TAN_CUDA_H__
 #define __TAN_CUDA_H__
 
-#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
-#include <cmath>
-#include <cuda_fp16.h>
-
-#define TAN_THRESHOLD 15000
+#include "../../../elementwise/unary.h"
 
 namespace op::tan::cuda {
-typedef struct TanOp {
-public:
-    static constexpr size_t num_inputs = 1;
-    template <typename T>
-    __device__ __forceinline__ T operator()(const T &x) const {
-        if constexpr (std::is_same_v<T, half2>) {
-            return h2sin(x) / h2cos(x);
-        } else if constexpr (std::is_same_v<T, half>) {
-            float tan_f = __tanf(__half2float(x));
-            if (std::fabs(tan_f) > TAN_THRESHOLD) {
-                return __float2half(tanf(__half2float(x)));
-            }
-            return __float2half(tan_f);
-        } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
-            float x0 = __bfloat162float(__low2bfloat16(x));
-            float x1 = __bfloat162float(__high2bfloat16(x));
-            float tan_f0 = __tanf(x0);
-            float tan_f1 = __tanf(x1);
-            if (std::fabs(tan_f0) > TAN_THRESHOLD) {
-                tan_f0 = tanf(x0);
-            }
-            if (std::fabs(tan_f1) > TAN_THRESHOLD) {
-                tan_f1 = tanf(x1);
-            }
-            return __floats2bfloat162_rn(tan_f0, tan_f1);
-        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
-            float tan_f = __tanf(__bfloat162float(x));
-            if (std::fabs(tan_f) > TAN_THRESHOLD) {
-                return __float2bfloat16_rn(tanf(__bfloat162float(x)));
-            }
-            return __float2bfloat16_rn(tan_f);
-        } else if constexpr (std::is_same_v<T, float>) {
-            float tan_f = __tanf(x);
-            if (std::fabs(tan_f) > TAN_THRESHOLD) {
-                return tanf(x);
-            }
-            return tan_f;
-        } else {
-            return std::tan(x);
-        }
-    }
-} TanOp;
+using Op = op::elementwise::unary::cuda::UnaryOp<op::elementwise::unary::UnaryMode::Tan>;
 } // namespace op::tan::cuda
 
 #endif // __TAN_CUDA_H__
diff --git a/src/infiniop/ops/tan/nvidia/tan_nvidia.cu b/src/infiniop/ops/tan/nvidia/tan_nvidia.cu
index b4c24e2fe..5f56dcb6f 100644
--- a/src/infiniop/ops/tan/nvidia/tan_nvidia.cu
+++ b/src/infiniop/ops/tan/nvidia/tan_nvidia.cu
@@ -1,54 +1,10 @@
-#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh"
 
 #include "../cuda/kernel.cuh"
 #include "tan_nvidia.cuh"
 
 namespace op::tan::nvidia {
 
-Descriptor::~Descriptor() = default;
+ELEMENTWISE_NVIDIA_IMPL_UNARY(tan)
 
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t out_desc,
-    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
-
-    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
-    auto dtype = out_desc->dtype();
-
-    const auto &x_desc = input_desc_vec.at(0);
-    const auto &y_shape = out_desc->shape();
-    const auto &x_shape = x_desc->shape();
-
-    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
-
-    CHECK_SAME_SHAPE(y_shape, x_shape);
-
-    // create CUDA elementwise descriptor
-    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    std::vector<const void *> inputs,
-    void *stream) const {
-    if (workspace_size < _workspace_size) {
-        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
-    }
-
-    switch (_dtype) {
-    case INFINI_DTYPE_F16:
-        return _device_info->calculate<256, cuda::TanOp, half>(_info, workspace, output, inputs, stream);
-    case INFINI_DTYPE_F32:
-        return _device_info->calculate<256, cuda::TanOp, float>(_info, workspace, output, inputs, stream);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
 } // namespace op::tan::nvidia
diff --git a/src/infiniop/ops/tan/operator.cc b/src/infiniop/ops/tan/operator.cc
index 48ae8d48e..ae506dcd8 100644
--- a/src/infiniop/ops/tan/operator.cc
+++ b/src/infiniop/ops/tan/operator.cc
@@ -1,5 +1,4 @@
-#include "../../operator.h"
-#include "../../handle.h"
+#include "../../operator_impl.h"
 #include "infiniop/ops/tan.h"
 
 #ifdef ENABLE_CPU_API
@@ -9,131 +8,4 @@
 #include "nvidia/tan_nvidia.cuh"
 #endif
 
-__C infiniStatus_t infiniopCreateTanDescriptor(
-    infiniopHandle_t handle,
-    infiniopTanDescriptor_t *desc_ptr,
-    infiniopTensorDescriptor_t y_desc,
-    infiniopTensorDescriptor_t x_desc) {
-
-#define CREATE(CASE, NAMESPACE)                                            \
-    case CASE:                                                             \
-        return op::tan::NAMESPACE::Descriptor::create(                     \
-            handle,                                                        \
-            reinterpret_cast<op::tan::NAMESPACE::Descriptor **>(desc_ptr), \
-            y_desc,                                                        \
-            {x_desc})
-
-    switch (handle->device) {
-
-#ifdef ENABLE_CPU_API
-        CREATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        CREATE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef CREATE
-}
-
-__C infiniStatus_t infiniopGetTanWorkspaceSize(infiniopTanDescriptor_t desc, size_t *size) {
-
-#define GET(CASE, NAMESPACE)                                                               \
-    case CASE:                                                                             \
-        *size = reinterpret_cast<op::tan::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
-        return INFINI_STATUS_SUCCESS;
-
-    switch (desc->device_type) {
-#ifdef ENABLE_CPU_API
-        GET(INFINI_DEVICE_CPU, cpu)
-#endif
-#ifdef ENABLE_NVIDIA_API
-        GET(INFINI_DEVICE_NVIDIA, nvidia)
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        GET(INFINI_DEVICE_ILUVATAR, nvidia)
-#endif
-#ifdef ENABLE_QY_API
-        GET(INFINI_DEVICE_QY, nvidia)
-#endif
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-#undef GET
-
-    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-}
-
-__C infiniStatus_t infiniopTan(
-    infiniopTanDescriptor_t desc,
-    void *workspace,
-    size_t workspace_size,
-    void *y,
-    const void *x,
-    void *stream) {
-
-#define CALCULATE(CASE, NAMESPACE)                                            \
-    case CASE:                                                                \
-        return reinterpret_cast<const op::tan::NAMESPACE::Descriptor *>(desc) \
-            ->calculate(workspace, workspace_size, y, {x}, stream)
-
-    switch (desc->device_type) {
-
-#ifdef ENABLE_CPU_API
-        CALCULATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        CALCULATE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef CALCULATE
-}
-
-__C infiniStatus_t
-infiniopDestroyTanDescriptor(infiniopTanDescriptor_t desc) {
-
-#define DELETE(CASE, NAMESPACE)                                                \
-    case CASE:                                                                 \
-        delete reinterpret_cast<const op::tan::NAMESPACE::Descriptor *>(desc); \
-        return INFINI_STATUS_SUCCESS
-
-    switch (desc->device_type) {
-
-#ifdef ENABLE_CPU_API
-        DELETE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        DELETE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef DELETE
-}
+UNARY_OP_IMPL(tan, Tan)
diff --git a/src/infiniop/ops/tanh/cuda/kernel.cuh b/src/infiniop/ops/tanh/cuda/kernel.cuh
index e336a4995..d987ac7c5 100644
--- a/src/infiniop/ops/tanh/cuda/kernel.cuh
+++ b/src/infiniop/ops/tanh/cuda/kernel.cuh
@@ -1,44 +1,10 @@
 #ifndef __TANH_CUDA_H__
 #define __TANH_CUDA_H__
 
-#include <cmath>
+#include "../../../elementwise/unary.h"
 
 namespace op::tanh::cuda {
-typedef struct TanhOp {
-    static constexpr size_t num_inputs = 1;
-
-    __device__ __forceinline__ float tanh_f32_func(float x) const {
-        return tanhf(x);
-    }
-    template <typename T>
-    __device__ __forceinline__ T operator()(const T &input) const {
-        if constexpr (std::is_same_v<T, half2>) {
-            float2 vf = __half22float2(input);
-            float2 vr = make_float2(tanh_f32_func(vf.x), tanh_f32_func(vf.y));
-            return __float22half2_rn(vr);
-        } else if constexpr (std::is_same_v<T, half>) {
-            float xf = __half2float(input);
-            float yf = tanh_f32_func(xf);
-            return __float2half_rn(yf);
-        } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
-            float f0 = __bfloat162float(__low2bfloat16(input));
-            float f1 = __bfloat162float(__high2bfloat16(input));
-            float r0 = tanh_f32_func(f0);
-            float r1 = tanh_f32_func(f1);
-            return __floats2bfloat162_rn(r0, r1);
-        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
-            float xf = __bfloat162float(input);
-            float rf = tanh_f32_func(xf);
-            return __float2bfloat16_rn(rf);
-        } else if constexpr (std::is_same_v<T, float>) {
-            return tanh_f32_func(input);
-        } else if constexpr (std::is_same_v<T, double>) {
-            return std::tanh(input);
-        } else {
-            return std::tanh(input);
-        }
-    }
-} TanhOp;
+using Op = op::elementwise::unary::cuda::UnaryOp<op::elementwise::unary::UnaryMode::Tanh>;
 } // namespace op::tanh::cuda
 
 #endif // __TANH_CUDA_H__
diff --git a/src/infiniop/ops/tanh/nvidia/tanh_nvidia.cu b/src/infiniop/ops/tanh/nvidia/tanh_nvidia.cu
index a2c36551c..62f02da67 100644
--- a/src/infiniop/ops/tanh/nvidia/tanh_nvidia.cu
+++ b/src/infiniop/ops/tanh/nvidia/tanh_nvidia.cu
@@ -1,59 +1,10 @@
-#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh"
 
 #include "../cuda/kernel.cuh"
 #include "tanh_nvidia.cuh"
 
 namespace op::tanh::nvidia {
 
-Descriptor::~Descriptor() = default;
+ELEMENTWISE_NVIDIA_IMPL_UNARY(tanh)
 
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t out_desc,
-    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
-
-    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
-    auto dtype = out_desc->dtype();
-
-    const auto &input_desc = input_desc_vec.at(0);
-    const auto &output_shape = out_desc->shape();
-    const auto &input_shape = input_desc->shape();
-
-    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
-
-    CHECK_SAME_SHAPE(output_shape, input_shape);
-
-    // create CUDA elementwise descriptor
-    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    std::vector<const void *> inputs,
-    void *stream) const {
-
-    if (workspace_size < _workspace_size) {
-        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
-    }
-
-    switch (_dtype) {
-    case INFINI_DTYPE_F16:
-        return _device_info->calculate<256, cuda::TanhOp, half>(_info, workspace, output, inputs, stream);
-    case INFINI_DTYPE_BF16:
-        return _device_info->calculate<256, cuda::TanhOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
-    case INFINI_DTYPE_F32:
-        return _device_info->calculate<256, cuda::TanhOp, float>(_info, workspace, output, inputs, stream);
-    case INFINI_DTYPE_F64:
-        return _device_info->calculate<256, cuda::TanhOp, double>(_info, workspace, output, inputs, stream);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
 } // namespace op::tanh::nvidia

From 2ed76b9f46a0e5ff5436b361cebc94e495f4d98a Mon Sep 17 00:00:00 2001
From: gongchensu <zhuyue_134@qq.com>
Date: Fri, 16 Jan 2026 06:34:19 +0000
Subject: [PATCH 4/7] Issue/887 - Refactor: consolidate binary/unary operator
 headers and tests into unified files

---
 include/infiniop.h                            |  27 +-
 include/infiniop/ops/abs.h                    |   8 -
 include/infiniop/ops/acos.h                   |   8 -
 include/infiniop/ops/acosh.h                  |   8 -
 include/infiniop/ops/asin.h                   |   8 -
 include/infiniop/ops/asinh.h                  |   8 -
 include/infiniop/ops/atan.h                   |   8 -
 include/infiniop/ops/atanh.h                  |   8 -
 include/infiniop/ops/binary_ops_api.h         |  23 +
 include/infiniop/ops/ceil.h                   |   8 -
 include/infiniop/ops/cos.h                    |   8 -
 include/infiniop/ops/cosh.h                   |   8 -
 include/infiniop/ops/div.h                    |   8 -
 include/infiniop/ops/erf.h                    |   8 -
 include/infiniop/ops/floor.h                  |   8 -
 include/infiniop/ops/log.h                    |   8 -
 include/infiniop/ops/max.h                    |   8 -
 include/infiniop/ops/min.h                    |   8 -
 include/infiniop/ops/mod.h                    |   8 -
 include/infiniop/ops/neg.h                    |   8 -
 include/infiniop/ops/pow.h                    |   8 -
 include/infiniop/ops/reciprocal.h             |   8 -
 include/infiniop/ops/round.h                  |   8 -
 include/infiniop/ops/sign.h                   |   8 -
 include/infiniop/ops/sinh.h                   |   8 -
 include/infiniop/ops/sqrt.h                   |   8 -
 include/infiniop/ops/tan.h                    |   8 -
 include/infiniop/ops/unary_ops_api.h          |  39 ++
 scripts/test_binary_unary.py                  | 143 -----
 src/infiniop/ops/abs/operator.cc              |   2 +-
 src/infiniop/ops/acos/operator.cc             |   2 +-
 src/infiniop/ops/acosh/operator.cc            |   2 +-
 src/infiniop/ops/asin/operator.cc             |   2 +-
 src/infiniop/ops/asinh/operator.cc            |   2 +-
 src/infiniop/ops/atan/operator.cc             |   2 +-
 src/infiniop/ops/atanh/operator.cc            |   2 +-
 src/infiniop/ops/ceil/operator.cc             |   2 +-
 src/infiniop/ops/cos/operator.cc              |   2 +-
 src/infiniop/ops/cosh/operator.cc             |   2 +-
 src/infiniop/ops/div/operator.cc              |   2 +-
 src/infiniop/ops/erf/operator.cc              |   2 +-
 src/infiniop/ops/floor/operator.cc            |   2 +-
 src/infiniop/ops/log/operator.cc              |   2 +-
 src/infiniop/ops/max/operator.cc              |   2 +-
 src/infiniop/ops/min/operator.cc              |   2 +-
 src/infiniop/ops/mod/operator.cc              |   2 +-
 src/infiniop/ops/neg/operator.cc              |   2 +-
 src/infiniop/ops/pow/operator.cc              |   2 +-
 src/infiniop/ops/reciprocal/operator.cc       |   2 +-
 src/infiniop/ops/round/operator.cc            |   2 +-
 src/infiniop/ops/sign/operator.cc             |   2 +-
 src/infiniop/ops/sinh/operator.cc             |   2 +-
 src/infiniop/ops/sqrt/operator.cc             |   2 +-
 src/infiniop/ops/tan/operator.cc              |   2 +-
 test/infiniop/abs.py                          | 164 ------
 test/infiniop/acos.py                         | 165 ------
 test/infiniop/acosh.py                        | 165 ------
 test/infiniop/asin.py                         | 165 ------
 test/infiniop/asinh.py                        | 165 ------
 test/infiniop/atan.py                         | 164 ------
 test/infiniop/atanh.py                        | 165 ------
 test/infiniop/ceil.py                         | 165 ------
 test/infiniop/cos.py                          | 166 ------
 test/infiniop/cosh.py                         | 165 ------
 test/infiniop/div.py                          | 192 ------
 test/infiniop/erf.py                          | 165 ------
 test/infiniop/floor.py                        | 165 ------
 test/infiniop/libinfiniop/binary_test_base.py | 273 +++++++++
 test/infiniop/libinfiniop/unary_test_base.py  | 242 ++++++++
 test/infiniop/log.py                          | 166 ------
 test/infiniop/max.py                          | 189 ------
 test/infiniop/min.py                          | 189 ------
 test/infiniop/mod.py                          | 190 ------
 test/infiniop/neg.py                          | 165 ------
 test/infiniop/pow.py                          | 190 ------
 test/infiniop/reciprocal.py                   | 168 ------
 test/infiniop/round.py                        | 165 ------
 test/infiniop/sign.py                         | 166 ------
 test/infiniop/sinh.py                         | 166 ------
 test/infiniop/sqrt.py                         | 166 ------
 test/infiniop/tan.py                          | 167 ------
 test/infiniop/test_all_binary_ops.py          | 251 ++++++++
 test/infiniop/test_all_unary_ops.py           | 548 ++++++++++++++++++
 83 files changed, 1405 insertions(+), 4649 deletions(-)
 delete mode 100644 include/infiniop/ops/abs.h
 delete mode 100644 include/infiniop/ops/acos.h
 delete mode 100644 include/infiniop/ops/acosh.h
 delete mode 100644 include/infiniop/ops/asin.h
 delete mode 100644 include/infiniop/ops/asinh.h
 delete mode 100644 include/infiniop/ops/atan.h
 delete mode 100644 include/infiniop/ops/atanh.h
 create mode 100644 include/infiniop/ops/binary_ops_api.h
 delete mode 100644 include/infiniop/ops/ceil.h
 delete mode 100644 include/infiniop/ops/cos.h
 delete mode 100644 include/infiniop/ops/cosh.h
 delete mode 100644 include/infiniop/ops/div.h
 delete mode 100644 include/infiniop/ops/erf.h
 delete mode 100644 include/infiniop/ops/floor.h
 delete mode 100644 include/infiniop/ops/log.h
 delete mode 100644 include/infiniop/ops/max.h
 delete mode 100644 include/infiniop/ops/min.h
 delete mode 100644 include/infiniop/ops/mod.h
 delete mode 100644 include/infiniop/ops/neg.h
 delete mode 100644 include/infiniop/ops/pow.h
 delete mode 100644 include/infiniop/ops/reciprocal.h
 delete mode 100644 include/infiniop/ops/round.h
 delete mode 100644 include/infiniop/ops/sign.h
 delete mode 100644 include/infiniop/ops/sinh.h
 delete mode 100644 include/infiniop/ops/sqrt.h
 delete mode 100644 include/infiniop/ops/tan.h
 create mode 100644 include/infiniop/ops/unary_ops_api.h
 delete mode 100755 scripts/test_binary_unary.py
 delete mode 100644 test/infiniop/abs.py
 delete mode 100644 test/infiniop/acos.py
 delete mode 100644 test/infiniop/acosh.py
 delete mode 100644 test/infiniop/asin.py
 delete mode 100644 test/infiniop/asinh.py
 delete mode 100644 test/infiniop/atan.py
 delete mode 100644 test/infiniop/atanh.py
 delete mode 100644 test/infiniop/ceil.py
 delete mode 100644 test/infiniop/cos.py
 delete mode 100644 test/infiniop/cosh.py
 delete mode 100644 test/infiniop/div.py
 delete mode 100644 test/infiniop/erf.py
 delete mode 100644 test/infiniop/floor.py
 create mode 100644 test/infiniop/libinfiniop/binary_test_base.py
 create mode 100644 test/infiniop/libinfiniop/unary_test_base.py
 delete mode 100644 test/infiniop/log.py
 delete mode 100644 test/infiniop/max.py
 delete mode 100644 test/infiniop/min.py
 delete mode 100644 test/infiniop/mod.py
 delete mode 100644 test/infiniop/neg.py
 delete mode 100644 test/infiniop/pow.py
 delete mode 100644 test/infiniop/reciprocal.py
 delete mode 100644 test/infiniop/round.py
 delete mode 100644 test/infiniop/sign.py
 delete mode 100644 test/infiniop/sinh.py
 delete mode 100644 test/infiniop/sqrt.py
 delete mode 100644 test/infiniop/tan.py
 create mode 100644 test/infiniop/test_all_binary_ops.py
 create mode 100644 test/infiniop/test_all_unary_ops.py

diff --git a/include/infiniop.h b/include/infiniop.h
index 4778fce90..e87839bc2 100644
--- a/include/infiniop.h
+++ b/include/infiniop.h
@@ -2,47 +2,29 @@
 #define __INFINIOP_API_H__
 
 #include "infiniop/handle.h"
-#include "infiniop/ops/abs.h"
-#include "infiniop/ops/acos.h"
-#include "infiniop/ops/acosh.h"
+// Unified headers for elementwise operators
+#include "infiniop/ops/unary_ops_api.h"
+#include "infiniop/ops/binary_ops_api.h"
+// Other operators
 #include "infiniop/ops/add.h"
 #include "infiniop/ops/add_rms_norm.h"
-#include "infiniop/ops/asin.h"
-#include "infiniop/ops/asinh.h"
-#include "infiniop/ops/atan.h"
-#include "infiniop/ops/atanh.h"
 #include "infiniop/ops/attention.h"
-#include "infiniop/ops/ceil.h"
-#include "infiniop/ops/cos.h"
-#include "infiniop/ops/cosh.h"
-#include "infiniop/ops/erf.h"
-#include "infiniop/ops/floor.h"
 #include "infiniop/ops/causal_softmax.h"
 #include "infiniop/ops/clip.h"
 #include "infiniop/ops/conv.h"
 #include "infiniop/ops/dequantize_awq.h"
-#include "infiniop/ops/div.h"
 #include "infiniop/ops/gelu.h"
 #include "infiniop/ops/gemm.h"
 #include "infiniop/ops/layer_norm.h"
-#include "infiniop/ops/log.h"
 #include "infiniop/ops/logsoftmax.h"
 #include "infiniop/ops/lp_norm.h"
-#include "infiniop/ops/max.h"
-#include "infiniop/ops/min.h"
 #include "infiniop/ops/mul.h"
-#include "infiniop/ops/neg.h"
 #include "infiniop/ops/ones.h"
 #include "infiniop/ops/paged_attention.h"
 #include "infiniop/ops/paged_attention_prefill.h"
 #include "infiniop/ops/paged_caching.h"
 #include "infiniop/ops/random_sample.h"
-#include "infiniop/ops/reciprocal.h"
 #include "infiniop/ops/rearrange.h"
-#include "infiniop/ops/round.h"
-#include "infiniop/ops/sign.h"
-#include "infiniop/ops/sinh.h"
-#include "infiniop/ops/sqrt.h"
 #include "infiniop/ops/relu.h"
 #include "infiniop/ops/rms_norm.h"
 #include "infiniop/ops/rope.h"
@@ -52,7 +34,6 @@
 #include "infiniop/ops/softplus.h"
 #include "infiniop/ops/sub.h"
 #include "infiniop/ops/swiglu.h"
-#include "infiniop/ops/tan.h"
 #include "infiniop/ops/tanh.h"
 #include "infiniop/ops/topkrouter.h"
 #include "infiniop/ops/topksoftmax.h"
diff --git a/include/infiniop/ops/abs.h b/include/infiniop/ops/abs.h
deleted file mode 100644
index 1d1f1cbd1..000000000
--- a/include/infiniop/ops/abs.h
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifndef __INFINIOP_ABS_API_H__
-#define __INFINIOP_ABS_API_H__
-
-#include "unary_op_api.h"
-
-UNARY_OP_API_DECLARE(abs, Abs)
-
-#endif
diff --git a/include/infiniop/ops/acos.h b/include/infiniop/ops/acos.h
deleted file mode 100644
index c2f4de837..000000000
--- a/include/infiniop/ops/acos.h
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifndef __INFINIOP_ACOS_API_H__
-#define __INFINIOP_ACOS_API_H__
-
-#include "unary_op_api.h"
-
-UNARY_OP_API_DECLARE(acos, Acos)
-
-#endif
diff --git a/include/infiniop/ops/acosh.h b/include/infiniop/ops/acosh.h
deleted file mode 100644
index e8630b7d5..000000000
--- a/include/infiniop/ops/acosh.h
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifndef __INFINIOP_ACOSH_API_H__
-#define __INFINIOP_ACOSH_API_H__
-
-#include "unary_op_api.h"
-
-UNARY_OP_API_DECLARE(acosh, Acosh)
-
-#endif
diff --git a/include/infiniop/ops/asin.h b/include/infiniop/ops/asin.h
deleted file mode 100644
index 1a8bdd7b8..000000000
--- a/include/infiniop/ops/asin.h
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifndef __INFINIOP_ASIN_API_H__
-#define __INFINIOP_ASIN_API_H__
-
-#include "unary_op_api.h"
-
-UNARY_OP_API_DECLARE(asin, Asin)
-
-#endif
diff --git a/include/infiniop/ops/asinh.h b/include/infiniop/ops/asinh.h
deleted file mode 100644
index 2a3aebf5a..000000000
--- a/include/infiniop/ops/asinh.h
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifndef __INFINIOP_ASINH_API_H__
-#define __INFINIOP_ASINH_API_H__
-
-#include "unary_op_api.h"
-
-UNARY_OP_API_DECLARE(asinh, Asinh)
-
-#endif
diff --git a/include/infiniop/ops/atan.h b/include/infiniop/ops/atan.h
deleted file mode 100644
index 18eed316f..000000000
--- a/include/infiniop/ops/atan.h
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifndef __INFINIOP_ATAN_API_H__
-#define __INFINIOP_ATAN_API_H__
-
-#include "unary_op_api.h"
-
-UNARY_OP_API_DECLARE(atan, Atan)
-
-#endif
diff --git a/include/infiniop/ops/atanh.h b/include/infiniop/ops/atanh.h
deleted file mode 100644
index e7db5b53c..000000000
--- a/include/infiniop/ops/atanh.h
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifndef __INFINIOP_ATANH_API_H__
-#define __INFINIOP_ATANH_API_H__
-
-#include "unary_op_api.h"
-
-UNARY_OP_API_DECLARE(atanh, Atanh)
-
-#endif
diff --git a/include/infiniop/ops/binary_ops_api.h b/include/infiniop/ops/binary_ops_api.h
new file mode 100644
index 000000000..24d7715c9
--- /dev/null
+++ b/include/infiniop/ops/binary_ops_api.h
@@ -0,0 +1,23 @@
+#ifndef __INFINIOP_BINARY_OPS_API_H__
+#define __INFINIOP_BINARY_OPS_API_H__
+
+#include "binary_op_api.h"
+
+/**
+ * @brief Unified API declarations for all binary operators.
+ * 
+ * This header contains API declarations for all binary operators in a single file,
+ * eliminating the need for individual header files for each operator.
+ * 
+ * All binary operator APIs are declared here:
+ * - div, pow, mod, max, min
+ */
+
+// Declare all binary operator APIs
+BINARY_OP_API_DECLARE(div, Div)
+BINARY_OP_API_DECLARE(pow, Pow)
+BINARY_OP_API_DECLARE(mod, Mod)
+BINARY_OP_API_DECLARE(max, Max)
+BINARY_OP_API_DECLARE(min, Min)
+
+#endif // __INFINIOP_BINARY_OPS_API_H__
diff --git a/include/infiniop/ops/ceil.h b/include/infiniop/ops/ceil.h
deleted file mode 100644
index 8fca73b2e..000000000
--- a/include/infiniop/ops/ceil.h
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifndef __INFINIOP_CEIL_API_H__
-#define __INFINIOP_CEIL_API_H__
-
-#include "unary_op_api.h"
-
-UNARY_OP_API_DECLARE(ceil, Ceil)
-
-#endif
diff --git a/include/infiniop/ops/cos.h b/include/infiniop/ops/cos.h
deleted file mode 100644
index ed33b0a0e..000000000
--- a/include/infiniop/ops/cos.h
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifndef __INFINIOP_COS_API_H__
-#define __INFINIOP_COS_API_H__
-
-#include "unary_op_api.h"
-
-UNARY_OP_API_DECLARE(cos, Cos)
-
-#endif
diff --git a/include/infiniop/ops/cosh.h b/include/infiniop/ops/cosh.h
deleted file mode 100644
index b607b8fd1..000000000
--- a/include/infiniop/ops/cosh.h
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifndef __INFINIOP_COSH_API_H__
-#define __INFINIOP_COSH_API_H__
-
-#include "unary_op_api.h"
-
-UNARY_OP_API_DECLARE(cosh, Cosh)
-
-#endif
diff --git a/include/infiniop/ops/div.h b/include/infiniop/ops/div.h
deleted file mode 100644
index 6f146bf4c..000000000
--- a/include/infiniop/ops/div.h
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifndef __INFINIOP_DIV_API_H__
-#define __INFINIOP_DIV_API_H__
-
-#include "binary_op_api.h"
-
-BINARY_OP_API_DECLARE(div, Div)
-
-#endif
diff --git a/include/infiniop/ops/erf.h b/include/infiniop/ops/erf.h
deleted file mode 100644
index 0dcc149da..000000000
--- a/include/infiniop/ops/erf.h
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifndef __INFINIOP_ERF_API_H__
-#define __INFINIOP_ERF_API_H__
-
-#include "unary_op_api.h"
-
-UNARY_OP_API_DECLARE(erf, Erf)
-
-#endif
diff --git a/include/infiniop/ops/floor.h b/include/infiniop/ops/floor.h
deleted file mode 100644
index 02efc6761..000000000
--- a/include/infiniop/ops/floor.h
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifndef __INFINIOP_FLOOR_API_H__
-#define __INFINIOP_FLOOR_API_H__
-
-#include "unary_op_api.h"
-
-UNARY_OP_API_DECLARE(floor, Floor)
-
-#endif
diff --git a/include/infiniop/ops/log.h b/include/infiniop/ops/log.h
deleted file mode 100644
index 3892ccb6e..000000000
--- a/include/infiniop/ops/log.h
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifndef __INFINIOP_LOG_API_H__
-#define __INFINIOP_LOG_API_H__
-
-#include "unary_op_api.h"
-
-UNARY_OP_API_DECLARE(log, Log)
-
-#endif
diff --git a/include/infiniop/ops/max.h b/include/infiniop/ops/max.h
deleted file mode 100644
index 4b91e5c83..000000000
--- a/include/infiniop/ops/max.h
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifndef __INFINIOP_MAX_API_H__
-#define __INFINIOP_MAX_API_H__
-
-#include "binary_op_api.h"
-
-BINARY_OP_API_DECLARE(max, Max)
-
-#endif
diff --git a/include/infiniop/ops/min.h b/include/infiniop/ops/min.h
deleted file mode 100644
index 1496806df..000000000
--- a/include/infiniop/ops/min.h
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifndef __INFINIOP_MIN_API_H__
-#define __INFINIOP_MIN_API_H__
-
-#include "binary_op_api.h"
-
-BINARY_OP_API_DECLARE(min, Min)
-
-#endif
diff --git a/include/infiniop/ops/mod.h b/include/infiniop/ops/mod.h
deleted file mode 100644
index e4fcd571e..000000000
--- a/include/infiniop/ops/mod.h
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifndef __INFINIOP_MOD_API_H__
-#define __INFINIOP_MOD_API_H__
-
-#include "binary_op_api.h"
-
-BINARY_OP_API_DECLARE(mod, Mod)
-
-#endif
diff --git a/include/infiniop/ops/neg.h b/include/infiniop/ops/neg.h
deleted file mode 100644
index 0d18bbd5c..000000000
--- a/include/infiniop/ops/neg.h
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifndef __INFINIOP_NEG_API_H__
-#define __INFINIOP_NEG_API_H__
-
-#include "unary_op_api.h"
-
-UNARY_OP_API_DECLARE(neg, Neg)
-
-#endif
diff --git a/include/infiniop/ops/pow.h b/include/infiniop/ops/pow.h
deleted file mode 100644
index f4e263a58..000000000
--- a/include/infiniop/ops/pow.h
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifndef __INFINIOP_POW_API_H__
-#define __INFINIOP_POW_API_H__
-
-#include "binary_op_api.h"
-
-BINARY_OP_API_DECLARE(pow, Pow)
-
-#endif
diff --git a/include/infiniop/ops/reciprocal.h b/include/infiniop/ops/reciprocal.h
deleted file mode 100644
index 7d5626176..000000000
--- a/include/infiniop/ops/reciprocal.h
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifndef __INFINIOP_RECIPROCAL_API_H__
-#define __INFINIOP_RECIPROCAL_API_H__
-
-#include "unary_op_api.h"
-
-UNARY_OP_API_DECLARE(reciprocal, Reciprocal)
-
-#endif
diff --git a/include/infiniop/ops/round.h b/include/infiniop/ops/round.h
deleted file mode 100644
index 1bf4377ff..000000000
--- a/include/infiniop/ops/round.h
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifndef __INFINIOP_ROUND_API_H__
-#define __INFINIOP_ROUND_API_H__
-
-#include "unary_op_api.h"
-
-UNARY_OP_API_DECLARE(round, Round)
-
-#endif
diff --git a/include/infiniop/ops/sign.h b/include/infiniop/ops/sign.h
deleted file mode 100644
index ef7854de8..000000000
--- a/include/infiniop/ops/sign.h
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifndef __INFINIOP_SIGN_API_H__
-#define __INFINIOP_SIGN_API_H__
-
-#include "unary_op_api.h"
-
-UNARY_OP_API_DECLARE(sign, Sign)
-
-#endif
diff --git a/include/infiniop/ops/sinh.h b/include/infiniop/ops/sinh.h
deleted file mode 100644
index ea8511a2b..000000000
--- a/include/infiniop/ops/sinh.h
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifndef __INFINIOP_SINH_API_H__
-#define __INFINIOP_SINH_API_H__
-
-#include "unary_op_api.h"
-
-UNARY_OP_API_DECLARE(sinh, Sinh)
-
-#endif
diff --git a/include/infiniop/ops/sqrt.h b/include/infiniop/ops/sqrt.h
deleted file mode 100644
index 6df6fe89c..000000000
--- a/include/infiniop/ops/sqrt.h
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifndef __INFINIOP_SQRT_API_H__
-#define __INFINIOP_SQRT_API_H__
-
-#include "unary_op_api.h"
-
-UNARY_OP_API_DECLARE(sqrt, Sqrt)
-
-#endif
diff --git a/include/infiniop/ops/tan.h b/include/infiniop/ops/tan.h
deleted file mode 100644
index d4a2f0bf2..000000000
--- a/include/infiniop/ops/tan.h
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifndef __INFINIOP_TAN_API_H__
-#define __INFINIOP_TAN_API_H__
-
-#include "unary_op_api.h"
-
-UNARY_OP_API_DECLARE(tan, Tan)
-
-#endif
diff --git a/include/infiniop/ops/unary_ops_api.h b/include/infiniop/ops/unary_ops_api.h
new file mode 100644
index 000000000..95b0773b6
--- /dev/null
+++ b/include/infiniop/ops/unary_ops_api.h
@@ -0,0 +1,39 @@
+#ifndef __INFINIOP_UNARY_OPS_API_H__
+#define __INFINIOP_UNARY_OPS_API_H__
+
+#include "unary_op_api.h"
+
+/**
+ * @brief Unified API declarations for all unary operators.
+ * 
+ * This header contains API declarations for all unary operators in a single file,
+ * eliminating the need for individual header files for each operator.
+ * 
+ * All unary operator APIs are declared here:
+ * - abs, log, sqrt, reciprocal, neg, round, sinh, sign, tan
+ * - acosh, asinh, cos, atanh, asin, floor, cosh, erf, atan, acos, ceil
+ */
+
+// Declare all unary operator APIs
+UNARY_OP_API_DECLARE(abs, Abs)
+UNARY_OP_API_DECLARE(log, Log)
+UNARY_OP_API_DECLARE(sqrt, Sqrt)
+UNARY_OP_API_DECLARE(reciprocal, Reciprocal)
+UNARY_OP_API_DECLARE(neg, Neg)
+UNARY_OP_API_DECLARE(round, Round)
+UNARY_OP_API_DECLARE(sinh, Sinh)
+UNARY_OP_API_DECLARE(sign, Sign)
+UNARY_OP_API_DECLARE(tan, Tan)
+UNARY_OP_API_DECLARE(acosh, Acosh)
+UNARY_OP_API_DECLARE(asinh, Asinh)
+UNARY_OP_API_DECLARE(cos, Cos)
+UNARY_OP_API_DECLARE(atanh, Atanh)
+UNARY_OP_API_DECLARE(asin, Asin)
+UNARY_OP_API_DECLARE(floor, Floor)
+UNARY_OP_API_DECLARE(cosh, Cosh)
+UNARY_OP_API_DECLARE(erf, Erf)
+UNARY_OP_API_DECLARE(atan, Atan)
+UNARY_OP_API_DECLARE(acos, Acos)
+UNARY_OP_API_DECLARE(ceil, Ceil)
+
+#endif // __INFINIOP_UNARY_OPS_API_H__
diff --git a/scripts/test_binary_unary.py b/scripts/test_binary_unary.py
deleted file mode 100755
index 8dbbfbf53..000000000
--- a/scripts/test_binary_unary.py
+++ /dev/null
@@ -1,143 +0,0 @@
-import os
-import subprocess
-from set_env import set_env
-import sys
-
-PROJECT_DIR = os.path.abspath(
-    os.path.join(os.path.dirname(__file__), "..", "test", "infiniop")
-)
-os.chdir(PROJECT_DIR)
-
-
-def run_tests(args):
-    failed = []
-    
-    # Binary operators (重构过的)
-    binary_tests = [
-        "div.py",
-        "pow.py",
-        "mod.py",
-        "min.py",
-        "max.py",
-    ]
-    
-    # Unary operators (重构过的)
-    unary_tests = [
-        "abs.py",
-        "log.py",
-        "cos.py",
-        "sqrt.py",
-        "neg.py",
-        "sign.py",
-        "reciprocal.py",
-        "round.py",
-        "floor.py",
-        "ceil.py",
-        "erf.py",
-        "cosh.py",
-        "sinh.py",
-        "tan.py",
-        "acos.py",
-        "acosh.py",
-        "asin.py",
-        "asinh.py",
-        "atan.py",
-        "atanh.py",
-    ]
-    
-    all_tests = binary_tests + unary_tests
-    
-    print("\033[94m" + "=" * 60 + "\033[0m")
-    print("\033[94mTesting Binary and Unary Operators (Refactored)\033[0m")
-    print("\033[94m" + "=" * 60 + "\033[0m")
-    print(f"\033[94mTotal tests: {len(all_tests)}\033[0m")
-    print(f"\033[94m  - Binary operators: {len(binary_tests)}\033[0m")
-    print(f"\033[94m  - Unary operators: {len(unary_tests)}\033[0m")
-    print()
-    
-    for test in all_tests:
-        if not os.path.exists(test):
-            print(f"\033[93m[SKIP] {test} - test file not found\033[0m")
-            continue
-            
-        print(f"\033[96m[RUN] {test}\033[0m", end=" ... ", flush=True)
-        result = subprocess.run(
-            f"python3 {test} {args}", 
-            text=True, 
-            encoding="utf-8", 
-            shell=True,
-            capture_output=True
-        )
-        
-        if result.returncode != 0:
-            print(f"\033[91m[FAIL]\033[0m")
-            print(f"\033[91mError output:\033[0m")
-            print(result.stderr)
-            failed.append(test)
-        else:
-            print(f"\033[92m[PASS]\033[0m")
-    
-    return failed
-
-
-if __name__ == "__main__":
-    import argparse
-    
-    parser = argparse.ArgumentParser(
-        description="Test refactored binary and unary operators",
-        formatter_class=argparse.RawDescriptionHelpFormatter,
-        epilog="""
-Examples:
-  # Test on CPU only (default)
-  python3 scripts/test_binary_unary.py --cpu
-  
-  # Test on NVIDIA GPU only
-  python3 scripts/test_binary_unary.py --nvidia
-  
-  # Test on both CPU and NVIDIA
-  python3 scripts/test_binary_unary.py --cpu --nvidia
-  
-  # Test with debug mode
-  python3 scripts/test_binary_unary.py --cpu --debug
-  
-  # Test with profiling
-  python3 scripts/test_binary_unary.py --nvidia --profile
-        """
-    )
-    
-    # Device selection arguments (same as test files)
-    parser.add_argument("--cpu", action="store_true", help="Run CPU tests")
-    parser.add_argument("--nvidia", action="store_true", help="Run NVIDIA GPU tests")
-    parser.add_argument("--debug", action="store_true", help="Enable debug mode")
-    parser.add_argument("--profile", action="store_true", help="Enable profiling")
-    
-    args, unknown = parser.parse_known_args()
-    
-    # Build command line arguments to pass to test files
-    test_args = []
-    if args.cpu:
-        test_args.append("--cpu")
-    if args.nvidia:
-        test_args.append("--nvidia")
-    if args.debug:
-        test_args.append("--debug")
-    if args.profile:
-        test_args.append("--profile")
-    
-    # Add any unknown arguments (for compatibility)
-    test_args.extend(unknown)
-    
-    set_env()
-    failed = run_tests(" ".join(test_args))
-    
-    print()
-    print("\033[94m" + "=" * 60 + "\033[0m")
-    if len(failed) == 0:
-        print("\033[92m✓ All tests passed!\033[0m")
-    else:
-        print(f"\033[91m✗ {len(failed)} test(s) failed:\033[0m")
-        for test in failed:
-            print(f"\033[91m  - {test}\033[0m")
-    print("\033[94m" + "=" * 60 + "\033[0m")
-    
-    exit(len(failed))
diff --git a/src/infiniop/ops/abs/operator.cc b/src/infiniop/ops/abs/operator.cc
index 051b8711a..8439236eb 100644
--- a/src/infiniop/ops/abs/operator.cc
+++ b/src/infiniop/ops/abs/operator.cc
@@ -1,5 +1,5 @@
 #include "../../operator_impl.h"
-#include "infiniop/ops/abs.h"
+#include "infiniop/ops/unary_ops_api.h"
 
 #ifdef ENABLE_CPU_API
 #include "cpu/abs_cpu.h"
diff --git a/src/infiniop/ops/acos/operator.cc b/src/infiniop/ops/acos/operator.cc
index 15872b754..3fd50fb51 100644
--- a/src/infiniop/ops/acos/operator.cc
+++ b/src/infiniop/ops/acos/operator.cc
@@ -1,5 +1,5 @@
 #include "../../operator_impl.h"
-#include "infiniop/ops/acos.h"
+#include "infiniop/ops/unary_ops_api.h"
 
 #ifdef ENABLE_CPU_API
 #include "cpu/acos_cpu.h"
diff --git a/src/infiniop/ops/acosh/operator.cc b/src/infiniop/ops/acosh/operator.cc
index c1939a54c..0fb30c0f6 100644
--- a/src/infiniop/ops/acosh/operator.cc
+++ b/src/infiniop/ops/acosh/operator.cc
@@ -1,5 +1,5 @@
 #include "../../operator_impl.h"
-#include "infiniop/ops/acosh.h"
+#include "infiniop/ops/unary_ops_api.h"
 
 #ifdef ENABLE_CPU_API
 #include "cpu/acosh_cpu.h"
diff --git a/src/infiniop/ops/asin/operator.cc b/src/infiniop/ops/asin/operator.cc
index edb8fa867..8ed07d55d 100644
--- a/src/infiniop/ops/asin/operator.cc
+++ b/src/infiniop/ops/asin/operator.cc
@@ -1,5 +1,5 @@
 #include "../../operator_impl.h"
-#include "infiniop/ops/asin.h"
+#include "infiniop/ops/unary_ops_api.h"
 
 #ifdef ENABLE_CPU_API
 #include "cpu/asin_cpu.h"
diff --git a/src/infiniop/ops/asinh/operator.cc b/src/infiniop/ops/asinh/operator.cc
index 7b519ec05..020f83dc4 100644
--- a/src/infiniop/ops/asinh/operator.cc
+++ b/src/infiniop/ops/asinh/operator.cc
@@ -1,5 +1,5 @@
 #include "../../operator_impl.h"
-#include "infiniop/ops/asinh.h"
+#include "infiniop/ops/unary_ops_api.h"
 
 #ifdef ENABLE_CPU_API
 #include "cpu/asinh_cpu.h"
diff --git a/src/infiniop/ops/atan/operator.cc b/src/infiniop/ops/atan/operator.cc
index 9025489c3..2ee3ad449 100644
--- a/src/infiniop/ops/atan/operator.cc
+++ b/src/infiniop/ops/atan/operator.cc
@@ -1,5 +1,5 @@
 #include "../../operator_impl.h"
-#include "infiniop/ops/atan.h"
+#include "infiniop/ops/unary_ops_api.h"
 
 #ifdef ENABLE_CPU_API
 #include "cpu/atan_cpu.h"
diff --git a/src/infiniop/ops/atanh/operator.cc b/src/infiniop/ops/atanh/operator.cc
index cc9d6131e..fb991051c 100644
--- a/src/infiniop/ops/atanh/operator.cc
+++ b/src/infiniop/ops/atanh/operator.cc
@@ -1,5 +1,5 @@
 #include "../../operator_impl.h"
-#include "infiniop/ops/atanh.h"
+#include "infiniop/ops/unary_ops_api.h"
 
 #ifdef ENABLE_CPU_API
 #include "cpu/atanh_cpu.h"
diff --git a/src/infiniop/ops/ceil/operator.cc b/src/infiniop/ops/ceil/operator.cc
index dbe591043..26252ec16 100644
--- a/src/infiniop/ops/ceil/operator.cc
+++ b/src/infiniop/ops/ceil/operator.cc
@@ -1,5 +1,5 @@
 #include "../../operator_impl.h"
-#include "infiniop/ops/ceil.h"
+#include "infiniop/ops/unary_ops_api.h"
 
 #ifdef ENABLE_CPU_API
 #include "cpu/ceil_cpu.h"
diff --git a/src/infiniop/ops/cos/operator.cc b/src/infiniop/ops/cos/operator.cc
index 1531c6caa..e3d9237a9 100644
--- a/src/infiniop/ops/cos/operator.cc
+++ b/src/infiniop/ops/cos/operator.cc
@@ -1,5 +1,5 @@
 #include "../../operator_impl.h"
-#include "infiniop/ops/cos.h"
+#include "infiniop/ops/unary_ops_api.h"
 
 #ifdef ENABLE_CPU_API
 #include "cpu/cos_cpu.h"
diff --git a/src/infiniop/ops/cosh/operator.cc b/src/infiniop/ops/cosh/operator.cc
index 9b18b47ee..c1a6159c1 100644
--- a/src/infiniop/ops/cosh/operator.cc
+++ b/src/infiniop/ops/cosh/operator.cc
@@ -1,5 +1,5 @@
 #include "../../operator_impl.h"
-#include "infiniop/ops/cosh.h"
+#include "infiniop/ops/unary_ops_api.h"
 
 #ifdef ENABLE_CPU_API
 #include "cpu/cosh_cpu.h"
diff --git a/src/infiniop/ops/div/operator.cc b/src/infiniop/ops/div/operator.cc
index af9d1929a..4ed2374af 100644
--- a/src/infiniop/ops/div/operator.cc
+++ b/src/infiniop/ops/div/operator.cc
@@ -1,5 +1,5 @@
 #include "../../operator_impl.h"
-#include "infiniop/ops/div.h"
+#include "infiniop/ops/binary_ops_api.h"
 
 #ifdef ENABLE_CPU_API
 #include "cpu/div_cpu.h"
diff --git a/src/infiniop/ops/erf/operator.cc b/src/infiniop/ops/erf/operator.cc
index 9304cf525..eeee864ee 100644
--- a/src/infiniop/ops/erf/operator.cc
+++ b/src/infiniop/ops/erf/operator.cc
@@ -1,5 +1,5 @@
 #include "../../operator_impl.h"
-#include "infiniop/ops/erf.h"
+#include "infiniop/ops/unary_ops_api.h"
 
 #ifdef ENABLE_CPU_API
 #include "cpu/erf_cpu.h"
diff --git a/src/infiniop/ops/floor/operator.cc b/src/infiniop/ops/floor/operator.cc
index 64e4a586b..bfb4a2466 100644
--- a/src/infiniop/ops/floor/operator.cc
+++ b/src/infiniop/ops/floor/operator.cc
@@ -1,5 +1,5 @@
 #include "../../operator_impl.h"
-#include "infiniop/ops/floor.h"
+#include "infiniop/ops/unary_ops_api.h"
 
 #ifdef ENABLE_CPU_API
 #include "cpu/floor_cpu.h"
diff --git a/src/infiniop/ops/log/operator.cc b/src/infiniop/ops/log/operator.cc
index 9614a0861..b4814ff72 100644
--- a/src/infiniop/ops/log/operator.cc
+++ b/src/infiniop/ops/log/operator.cc
@@ -1,5 +1,5 @@
 #include "../../operator_impl.h"
-#include "infiniop/ops/log.h"
+#include "infiniop/ops/unary_ops_api.h"
 
 #ifdef ENABLE_CPU_API
 #include "cpu/log_cpu.h"
diff --git a/src/infiniop/ops/max/operator.cc b/src/infiniop/ops/max/operator.cc
index 3e5299f52..03b6d4eeb 100644
--- a/src/infiniop/ops/max/operator.cc
+++ b/src/infiniop/ops/max/operator.cc
@@ -1,5 +1,5 @@
 #include "../../operator_impl.h"
-#include "infiniop/ops/max.h"
+#include "infiniop/ops/binary_ops_api.h"
 
 #ifdef ENABLE_CPU_API
 #include "cpu/max_cpu.h"
diff --git a/src/infiniop/ops/min/operator.cc b/src/infiniop/ops/min/operator.cc
index 6f67ecf87..1597bb5d3 100644
--- a/src/infiniop/ops/min/operator.cc
+++ b/src/infiniop/ops/min/operator.cc
@@ -1,5 +1,5 @@
 #include "../../operator_impl.h"
-#include "infiniop/ops/min.h"
+#include "infiniop/ops/binary_ops_api.h"
 
 #ifdef ENABLE_CPU_API
 #include "cpu/min_cpu.h"
diff --git a/src/infiniop/ops/mod/operator.cc b/src/infiniop/ops/mod/operator.cc
index aef892ce1..9f635d6e6 100644
--- a/src/infiniop/ops/mod/operator.cc
+++ b/src/infiniop/ops/mod/operator.cc
@@ -1,5 +1,5 @@
 #include "../../operator_impl.h"
-#include "infiniop/ops/mod.h"
+#include "infiniop/ops/binary_ops_api.h"
 
 #ifdef ENABLE_CPU_API
 #include "cpu/mod_cpu.h"
diff --git a/src/infiniop/ops/neg/operator.cc b/src/infiniop/ops/neg/operator.cc
index c3945f4bb..e8c99dcdf 100644
--- a/src/infiniop/ops/neg/operator.cc
+++ b/src/infiniop/ops/neg/operator.cc
@@ -1,5 +1,5 @@
 #include "../../operator_impl.h"
-#include "infiniop/ops/neg.h"
+#include "infiniop/ops/unary_ops_api.h"
 
 #ifdef ENABLE_CPU_API
 #include "cpu/neg_cpu.h"
diff --git a/src/infiniop/ops/pow/operator.cc b/src/infiniop/ops/pow/operator.cc
index b1ddbc9c1..7a24d7a20 100644
--- a/src/infiniop/ops/pow/operator.cc
+++ b/src/infiniop/ops/pow/operator.cc
@@ -1,5 +1,5 @@
 #include "../../operator_impl.h"
-#include "infiniop/ops/pow.h"
+#include "infiniop/ops/binary_ops_api.h"
 
 #ifdef ENABLE_CPU_API
 #include "cpu/pow_cpu.h"
diff --git a/src/infiniop/ops/reciprocal/operator.cc b/src/infiniop/ops/reciprocal/operator.cc
index 966bd72d8..4c55fdf20 100644
--- a/src/infiniop/ops/reciprocal/operator.cc
+++ b/src/infiniop/ops/reciprocal/operator.cc
@@ -1,5 +1,5 @@
 #include "../../operator_impl.h"
-#include "infiniop/ops/reciprocal.h"
+#include "infiniop/ops/unary_ops_api.h"
 
 #ifdef ENABLE_CPU_API
 #include "cpu/reciprocal_cpu.h"
diff --git a/src/infiniop/ops/round/operator.cc b/src/infiniop/ops/round/operator.cc
index a20fbcb17..5a1e0fcc5 100644
--- a/src/infiniop/ops/round/operator.cc
+++ b/src/infiniop/ops/round/operator.cc
@@ -1,5 +1,5 @@
 #include "../../operator_impl.h"
-#include "infiniop/ops/round.h"
+#include "infiniop/ops/unary_ops_api.h"
 
 #ifdef ENABLE_CPU_API
 #include "cpu/round_cpu.h"
diff --git a/src/infiniop/ops/sign/operator.cc b/src/infiniop/ops/sign/operator.cc
index 1a4599d5d..18850ec1f 100644
--- a/src/infiniop/ops/sign/operator.cc
+++ b/src/infiniop/ops/sign/operator.cc
@@ -1,5 +1,5 @@
 #include "../../operator_impl.h"
-#include "infiniop/ops/sign.h"
+#include "infiniop/ops/unary_ops_api.h"
 
 #ifdef ENABLE_CPU_API
 #include "cpu/sign_cpu.h"
diff --git a/src/infiniop/ops/sinh/operator.cc b/src/infiniop/ops/sinh/operator.cc
index 41940d235..263d20347 100644
--- a/src/infiniop/ops/sinh/operator.cc
+++ b/src/infiniop/ops/sinh/operator.cc
@@ -1,5 +1,5 @@
 #include "../../operator_impl.h"
-#include "infiniop/ops/sinh.h"
+#include "infiniop/ops/unary_ops_api.h"
 
 #ifdef ENABLE_CPU_API
 #include "cpu/sinh_cpu.h"
diff --git a/src/infiniop/ops/sqrt/operator.cc b/src/infiniop/ops/sqrt/operator.cc
index fe999f58f..5962860ca 100644
--- a/src/infiniop/ops/sqrt/operator.cc
+++ b/src/infiniop/ops/sqrt/operator.cc
@@ -1,5 +1,5 @@
 #include "../../operator_impl.h"
-#include "infiniop/ops/sqrt.h"
+#include "infiniop/ops/unary_ops_api.h"
 
 #ifdef ENABLE_CPU_API
 #include "cpu/sqrt_cpu.h"
diff --git a/src/infiniop/ops/tan/operator.cc b/src/infiniop/ops/tan/operator.cc
index ae506dcd8..75dd8277e 100644
--- a/src/infiniop/ops/tan/operator.cc
+++ b/src/infiniop/ops/tan/operator.cc
@@ -1,5 +1,5 @@
 #include "../../operator_impl.h"
-#include "infiniop/ops/tan.h"
+#include "infiniop/ops/unary_ops_api.h"
 
 #ifdef ENABLE_CPU_API
 #include "cpu/tan_cpu.h"
diff --git a/test/infiniop/abs.py b/test/infiniop/abs.py
deleted file mode 100644
index df8748a97..000000000
--- a/test/infiniop/abs.py
+++ /dev/null
@@ -1,164 +0,0 @@
-import ctypes
-from ctypes import c_uint64
-from enum import Enum, auto
-
-import torch
-from libinfiniop import (
-    LIBINFINIOP,
-    InfiniDeviceNames,
-    InfiniDtype,
-    InfiniDtypeNames,
-    TestTensor,
-    TestWorkspace,
-    check_error,
-    debug,
-    get_args,
-    get_test_devices,
-    get_tolerance,
-    infiniopOperatorDescriptor_t,
-    profile_operation,
-    test_operator,
-)
-
-# ==============================================================================
-#  Configuration (Internal Use Only)
-# ==============================================================================
-# These are not meant to be imported from other modules
-_TEST_CASES_ = [
-    # tensor_shape, inplace
-    ((1, 3),),
-    ((3, 3),),
-    ((32, 20, 512),),
-    ((33, 333, 333),),
-    ((32, 256, 112, 112),),
-    ((3, 3, 13, 9, 17),),
-]
-
-
-class Inplace(Enum):
-    OUT_OF_PLACE = auto()
-    INPLACE_X = auto()
-
-
-# Inplace options applied for each test case in _TEST_CASES_
-_INPLACE = [
-    Inplace.OUT_OF_PLACE,
-    Inplace.INPLACE_X,
-]
-
-# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
-_TEST_CASES = [
-    test_case + (inplace_item,)
-    for test_case in _TEST_CASES_
-    for inplace_item in _INPLACE
-]
-
-# Data types used for testing (matching old operators library: only F16 and F32)
-_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32]
-
-# Tolerance map for different data types
-_TOLERANCE_MAP = {
-    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
-    InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7},
-}
-
-DEBUG = False
-PROFILE = False
-NUM_PRERUN = 10
-NUM_ITERATIONS = 1000
-
-
-def abs_op(x):
-    return torch.abs(x).to(x.dtype)
-
-
-def test(
-    handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=InfiniDtype.F16, sync=None
-):
-    # Generate test tensors with values in range [-1, 1) for abs operation
-    x_torch_tensor = torch.rand(shape) * 2 - 1
-
-    x = TestTensor(
-        shape,
-        x_torch_tensor.stride(),
-        dtype,
-        device,
-        mode="manual",
-        set_tensor=x_torch_tensor,
-    )
-
-    if inplace == Inplace.INPLACE_X:
-        y = x
-    else:
-        y = TestTensor(shape, None, dtype, device)
-
-    if y.is_broadcast():
-        return
-
-    print(
-        f"Testing Abs on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}"
-    )
-
-    ans = abs_op(x.torch_tensor())
-
-    if sync is not None:
-        sync()
-
-    descriptor = infiniopOperatorDescriptor_t()
-    check_error(
-        LIBINFINIOP.infiniopCreateAbsDescriptor(
-            handle, ctypes.byref(descriptor), y.descriptor, x.descriptor
-        )
-    )
-
-    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
-    for tensor in [x, y]:
-        tensor.destroy_desc()
-
-    workspace_size = c_uint64(0)
-    check_error(
-        LIBINFINIOP.infiniopGetAbsWorkspaceSize(
-            descriptor, ctypes.byref(workspace_size)
-        )
-    )
-    workspace = TestWorkspace(workspace_size.value, y.device)
-
-    def lib_abs():
-        check_error(
-            LIBINFINIOP.infiniopAbs(
-                descriptor, workspace.data(), workspace_size.value, y.data(), x.data(), None
-            )
-        )
-
-    lib_abs()
-    if sync is not None:
-        sync()
-
-    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
-    if DEBUG:
-        debug(y.actual_tensor(), ans, atol=atol, rtol=rtol)
-    assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol)
-
-    # Profiling workflow
-    if PROFILE:
-        # fmt: off
-        profile_operation("PyTorch", lambda: abs_op(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
-        profile_operation("    lib", lambda: lib_abs(), device, NUM_PRERUN, NUM_ITERATIONS)
-        # fmt: on
-
-    check_error(LIBINFINIOP.infiniopDestroyAbsDescriptor(descriptor))
-
-
-if __name__ == "__main__":
-    args = get_args()
-
-    # Configure testing options
-    DEBUG = args.debug
-    PROFILE = args.profile
-    NUM_PRERUN = args.num_prerun
-    NUM_ITERATIONS = args.num_iterations
-
-    for device in get_test_devices(args):
-        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
-
-    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/acos.py b/test/infiniop/acos.py
deleted file mode 100644
index d39e966c4..000000000
--- a/test/infiniop/acos.py
+++ /dev/null
@@ -1,165 +0,0 @@
-import ctypes
-from ctypes import c_uint64
-from enum import Enum, auto
-
-import torch
-from libinfiniop import (
-    LIBINFINIOP,
-    InfiniDeviceNames,
-    InfiniDtype,
-    InfiniDtypeNames,
-    TestTensor,
-    TestWorkspace,
-    check_error,
-    debug,
-    get_args,
-    get_test_devices,
-    get_tolerance,
-    infiniopOperatorDescriptor_t,
-    profile_operation,
-    test_operator,
-)
-
-# ==============================================================================
-#  Configuration (Internal Use Only)
-# ==============================================================================
-# These are not meant to be imported from other modules
-_TEST_CASES_ = [
-    # tensor_shape, inplace
-    ((1, 3),),
-    ((3, 3),),
-    ((32, 20, 512),),
-    ((33, 333, 333),),
-    ((32, 256, 112, 112),),
-    ((3, 3, 13, 9, 17),),
-]
-
-
-class Inplace(Enum):
-    OUT_OF_PLACE = auto()
-    INPLACE_X = auto()
-
-
-# Inplace options applied for each test case in _TEST_CASES_
-_INPLACE = [
-    Inplace.OUT_OF_PLACE,
-    Inplace.INPLACE_X,
-]
-
-# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
-_TEST_CASES = [
-    test_case + (inplace_item,)
-    for test_case in _TEST_CASES_
-    for inplace_item in _INPLACE
-]
-
-# Data types used for testing (matching old operators library: only F16 and F32)
-_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32]
-
-# Tolerance map for different data types
-_TOLERANCE_MAP = {
-    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
-    InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7},
-}
-
-DEBUG = False
-PROFILE = False
-NUM_PRERUN = 10
-NUM_ITERATIONS = 1000
-
-
-def acos_op(x):
-    return torch.acos(x).to(x.dtype)
-
-
-def test(
-    handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=InfiniDtype.F16, sync=None
-):
-    # Generate test tensors with values in range [-1, 1) for acos operation
-    # acos domain is [-1, 1], so we use range [-1, 1)
-    x_torch_tensor = torch.rand(shape) * 2 - 1
-
-    x = TestTensor(
-        shape,
-        x_torch_tensor.stride(),
-        dtype,
-        device,
-        mode="manual",
-        set_tensor=x_torch_tensor,
-    )
-
-    if inplace == Inplace.INPLACE_X:
-        y = x
-    else:
-        y = TestTensor(shape, None, dtype, device)
-
-    if y.is_broadcast():
-        return
-
-    print(
-        f"Testing Acos on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}"
-    )
-
-    ans = acos_op(x.torch_tensor())
-
-    if sync is not None:
-        sync()
-
-    descriptor = infiniopOperatorDescriptor_t()
-    check_error(
-        LIBINFINIOP.infiniopCreateAcosDescriptor(
-            handle, ctypes.byref(descriptor), y.descriptor, x.descriptor
-        )
-    )
-
-    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
-    for tensor in [x, y]:
-        tensor.destroy_desc()
-
-    workspace_size = c_uint64(0)
-    check_error(
-        LIBINFINIOP.infiniopGetAcosWorkspaceSize(
-            descriptor, ctypes.byref(workspace_size)
-        )
-    )
-    workspace = TestWorkspace(workspace_size.value, y.device)
-
-    def lib_acos():
-        check_error(
-            LIBINFINIOP.infiniopAcos(
-                descriptor, workspace.data(), workspace_size.value, y.data(), x.data(), None
-            )
-        )
-
-    lib_acos()
-    if sync is not None:
-        sync()
-
-    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
-    if DEBUG:
-        debug(y.actual_tensor(), ans, atol=atol, rtol=rtol)
-    assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol, equal_nan=True)
-
-    # Profiling workflow
-    if PROFILE:
-        # fmt: off
-        profile_operation("PyTorch", lambda: acos_op(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
-        profile_operation("    lib", lambda: lib_acos(), device, NUM_PRERUN, NUM_ITERATIONS)
-        # fmt: on
-
-    check_error(LIBINFINIOP.infiniopDestroyAcosDescriptor(descriptor))
-
-
-if __name__ == "__main__":
-    args = get_args()
-
-    # Configure testing options
-    DEBUG = args.debug
-    PROFILE = args.profile
-    NUM_PRERUN = args.num_prerun
-    NUM_ITERATIONS = args.num_iterations
-
-    for device in get_test_devices(args):
-        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
-
-    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/acosh.py b/test/infiniop/acosh.py
deleted file mode 100644
index c6777998b..000000000
--- a/test/infiniop/acosh.py
+++ /dev/null
@@ -1,165 +0,0 @@
-import ctypes
-from ctypes import c_uint64
-from enum import Enum, auto
-
-import torch
-from libinfiniop import (
-    LIBINFINIOP,
-    InfiniDeviceNames,
-    InfiniDtype,
-    InfiniDtypeNames,
-    TestTensor,
-    TestWorkspace,
-    check_error,
-    debug,
-    get_args,
-    get_test_devices,
-    get_tolerance,
-    infiniopOperatorDescriptor_t,
-    profile_operation,
-    test_operator,
-)
-
-# ==============================================================================
-#  Configuration (Internal Use Only)
-# ==============================================================================
-# These are not meant to be imported from other modules
-_TEST_CASES_ = [
-    # tensor_shape, inplace
-    ((1, 3),),
-    ((3, 3),),
-    ((32, 20, 512),),
-    ((33, 333, 333),),
-    ((32, 256, 112, 112),),
-    ((3, 3, 13, 9, 17),),
-]
-
-
-class Inplace(Enum):
-    OUT_OF_PLACE = auto()
-    INPLACE_X = auto()
-
-
-# Inplace options applied for each test case in _TEST_CASES_
-_INPLACE = [
-    Inplace.OUT_OF_PLACE,
-    Inplace.INPLACE_X,
-]
-
-# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
-_TEST_CASES = [
-    test_case + (inplace_item,)
-    for test_case in _TEST_CASES_
-    for inplace_item in _INPLACE
-]
-
-# Data types used for testing (matching old operators library: only F16 and F32)
-_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32]
-
-# Tolerance map for different data types
-_TOLERANCE_MAP = {
-    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
-    InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7},
-}
-
-DEBUG = False
-PROFILE = False
-NUM_PRERUN = 10
-NUM_ITERATIONS = 1000
-
-
-def acosh_op(x):
-    return torch.acosh(x).to(x.dtype)
-
-
-def test(
-    handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=InfiniDtype.F16, sync=None
-):
-    # Generate test tensors with values in range [1, 101) for acosh operation
-    # acosh domain is [1, +∞), so we use range [1, 101)
-    x_torch_tensor = torch.rand(shape) * 100 + 1
-
-    x = TestTensor(
-        shape,
-        x_torch_tensor.stride(),
-        dtype,
-        device,
-        mode="manual",
-        set_tensor=x_torch_tensor,
-    )
-
-    if inplace == Inplace.INPLACE_X:
-        y = x
-    else:
-        y = TestTensor(shape, None, dtype, device)
-
-    if y.is_broadcast():
-        return
-
-    print(
-        f"Testing Acosh on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}"
-    )
-
-    ans = acosh_op(x.torch_tensor())
-
-    if sync is not None:
-        sync()
-
-    descriptor = infiniopOperatorDescriptor_t()
-    check_error(
-        LIBINFINIOP.infiniopCreateAcoshDescriptor(
-            handle, ctypes.byref(descriptor), y.descriptor, x.descriptor
-        )
-    )
-
-    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
-    for tensor in [x, y]:
-        tensor.destroy_desc()
-
-    workspace_size = c_uint64(0)
-    check_error(
-        LIBINFINIOP.infiniopGetAcoshWorkspaceSize(
-            descriptor, ctypes.byref(workspace_size)
-        )
-    )
-    workspace = TestWorkspace(workspace_size.value, y.device)
-
-    def lib_acosh():
-        check_error(
-            LIBINFINIOP.infiniopAcosh(
-                descriptor, workspace.data(), workspace_size.value, y.data(), x.data(), None
-            )
-        )
-
-    lib_acosh()
-    if sync is not None:
-        sync()
-
-    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
-    if DEBUG:
-        debug(y.actual_tensor(), ans, atol=atol, rtol=rtol)
-    assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol, equal_nan=True)
-
-    # Profiling workflow
-    if PROFILE:
-        # fmt: off
-        profile_operation("PyTorch", lambda: acosh_op(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
-        profile_operation("    lib", lambda: lib_acosh(), device, NUM_PRERUN, NUM_ITERATIONS)
-        # fmt: on
-
-    check_error(LIBINFINIOP.infiniopDestroyAcoshDescriptor(descriptor))
-
-
-if __name__ == "__main__":
-    args = get_args()
-
-    # Configure testing options
-    DEBUG = args.debug
-    PROFILE = args.profile
-    NUM_PRERUN = args.num_prerun
-    NUM_ITERATIONS = args.num_iterations
-
-    for device in get_test_devices(args):
-        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
-
-    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/asin.py b/test/infiniop/asin.py
deleted file mode 100644
index 18cf0ec8e..000000000
--- a/test/infiniop/asin.py
+++ /dev/null
@@ -1,165 +0,0 @@
-import ctypes
-from ctypes import c_uint64
-from enum import Enum, auto
-
-import torch
-from libinfiniop import (
-    LIBINFINIOP,
-    InfiniDeviceNames,
-    InfiniDtype,
-    InfiniDtypeNames,
-    TestTensor,
-    TestWorkspace,
-    check_error,
-    debug,
-    get_args,
-    get_test_devices,
-    get_tolerance,
-    infiniopOperatorDescriptor_t,
-    profile_operation,
-    test_operator,
-)
-
-# ==============================================================================
-#  Configuration (Internal Use Only)
-# ==============================================================================
-# These are not meant to be imported from other modules
-_TEST_CASES_ = [
-    # tensor_shape, inplace
-    ((1, 3),),
-    ((3, 3),),
-    ((32, 20, 512),),
-    ((33, 333, 333),),
-    ((32, 256, 112, 112),),
-    ((3, 3, 13, 9, 17),),
-]
-
-
-class Inplace(Enum):
-    OUT_OF_PLACE = auto()
-    INPLACE_X = auto()
-
-
-# Inplace options applied for each test case in _TEST_CASES_
-_INPLACE = [
-    Inplace.OUT_OF_PLACE,
-    Inplace.INPLACE_X,
-]
-
-# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
-_TEST_CASES = [
-    test_case + (inplace_item,)
-    for test_case in _TEST_CASES_
-    for inplace_item in _INPLACE
-]
-
-# Data types used for testing (matching old operators library: only F16 and F32)
-_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32]
-
-# Tolerance map for different data types
-_TOLERANCE_MAP = {
-    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
-    InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7},
-}
-
-DEBUG = False
-PROFILE = False
-NUM_PRERUN = 10
-NUM_ITERATIONS = 1000
-
-
-def asin_op(x):
-    return torch.asin(x).to(x.dtype)
-
-
-def test(
-    handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=InfiniDtype.F16, sync=None
-):
-    # Generate test tensors with values in range [-1, 1) for asin operation
-    # asin domain is [-1, 1], so we use range [-1, 1)
-    x_torch_tensor = torch.rand(shape) * 2 - 1
-
-    x = TestTensor(
-        shape,
-        x_torch_tensor.stride(),
-        dtype,
-        device,
-        mode="manual",
-        set_tensor=x_torch_tensor,
-    )
-
-    if inplace == Inplace.INPLACE_X:
-        y = x
-    else:
-        y = TestTensor(shape, None, dtype, device)
-
-    if y.is_broadcast():
-        return
-
-    print(
-        f"Testing Asin on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}"
-    )
-
-    ans = asin_op(x.torch_tensor())
-
-    if sync is not None:
-        sync()
-
-    descriptor = infiniopOperatorDescriptor_t()
-    check_error(
-        LIBINFINIOP.infiniopCreateAsinDescriptor(
-            handle, ctypes.byref(descriptor), y.descriptor, x.descriptor
-        )
-    )
-
-    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
-    for tensor in [x, y]:
-        tensor.destroy_desc()
-
-    workspace_size = c_uint64(0)
-    check_error(
-        LIBINFINIOP.infiniopGetAsinWorkspaceSize(
-            descriptor, ctypes.byref(workspace_size)
-        )
-    )
-    workspace = TestWorkspace(workspace_size.value, y.device)
-
-    def lib_asin():
-        check_error(
-            LIBINFINIOP.infiniopAsin(
-                descriptor, workspace.data(), workspace_size.value, y.data(), x.data(), None
-            )
-        )
-
-    lib_asin()
-    if sync is not None:
-        sync()
-
-    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
-    if DEBUG:
-        debug(y.actual_tensor(), ans, atol=atol, rtol=rtol)
-    assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol, equal_nan=True)
-
-    # Profiling workflow
-    if PROFILE:
-        # fmt: off
-        profile_operation("PyTorch", lambda: asin_op(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
-        profile_operation("    lib", lambda: lib_asin(), device, NUM_PRERUN, NUM_ITERATIONS)
-        # fmt: on
-
-    check_error(LIBINFINIOP.infiniopDestroyAsinDescriptor(descriptor))
-
-
-if __name__ == "__main__":
-    args = get_args()
-
-    # Configure testing options
-    DEBUG = args.debug
-    PROFILE = args.profile
-    NUM_PRERUN = args.num_prerun
-    NUM_ITERATIONS = args.num_iterations
-
-    for device in get_test_devices(args):
-        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
-
-    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/asinh.py b/test/infiniop/asinh.py
deleted file mode 100644
index d051d486e..000000000
--- a/test/infiniop/asinh.py
+++ /dev/null
@@ -1,165 +0,0 @@
-import ctypes
-from ctypes import c_uint64
-from enum import Enum, auto
-
-import torch
-from libinfiniop import (
-    LIBINFINIOP,
-    InfiniDeviceNames,
-    InfiniDtype,
-    InfiniDtypeNames,
-    TestTensor,
-    TestWorkspace,
-    check_error,
-    debug,
-    get_args,
-    get_test_devices,
-    get_tolerance,
-    infiniopOperatorDescriptor_t,
-    profile_operation,
-    test_operator,
-)
-
-# ==============================================================================
-#  Configuration (Internal Use Only)
-# ==============================================================================
-# These are not meant to be imported from other modules
-_TEST_CASES_ = [
-    # tensor_shape, inplace
-    ((1, 3),),
-    ((3, 3),),
-    ((32, 20, 512),),
-    ((33, 333, 333),),
-    ((32, 256, 112, 112),),
-    ((3, 3, 13, 9, 17),),
-]
-
-
-class Inplace(Enum):
-    OUT_OF_PLACE = auto()
-    INPLACE_X = auto()
-
-
-# Inplace options applied for each test case in _TEST_CASES_
-_INPLACE = [
-    Inplace.OUT_OF_PLACE,
-    Inplace.INPLACE_X,
-]
-
-# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
-_TEST_CASES = [
-    test_case + (inplace_item,)
-    for test_case in _TEST_CASES_
-    for inplace_item in _INPLACE
-]
-
-# Data types used for testing (matching old operators library: only F16 and F32)
-_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32]
-
-# Tolerance map for different data types
-_TOLERANCE_MAP = {
-    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
-    InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7},
-}
-
-DEBUG = False
-PROFILE = False
-NUM_PRERUN = 10
-NUM_ITERATIONS = 1000
-
-
-def asinh_op(x):
-    return torch.asinh(x).to(x.dtype)
-
-
-def test(
-    handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=InfiniDtype.F16, sync=None
-):
-    # Generate test tensors with values in range [0, 100) for asinh operation
-    # asinh domain is (-∞, +∞), so we use range [0, 100)
-    x_torch_tensor = torch.rand(shape) * 100
-
-    x = TestTensor(
-        shape,
-        x_torch_tensor.stride(),
-        dtype,
-        device,
-        mode="manual",
-        set_tensor=x_torch_tensor,
-    )
-
-    if inplace == Inplace.INPLACE_X:
-        y = x
-    else:
-        y = TestTensor(shape, None, dtype, device)
-
-    if y.is_broadcast():
-        return
-
-    print(
-        f"Testing Asinh on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}"
-    )
-
-    ans = asinh_op(x.torch_tensor())
-
-    if sync is not None:
-        sync()
-
-    descriptor = infiniopOperatorDescriptor_t()
-    check_error(
-        LIBINFINIOP.infiniopCreateAsinhDescriptor(
-            handle, ctypes.byref(descriptor), y.descriptor, x.descriptor
-        )
-    )
-
-    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
-    for tensor in [x, y]:
-        tensor.destroy_desc()
-
-    workspace_size = c_uint64(0)
-    check_error(
-        LIBINFINIOP.infiniopGetAsinhWorkspaceSize(
-            descriptor, ctypes.byref(workspace_size)
-        )
-    )
-    workspace = TestWorkspace(workspace_size.value, y.device)
-
-    def lib_asinh():
-        check_error(
-            LIBINFINIOP.infiniopAsinh(
-                descriptor, workspace.data(), workspace_size.value, y.data(), x.data(), None
-            )
-        )
-
-    lib_asinh()
-    if sync is not None:
-        sync()
-
-    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
-    if DEBUG:
-        debug(y.actual_tensor(), ans, atol=atol, rtol=rtol)
-    assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol, equal_nan=True)
-
-    # Profiling workflow
-    if PROFILE:
-        # fmt: off
-        profile_operation("PyTorch", lambda: asinh_op(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
-        profile_operation("    lib", lambda: lib_asinh(), device, NUM_PRERUN, NUM_ITERATIONS)
-        # fmt: on
-
-    check_error(LIBINFINIOP.infiniopDestroyAsinhDescriptor(descriptor))
-
-
-if __name__ == "__main__":
-    args = get_args()
-
-    # Configure testing options
-    DEBUG = args.debug
-    PROFILE = args.profile
-    NUM_PRERUN = args.num_prerun
-    NUM_ITERATIONS = args.num_iterations
-
-    for device in get_test_devices(args):
-        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
-
-    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/atan.py b/test/infiniop/atan.py
deleted file mode 100644
index 01fceff5b..000000000
--- a/test/infiniop/atan.py
+++ /dev/null
@@ -1,164 +0,0 @@
-import ctypes
-from ctypes import c_uint64
-from enum import Enum, auto
-
-import torch
-from libinfiniop import (
-    LIBINFINIOP,
-    InfiniDeviceNames,
-    InfiniDtype,
-    InfiniDtypeNames,
-    TestTensor,
-    TestWorkspace,
-    check_error,
-    debug,
-    get_args,
-    get_test_devices,
-    get_tolerance,
-    infiniopOperatorDescriptor_t,
-    profile_operation,
-    test_operator,
-)
-
-# ==============================================================================
-#  Configuration (Internal Use Only)
-# ==============================================================================
-# These are not meant to be imported from other modules
-_TEST_CASES_ = [
-    # tensor_shape, inplace
-    ((1, 3),),
-    ((3, 3, 13, 9, 17),),
-    ((32, 20, 512),),
-    ((33, 333, 333),),
-    ((32, 256, 112, 112),),
-]
-
-
-class Inplace(Enum):
-    OUT_OF_PLACE = auto()
-    INPLACE_X = auto()
-
-
-# Inplace options applied for each test case in _TEST_CASES_
-_INPLACE = [
-    Inplace.OUT_OF_PLACE,
-    Inplace.INPLACE_X,
-]
-
-# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
-_TEST_CASES = [
-    test_case + (inplace_item,)
-    for test_case in _TEST_CASES_
-    for inplace_item in _INPLACE
-]
-
-# Data types used for testing (matching old operators library: only F16 and F32)
-_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32]
-
-# Tolerance map for different data types
-_TOLERANCE_MAP = {
-    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
-    InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7},
-}
-
-DEBUG = False
-PROFILE = False
-NUM_PRERUN = 10
-NUM_ITERATIONS = 1000
-
-
-def atan_op(x):
-    return torch.atan(x).to(x.dtype)
-
-
-def test(
-    handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=InfiniDtype.F16, sync=None
-):
-    # Generate test tensors with values in range [-200, -100) for atan operation
-    # atan domain is (-∞, +∞), so we use range [-200, -100)
-    x_torch_tensor = torch.rand(shape) * 100 - 200
-
-    x = TestTensor(
-        shape,
-        x_torch_tensor.stride(),
-        dtype,
-        device,
-        mode="manual",
-        set_tensor=x_torch_tensor,
-    )
-
-    if inplace == Inplace.INPLACE_X:
-        y = x
-    else:
-        y = TestTensor(shape, None, dtype, device)
-
-    if y.is_broadcast():
-        return
-
-    print(
-        f"Testing Atan on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}"
-    )
-
-    ans = atan_op(x.torch_tensor())
-
-    if sync is not None:
-        sync()
-
-    descriptor = infiniopOperatorDescriptor_t()
-    check_error(
-        LIBINFINIOP.infiniopCreateAtanDescriptor(
-            handle, ctypes.byref(descriptor), y.descriptor, x.descriptor
-        )
-    )
-
-    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
-    for tensor in [x, y]:
-        tensor.destroy_desc()
-
-    workspace_size = c_uint64(0)
-    check_error(
-        LIBINFINIOP.infiniopGetAtanWorkspaceSize(
-            descriptor, ctypes.byref(workspace_size)
-        )
-    )
-    workspace = TestWorkspace(workspace_size.value, y.device)
-
-    def lib_atan():
-        check_error(
-            LIBINFINIOP.infiniopAtan(
-                descriptor, workspace.data(), workspace_size.value, y.data(), x.data(), None
-            )
-        )
-
-    lib_atan()
-    if sync is not None:
-        sync()
-
-    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
-    if DEBUG:
-        debug(y.actual_tensor(), ans, atol=atol, rtol=rtol)
-    assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol, equal_nan=True)
-
-    # Profiling workflow
-    if PROFILE:
-        # fmt: off
-        profile_operation("PyTorch", lambda: atan_op(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
-        profile_operation("    lib", lambda: lib_atan(), device, NUM_PRERUN, NUM_ITERATIONS)
-        # fmt: on
-
-    check_error(LIBINFINIOP.infiniopDestroyAtanDescriptor(descriptor))
-
-
-if __name__ == "__main__":
-    args = get_args()
-
-    # Configure testing options
-    DEBUG = args.debug
-    PROFILE = args.profile
-    NUM_PRERUN = args.num_prerun
-    NUM_ITERATIONS = args.num_iterations
-
-    for device in get_test_devices(args):
-        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
-
-    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/atanh.py b/test/infiniop/atanh.py
deleted file mode 100644
index 74073a6f2..000000000
--- a/test/infiniop/atanh.py
+++ /dev/null
@@ -1,165 +0,0 @@
-import ctypes
-from ctypes import c_uint64
-from enum import Enum, auto
-
-import torch
-from libinfiniop import (
-    LIBINFINIOP,
-    InfiniDeviceNames,
-    InfiniDtype,
-    InfiniDtypeNames,
-    TestTensor,
-    TestWorkspace,
-    check_error,
-    debug,
-    get_args,
-    get_test_devices,
-    get_tolerance,
-    infiniopOperatorDescriptor_t,
-    profile_operation,
-    test_operator,
-)
-
-# ==============================================================================
-#  Configuration (Internal Use Only)
-# ==============================================================================
-# These are not meant to be imported from other modules
-_TEST_CASES_ = [
-    # tensor_shape, inplace
-    ((1, 3),),
-    ((3, 3),),
-    ((32, 20, 512),),
-    ((33, 333, 333),),
-    ((32, 256, 112, 112),),
-    ((3, 3, 13, 9, 17),),
-]
-
-
-class Inplace(Enum):
-    OUT_OF_PLACE = auto()
-    INPLACE_X = auto()
-
-
-# Inplace options applied for each test case in _TEST_CASES_
-_INPLACE = [
-    Inplace.OUT_OF_PLACE,
-    Inplace.INPLACE_X,
-]
-
-# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
-_TEST_CASES = [
-    test_case + (inplace_item,)
-    for test_case in _TEST_CASES_
-    for inplace_item in _INPLACE
-]
-
-# Data types used for testing (matching old operators library: only F16 and F32)
-_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32]
-
-# Tolerance map for different data types
-_TOLERANCE_MAP = {
-    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
-    InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7},
-}
-
-DEBUG = False
-PROFILE = False
-NUM_PRERUN = 10
-NUM_ITERATIONS = 1000
-
-
-def atanh_op(x):
-    return torch.atanh(x).to(x.dtype)
-
-
-def test(
-    handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=InfiniDtype.F16, sync=None
-):
-    # Generate test tensors with values in range [-1, 1) for atanh operation
-    # atanh domain is (-1, 1), so we use range [-1, 1)
-    x_torch_tensor = torch.rand(shape) * 2 - 1
-
-    x = TestTensor(
-        shape,
-        x_torch_tensor.stride(),
-        dtype,
-        device,
-        mode="manual",
-        set_tensor=x_torch_tensor,
-    )
-
-    if inplace == Inplace.INPLACE_X:
-        y = x
-    else:
-        y = TestTensor(shape, None, dtype, device)
-
-    if y.is_broadcast():
-        return
-
-    print(
-        f"Testing Atanh on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}"
-    )
-
-    ans = atanh_op(x.torch_tensor())
-
-    if sync is not None:
-        sync()
-
-    descriptor = infiniopOperatorDescriptor_t()
-    check_error(
-        LIBINFINIOP.infiniopCreateAtanhDescriptor(
-            handle, ctypes.byref(descriptor), y.descriptor, x.descriptor
-        )
-    )
-
-    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
-    for tensor in [x, y]:
-        tensor.destroy_desc()
-
-    workspace_size = c_uint64(0)
-    check_error(
-        LIBINFINIOP.infiniopGetAtanhWorkspaceSize(
-            descriptor, ctypes.byref(workspace_size)
-        )
-    )
-    workspace = TestWorkspace(workspace_size.value, y.device)
-
-    def lib_atanh():
-        check_error(
-            LIBINFINIOP.infiniopAtanh(
-                descriptor, workspace.data(), workspace_size.value, y.data(), x.data(), None
-            )
-        )
-
-    lib_atanh()
-    if sync is not None:
-        sync()
-
-    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
-    if DEBUG:
-        debug(y.actual_tensor(), ans, atol=atol, rtol=rtol)
-    assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol, equal_nan=True)
-
-    # Profiling workflow
-    if PROFILE:
-        # fmt: off
-        profile_operation("PyTorch", lambda: atanh_op(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
-        profile_operation("    lib", lambda: lib_atanh(), device, NUM_PRERUN, NUM_ITERATIONS)
-        # fmt: on
-
-    check_error(LIBINFINIOP.infiniopDestroyAtanhDescriptor(descriptor))
-
-
-if __name__ == "__main__":
-    args = get_args()
-
-    # Configure testing options
-    DEBUG = args.debug
-    PROFILE = args.profile
-    NUM_PRERUN = args.num_prerun
-    NUM_ITERATIONS = args.num_iterations
-
-    for device in get_test_devices(args):
-        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
-
-    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/ceil.py b/test/infiniop/ceil.py
deleted file mode 100644
index afc1993c1..000000000
--- a/test/infiniop/ceil.py
+++ /dev/null
@@ -1,165 +0,0 @@
-import ctypes
-from ctypes import c_uint64
-from enum import Enum, auto
-
-import torch
-from libinfiniop import (
-    LIBINFINIOP,
-    InfiniDeviceNames,
-    InfiniDtype,
-    InfiniDtypeNames,
-    TestTensor,
-    TestWorkspace,
-    check_error,
-    debug,
-    get_args,
-    get_test_devices,
-    get_tolerance,
-    infiniopOperatorDescriptor_t,
-    profile_operation,
-    test_operator,
-)
-
-# ==============================================================================
-#  Configuration (Internal Use Only)
-# ==============================================================================
-# These are not meant to be imported from other modules
-_TEST_CASES_ = [
-    # tensor_shape, inplace
-    ((1, 3),),
-    ((3, 3),),
-    ((32, 20, 512),),
-    ((33, 333, 333),),
-    ((32, 256, 112, 112),),
-    ((3, 3, 13, 9, 17),),
-]
-
-
-class Inplace(Enum):
-    OUT_OF_PLACE = auto()
-    INPLACE_X = auto()
-
-
-# Inplace options applied for each test case in _TEST_CASES_
-_INPLACE = [
-    Inplace.OUT_OF_PLACE,
-    Inplace.INPLACE_X,
-]
-
-# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
-_TEST_CASES = [
-    test_case + (inplace_item,)
-    for test_case in _TEST_CASES_
-    for inplace_item in _INPLACE
-]
-
-# Data types used for testing (matching old operators library: only F16 and F32)
-_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32]
-
-# Tolerance map for different data types
-_TOLERANCE_MAP = {
-    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
-    InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7},
-}
-
-DEBUG = False
-PROFILE = False
-NUM_PRERUN = 10
-NUM_ITERATIONS = 1000
-
-
-def ceil_op(x):
-    return torch.ceil(x).to(x.dtype)
-
-
-def test(
-    handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=InfiniDtype.F16, sync=None
-):
-    # Generate test tensors with values in range [-20, -10) for ceil operation
-    # ceil domain is (-∞, +∞), so we use range [-20, -10)
-    x_torch_tensor = torch.rand(shape) * 10 - 20
-
-    x = TestTensor(
-        shape,
-        x_torch_tensor.stride(),
-        dtype,
-        device,
-        mode="manual",
-        set_tensor=x_torch_tensor,
-    )
-
-    if inplace == Inplace.INPLACE_X:
-        y = x
-    else:
-        y = TestTensor(shape, None, dtype, device)
-
-    if y.is_broadcast():
-        return
-
-    print(
-        f"Testing Ceil on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}"
-    )
-
-    ans = ceil_op(x.torch_tensor())
-
-    if sync is not None:
-        sync()
-
-    descriptor = infiniopOperatorDescriptor_t()
-    check_error(
-        LIBINFINIOP.infiniopCreateCeilDescriptor(
-            handle, ctypes.byref(descriptor), y.descriptor, x.descriptor
-        )
-    )
-
-    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
-    for tensor in [x, y]:
-        tensor.destroy_desc()
-
-    workspace_size = c_uint64(0)
-    check_error(
-        LIBINFINIOP.infiniopGetCeilWorkspaceSize(
-            descriptor, ctypes.byref(workspace_size)
-        )
-    )
-    workspace = TestWorkspace(workspace_size.value, y.device)
-
-    def lib_ceil():
-        check_error(
-            LIBINFINIOP.infiniopCeil(
-                descriptor, workspace.data(), workspace_size.value, y.data(), x.data(), None
-            )
-        )
-
-    lib_ceil()
-    if sync is not None:
-        sync()
-
-    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
-    if DEBUG:
-        debug(y.actual_tensor(), ans, atol=atol, rtol=rtol)
-    assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol, equal_nan=True)
-
-    # Profiling workflow
-    if PROFILE:
-        # fmt: off
-        profile_operation("PyTorch", lambda: ceil_op(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
-        profile_operation("    lib", lambda: lib_ceil(), device, NUM_PRERUN, NUM_ITERATIONS)
-        # fmt: on
-
-    check_error(LIBINFINIOP.infiniopDestroyCeilDescriptor(descriptor))
-
-
-if __name__ == "__main__":
-    args = get_args()
-
-    # Configure testing options
-    DEBUG = args.debug
-    PROFILE = args.profile
-    NUM_PRERUN = args.num_prerun
-    NUM_ITERATIONS = args.num_iterations
-
-    for device in get_test_devices(args):
-        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
-
-    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/cos.py b/test/infiniop/cos.py
deleted file mode 100644
index 972f17b7b..000000000
--- a/test/infiniop/cos.py
+++ /dev/null
@@ -1,166 +0,0 @@
-import ctypes
-from ctypes import c_uint64
-from enum import Enum, auto
-
-import torch
-from libinfiniop import (
-    LIBINFINIOP,
-    InfiniDeviceNames,
-    InfiniDtype,
-    InfiniDtypeNames,
-    TestTensor,
-    TestWorkspace,
-    check_error,
-    debug,
-    get_args,
-    get_test_devices,
-    get_tolerance,
-    infiniopOperatorDescriptor_t,
-    profile_operation,
-    test_operator,
-)
-
-# ==============================================================================
-#  Configuration (Internal Use Only)
-# ==============================================================================
-# These are not meant to be imported from other modules
-_TEST_CASES_ = [
-    # tensor_shape, inplace
-    ((1, 3),),
-    ((3, 3),),
-    ((32, 20, 512),),
-    ((33, 333, 333),),
-    ((32, 256, 112, 112),),
-    ((3, 3, 13, 9, 17),),
-]
-
-
-class Inplace(Enum):
-    OUT_OF_PLACE = auto()
-    INPLACE_X = auto()
-
-
-# Inplace options applied for each test case in _TEST_CASES_
-_INPLACE = [
-    Inplace.OUT_OF_PLACE,
-    Inplace.INPLACE_X,
-]
-
-# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
-_TEST_CASES = [
-    test_case + (inplace_item,)
-    for test_case in _TEST_CASES_
-    for inplace_item in _INPLACE
-]
-
-# Data types used for testing (matching old operators library: only F16 and F32)
-_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32]
-
-# Tolerance map for different data types
-# Copied from old operators library: atol=1e-4, rtol=1e-2
-_TOLERANCE_MAP = {
-    InfiniDtype.F16: {"atol": 1e-4, "rtol": 1e-2},
-    InfiniDtype.F32: {"atol": 1e-4, "rtol": 1e-2},
-}
-
-DEBUG = False
-PROFILE = False
-NUM_PRERUN = 10
-NUM_ITERATIONS = 1000
-
-
-def cos_op(x):
-    return torch.cos(x).to(x.dtype)
-
-
-def test(
-    handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=InfiniDtype.F16, sync=None
-):
-    # Generate test tensors with values in range [-200, -100) for cos operation
-    # cos domain is (-∞, +∞), so we use range [-200, -100)
-    x_torch_tensor = torch.rand(shape) * 100 - 200
-
-    x = TestTensor(
-        shape,
-        x_torch_tensor.stride(),
-        dtype,
-        device,
-        mode="manual",
-        set_tensor=x_torch_tensor,
-    )
-
-    if inplace == Inplace.INPLACE_X:
-        y = x
-    else:
-        y = TestTensor(shape, None, dtype, device)
-
-    if y.is_broadcast():
-        return
-
-    print(
-        f"Testing Cos on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}"
-    )
-
-    ans = cos_op(x.torch_tensor())
-
-    if sync is not None:
-        sync()
-
-    descriptor = infiniopOperatorDescriptor_t()
-    check_error(
-        LIBINFINIOP.infiniopCreateCosDescriptor(
-            handle, ctypes.byref(descriptor), y.descriptor, x.descriptor
-        )
-    )
-
-    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
-    for tensor in [x, y]:
-        tensor.destroy_desc()
-
-    workspace_size = c_uint64(0)
-    check_error(
-        LIBINFINIOP.infiniopGetCosWorkspaceSize(
-            descriptor, ctypes.byref(workspace_size)
-        )
-    )
-    workspace = TestWorkspace(workspace_size.value, y.device)
-
-    def lib_cos():
-        check_error(
-            LIBINFINIOP.infiniopCos(
-                descriptor, workspace.data(), workspace_size.value, y.data(), x.data(), None
-            )
-        )
-
-    lib_cos()
-    if sync is not None:
-        sync()
-
-    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
-    if DEBUG:
-        debug(y.actual_tensor(), ans, atol=atol, rtol=rtol)
-    assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol, equal_nan=True)
-
-    # Profiling workflow
-    if PROFILE:
-        # fmt: off
-        profile_operation("PyTorch", lambda: cos_op(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
-        profile_operation("    lib", lambda: lib_cos(), device, NUM_PRERUN, NUM_ITERATIONS)
-        # fmt: on
-
-    check_error(LIBINFINIOP.infiniopDestroyCosDescriptor(descriptor))
-
-
-if __name__ == "__main__":
-    args = get_args()
-
-    # Configure testing options
-    DEBUG = args.debug
-    PROFILE = args.profile
-    NUM_PRERUN = args.num_prerun
-    NUM_ITERATIONS = args.num_iterations
-
-    for device in get_test_devices(args):
-        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
-
-    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/cosh.py b/test/infiniop/cosh.py
deleted file mode 100644
index ee7994531..000000000
--- a/test/infiniop/cosh.py
+++ /dev/null
@@ -1,165 +0,0 @@
-import ctypes
-from ctypes import c_uint64
-from enum import Enum, auto
-
-import torch
-from libinfiniop import (
-    LIBINFINIOP,
-    InfiniDeviceNames,
-    InfiniDtype,
-    InfiniDtypeNames,
-    TestTensor,
-    TestWorkspace,
-    check_error,
-    debug,
-    get_args,
-    get_test_devices,
-    get_tolerance,
-    infiniopOperatorDescriptor_t,
-    profile_operation,
-    test_operator,
-)
-
-# ==============================================================================
-#  Configuration (Internal Use Only)
-# ==============================================================================
-# These are not meant to be imported from other modules
-_TEST_CASES_ = [
-    # tensor_shape, inplace
-    ((1, 3),),
-    ((3, 3),),
-    ((32, 20, 512),),
-    ((33, 333, 333),),
-    ((32, 256, 112, 112),),
-    ((3, 3, 13, 9, 17),),
-]
-
-
-class Inplace(Enum):
-    OUT_OF_PLACE = auto()
-    INPLACE_X = auto()
-
-
-# Inplace options applied for each test case in _TEST_CASES_
-_INPLACE = [
-    Inplace.OUT_OF_PLACE,
-    Inplace.INPLACE_X,
-]
-
-# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
-_TEST_CASES = [
-    test_case + (inplace_item,)
-    for test_case in _TEST_CASES_
-    for inplace_item in _INPLACE
-]
-
-# Data types used for testing (matching old operators library: only F16 and F32)
-_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32]
-
-# Tolerance map for different data types
-_TOLERANCE_MAP = {
-    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
-    InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7},
-}
-
-DEBUG = False
-PROFILE = False
-NUM_PRERUN = 10
-NUM_ITERATIONS = 1000
-
-
-def cosh_op(x):
-    return torch.cosh(x).to(x.dtype)
-
-
-def test(
-    handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=InfiniDtype.F16, sync=None
-):
-    # Generate test tensors with values in range [-200, -100) for cosh operation
-    # cosh domain is (-∞, +∞), so we use range [-200, -100)
-    x_torch_tensor = torch.rand(shape) * 100 - 200
-
-    x = TestTensor(
-        shape,
-        x_torch_tensor.stride(),
-        dtype,
-        device,
-        mode="manual",
-        set_tensor=x_torch_tensor,
-    )
-
-    if inplace == Inplace.INPLACE_X:
-        y = x
-    else:
-        y = TestTensor(shape, None, dtype, device)
-
-    if y.is_broadcast():
-        return
-
-    print(
-        f"Testing Cosh on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}"
-    )
-
-    ans = cosh_op(x.torch_tensor())
-
-    if sync is not None:
-        sync()
-
-    descriptor = infiniopOperatorDescriptor_t()
-    check_error(
-        LIBINFINIOP.infiniopCreateCoshDescriptor(
-            handle, ctypes.byref(descriptor), y.descriptor, x.descriptor
-        )
-    )
-
-    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
-    for tensor in [x, y]:
-        tensor.destroy_desc()
-
-    workspace_size = c_uint64(0)
-    check_error(
-        LIBINFINIOP.infiniopGetCoshWorkspaceSize(
-            descriptor, ctypes.byref(workspace_size)
-        )
-    )
-    workspace = TestWorkspace(workspace_size.value, y.device)
-
-    def lib_cosh():
-        check_error(
-            LIBINFINIOP.infiniopCosh(
-                descriptor, workspace.data(), workspace_size.value, y.data(), x.data(), None
-            )
-        )
-
-    lib_cosh()
-    if sync is not None:
-        sync()
-
-    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
-    if DEBUG:
-        debug(y.actual_tensor(), ans, atol=atol, rtol=rtol)
-    assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol, equal_nan=True)
-
-    # Profiling workflow
-    if PROFILE:
-        # fmt: off
-        profile_operation("PyTorch", lambda: cosh_op(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
-        profile_operation("    lib", lambda: lib_cosh(), device, NUM_PRERUN, NUM_ITERATIONS)
-        # fmt: on
-
-    check_error(LIBINFINIOP.infiniopDestroyCoshDescriptor(descriptor))
-
-
-if __name__ == "__main__":
-    args = get_args()
-
-    # Configure testing options
-    DEBUG = args.debug
-    PROFILE = args.profile
-    NUM_PRERUN = args.num_prerun
-    NUM_ITERATIONS = args.num_iterations
-
-    for device in get_test_devices(args):
-        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
-
-    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/div.py b/test/infiniop/div.py
deleted file mode 100644
index 17b22b2e5..000000000
--- a/test/infiniop/div.py
+++ /dev/null
@@ -1,192 +0,0 @@
-import torch
-import ctypes
-from ctypes import c_uint64
-from libinfiniop import (
-    LIBINFINIOP,
-    TestTensor,
-    get_test_devices,
-    check_error,
-    test_operator,
-    get_args,
-    debug,
-    get_tolerance,
-    profile_operation,
-    TestWorkspace,
-    InfiniDtype,
-    InfiniDtypeNames,
-    InfiniDeviceNames,
-    infiniopOperatorDescriptor_t,
-)
-from enum import Enum, auto
-
-# ==============================================================================
-#  Configuration (Internal Use Only)
-# ==============================================================================
-# These are not meant to be imported from other modules
-_TEST_CASES_ = [
-    # shape, a_stride, b_stride, c_stride
-    ((13, 4), None, None, None),
-    ((13, 4), (10, 1), (10, 1), (10, 1)),
-    ((13, 4), (0, 1), None, None),
-    ((13, 4, 4), None, None, None),
-    ((13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1)),
-    ((13, 4, 4), (4, 0, 1), (0, 4, 1), None),
-    ((16, 5632), None, None, None),
-    ((16, 5632), (13312, 1), (13312, 1), (13312, 1)),
-    ((13, 16, 2), (128, 4, 1), (0, 2, 1), (64, 4, 1)),
-    ((13, 16, 2), (128, 4, 1), (2, 0, 1), (64, 4, 1)),
-    ((4, 4, 5632), None, None, None),
-    ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1)),
-]
-
-
-class Inplace(Enum):
-    OUT_OF_PLACE = auto()
-    INPLACE_A = auto()
-    INPLACE_B = auto()
-
-
-# Inplace options applied for each test case in _TEST_CASES_
-_INPLACE = [
-    Inplace.OUT_OF_PLACE,
-    Inplace.INPLACE_A,
-    Inplace.INPLACE_B,
-]
-
-# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
-_TEST_CASES = [
-    test_case + (inplace_item,)
-    for test_case in _TEST_CASES_
-    for inplace_item in _INPLACE
-]
-
-# Data types used for testing (matching old operators library: only F16 and F32)
-_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32]
-
-# Tolerance map for different data types
-# Note: F32 tolerance is relaxed compared to theoretical precision due to:
-# - Old operators library uses vectorized operations (pack_size=4) with vecN<float2, float, 2>
-# - InfiniCore uses elementwise operations, which can cause 1 ULP differences
-# - This is acceptable as it's within floating-point precision limits
-_TOLERANCE_MAP = {
-    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
-    InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6},  # Relaxed from 1e-7 to accommodate vectorization differences
-}
-
-DEBUG = False
-PROFILE = False
-NUM_PRERUN = 10
-NUM_ITERATIONS = 1000
-
-
-def div(c, a, b):
-    # Only support F16 and F32 (matching old operators library)
-    torch.div(a, b, out=c)
-
-
-def test(
-    handle,
-    device,
-    shape,
-    a_stride=None,
-    b_stride=None,
-    c_stride=None,
-    inplace=Inplace.OUT_OF_PLACE,
-    dtype=InfiniDtype.F16,
-    sync=None,
-):
-    a = TestTensor(shape, a_stride, dtype, device)
-    # For division, ensure b doesn't contain zeros to avoid division by zero
-    # Similar to old test: b = torch.rand(...) * 2, which gives range [0, 2)
-    # Use scale=2 to ensure values are in [0, 2) range, then add small bias to avoid zero
-    b = TestTensor(shape, b_stride, dtype, device, scale=2, bias=0.1)
-    
-    if inplace == Inplace.INPLACE_A:
-        if c_stride is not None and c_stride != a_stride:
-            return
-        c = a
-    elif inplace == Inplace.INPLACE_B:
-        if c_stride is not None and c_stride != b_stride:
-            return
-        c = b
-    else:
-        c = TestTensor(shape, c_stride, dtype, device)
-
-    if c.is_broadcast():
-        return
-
-    print(
-        f"Testing Div on {InfiniDeviceNames[device]} with shape:{shape} a_stride:{a_stride} b_stride:{b_stride} c_stride:{c_stride} "
-        f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}"
-    )
-    div(c.torch_tensor(), a.torch_tensor(), b.torch_tensor())
-
-    if sync is not None:
-        sync()
-
-    descriptor = infiniopOperatorDescriptor_t()
-    check_error(
-        LIBINFINIOP.infiniopCreateDivDescriptor(
-            handle,
-            ctypes.byref(descriptor),
-            c.descriptor,
-            a.descriptor,
-            b.descriptor,
-        )
-    )
-
-    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
-    for tensor in [a, b, c]:
-        tensor.destroy_desc()
-
-    workspace_size = c_uint64(0)
-    check_error(
-        LIBINFINIOP.infiniopGetDivWorkspaceSize(
-            descriptor, ctypes.byref(workspace_size)
-        )
-    )
-    workspace = TestWorkspace(workspace_size.value, device)
-
-    def lib_div():
-        check_error(
-            LIBINFINIOP.infiniopDiv(
-                descriptor,
-                workspace.data(),
-                workspace_size.value,
-                c.data(),
-                a.data(),
-                b.data(),
-                None,
-            )
-        )
-
-    lib_div()
-    if sync is not None:
-        sync()
-
-    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
-    if DEBUG:
-        debug(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol)
-    assert torch.allclose(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol, equal_nan=True)
-
-    # Profiling workflow
-    if PROFILE:
-        # fmt: off
-        profile_operation("PyTorch", lambda: div(c.torch_tensor(), a.torch_tensor(), b.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
-        profile_operation("    lib", lambda: lib_div(), device, NUM_PRERUN, NUM_ITERATIONS)
-        # fmt: on
-    check_error(LIBINFINIOP.infiniopDestroyDivDescriptor(descriptor))
-
-
-if __name__ == "__main__":
-    args = get_args()
-    # Configure testing options
-    DEBUG = args.debug
-    PROFILE = args.profile
-    NUM_PRERUN = args.num_prerun
-    NUM_ITERATIONS = args.num_iterations
-
-    for device in get_test_devices(args):
-        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
-
-    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/erf.py b/test/infiniop/erf.py
deleted file mode 100644
index f5f9c4cd9..000000000
--- a/test/infiniop/erf.py
+++ /dev/null
@@ -1,165 +0,0 @@
-import ctypes
-from ctypes import c_uint64
-from enum import Enum, auto
-
-import torch
-from libinfiniop import (
-    LIBINFINIOP,
-    InfiniDeviceNames,
-    InfiniDtype,
-    InfiniDtypeNames,
-    TestTensor,
-    TestWorkspace,
-    check_error,
-    debug,
-    get_args,
-    get_test_devices,
-    get_tolerance,
-    infiniopOperatorDescriptor_t,
-    profile_operation,
-    test_operator,
-)
-
-# ==============================================================================
-#  Configuration (Internal Use Only)
-# ==============================================================================
-# These are not meant to be imported from other modules
-_TEST_CASES_ = [
-    # tensor_shape, inplace
-    ((1, 3),),
-    ((3, 3),),
-    ((32, 20, 512),),
-    ((33, 333, 333),),
-    ((32, 256, 112, 112),),
-    ((3, 3, 13, 9, 17),),
-]
-
-
-class Inplace(Enum):
-    OUT_OF_PLACE = auto()
-    INPLACE_X = auto()
-
-
-# Inplace options applied for each test case in _TEST_CASES_
-_INPLACE = [
-    Inplace.OUT_OF_PLACE,
-    Inplace.INPLACE_X,
-]
-
-# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
-_TEST_CASES = [
-    test_case + (inplace_item,)
-    for test_case in _TEST_CASES_
-    for inplace_item in _INPLACE
-]
-
-# Data types used for testing (matching old operators library: only F16 and F32)
-_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32]
-
-# Tolerance map for different data types
-_TOLERANCE_MAP = {
-    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
-    InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7},
-}
-
-DEBUG = False
-PROFILE = False
-NUM_PRERUN = 10
-NUM_ITERATIONS = 1000
-
-
-def erf_op(x):
-    return torch.erf(x).to(x.dtype)
-
-
-def test(
-    handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=InfiniDtype.F16, sync=None
-):
-    # Generate test tensors with values in range [-3, 3) for erf operation
-    # erf domain is (-∞, +∞), so we use range [-3, 3)
-    x_torch_tensor = torch.rand(shape) * 6 - 3
-
-    x = TestTensor(
-        shape,
-        x_torch_tensor.stride(),
-        dtype,
-        device,
-        mode="manual",
-        set_tensor=x_torch_tensor,
-    )
-
-    if inplace == Inplace.INPLACE_X:
-        y = x
-    else:
-        y = TestTensor(shape, None, dtype, device)
-
-    if y.is_broadcast():
-        return
-
-    print(
-        f"Testing Erf on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}"
-    )
-
-    ans = erf_op(x.torch_tensor())
-
-    if sync is not None:
-        sync()
-
-    descriptor = infiniopOperatorDescriptor_t()
-    check_error(
-        LIBINFINIOP.infiniopCreateErfDescriptor(
-            handle, ctypes.byref(descriptor), y.descriptor, x.descriptor
-        )
-    )
-
-    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
-    for tensor in [x, y]:
-        tensor.destroy_desc()
-
-    workspace_size = c_uint64(0)
-    check_error(
-        LIBINFINIOP.infiniopGetErfWorkspaceSize(
-            descriptor, ctypes.byref(workspace_size)
-        )
-    )
-    workspace = TestWorkspace(workspace_size.value, y.device)
-
-    def lib_erf():
-        check_error(
-            LIBINFINIOP.infiniopErf(
-                descriptor, workspace.data(), workspace_size.value, y.data(), x.data(), None
-            )
-        )
-
-    lib_erf()
-    if sync is not None:
-        sync()
-
-    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
-    if DEBUG:
-        debug(y.actual_tensor(), ans, atol=atol, rtol=rtol)
-    assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol, equal_nan=True)
-
-    # Profiling workflow
-    if PROFILE:
-        # fmt: off
-        profile_operation("PyTorch", lambda: erf_op(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
-        profile_operation("    lib", lambda: lib_erf(), device, NUM_PRERUN, NUM_ITERATIONS)
-        # fmt: on
-
-    check_error(LIBINFINIOP.infiniopDestroyErfDescriptor(descriptor))
-
-
-if __name__ == "__main__":
-    args = get_args()
-
-    # Configure testing options
-    DEBUG = args.debug
-    PROFILE = args.profile
-    NUM_PRERUN = args.num_prerun
-    NUM_ITERATIONS = args.num_iterations
-
-    for device in get_test_devices(args):
-        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
-
-    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/floor.py b/test/infiniop/floor.py
deleted file mode 100644
index b981da809..000000000
--- a/test/infiniop/floor.py
+++ /dev/null
@@ -1,165 +0,0 @@
-import ctypes
-from ctypes import c_uint64
-from enum import Enum, auto
-
-import torch
-from libinfiniop import (
-    LIBINFINIOP,
-    InfiniDeviceNames,
-    InfiniDtype,
-    InfiniDtypeNames,
-    TestTensor,
-    TestWorkspace,
-    check_error,
-    debug,
-    get_args,
-    get_test_devices,
-    get_tolerance,
-    infiniopOperatorDescriptor_t,
-    profile_operation,
-    test_operator,
-)
-
-# ==============================================================================
-#  Configuration (Internal Use Only)
-# ==============================================================================
-# These are not meant to be imported from other modules
-_TEST_CASES_ = [
-    # tensor_shape, inplace
-    ((1, 3),),
-    ((3, 3),),
-    ((32, 20, 512),),
-    ((33, 333, 333),),
-    ((32, 256, 112, 112),),
-    ((3, 3, 13, 9, 17),),
-]
-
-
-class Inplace(Enum):
-    OUT_OF_PLACE = auto()
-    INPLACE_X = auto()
-
-
-# Inplace options applied for each test case in _TEST_CASES_
-_INPLACE = [
-    Inplace.OUT_OF_PLACE,
-    Inplace.INPLACE_X,
-]
-
-# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
-_TEST_CASES = [
-    test_case + (inplace_item,)
-    for test_case in _TEST_CASES_
-    for inplace_item in _INPLACE
-]
-
-# Data types used for testing (matching old operators library: only F16 and F32)
-_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32]
-
-# Tolerance map for different data types
-_TOLERANCE_MAP = {
-    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
-    InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7},
-}
-
-DEBUG = False
-PROFILE = False
-NUM_PRERUN = 10
-NUM_ITERATIONS = 1000
-
-
-def floor_op(x):
-    return torch.floor(x).to(x.dtype)
-
-
-def test(
-    handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=InfiniDtype.F16, sync=None
-):
-    # Generate test tensors with values in range [-20, -10) for floor operation
-    # floor domain is (-∞, +∞), so we use range [-20, -10)
-    x_torch_tensor = torch.rand(shape) * 10 - 20
-
-    x = TestTensor(
-        shape,
-        x_torch_tensor.stride(),
-        dtype,
-        device,
-        mode="manual",
-        set_tensor=x_torch_tensor,
-    )
-
-    if inplace == Inplace.INPLACE_X:
-        y = x
-    else:
-        y = TestTensor(shape, None, dtype, device)
-
-    if y.is_broadcast():
-        return
-
-    print(
-        f"Testing Floor on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}"
-    )
-
-    ans = floor_op(x.torch_tensor())
-
-    if sync is not None:
-        sync()
-
-    descriptor = infiniopOperatorDescriptor_t()
-    check_error(
-        LIBINFINIOP.infiniopCreateFloorDescriptor(
-            handle, ctypes.byref(descriptor), y.descriptor, x.descriptor
-        )
-    )
-
-    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
-    for tensor in [x, y]:
-        tensor.destroy_desc()
-
-    workspace_size = c_uint64(0)
-    check_error(
-        LIBINFINIOP.infiniopGetFloorWorkspaceSize(
-            descriptor, ctypes.byref(workspace_size)
-        )
-    )
-    workspace = TestWorkspace(workspace_size.value, y.device)
-
-    def lib_floor():
-        check_error(
-            LIBINFINIOP.infiniopFloor(
-                descriptor, workspace.data(), workspace_size.value, y.data(), x.data(), None
-            )
-        )
-
-    lib_floor()
-    if sync is not None:
-        sync()
-
-    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
-    if DEBUG:
-        debug(y.actual_tensor(), ans, atol=atol, rtol=rtol)
-    assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol, equal_nan=True)
-
-    # Profiling workflow
-    if PROFILE:
-        # fmt: off
-        profile_operation("PyTorch", lambda: floor_op(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
-        profile_operation("    lib", lambda: lib_floor(), device, NUM_PRERUN, NUM_ITERATIONS)
-        # fmt: on
-
-    check_error(LIBINFINIOP.infiniopDestroyFloorDescriptor(descriptor))
-
-
-if __name__ == "__main__":
-    args = get_args()
-
-    # Configure testing options
-    DEBUG = args.debug
-    PROFILE = args.profile
-    NUM_PRERUN = args.num_prerun
-    NUM_ITERATIONS = args.num_iterations
-
-    for device in get_test_devices(args):
-        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
-
-    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/libinfiniop/binary_test_base.py b/test/infiniop/libinfiniop/binary_test_base.py
new file mode 100644
index 000000000..c9da5b4de
--- /dev/null
+++ b/test/infiniop/libinfiniop/binary_test_base.py
@@ -0,0 +1,273 @@
+"""
+Base test template for binary operators.
+
+This module provides a unified test framework for all binary operators,
+eliminating code duplication across individual test scripts.
+
+Usage:
+    from libinfiniop.binary_test_base import BinaryTestBase
+    
+    class DivTest(BinaryTestBase):
+        OP_NAME = "Div"
+        OP_NAME_LOWER = "div"
+        
+        @staticmethod
+        def torch_op(c, a, b):
+            torch.div(a, b, out=c)
+        
+        @staticmethod
+        def generate_input_a(shape, dtype, device):
+            return TestTensor(shape, None, dtype, device)
+        
+        @staticmethod
+        def generate_input_b(shape, dtype, device):
+            # For division, ensure b doesn't contain zeros
+            return TestTensor(shape, None, dtype, device, scale=2, bias=0.1)
+        
+        TOLERANCE_MAP = {
+            InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+            InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6},
+        }
+    
+    if __name__ == "__main__":
+        DivTest.run()
+"""
+
+import ctypes
+from ctypes import c_uint64
+from enum import Enum, auto
+
+import torch
+from libinfiniop import (
+    LIBINFINIOP,
+    InfiniDeviceNames,
+    InfiniDtype,
+    InfiniDtypeNames,
+    TestTensor,
+    TestWorkspace,
+    check_error,
+    debug,
+    get_args,
+    get_test_devices,
+    get_tolerance,
+    infiniopOperatorDescriptor_t,
+    profile_operation,
+    test_operator,
+)
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_A = auto()
+    INPLACE_B = auto()
+
+
+# Common test cases for binary operators
+_BINARY_TEST_CASES_ = [
+    # shape, a_stride, b_stride, c_stride
+    ((13, 4), None, None, None),
+    ((13, 4), (10, 1), (10, 1), (10, 1)),
+    ((13, 4), (0, 1), None, None),
+    ((13, 4, 4), None, None, None),
+    ((13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1)),
+    ((13, 4, 4), (4, 0, 1), (0, 4, 1), None),
+    ((16, 5632), None, None, None),
+    ((16, 5632), (13312, 1), (13312, 1), (13312, 1)),
+    ((13, 16, 2), (128, 4, 1), (0, 2, 1), (64, 4, 1)),
+    ((13, 16, 2), (128, 4, 1), (2, 0, 1), (64, 4, 1)),
+    ((4, 4, 5632), None, None, None),
+    ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1)),
+]
+
+# Inplace options applied for each test case
+_BINARY_INPLACE = [
+    Inplace.OUT_OF_PLACE,
+    Inplace.INPLACE_A,
+    Inplace.INPLACE_B,
+]
+
+# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
+_BINARY_TEST_CASES = [
+    test_case + (inplace_item,)
+    for test_case in _BINARY_TEST_CASES_
+    for inplace_item in _BINARY_INPLACE
+]
+
+# Data types used for testing (matching old operators library: only F16 and F32)
+_BINARY_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32]
+
+
+class BinaryTestBase:
+    """
+    Base class for binary operator tests.
+    
+    Subclasses must define:
+    - OP_NAME: Uppercase operator name (e.g., "Div", "Pow")
+    - OP_NAME_LOWER: Lowercase operator name (e.g., "div", "pow")
+    - torch_op: Static method that performs the PyTorch operation
+    - generate_input_a: Static method that generates first input tensor
+    - generate_input_b: Static method that generates second input tensor
+    - TOLERANCE_MAP: Dictionary mapping dtype to tolerance values
+    """
+    
+    OP_NAME = None
+    OP_NAME_LOWER = None
+    
+    # Default tolerance map (can be overridden)
+    TOLERANCE_MAP = {
+        InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+        InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6},
+    }
+    
+    # Test cases (can be overridden)
+    TEST_CASES = _BINARY_TEST_CASES
+    TENSOR_DTYPES = _BINARY_TENSOR_DTYPES
+    
+    DEBUG = False
+    PROFILE = False
+    NUM_PRERUN = 10
+    NUM_ITERATIONS = 1000
+    
+    @staticmethod
+    def torch_op(c, a, b):
+        """PyTorch operation - must be implemented by subclass"""
+        raise NotImplementedError("Subclass must implement torch_op")
+    
+    @staticmethod
+    def generate_input_a(shape, a_stride, dtype, device):
+        """
+        Generate first input tensor - must be implemented by subclass.
+        
+        Args:
+            shape: Tensor shape tuple
+            a_stride: Stride tuple or None
+            dtype: InfiniDtype enum value
+            device: InfiniDeviceEnum value
+        
+        Returns:
+            TestTensor: Generated first input tensor
+        """
+        raise NotImplementedError("Subclass must implement generate_input_a")
+    
+    @staticmethod
+    def generate_input_b(shape, b_stride, dtype, device):
+        """
+        Generate second input tensor - must be implemented by subclass.
+        
+        Args:
+            shape: Tensor shape tuple
+            b_stride: Stride tuple or None
+            dtype: InfiniDtype enum value
+            device: InfiniDeviceEnum value
+        
+        Returns:
+            TestTensor: Generated second input tensor
+        """
+        raise NotImplementedError("Subclass must implement generate_input_b")
+    
+    @classmethod
+    def test(cls, handle, device, shape, a_stride=None, b_stride=None, c_stride=None, 
+             inplace=Inplace.OUT_OF_PLACE, dtype=InfiniDtype.F16, sync=None):
+        """Common test function for binary operators"""
+        a = cls.generate_input_a(shape, a_stride, dtype, device)
+        b = cls.generate_input_b(shape, b_stride, dtype, device)
+        
+        if inplace == Inplace.INPLACE_A:
+            if c_stride is not None and c_stride != a_stride:
+                return
+            c = a
+        elif inplace == Inplace.INPLACE_B:
+            if c_stride is not None and c_stride != b_stride:
+                return
+            c = b
+        else:
+            c = TestTensor(shape, c_stride, dtype, device)
+        
+        if c.is_broadcast():
+            return
+        
+        print(
+            f"Testing {cls.OP_NAME} on {InfiniDeviceNames[device]} with shape:{shape} a_stride:{a_stride} b_stride:{b_stride} c_stride:{c_stride} "
+            f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}"
+        )
+        
+        cls.torch_op(c.torch_tensor(), a.torch_tensor(), b.torch_tensor())
+        
+        if sync is not None:
+            sync()
+        
+        descriptor = infiniopOperatorDescriptor_t()
+        create_func = getattr(LIBINFINIOP, f"infiniopCreate{cls.OP_NAME}Descriptor")
+        check_error(
+            create_func(
+                handle,
+                ctypes.byref(descriptor),
+                c.descriptor,
+                a.descriptor,
+                b.descriptor,
+            )
+        )
+        
+        # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+        for tensor in [a, b, c]:
+            tensor.destroy_desc()
+        
+        workspace_size = c_uint64(0)
+        get_workspace_func = getattr(LIBINFINIOP, f"infiniopGet{cls.OP_NAME}WorkspaceSize")
+        check_error(
+            get_workspace_func(
+                descriptor, ctypes.byref(workspace_size)
+            )
+        )
+        workspace = TestWorkspace(workspace_size.value, device)
+        
+        def lib_op():
+            op_func = getattr(LIBINFINIOP, f"infiniop{cls.OP_NAME}")
+            check_error(
+                op_func(
+                    descriptor,
+                    workspace.data(),
+                    workspace_size.value,
+                    c.data(),
+                    a.data(),
+                    b.data(),
+                    None,
+                )
+            )
+        
+        lib_op()
+        if sync is not None:
+            sync()
+        
+        atol, rtol = get_tolerance(cls.TOLERANCE_MAP, dtype)
+        if cls.DEBUG:
+            debug(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol)
+        
+        equal_nan = getattr(cls, 'EQUAL_NAN', False)
+        assert torch.allclose(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol, equal_nan=equal_nan)
+        
+        # Profiling workflow
+        if cls.PROFILE:
+            # fmt: off
+            profile_operation("PyTorch", lambda: cls.torch_op(c.torch_tensor(), a.torch_tensor(), b.torch_tensor()), device, cls.NUM_PRERUN, cls.NUM_ITERATIONS)
+            profile_operation("    lib", lambda: lib_op(), device, cls.NUM_PRERUN, cls.NUM_ITERATIONS)
+            # fmt: on
+        
+        destroy_func = getattr(LIBINFINIOP, f"infiniopDestroy{cls.OP_NAME}Descriptor")
+        check_error(destroy_func(descriptor))
+    
+    @classmethod
+    def run(cls):
+        """Run the test"""
+        args = get_args()
+        
+        # Configure testing options
+        cls.DEBUG = args.debug
+        cls.PROFILE = args.profile
+        cls.NUM_PRERUN = args.num_prerun
+        cls.NUM_ITERATIONS = args.num_iterations
+        
+        for device in get_test_devices(args):
+            test_operator(device, cls.test, cls.TEST_CASES, cls.TENSOR_DTYPES)
+        
+        print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/libinfiniop/unary_test_base.py b/test/infiniop/libinfiniop/unary_test_base.py
new file mode 100644
index 000000000..648a97d3e
--- /dev/null
+++ b/test/infiniop/libinfiniop/unary_test_base.py
@@ -0,0 +1,242 @@
+"""
+Base test template for unary operators.
+
+This module provides a unified test framework for all unary operators,
+eliminating code duplication across individual test scripts.
+
+Usage:
+    from libinfiniop.unary_test_base import UnaryTestBase
+    
+    class AbsTest(UnaryTestBase):
+        OP_NAME = "Abs"
+        OP_NAME_LOWER = "abs"
+        
+        @staticmethod
+        def torch_op(x):
+            return torch.abs(x).to(x.dtype)
+        
+        @staticmethod
+        def generate_input(shape, dtype, device):
+            # Generate test tensors with values in range [-1, 1) for abs operation
+            return torch.rand(shape, dtype=dtype, device=device) * 2 - 1
+        
+        TOLERANCE_MAP = {
+            InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+            InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7},
+        }
+    
+    if __name__ == "__main__":
+        AbsTest.run()
+"""
+
+import ctypes
+from ctypes import c_uint64
+from enum import Enum, auto
+
+import torch
+from libinfiniop import (
+    LIBINFINIOP,
+    InfiniDeviceNames,
+    InfiniDtype,
+    InfiniDtypeNames,
+    TestTensor,
+    TestWorkspace,
+    check_error,
+    debug,
+    get_args,
+    get_test_devices,
+    get_tolerance,
+    infiniopOperatorDescriptor_t,
+    profile_operation,
+    test_operator,
+)
+from libinfiniop.utils import to_torch_dtype
+from libinfiniop.devices import torch_device_map
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_X = auto()
+
+
+# Common test cases for unary operators
+_UNARY_TEST_CASES_ = [
+    # tensor_shape, inplace
+    ((1, 3),),
+    ((3, 3),),
+    ((32, 20, 512),),
+    ((33, 333, 333),),
+    ((32, 256, 112, 112),),
+    ((3, 3, 13, 9, 17),),
+]
+
+# Inplace options applied for each test case
+_UNARY_INPLACE = [
+    Inplace.OUT_OF_PLACE,
+    Inplace.INPLACE_X,
+]
+
+# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
+_UNARY_TEST_CASES = [
+    test_case + (inplace_item,)
+    for test_case in _UNARY_TEST_CASES_
+    for inplace_item in _UNARY_INPLACE
+]
+
+# Data types used for testing (matching old operators library: only F16 and F32)
+_UNARY_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32]
+
+
+class UnaryTestBase:
+    """
+    Base class for unary operator tests.
+    
+    Subclasses must define:
+    - OP_NAME: Uppercase operator name (e.g., "Abs", "Log")
+    - OP_NAME_LOWER: Lowercase operator name (e.g., "abs", "log")
+    - torch_op: Static method that performs the PyTorch operation
+    - generate_input: Static method that generates input tensor
+    - TOLERANCE_MAP: Dictionary mapping dtype to tolerance values
+    """
+    
+    OP_NAME = None
+    OP_NAME_LOWER = None
+    
+    # Default tolerance map (can be overridden)
+    TOLERANCE_MAP = {
+        InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+        InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7},
+    }
+    
+    # Test cases (can be overridden)
+    TEST_CASES = _UNARY_TEST_CASES
+    TENSOR_DTYPES = _UNARY_TENSOR_DTYPES
+    
+    DEBUG = False
+    PROFILE = False
+    NUM_PRERUN = 10
+    NUM_ITERATIONS = 1000
+    
+    @staticmethod
+    def torch_op(x):
+        """PyTorch operation - must be implemented by subclass"""
+        raise NotImplementedError("Subclass must implement torch_op")
+    
+    @staticmethod
+    def generate_input(shape, dtype, device):
+        """
+        Generate input tensor - must be implemented by subclass.
+        
+        Args:
+            shape: Tensor shape tuple
+            dtype: PyTorch dtype (e.g., torch.float16, torch.float32)
+            device: PyTorch device string (e.g., "cpu", "cuda")
+        
+        Returns:
+            torch.Tensor: Generated input tensor
+        """
+        raise NotImplementedError("Subclass must implement generate_input")
+    
+    @classmethod
+    def test(cls, handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=InfiniDtype.F16, sync=None):
+        """Common test function for unary operators"""
+        from libinfiniop.devices import torch_device_map
+        from libinfiniop.utils import to_torch_dtype
+        
+        # Generate input tensor
+        torch_dtype = to_torch_dtype(dtype)
+        torch_device = torch_device_map[device]
+        x_torch_tensor = cls.generate_input(shape, torch_dtype, torch_device)
+        
+        x = TestTensor(
+            shape,
+            x_torch_tensor.stride(),
+            dtype,
+            device,
+            mode="manual",
+            set_tensor=x_torch_tensor,
+        )
+        
+        if inplace == Inplace.INPLACE_X:
+            y = x
+        else:
+            y = TestTensor(shape, None, dtype, device)
+        
+        if y.is_broadcast():
+            return
+        
+        print(
+            f"Testing {cls.OP_NAME} on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}"
+        )
+        
+        ans = cls.torch_op(x.torch_tensor())
+        
+        if sync is not None:
+            sync()
+        
+        descriptor = infiniopOperatorDescriptor_t()
+        create_func = getattr(LIBINFINIOP, f"infiniopCreate{cls.OP_NAME}Descriptor")
+        check_error(
+            create_func(
+                handle, ctypes.byref(descriptor), y.descriptor, x.descriptor
+            )
+        )
+        
+        # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+        for tensor in [x, y]:
+            tensor.destroy_desc()
+        
+        workspace_size = c_uint64(0)
+        get_workspace_func = getattr(LIBINFINIOP, f"infiniopGet{cls.OP_NAME}WorkspaceSize")
+        check_error(
+            get_workspace_func(
+                descriptor, ctypes.byref(workspace_size)
+            )
+        )
+        workspace = TestWorkspace(workspace_size.value, y.device)
+        
+        def lib_op():
+            op_func = getattr(LIBINFINIOP, f"infiniop{cls.OP_NAME}")
+            check_error(
+                op_func(
+                    descriptor, workspace.data(), workspace_size.value, y.data(), x.data(), None
+                )
+            )
+        
+        lib_op()
+        if sync is not None:
+            sync()
+        
+        atol, rtol = get_tolerance(cls.TOLERANCE_MAP, dtype)
+        equal_nan = getattr(cls, 'EQUAL_NAN', False)
+        
+        if cls.DEBUG:
+            debug(y.actual_tensor(), ans, atol=atol, rtol=rtol, equal_nan=equal_nan)
+        
+        assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol, equal_nan=equal_nan)
+        
+        # Profiling workflow
+        if cls.PROFILE:
+            # fmt: off
+            profile_operation("PyTorch", lambda: cls.torch_op(x.torch_tensor()), device, cls.NUM_PRERUN, cls.NUM_ITERATIONS)
+            profile_operation("    lib", lambda: lib_op(), device, cls.NUM_PRERUN, cls.NUM_ITERATIONS)
+            # fmt: on
+        
+        destroy_func = getattr(LIBINFINIOP, f"infiniopDestroy{cls.OP_NAME}Descriptor")
+        check_error(destroy_func(descriptor))
+    
+    @classmethod
+    def run(cls):
+        """Run the test"""
+        args = get_args()
+        
+        # Configure testing options
+        cls.DEBUG = args.debug
+        cls.PROFILE = args.profile
+        cls.NUM_PRERUN = args.num_prerun
+        cls.NUM_ITERATIONS = args.num_iterations
+        
+        for device in get_test_devices(args):
+            test_operator(device, cls.test, cls.TEST_CASES, cls.TENSOR_DTYPES)
+        
+        print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/log.py b/test/infiniop/log.py
deleted file mode 100644
index 4f97de374..000000000
--- a/test/infiniop/log.py
+++ /dev/null
@@ -1,166 +0,0 @@
-import ctypes
-from ctypes import c_uint64
-from enum import Enum, auto
-
-import torch
-from libinfiniop import (
-    LIBINFINIOP,
-    InfiniDeviceNames,
-    InfiniDtype,
-    InfiniDtypeNames,
-    TestTensor,
-    TestWorkspace,
-    check_error,
-    debug,
-    get_args,
-    get_test_devices,
-    get_tolerance,
-    infiniopOperatorDescriptor_t,
-    profile_operation,
-    test_operator,
-)
-
-# ==============================================================================
-#  Configuration (Internal Use Only)
-# ==============================================================================
-# These are not meant to be imported from other modules
-_TEST_CASES_ = [
-    # tensor_shape, inplace
-    ((1, 3),),
-    ((3, 3),),
-    ((32, 20, 512),),
-    ((33, 333, 333),),
-    ((32, 256, 112, 112),),
-    ((3, 3, 13, 9, 17),),
-]
-
-
-class Inplace(Enum):
-    OUT_OF_PLACE = auto()
-    INPLACE_X = auto()
-
-
-# Inplace options applied for each test case in _TEST_CASES_
-_INPLACE = [
-    Inplace.OUT_OF_PLACE,
-    Inplace.INPLACE_X,
-]
-
-# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
-_TEST_CASES = [
-    test_case + (inplace_item,)
-    for test_case in _TEST_CASES_
-    for inplace_item in _INPLACE
-]
-
-# Data types used for testing (matching old operators library: only F16 and F32)
-_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32]
-
-# Tolerance map for different data types
-# Copied from old operators library: atol=1e-7, rtol=1e-3
-_TOLERANCE_MAP = {
-    InfiniDtype.F16: {"atol": 1e-7, "rtol": 1e-3},
-    InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-3},
-}
-
-DEBUG = False
-PROFILE = False
-NUM_PRERUN = 10
-NUM_ITERATIONS = 1000
-
-
-def log_op(x):
-    return torch.log(x).to(x.dtype)
-
-
-def test(
-    handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=InfiniDtype.F16, sync=None
-):
-    # Generate test tensors with values in range [0.1, 1.1) for log operation
-    # log domain is (0, +∞), so we use range [0.1, 1.1)
-    x_torch_tensor = torch.rand(shape) + 0.1
-
-    x = TestTensor(
-        shape,
-        x_torch_tensor.stride(),
-        dtype,
-        device,
-        mode="manual",
-        set_tensor=x_torch_tensor,
-    )
-
-    if inplace == Inplace.INPLACE_X:
-        y = x
-    else:
-        y = TestTensor(shape, None, dtype, device)
-
-    if y.is_broadcast():
-        return
-
-    print(
-        f"Testing Log on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}"
-    )
-
-    ans = log_op(x.torch_tensor())
-
-    if sync is not None:
-        sync()
-
-    descriptor = infiniopOperatorDescriptor_t()
-    check_error(
-        LIBINFINIOP.infiniopCreateLogDescriptor(
-            handle, ctypes.byref(descriptor), y.descriptor, x.descriptor
-        )
-    )
-
-    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
-    for tensor in [x, y]:
-        tensor.destroy_desc()
-
-    workspace_size = c_uint64(0)
-    check_error(
-        LIBINFINIOP.infiniopGetLogWorkspaceSize(
-            descriptor, ctypes.byref(workspace_size)
-        )
-    )
-    workspace = TestWorkspace(workspace_size.value, y.device)
-
-    def lib_log():
-        check_error(
-            LIBINFINIOP.infiniopLog(
-                descriptor, workspace.data(), workspace_size.value, y.data(), x.data(), None
-            )
-        )
-
-    lib_log()
-    if sync is not None:
-        sync()
-
-    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
-    if DEBUG:
-        debug(y.actual_tensor(), ans, atol=atol, rtol=rtol)
-    assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol, equal_nan=True)
-
-    # Profiling workflow
-    if PROFILE:
-        # fmt: off
-        profile_operation("PyTorch", lambda: log_op(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
-        profile_operation("    lib", lambda: lib_log(), device, NUM_PRERUN, NUM_ITERATIONS)
-        # fmt: on
-
-    check_error(LIBINFINIOP.infiniopDestroyLogDescriptor(descriptor))
-
-
-if __name__ == "__main__":
-    args = get_args()
-
-    # Configure testing options
-    DEBUG = args.debug
-    PROFILE = args.profile
-    NUM_PRERUN = args.num_prerun
-    NUM_ITERATIONS = args.num_iterations
-
-    for device in get_test_devices(args):
-        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
-
-    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/max.py b/test/infiniop/max.py
deleted file mode 100644
index e4221cf3e..000000000
--- a/test/infiniop/max.py
+++ /dev/null
@@ -1,189 +0,0 @@
-import torch
-import ctypes
-from ctypes import c_uint64
-from libinfiniop import (
-    LIBINFINIOP,
-    TestTensor,
-    get_test_devices,
-    check_error,
-    test_operator,
-    get_args,
-    debug,
-    get_tolerance,
-    profile_operation,
-    TestWorkspace,
-    InfiniDtype,
-    InfiniDtypeNames,
-    InfiniDeviceNames,
-    infiniopOperatorDescriptor_t,
-)
-from enum import Enum, auto
-
-# ==============================================================================
-#  Configuration (Internal Use Only)
-# ==============================================================================
-# These are not meant to be imported from other modules
-_TEST_CASES_ = [
-    # shape, a_stride, b_stride, c_stride
-    ((13, 4), None, None, None),
-    ((13, 4), (10, 1), (10, 1), (10, 1)),
-    ((13, 4), (0, 1), None, None),
-    ((13, 4, 4), None, None, None),
-    ((13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1)),
-    ((13, 4, 4), (4, 0, 1), (0, 4, 1), None),
-    ((16, 5632), None, None, None),
-    ((16, 5632), (13312, 1), (13312, 1), (13312, 1)),
-    ((13, 16, 2), (128, 4, 1), (0, 2, 1), (64, 4, 1)),
-    ((13, 16, 2), (128, 4, 1), (2, 0, 1), (64, 4, 1)),
-    ((4, 4, 5632), None, None, None),
-    ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1)),
-]
-
-
-class Inplace(Enum):
-    OUT_OF_PLACE = auto()
-    INPLACE_A = auto()
-    INPLACE_B = auto()
-
-
-# Inplace options applied for each test case in _TEST_CASES_
-_INPLACE = [
-    Inplace.OUT_OF_PLACE,
-    Inplace.INPLACE_A,
-    Inplace.INPLACE_B,
-]
-
-# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
-_TEST_CASES = [
-    test_case + (inplace_item,)
-    for test_case in _TEST_CASES_
-    for inplace_item in _INPLACE
-]
-
-# Data types used for testing (matching old operators library: only F16 and F32)
-_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32]
-
-# Tolerance map for different data types
-# Note: F32 tolerance is relaxed compared to theoretical precision due to:
-# - Old operators library uses vectorized operations (pack_size=4) with vecN<float2, float, 2>
-# - InfiniCore uses elementwise operations, which can cause 1 ULP differences
-# - This is acceptable as it's within floating-point precision limits
-_TOLERANCE_MAP = {
-    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
-    InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6},  # Relaxed from 1e-7 to accommodate vectorization differences
-}
-
-DEBUG = False
-PROFILE = False
-NUM_PRERUN = 10
-NUM_ITERATIONS = 1000
-
-
-def max(c, a, b):
-    # Only support F16 and F32 (matching old operators library)
-    torch.maximum(a, b, out=c)
-
-
-def test(
-    handle,
-    device,
-    shape,
-    a_stride=None,
-    b_stride=None,
-    c_stride=None,
-    inplace=Inplace.OUT_OF_PLACE,
-    dtype=InfiniDtype.F16,
-    sync=None,
-):
-    a = TestTensor(shape, a_stride, dtype, device)
-    b = TestTensor(shape, b_stride, dtype, device)
-    
-    if inplace == Inplace.INPLACE_A:
-        if c_stride is not None and c_stride != a_stride:
-            return
-        c = a
-    elif inplace == Inplace.INPLACE_B:
-        if c_stride is not None and c_stride != b_stride:
-            return
-        c = b
-    else:
-        c = TestTensor(shape, c_stride, dtype, device)
-
-    if c.is_broadcast():
-        return
-
-    print(
-        f"Testing Max on {InfiniDeviceNames[device]} with shape:{shape} a_stride:{a_stride} b_stride:{b_stride} c_stride:{c_stride} "
-        f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}"
-    )
-    max(c.torch_tensor(), a.torch_tensor(), b.torch_tensor())
-
-    if sync is not None:
-        sync()
-
-    descriptor = infiniopOperatorDescriptor_t()
-    check_error(
-        LIBINFINIOP.infiniopCreateMaxDescriptor(
-            handle,
-            ctypes.byref(descriptor),
-            c.descriptor,
-            a.descriptor,
-            b.descriptor,
-        )
-    )
-
-    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
-    for tensor in [a, b, c]:
-        tensor.destroy_desc()
-
-    workspace_size = c_uint64(0)
-    check_error(
-        LIBINFINIOP.infiniopGetMaxWorkspaceSize(
-            descriptor, ctypes.byref(workspace_size)
-        )
-    )
-    workspace = TestWorkspace(workspace_size.value, device)
-
-    def lib_max():
-        check_error(
-            LIBINFINIOP.infiniopMax(
-                descriptor,
-                workspace.data(),
-                workspace_size.value,
-                c.data(),
-                a.data(),
-                b.data(),
-                None,
-            )
-        )
-
-    lib_max()
-    if sync is not None:
-        sync()
-
-    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
-    if DEBUG:
-        debug(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol)
-    assert torch.allclose(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol, equal_nan=True)
-
-    # Profiling workflow
-    if PROFILE:
-        # fmt: off
-        profile_operation("PyTorch", lambda: max(c.torch_tensor(), a.torch_tensor(), b.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
-        profile_operation("    lib", lambda: lib_max(), device, NUM_PRERUN, NUM_ITERATIONS)
-        # fmt: on
-    check_error(LIBINFINIOP.infiniopDestroyMaxDescriptor(descriptor))
-
-
-if __name__ == "__main__":
-    args = get_args()
-    # Configure testing options
-    DEBUG = args.debug
-    PROFILE = args.profile
-    NUM_PRERUN = args.num_prerun
-    NUM_ITERATIONS = args.num_iterations
-
-    for device in get_test_devices(args):
-        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
-
-    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/min.py b/test/infiniop/min.py
deleted file mode 100644
index 19f52a334..000000000
--- a/test/infiniop/min.py
+++ /dev/null
@@ -1,189 +0,0 @@
-import torch
-import ctypes
-from ctypes import c_uint64
-from libinfiniop import (
-    LIBINFINIOP,
-    TestTensor,
-    get_test_devices,
-    check_error,
-    test_operator,
-    get_args,
-    debug,
-    get_tolerance,
-    profile_operation,
-    TestWorkspace,
-    InfiniDtype,
-    InfiniDtypeNames,
-    InfiniDeviceNames,
-    infiniopOperatorDescriptor_t,
-)
-from enum import Enum, auto
-
-# ==============================================================================
-#  Configuration (Internal Use Only)
-# ==============================================================================
-# These are not meant to be imported from other modules
-_TEST_CASES_ = [
-    # shape, a_stride, b_stride, c_stride
-    ((13, 4), None, None, None),
-    ((13, 4), (10, 1), (10, 1), (10, 1)),
-    ((13, 4), (0, 1), None, None),
-    ((13, 4, 4), None, None, None),
-    ((13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1)),
-    ((13, 4, 4), (4, 0, 1), (0, 4, 1), None),
-    ((16, 5632), None, None, None),
-    ((16, 5632), (13312, 1), (13312, 1), (13312, 1)),
-    ((13, 16, 2), (128, 4, 1), (0, 2, 1), (64, 4, 1)),
-    ((13, 16, 2), (128, 4, 1), (2, 0, 1), (64, 4, 1)),
-    ((4, 4, 5632), None, None, None),
-    ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1)),
-]
-
-
-class Inplace(Enum):
-    OUT_OF_PLACE = auto()
-    INPLACE_A = auto()
-    INPLACE_B = auto()
-
-
-# Inplace options applied for each test case in _TEST_CASES_
-_INPLACE = [
-    Inplace.OUT_OF_PLACE,
-    Inplace.INPLACE_A,
-    Inplace.INPLACE_B,
-]
-
-# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
-_TEST_CASES = [
-    test_case + (inplace_item,)
-    for test_case in _TEST_CASES_
-    for inplace_item in _INPLACE
-]
-
-# Data types used for testing (matching old operators library: only F16 and F32)
-_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32]
-
-# Tolerance map for different data types
-# Note: F32 tolerance is relaxed compared to theoretical precision due to:
-# - Old operators library uses vectorized operations (pack_size=4) with vecN<float2, float, 2>
-# - InfiniCore uses elementwise operations, which can cause 1 ULP differences
-# - This is acceptable as it's within floating-point precision limits
-_TOLERANCE_MAP = {
-    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
-    InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6},  # Relaxed from 1e-7 to accommodate vectorization differences
-}
-
-DEBUG = False
-PROFILE = False
-NUM_PRERUN = 10
-NUM_ITERATIONS = 1000
-
-
-def min(c, a, b):
-    # Only support F16 and F32 (matching old operators library)
-    torch.minimum(a, b, out=c)
-
-
-def test(
-    handle,
-    device,
-    shape,
-    a_stride=None,
-    b_stride=None,
-    c_stride=None,
-    inplace=Inplace.OUT_OF_PLACE,
-    dtype=InfiniDtype.F16,
-    sync=None,
-):
-    a = TestTensor(shape, a_stride, dtype, device)
-    b = TestTensor(shape, b_stride, dtype, device)
-    
-    if inplace == Inplace.INPLACE_A:
-        if c_stride is not None and c_stride != a_stride:
-            return
-        c = a
-    elif inplace == Inplace.INPLACE_B:
-        if c_stride is not None and c_stride != b_stride:
-            return
-        c = b
-    else:
-        c = TestTensor(shape, c_stride, dtype, device)
-
-    if c.is_broadcast():
-        return
-
-    print(
-        f"Testing Min on {InfiniDeviceNames[device]} with shape:{shape} a_stride:{a_stride} b_stride:{b_stride} c_stride:{c_stride} "
-        f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}"
-    )
-    min(c.torch_tensor(), a.torch_tensor(), b.torch_tensor())
-
-    if sync is not None:
-        sync()
-
-    descriptor = infiniopOperatorDescriptor_t()
-    check_error(
-        LIBINFINIOP.infiniopCreateMinDescriptor(
-            handle,
-            ctypes.byref(descriptor),
-            c.descriptor,
-            a.descriptor,
-            b.descriptor,
-        )
-    )
-
-    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
-    for tensor in [a, b, c]:
-        tensor.destroy_desc()
-
-    workspace_size = c_uint64(0)
-    check_error(
-        LIBINFINIOP.infiniopGetMinWorkspaceSize(
-            descriptor, ctypes.byref(workspace_size)
-        )
-    )
-    workspace = TestWorkspace(workspace_size.value, device)
-
-    def lib_min():
-        check_error(
-            LIBINFINIOP.infiniopMin(
-                descriptor,
-                workspace.data(),
-                workspace_size.value,
-                c.data(),
-                a.data(),
-                b.data(),
-                None,
-            )
-        )
-
-    lib_min()
-    if sync is not None:
-        sync()
-
-    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
-    if DEBUG:
-        debug(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol)
-    assert torch.allclose(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol, equal_nan=True)
-
-    # Profiling workflow
-    if PROFILE:
-        # fmt: off
-        profile_operation("PyTorch", lambda: min(c.torch_tensor(), a.torch_tensor(), b.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
-        profile_operation("    lib", lambda: lib_min(), device, NUM_PRERUN, NUM_ITERATIONS)
-        # fmt: on
-    check_error(LIBINFINIOP.infiniopDestroyMinDescriptor(descriptor))
-
-
-if __name__ == "__main__":
-    args = get_args()
-    # Configure testing options
-    DEBUG = args.debug
-    PROFILE = args.profile
-    NUM_PRERUN = args.num_prerun
-    NUM_ITERATIONS = args.num_iterations
-
-    for device in get_test_devices(args):
-        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
-
-    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/mod.py b/test/infiniop/mod.py
deleted file mode 100644
index 298f3137f..000000000
--- a/test/infiniop/mod.py
+++ /dev/null
@@ -1,190 +0,0 @@
-import torch
-import ctypes
-from ctypes import c_uint64
-from libinfiniop import (
-    LIBINFINIOP,
-    TestTensor,
-    get_test_devices,
-    check_error,
-    test_operator,
-    get_args,
-    debug,
-    get_tolerance,
-    profile_operation,
-    TestWorkspace,
-    InfiniDtype,
-    InfiniDtypeNames,
-    InfiniDeviceNames,
-    infiniopOperatorDescriptor_t,
-)
-from enum import Enum, auto
-
-# ==============================================================================
-#  Configuration (Internal Use Only)
-# ==============================================================================
-# These are not meant to be imported from other modules
-_TEST_CASES_ = [
-    # shape, a_stride, b_stride, c_stride
-    ((13, 4), None, None, None),
-    ((13, 4), (10, 1), (10, 1), (10, 1)),
-    ((13, 4), (0, 1), None, None),
-    ((13, 4, 4), None, None, None),
-    ((13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1)),
-    ((13, 4, 4), (4, 0, 1), (0, 4, 1), None),
-    ((16, 5632), None, None, None),
-    ((16, 5632), (13312, 1), (13312, 1), (13312, 1)),
-    ((13, 16, 2), (128, 4, 1), (0, 2, 1), (64, 4, 1)),
-    ((13, 16, 2), (128, 4, 1), (2, 0, 1), (64, 4, 1)),
-    ((4, 4, 5632), None, None, None),
-    ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1)),
-]
-
-
-class Inplace(Enum):
-    OUT_OF_PLACE = auto()
-    INPLACE_A = auto()
-    INPLACE_B = auto()
-
-
-# Inplace options applied for each test case in _TEST_CASES_
-_INPLACE = [
-    Inplace.OUT_OF_PLACE,
-    Inplace.INPLACE_A,
-    Inplace.INPLACE_B,
-]
-
-# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
-_TEST_CASES = [
-    test_case + (inplace_item,)
-    for test_case in _TEST_CASES_
-    for inplace_item in _INPLACE
-]
-
-# Data types used for testing (matching old operators library: only F16 and F32)
-_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32]
-
-# Tolerance map for different data types
-# Note: mod operation uses fmod for floating point, which should be exact
-_TOLERANCE_MAP = {
-    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
-    InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7},
-}
-
-DEBUG = False
-PROFILE = False
-NUM_PRERUN = 10
-NUM_ITERATIONS = 1000
-
-
-def mod_op(c, a, b):
-    torch.fmod(a, b, out=c)
-
-
-def test(
-    handle,
-    device,
-    shape,
-    a_stride=None,
-    b_stride=None,
-    c_stride=None,
-    inplace=Inplace.OUT_OF_PLACE,
-    dtype=InfiniDtype.F16,
-    sync=None,
-):
-    # Generate test tensors with values in a reasonable range for mod operation
-    # Use scale=10 to get values in [0, 10) range, similar to old test
-    a = TestTensor(shape, a_stride, dtype, device, mode="random", scale=10.0)
-    # Ensure b doesn't contain zeros to avoid division by zero in mod
-    b = TestTensor(shape, b_stride, dtype, device, mode="random", scale=10.0, bias=0.1)
-    
-    if inplace == Inplace.INPLACE_A:
-        if c_stride is not None and c_stride != a_stride:
-            return
-        c = a
-    elif inplace == Inplace.INPLACE_B:
-        if c_stride is not None and c_stride != b_stride:
-            return
-        c = b
-    else:
-        c = TestTensor(shape, c_stride, dtype, device)
-
-    if c.is_broadcast():
-        return
-
-    print(
-        f"Testing Mod on {InfiniDeviceNames[device]} with shape:{shape} a_stride:{a_stride} b_stride:{b_stride} c_stride:{c_stride} "
-        f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}"
-    )
-    
-    mod_op(c.torch_tensor(), a.torch_tensor(), b.torch_tensor())
-
-    if sync is not None:
-        sync()
-
-    descriptor = infiniopOperatorDescriptor_t()
-    check_error(
-        LIBINFINIOP.infiniopCreateModDescriptor(
-            handle,
-            ctypes.byref(descriptor),
-            c.descriptor,
-            a.descriptor,
-            b.descriptor,
-        )
-    )
-
-    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
-    for tensor in [a, b, c]:
-        tensor.destroy_desc()
-
-    workspace_size = c_uint64(0)
-    check_error(
-        LIBINFINIOP.infiniopGetModWorkspaceSize(
-            descriptor, ctypes.byref(workspace_size)
-        )
-    )
-    workspace = TestWorkspace(workspace_size.value, device)
-
-    def lib_mod():
-        check_error(
-            LIBINFINIOP.infiniopMod(
-                descriptor,
-                workspace.data(),
-                workspace_size.value,
-                c.data(),
-                a.data(),
-                b.data(),
-                None,
-            )
-        )
-
-    lib_mod()
-    if sync is not None:
-        sync()
-
-    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
-    if DEBUG:
-        debug(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol, equal_nan=True)
-    assert torch.allclose(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol, equal_nan=True)
-
-    # Profiling workflow
-    if PROFILE:
-        # fmt: off
-        profile_operation("PyTorch", lambda: mod_op(c.torch_tensor(), a.torch_tensor(), b.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
-        profile_operation("    lib", lambda: lib_mod(), device, NUM_PRERUN, NUM_ITERATIONS)
-        # fmt: on
-    check_error(LIBINFINIOP.infiniopDestroyModDescriptor(descriptor))
-
-
-if __name__ == "__main__":
-    args = get_args()
-
-    # Configure testing options
-    DEBUG = args.debug
-    PROFILE = args.profile
-    NUM_PRERUN = args.num_prerun
-    NUM_ITERATIONS = args.num_iterations
-
-    for device in get_test_devices(args):
-        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
-
-    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/neg.py b/test/infiniop/neg.py
deleted file mode 100644
index 62607bce0..000000000
--- a/test/infiniop/neg.py
+++ /dev/null
@@ -1,165 +0,0 @@
-import ctypes
-from ctypes import c_uint64
-from enum import Enum, auto
-
-import torch
-from libinfiniop import (
-    LIBINFINIOP,
-    InfiniDeviceNames,
-    InfiniDtype,
-    InfiniDtypeNames,
-    TestTensor,
-    TestWorkspace,
-    check_error,
-    debug,
-    get_args,
-    get_test_devices,
-    get_tolerance,
-    infiniopOperatorDescriptor_t,
-    profile_operation,
-    test_operator,
-)
-
-# ==============================================================================
-#  Configuration (Internal Use Only)
-# ==============================================================================
-# These are not meant to be imported from other modules
-_TEST_CASES_ = [
-    # tensor_shape, inplace
-    ((1, 3),),
-    ((3, 3),),
-    ((32, 20, 512),),
-    ((33, 333, 333),),
-    ((32, 256, 112, 112),),
-    ((3, 3, 13, 9, 17),),
-]
-
-
-class Inplace(Enum):
-    OUT_OF_PLACE = auto()
-    INPLACE_X = auto()
-
-
-# Inplace options applied for each test case in _TEST_CASES_
-_INPLACE = [
-    Inplace.OUT_OF_PLACE,
-    Inplace.INPLACE_X,
-]
-
-# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
-_TEST_CASES = [
-    test_case + (inplace_item,)
-    for test_case in _TEST_CASES_
-    for inplace_item in _INPLACE
-]
-
-# Data types used for testing (matching old operators library: only F16 and F32)
-_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32]
-
-# Tolerance map for different data types
-_TOLERANCE_MAP = {
-    InfiniDtype.F16: {"atol": 0, "rtol": 1e-3},
-    InfiniDtype.F32: {"atol": 0, "rtol": 1e-7},
-}
-
-DEBUG = False
-PROFILE = False
-NUM_PRERUN = 10
-NUM_ITERATIONS = 1000
-
-
-def neg_op(x):
-    return torch.neg(x).to(x.dtype)
-
-
-def test(
-    handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=InfiniDtype.F16, sync=None
-):
-    # Generate test tensors with values in range [-200, -100) for neg operation
-    # This matches the original test case: * 100 - 200
-    x_torch_tensor = torch.rand(shape) * 100 - 200
-
-    x = TestTensor(
-        shape,
-        x_torch_tensor.stride(),
-        dtype,
-        device,
-        mode="manual",
-        set_tensor=x_torch_tensor,
-    )
-
-    if inplace == Inplace.INPLACE_X:
-        y = x
-    else:
-        y = TestTensor(shape, None, dtype, device)
-
-    if y.is_broadcast():
-        return
-
-    print(
-        f"Testing Neg on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}"
-    )
-
-    ans = neg_op(x.torch_tensor())
-
-    if sync is not None:
-        sync()
-
-    descriptor = infiniopOperatorDescriptor_t()
-    check_error(
-        LIBINFINIOP.infiniopCreateNegDescriptor(
-            handle, ctypes.byref(descriptor), y.descriptor, x.descriptor
-        )
-    )
-
-    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
-    for tensor in [x, y]:
-        tensor.destroy_desc()
-
-    workspace_size = c_uint64(0)
-    check_error(
-        LIBINFINIOP.infiniopGetNegWorkspaceSize(
-            descriptor, ctypes.byref(workspace_size)
-        )
-    )
-    workspace = TestWorkspace(workspace_size.value, y.device)
-
-    def lib_neg():
-        check_error(
-            LIBINFINIOP.infiniopNeg(
-                descriptor, workspace.data(), workspace_size.value, y.data(), x.data(), None
-            )
-        )
-
-    lib_neg()
-    if sync is not None:
-        sync()
-
-    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
-    if DEBUG:
-        debug(y.actual_tensor(), ans, atol=atol, rtol=rtol)
-    assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol, equal_nan=True)
-
-    # Profiling workflow
-    if PROFILE:
-        # fmt: off
-        profile_operation("PyTorch", lambda: neg_op(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
-        profile_operation("    lib", lambda: lib_neg(), device, NUM_PRERUN, NUM_ITERATIONS)
-        # fmt: on
-
-    check_error(LIBINFINIOP.infiniopDestroyNegDescriptor(descriptor))
-
-
-if __name__ == "__main__":
-    args = get_args()
-
-    # Configure testing options
-    DEBUG = args.debug
-    PROFILE = args.profile
-    NUM_PRERUN = args.num_prerun
-    NUM_ITERATIONS = args.num_iterations
-
-    for device in get_test_devices(args):
-        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
-
-    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/pow.py b/test/infiniop/pow.py
deleted file mode 100644
index f437c4229..000000000
--- a/test/infiniop/pow.py
+++ /dev/null
@@ -1,190 +0,0 @@
-import torch
-import ctypes
-from ctypes import c_uint64
-from libinfiniop import (
-    LIBINFINIOP,
-    TestTensor,
-    get_test_devices,
-    check_error,
-    test_operator,
-    get_args,
-    debug,
-    get_tolerance,
-    profile_operation,
-    TestWorkspace,
-    InfiniDtype,
-    InfiniDtypeNames,
-    InfiniDeviceNames,
-    infiniopOperatorDescriptor_t,
-)
-from enum import Enum, auto
-
-# ==============================================================================
-#  Configuration (Internal Use Only)
-# ==============================================================================
-# These are not meant to be imported from other modules
-_TEST_CASES_ = [
-    # shape, a_stride, b_stride, c_stride
-    ((13, 4), None, None, None),
-    ((13, 4), (10, 1), (10, 1), (10, 1)),
-    ((13, 4), (0, 1), None, None),
-    ((13, 4, 4), None, None, None),
-    ((13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1)),
-    ((13, 4, 4), (4, 0, 1), (0, 4, 1), None),
-    ((16, 5632), None, None, None),
-    ((16, 5632), (13312, 1), (13312, 1), (13312, 1)),
-    ((4, 4, 5632), None, None, None),
-    ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1)),
-]
-
-
-class Inplace(Enum):
-    OUT_OF_PLACE = auto()
-    INPLACE_A = auto()
-    INPLACE_B = auto()
-
-
-# Inplace options applied for each test case in _TEST_CASES_
-_INPLACE = [
-    Inplace.OUT_OF_PLACE,
-    Inplace.INPLACE_A,
-    Inplace.INPLACE_B,
-]
-
-# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
-_TEST_CASES = [
-    test_case + (inplace_item,)
-    for test_case in _TEST_CASES_
-    for inplace_item in _INPLACE
-]
-
-# Data types used for testing
-# Note: Only F16 and F32 are supported, matching the old repository's binary operator
-_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32]
-
-# Tolerance map for different data types
-# Note: pow operation may have larger numerical errors, especially for F16
-_TOLERANCE_MAP = {
-    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
-    InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-3},
-}
-
-DEBUG = False
-PROFILE = False
-NUM_PRERUN = 10
-NUM_ITERATIONS = 1000
-
-
-def pow_op(c, a, b):
-    torch.pow(a, b, out=c)
-
-
-def test(
-    handle,
-    device,
-    shape,
-    a_stride=None,
-    b_stride=None,
-    c_stride=None,
-    inplace=Inplace.OUT_OF_PLACE,
-    dtype=InfiniDtype.F16,
-    sync=None,
-):
-    # Generate test tensors with values in a reasonable range for pow operation
-    # Avoid negative bases and very large exponents to prevent numerical issues
-    a = TestTensor(shape, a_stride, dtype, device, mode="random", scale=5.0, bias=0.1)
-    b = TestTensor(shape, b_stride, dtype, device, mode="random", scale=3.0, bias=0.1)
-    
-    if inplace == Inplace.INPLACE_A:
-        if c_stride is not None and c_stride != a_stride:
-            return
-        c = a
-    elif inplace == Inplace.INPLACE_B:
-        if c_stride is not None and c_stride != b_stride:
-            return
-        c = b
-    else:
-        c = TestTensor(shape, c_stride, dtype, device)
-
-    if c.is_broadcast():
-        return
-
-    print(
-        f"Testing Pow on {InfiniDeviceNames[device]} with shape:{shape} a_stride:{a_stride} b_stride:{b_stride} c_stride:{c_stride} "
-        f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}"
-    )
-    
-    pow_op(c.torch_tensor(), a.torch_tensor(), b.torch_tensor())
-
-    if sync is not None:
-        sync()
-
-    descriptor = infiniopOperatorDescriptor_t()
-    check_error(
-        LIBINFINIOP.infiniopCreatePowDescriptor(
-            handle,
-            ctypes.byref(descriptor),
-            c.descriptor,
-            a.descriptor,
-            b.descriptor,
-        )
-    )
-
-    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
-    for tensor in [a, b, c]:
-        tensor.destroy_desc()
-
-    workspace_size = c_uint64(0)
-    check_error(
-        LIBINFINIOP.infiniopGetPowWorkspaceSize(
-            descriptor, ctypes.byref(workspace_size)
-        )
-    )
-    workspace = TestWorkspace(workspace_size.value, c.device)
-
-    def lib_pow():
-        check_error(
-            LIBINFINIOP.infiniopPow(
-                descriptor,
-                workspace.data(),
-                workspace_size.value,
-                c.data(),
-                a.data(),
-                b.data(),
-                None,
-            )
-        )
-
-    lib_pow()
-
-    if sync is not None:
-        sync()
-
-    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
-    if DEBUG:
-        debug(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol, equal_nan=True)
-    # Use equal_nan=True to handle NaN cases in pow operation
-    assert torch.allclose(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol, equal_nan=True)
-
-    # Profiling workflow
-    if PROFILE:
-        # fmt: off
-        profile_operation("PyTorch", lambda: pow_op(c.torch_tensor(), a.torch_tensor(), b.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
-        profile_operation("    lib", lambda: lib_pow(), device, NUM_PRERUN, NUM_ITERATIONS)
-        # fmt: on
-    check_error(LIBINFINIOP.infiniopDestroyPowDescriptor(descriptor))
-
-
-if __name__ == "__main__":
-    args = get_args()
-
-    # Configure testing options
-    DEBUG = args.debug
-    PROFILE = args.profile
-    NUM_PRERUN = args.num_prerun
-    NUM_ITERATIONS = args.num_iterations
-
-    for device in get_test_devices(args):
-        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
-
-    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/reciprocal.py b/test/infiniop/reciprocal.py
deleted file mode 100644
index 4e816481c..000000000
--- a/test/infiniop/reciprocal.py
+++ /dev/null
@@ -1,168 +0,0 @@
-import ctypes
-from ctypes import c_uint64
-from enum import Enum, auto
-
-import torch
-from libinfiniop import (
-    LIBINFINIOP,
-    InfiniDeviceNames,
-    InfiniDtype,
-    InfiniDtypeNames,
-    TestTensor,
-    TestWorkspace,
-    check_error,
-    debug,
-    get_args,
-    get_test_devices,
-    get_tolerance,
-    infiniopOperatorDescriptor_t,
-    profile_operation,
-    test_operator,
-)
-
-# ==============================================================================
-#  Configuration (Internal Use Only)
-# ==============================================================================
-# These are not meant to be imported from other modules
-_TEST_CASES_ = [
-    # tensor_shape, inplace
-    ((1, 3),),
-    ((3, 3),),
-    ((32, 20, 512),),
-    ((33, 333, 333),),
-    ((32, 256, 112, 112),),
-    ((3, 3, 13, 9, 17),),
-]
-
-
-class Inplace(Enum):
-    OUT_OF_PLACE = auto()
-    INPLACE_X = auto()
-
-
-# Inplace options applied for each test case in _TEST_CASES_
-_INPLACE = [
-    Inplace.OUT_OF_PLACE,
-    Inplace.INPLACE_X,
-]
-
-# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
-_TEST_CASES = [
-    test_case + (inplace_item,)
-    for test_case in _TEST_CASES_
-    for inplace_item in _INPLACE
-]
-
-# Data types used for testing (matching old operators library: only F16 and F32)
-_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32]
-
-# Tolerance map for different data types
-_TOLERANCE_MAP = {
-    InfiniDtype.F16: {"atol": 0, "rtol": 1e-3},
-    InfiniDtype.F32: {"atol": 0, "rtol": 1e-7},
-}
-
-DEBUG = False
-PROFILE = False
-NUM_PRERUN = 10
-NUM_ITERATIONS = 1000
-
-
-def reciprocal_op(x):
-    return torch.reciprocal(x).to(x.dtype)
-
-
-def test(
-    handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=InfiniDtype.F16, sync=None
-):
-    # Generate test tensors with values in range [-10, 10) for reciprocal operation
-    # This matches the original test case: * 20 - 10
-    # Note: Avoid values too close to zero to prevent division by zero issues
-    x_torch_tensor = torch.rand(shape) * 20 - 10
-    # Ensure no zero values
-    x_torch_tensor = torch.where(x_torch_tensor == 0, torch.ones_like(x_torch_tensor), x_torch_tensor)
-
-    x = TestTensor(
-        shape,
-        x_torch_tensor.stride(),
-        dtype,
-        device,
-        mode="manual",
-        set_tensor=x_torch_tensor,
-    )
-
-    if inplace == Inplace.INPLACE_X:
-        y = x
-    else:
-        y = TestTensor(shape, None, dtype, device)
-
-    if y.is_broadcast():
-        return
-
-    print(
-        f"Testing Reciprocal on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}"
-    )
-
-    ans = reciprocal_op(x.torch_tensor())
-
-    if sync is not None:
-        sync()
-
-    descriptor = infiniopOperatorDescriptor_t()
-    check_error(
-        LIBINFINIOP.infiniopCreateReciprocalDescriptor(
-            handle, ctypes.byref(descriptor), y.descriptor, x.descriptor
-        )
-    )
-
-    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
-    for tensor in [x, y]:
-        tensor.destroy_desc()
-
-    workspace_size = c_uint64(0)
-    check_error(
-        LIBINFINIOP.infiniopGetReciprocalWorkspaceSize(
-            descriptor, ctypes.byref(workspace_size)
-        )
-    )
-    workspace = TestWorkspace(workspace_size.value, y.device)
-
-    def lib_reciprocal():
-        check_error(
-            LIBINFINIOP.infiniopReciprocal(
-                descriptor, workspace.data(), workspace_size.value, y.data(), x.data(), None
-            )
-        )
-
-    lib_reciprocal()
-    if sync is not None:
-        sync()
-
-    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
-    if DEBUG:
-        debug(y.actual_tensor(), ans, atol=atol, rtol=rtol)
-    assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol, equal_nan=True)
-
-    # Profiling workflow
-    if PROFILE:
-        # fmt: off
-        profile_operation("PyTorch", lambda: reciprocal_op(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
-        profile_operation("    lib", lambda: lib_reciprocal(), device, NUM_PRERUN, NUM_ITERATIONS)
-        # fmt: on
-
-    check_error(LIBINFINIOP.infiniopDestroyReciprocalDescriptor(descriptor))
-
-
-if __name__ == "__main__":
-    args = get_args()
-
-    # Configure testing options
-    DEBUG = args.debug
-    PROFILE = args.profile
-    NUM_PRERUN = args.num_prerun
-    NUM_ITERATIONS = args.num_iterations
-
-    for device in get_test_devices(args):
-        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
-
-    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/round.py b/test/infiniop/round.py
deleted file mode 100644
index d6053f676..000000000
--- a/test/infiniop/round.py
+++ /dev/null
@@ -1,165 +0,0 @@
-import ctypes
-from ctypes import c_uint64
-from enum import Enum, auto
-
-import torch
-from libinfiniop import (
-    LIBINFINIOP,
-    InfiniDeviceNames,
-    InfiniDtype,
-    InfiniDtypeNames,
-    TestTensor,
-    TestWorkspace,
-    check_error,
-    debug,
-    get_args,
-    get_test_devices,
-    get_tolerance,
-    infiniopOperatorDescriptor_t,
-    profile_operation,
-    test_operator,
-)
-
-# ==============================================================================
-#  Configuration (Internal Use Only)
-# ==============================================================================
-# These are not meant to be imported from other modules
-_TEST_CASES_ = [
-    # tensor_shape, inplace
-    ((1, 3),),
-    ((3, 3),),
-    ((32, 20, 512),),
-    ((33, 333, 333),),
-    ((32, 256, 112, 112),),
-    ((3, 3, 13, 9, 17),),
-]
-
-
-class Inplace(Enum):
-    OUT_OF_PLACE = auto()
-    INPLACE_X = auto()
-
-
-# Inplace options applied for each test case in _TEST_CASES_
-_INPLACE = [
-    Inplace.OUT_OF_PLACE,
-    Inplace.INPLACE_X,
-]
-
-# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
-_TEST_CASES = [
-    test_case + (inplace_item,)
-    for test_case in _TEST_CASES_
-    for inplace_item in _INPLACE
-]
-
-# Data types used for testing (matching old operators library: only F16 and F32)
-_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32]
-
-# Tolerance map for different data types
-_TOLERANCE_MAP = {
-    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
-    InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7},
-}
-
-DEBUG = False
-PROFILE = False
-NUM_PRERUN = 10
-NUM_ITERATIONS = 1000
-
-
-def round_op(x):
-    return torch.round(x).to(x.dtype)
-
-
-def test(
-    handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=InfiniDtype.F16, sync=None
-):
-    # Generate test tensors with values in range [-20, -10) for round operation
-    # This matches the original test case: * 10 - 20
-    x_torch_tensor = torch.rand(shape) * 10 - 20
-
-    x = TestTensor(
-        shape,
-        x_torch_tensor.stride(),
-        dtype,
-        device,
-        mode="manual",
-        set_tensor=x_torch_tensor,
-    )
-
-    if inplace == Inplace.INPLACE_X:
-        y = x
-    else:
-        y = TestTensor(shape, None, dtype, device)
-
-    if y.is_broadcast():
-        return
-
-    print(
-        f"Testing Round on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}"
-    )
-
-    ans = round_op(x.torch_tensor())
-
-    if sync is not None:
-        sync()
-
-    descriptor = infiniopOperatorDescriptor_t()
-    check_error(
-        LIBINFINIOP.infiniopCreateRoundDescriptor(
-            handle, ctypes.byref(descriptor), y.descriptor, x.descriptor
-        )
-    )
-
-    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
-    for tensor in [x, y]:
-        tensor.destroy_desc()
-
-    workspace_size = c_uint64(0)
-    check_error(
-        LIBINFINIOP.infiniopGetRoundWorkspaceSize(
-            descriptor, ctypes.byref(workspace_size)
-        )
-    )
-    workspace = TestWorkspace(workspace_size.value, y.device)
-
-    def lib_round():
-        check_error(
-            LIBINFINIOP.infiniopRound(
-                descriptor, workspace.data(), workspace_size.value, y.data(), x.data(), None
-            )
-        )
-
-    lib_round()
-    if sync is not None:
-        sync()
-
-    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
-    if DEBUG:
-        debug(y.actual_tensor(), ans, atol=atol, rtol=rtol)
-    assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol, equal_nan=True)
-
-    # Profiling workflow
-    if PROFILE:
-        # fmt: off
-        profile_operation("PyTorch", lambda: round_op(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
-        profile_operation("    lib", lambda: lib_round(), device, NUM_PRERUN, NUM_ITERATIONS)
-        # fmt: on
-
-    check_error(LIBINFINIOP.infiniopDestroyRoundDescriptor(descriptor))
-
-
-if __name__ == "__main__":
-    args = get_args()
-
-    # Configure testing options
-    DEBUG = args.debug
-    PROFILE = args.profile
-    NUM_PRERUN = args.num_prerun
-    NUM_ITERATIONS = args.num_iterations
-
-    for device in get_test_devices(args):
-        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
-
-    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/sign.py b/test/infiniop/sign.py
deleted file mode 100644
index f0eb5b5f8..000000000
--- a/test/infiniop/sign.py
+++ /dev/null
@@ -1,166 +0,0 @@
-import ctypes
-from ctypes import c_uint64
-from enum import Enum, auto
-
-import torch
-from libinfiniop import (
-    LIBINFINIOP,
-    InfiniDeviceNames,
-    InfiniDtype,
-    InfiniDtypeNames,
-    TestTensor,
-    TestWorkspace,
-    check_error,
-    debug,
-    get_args,
-    get_test_devices,
-    get_tolerance,
-    infiniopOperatorDescriptor_t,
-    profile_operation,
-    test_operator,
-)
-
-# ==============================================================================
-#  Configuration (Internal Use Only)
-# ==============================================================================
-# These are not meant to be imported from other modules
-_TEST_CASES_ = [
-    # tensor_shape, inplace
-    ((1, 3),),
-    ((3, 3),),
-    ((32, 20, 512),),
-    ((33, 333, 333),),
-    ((32, 256, 112, 112),),
-    ((3, 3, 13, 9, 17),),
-]
-
-
-class Inplace(Enum):
-    OUT_OF_PLACE = auto()
-    INPLACE_X = auto()
-
-
-# Inplace options applied for each test case in _TEST_CASES_
-_INPLACE = [
-    Inplace.OUT_OF_PLACE,
-    Inplace.INPLACE_X,
-]
-
-# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
-_TEST_CASES = [
-    test_case + (inplace_item,)
-    for test_case in _TEST_CASES_
-    for inplace_item in _INPLACE
-]
-
-# Data types used for testing (matching old operators library: only F16 and F32)
-_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32]
-
-# Tolerance map for different data types
-# Copied from old operators library: atol=0, rtol=0
-_TOLERANCE_MAP = {
-    InfiniDtype.F16: {"atol": 0, "rtol": 0},
-    InfiniDtype.F32: {"atol": 0, "rtol": 0},
-}
-
-DEBUG = False
-PROFILE = False
-NUM_PRERUN = 10
-NUM_ITERATIONS = 1000
-
-
-def sign_op(x):
-    return torch.sign(x).to(x.dtype)
-
-
-def test(
-    handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=InfiniDtype.F16, sync=None
-):
-    # Generate test tensors with values in range [-200, -100) for sign operation
-    # sign domain is (-∞, +∞), so we use range [-200, -100)
-    x_torch_tensor = torch.rand(shape) * 100 - 200
-
-    x = TestTensor(
-        shape,
-        x_torch_tensor.stride(),
-        dtype,
-        device,
-        mode="manual",
-        set_tensor=x_torch_tensor,
-    )
-
-    if inplace == Inplace.INPLACE_X:
-        y = x
-    else:
-        y = TestTensor(shape, None, dtype, device)
-
-    if y.is_broadcast():
-        return
-
-    print(
-        f"Testing Sign on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}"
-    )
-
-    ans = sign_op(x.torch_tensor())
-
-    if sync is not None:
-        sync()
-
-    descriptor = infiniopOperatorDescriptor_t()
-    check_error(
-        LIBINFINIOP.infiniopCreateSignDescriptor(
-            handle, ctypes.byref(descriptor), y.descriptor, x.descriptor
-        )
-    )
-
-    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
-    for tensor in [x, y]:
-        tensor.destroy_desc()
-
-    workspace_size = c_uint64(0)
-    check_error(
-        LIBINFINIOP.infiniopGetSignWorkspaceSize(
-            descriptor, ctypes.byref(workspace_size)
-        )
-    )
-    workspace = TestWorkspace(workspace_size.value, y.device)
-
-    def lib_sign():
-        check_error(
-            LIBINFINIOP.infiniopSign(
-                descriptor, workspace.data(), workspace_size.value, y.data(), x.data(), None
-            )
-        )
-
-    lib_sign()
-    if sync is not None:
-        sync()
-
-    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
-    if DEBUG:
-        debug(y.actual_tensor(), ans, atol=atol, rtol=rtol)
-    assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol, equal_nan=True)
-
-    # Profiling workflow
-    if PROFILE:
-        # fmt: off
-        profile_operation("PyTorch", lambda: sign_op(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
-        profile_operation("    lib", lambda: lib_sign(), device, NUM_PRERUN, NUM_ITERATIONS)
-        # fmt: on
-
-    check_error(LIBINFINIOP.infiniopDestroySignDescriptor(descriptor))
-
-
-if __name__ == "__main__":
-    args = get_args()
-
-    # Configure testing options
-    DEBUG = args.debug
-    PROFILE = args.profile
-    NUM_PRERUN = args.num_prerun
-    NUM_ITERATIONS = args.num_iterations
-
-    for device in get_test_devices(args):
-        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
-
-    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/sinh.py b/test/infiniop/sinh.py
deleted file mode 100644
index 99bc02c58..000000000
--- a/test/infiniop/sinh.py
+++ /dev/null
@@ -1,166 +0,0 @@
-import ctypes
-from ctypes import c_uint64
-from enum import Enum, auto
-
-import torch
-from libinfiniop import (
-    LIBINFINIOP,
-    InfiniDeviceNames,
-    InfiniDtype,
-    InfiniDtypeNames,
-    TestTensor,
-    TestWorkspace,
-    check_error,
-    debug,
-    get_args,
-    get_test_devices,
-    get_tolerance,
-    infiniopOperatorDescriptor_t,
-    profile_operation,
-    test_operator,
-)
-
-# ==============================================================================
-#  Configuration (Internal Use Only)
-# ==============================================================================
-# These are not meant to be imported from other modules
-_TEST_CASES_ = [
-    # tensor_shape, inplace
-    ((1, 3),),
-    ((3, 3),),
-    ((32, 20, 512),),
-    ((33, 333, 333),),
-    ((32, 256, 112, 112),),
-    ((3, 3, 13, 9, 17),),
-]
-
-
-class Inplace(Enum):
-    OUT_OF_PLACE = auto()
-    INPLACE_X = auto()
-
-
-# Inplace options applied for each test case in _TEST_CASES_
-_INPLACE = [
-    Inplace.OUT_OF_PLACE,
-    Inplace.INPLACE_X,
-]
-
-# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
-_TEST_CASES = [
-    test_case + (inplace_item,)
-    for test_case in _TEST_CASES_
-    for inplace_item in _INPLACE
-]
-
-# Data types used for testing (matching old operators library: only F16 and F32)
-_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32]
-
-# Tolerance map for different data types
-# Copied from old operators library: atol=0, rtol=0
-_TOLERANCE_MAP = {
-    InfiniDtype.F16: {"atol": 0, "rtol": 0},
-    InfiniDtype.F32: {"atol": 0, "rtol": 0},
-}
-
-DEBUG = False
-PROFILE = False
-NUM_PRERUN = 10
-NUM_ITERATIONS = 1000
-
-
-def sinh_op(x):
-    return torch.sinh(x).to(x.dtype)
-
-
-def test(
-    handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=InfiniDtype.F16, sync=None
-):
-    # Generate test tensors with values in range [-200, -100) for sinh operation
-    # sinh domain is (-∞, +∞), so we use range [-200, -100)
-    x_torch_tensor = torch.rand(shape) * 100 - 200
-
-    x = TestTensor(
-        shape,
-        x_torch_tensor.stride(),
-        dtype,
-        device,
-        mode="manual",
-        set_tensor=x_torch_tensor,
-    )
-
-    if inplace == Inplace.INPLACE_X:
-        y = x
-    else:
-        y = TestTensor(shape, None, dtype, device)
-
-    if y.is_broadcast():
-        return
-
-    print(
-        f"Testing Sinh on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}"
-    )
-
-    ans = sinh_op(x.torch_tensor())
-
-    if sync is not None:
-        sync()
-
-    descriptor = infiniopOperatorDescriptor_t()
-    check_error(
-        LIBINFINIOP.infiniopCreateSinhDescriptor(
-            handle, ctypes.byref(descriptor), y.descriptor, x.descriptor
-        )
-    )
-
-    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
-    for tensor in [x, y]:
-        tensor.destroy_desc()
-
-    workspace_size = c_uint64(0)
-    check_error(
-        LIBINFINIOP.infiniopGetSinhWorkspaceSize(
-            descriptor, ctypes.byref(workspace_size)
-        )
-    )
-    workspace = TestWorkspace(workspace_size.value, y.device)
-
-    def lib_sinh():
-        check_error(
-            LIBINFINIOP.infiniopSinh(
-                descriptor, workspace.data(), workspace_size.value, y.data(), x.data(), None
-            )
-        )
-
-    lib_sinh()
-    if sync is not None:
-        sync()
-
-    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
-    if DEBUG:
-        debug(y.actual_tensor(), ans, atol=atol, rtol=rtol)
-    assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol, equal_nan=True)
-
-    # Profiling workflow
-    if PROFILE:
-        # fmt: off
-        profile_operation("PyTorch", lambda: sinh_op(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
-        profile_operation("    lib", lambda: lib_sinh(), device, NUM_PRERUN, NUM_ITERATIONS)
-        # fmt: on
-
-    check_error(LIBINFINIOP.infiniopDestroySinhDescriptor(descriptor))
-
-
-if __name__ == "__main__":
-    args = get_args()
-
-    # Configure testing options
-    DEBUG = args.debug
-    PROFILE = args.profile
-    NUM_PRERUN = args.num_prerun
-    NUM_ITERATIONS = args.num_iterations
-
-    for device in get_test_devices(args):
-        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
-
-    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/sqrt.py b/test/infiniop/sqrt.py
deleted file mode 100644
index 6e1419971..000000000
--- a/test/infiniop/sqrt.py
+++ /dev/null
@@ -1,166 +0,0 @@
-import ctypes
-from ctypes import c_uint64
-from enum import Enum, auto
-
-import torch
-from libinfiniop import (
-    LIBINFINIOP,
-    InfiniDeviceNames,
-    InfiniDtype,
-    InfiniDtypeNames,
-    TestTensor,
-    TestWorkspace,
-    check_error,
-    debug,
-    get_args,
-    get_test_devices,
-    get_tolerance,
-    infiniopOperatorDescriptor_t,
-    profile_operation,
-    test_operator,
-)
-
-# ==============================================================================
-#  Configuration (Internal Use Only)
-# ==============================================================================
-# These are not meant to be imported from other modules
-_TEST_CASES_ = [
-    # tensor_shape, inplace
-    ((1, 3),),
-    ((3, 3),),
-    ((32, 20, 512),),
-    ((33, 333, 333),),
-    ((32, 256, 112, 112),),
-    ((3, 3, 13, 9, 17),),
-]
-
-
-class Inplace(Enum):
-    OUT_OF_PLACE = auto()
-    INPLACE_X = auto()
-
-
-# Inplace options applied for each test case in _TEST_CASES_
-_INPLACE = [
-    Inplace.OUT_OF_PLACE,
-    Inplace.INPLACE_X,
-]
-
-# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
-_TEST_CASES = [
-    test_case + (inplace_item,)
-    for test_case in _TEST_CASES_
-    for inplace_item in _INPLACE
-]
-
-# Data types used for testing (matching old operators library: only F16 and F32)
-_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32]
-
-# Tolerance map for different data types
-# Copied from old operators library: atol=0, rtol=1e-3
-_TOLERANCE_MAP = {
-    InfiniDtype.F16: {"atol": 0, "rtol": 1e-3},
-    InfiniDtype.F32: {"atol": 0, "rtol": 1e-3},
-}
-
-DEBUG = False
-PROFILE = False
-NUM_PRERUN = 10
-NUM_ITERATIONS = 1000
-
-
-def sqrt_op(x):
-    return torch.sqrt(x).to(x.dtype)
-
-
-def test(
-    handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=InfiniDtype.F16, sync=None
-):
-    # Generate test tensors with values in range [0, 100) for sqrt operation
-    # sqrt domain is [0, +∞), so we use range [0, 100)
-    x_torch_tensor = torch.rand(shape) * 100
-
-    x = TestTensor(
-        shape,
-        x_torch_tensor.stride(),
-        dtype,
-        device,
-        mode="manual",
-        set_tensor=x_torch_tensor,
-    )
-
-    if inplace == Inplace.INPLACE_X:
-        y = x
-    else:
-        y = TestTensor(shape, None, dtype, device)
-
-    if y.is_broadcast():
-        return
-
-    print(
-        f"Testing Sqrt on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}"
-    )
-
-    ans = sqrt_op(x.torch_tensor())
-
-    if sync is not None:
-        sync()
-
-    descriptor = infiniopOperatorDescriptor_t()
-    check_error(
-        LIBINFINIOP.infiniopCreateSqrtDescriptor(
-            handle, ctypes.byref(descriptor), y.descriptor, x.descriptor
-        )
-    )
-
-    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
-    for tensor in [x, y]:
-        tensor.destroy_desc()
-
-    workspace_size = c_uint64(0)
-    check_error(
-        LIBINFINIOP.infiniopGetSqrtWorkspaceSize(
-            descriptor, ctypes.byref(workspace_size)
-        )
-    )
-    workspace = TestWorkspace(workspace_size.value, y.device)
-
-    def lib_sqrt():
-        check_error(
-            LIBINFINIOP.infiniopSqrt(
-                descriptor, workspace.data(), workspace_size.value, y.data(), x.data(), None
-            )
-        )
-
-    lib_sqrt()
-    if sync is not None:
-        sync()
-
-    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
-    if DEBUG:
-        debug(y.actual_tensor(), ans, atol=atol, rtol=rtol)
-    assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol, equal_nan=True)
-
-    # Profiling workflow
-    if PROFILE:
-        # fmt: off
-        profile_operation("PyTorch", lambda: sqrt_op(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
-        profile_operation("    lib", lambda: lib_sqrt(), device, NUM_PRERUN, NUM_ITERATIONS)
-        # fmt: on
-
-    check_error(LIBINFINIOP.infiniopDestroySqrtDescriptor(descriptor))
-
-
-if __name__ == "__main__":
-    args = get_args()
-
-    # Configure testing options
-    DEBUG = args.debug
-    PROFILE = args.profile
-    NUM_PRERUN = args.num_prerun
-    NUM_ITERATIONS = args.num_iterations
-
-    for device in get_test_devices(args):
-        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
-
-    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/tan.py b/test/infiniop/tan.py
deleted file mode 100644
index 877f5dd58..000000000
--- a/test/infiniop/tan.py
+++ /dev/null
@@ -1,167 +0,0 @@
-import ctypes
-import math
-from ctypes import c_uint64
-from enum import Enum, auto
-
-import torch
-from libinfiniop import (
-    LIBINFINIOP,
-    InfiniDeviceNames,
-    InfiniDtype,
-    InfiniDtypeNames,
-    TestTensor,
-    TestWorkspace,
-    check_error,
-    debug,
-    get_args,
-    get_test_devices,
-    get_tolerance,
-    infiniopOperatorDescriptor_t,
-    profile_operation,
-    test_operator,
-)
-
-# ==============================================================================
-#  Configuration (Internal Use Only)
-# ==============================================================================
-# These are not meant to be imported from other modules
-_TEST_CASES_ = [
-    # tensor_shape, inplace
-    ((1, 3),),
-    ((3, 3),),
-    ((32, 20, 512),),
-    ((33, 333, 333),),
-    ((32, 256, 112, 112),),
-    ((3, 3, 13, 9, 17),),
-]
-
-
-class Inplace(Enum):
-    OUT_OF_PLACE = auto()
-    INPLACE_X = auto()
-
-
-# Inplace options applied for each test case in _TEST_CASES_
-_INPLACE = [
-    Inplace.OUT_OF_PLACE,
-    Inplace.INPLACE_X,
-]
-
-# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
-_TEST_CASES = [
-    test_case + (inplace_item,)
-    for test_case in _TEST_CASES_
-    for inplace_item in _INPLACE
-]
-
-# Data types used for testing (matching old operators library: only F16 and F32)
-_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32]
-
-# Tolerance map for different data types
-# Copied from old operators library: atol=1e-6, rtol=1e-2
-_TOLERANCE_MAP = {
-    InfiniDtype.F16: {"atol": 1e-6, "rtol": 1e-2},
-    InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-2},
-}
-
-DEBUG = False
-PROFILE = False
-NUM_PRERUN = 10
-NUM_ITERATIONS = 1000
-
-
-def tan_op(x):
-    return torch.tan(x).to(x.dtype)
-
-
-def test(
-    handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=InfiniDtype.F16, sync=None
-):
-    # Generate test tensors with values in range [-2π, 2π) for tan operation
-    # tan domain is (-∞, +∞), so we use range [-2π, 2π)
-    x_torch_tensor = torch.rand(shape) * 4 * math.pi - 2 * math.pi
-
-    x = TestTensor(
-        shape,
-        x_torch_tensor.stride(),
-        dtype,
-        device,
-        mode="manual",
-        set_tensor=x_torch_tensor,
-    )
-
-    if inplace == Inplace.INPLACE_X:
-        y = x
-    else:
-        y = TestTensor(shape, None, dtype, device)
-
-    if y.is_broadcast():
-        return
-
-    print(
-        f"Testing Tan on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}"
-    )
-
-    ans = tan_op(x.torch_tensor())
-
-    if sync is not None:
-        sync()
-
-    descriptor = infiniopOperatorDescriptor_t()
-    check_error(
-        LIBINFINIOP.infiniopCreateTanDescriptor(
-            handle, ctypes.byref(descriptor), y.descriptor, x.descriptor
-        )
-    )
-
-    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
-    for tensor in [x, y]:
-        tensor.destroy_desc()
-
-    workspace_size = c_uint64(0)
-    check_error(
-        LIBINFINIOP.infiniopGetTanWorkspaceSize(
-            descriptor, ctypes.byref(workspace_size)
-        )
-    )
-    workspace = TestWorkspace(workspace_size.value, y.device)
-
-    def lib_tan():
-        check_error(
-            LIBINFINIOP.infiniopTan(
-                descriptor, workspace.data(), workspace_size.value, y.data(), x.data(), None
-            )
-        )
-
-    lib_tan()
-    if sync is not None:
-        sync()
-
-    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
-    if DEBUG:
-        debug(y.actual_tensor(), ans, atol=atol, rtol=rtol)
-    assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol, equal_nan=True)
-
-    # Profiling workflow
-    if PROFILE:
-        # fmt: off
-        profile_operation("PyTorch", lambda: tan_op(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
-        profile_operation("    lib", lambda: lib_tan(), device, NUM_PRERUN, NUM_ITERATIONS)
-        # fmt: on
-
-    check_error(LIBINFINIOP.infiniopDestroyTanDescriptor(descriptor))
-
-
-if __name__ == "__main__":
-    args = get_args()
-
-    # Configure testing options
-    DEBUG = args.debug
-    PROFILE = args.profile
-    NUM_PRERUN = args.num_prerun
-    NUM_ITERATIONS = args.num_iterations
-
-    for device in get_test_devices(args):
-        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
-
-    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/test_all_binary_ops.py b/test/infiniop/test_all_binary_ops.py
new file mode 100644
index 000000000..e08b3e41b
--- /dev/null
+++ b/test/infiniop/test_all_binary_ops.py
@@ -0,0 +1,251 @@
+"""
+统一测试所有 Binary 算子
+
+这个文件包含所有 binary 算子的测试，方便统一管理和运行。
+可以通过命令行参数选择运行哪些算子，或者运行所有算子。
+
+使用方法:
+    # 运行所有 binary 算子测试
+    python test_all_binary_ops.py
+    
+    # 只运行 div 和 pow 算子
+    python test_all_binary_ops.py --ops div pow
+    
+    # 运行特定设备上的测试
+    python test_all_binary_ops.py --cpu --nvidia
+"""
+
+import torch
+import argparse
+from libinfiniop import InfiniDtype, TestTensor
+from libinfiniop.binary_test_base import BinaryTestBase
+
+
+# ==============================================================================
+# 所有 Binary 算子的测试类定义
+# ==============================================================================
+
+class DivTest(BinaryTestBase):
+    OP_NAME = "Div"
+    OP_NAME_LOWER = "div"
+    
+    @staticmethod
+    def torch_op(c, a, b):
+        torch.div(a, b, out=c)
+    
+    @staticmethod
+    def generate_input_a(shape, a_stride, dtype, device):
+        return TestTensor(shape, a_stride, dtype, device)
+    
+    @staticmethod
+    def generate_input_b(shape, b_stride, dtype, device):
+        # For division, ensure b doesn't contain zeros
+        return TestTensor(shape, b_stride, dtype, device, scale=2, bias=0.1)
+    
+    TOLERANCE_MAP = {
+        InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+        InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6},
+    }
+    
+    EQUAL_NAN = True
+
+
+class PowTest(BinaryTestBase):
+    OP_NAME = "Pow"
+    OP_NAME_LOWER = "pow"
+    
+    @staticmethod
+    def torch_op(c, a, b):
+        torch.pow(a, b, out=c)
+    
+    @staticmethod
+    def generate_input_a(shape, a_stride, dtype, device):
+        # Avoid negative bases and very large exponents
+        return TestTensor(shape, a_stride, dtype, device, mode="random", scale=5.0, bias=0.1)
+    
+    @staticmethod
+    def generate_input_b(shape, b_stride, dtype, device):
+        return TestTensor(shape, b_stride, dtype, device, mode="random", scale=3.0, bias=0.1)
+    
+    TOLERANCE_MAP = {
+        InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+        InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-3},
+    }
+    
+    EQUAL_NAN = True
+
+
+class ModTest(BinaryTestBase):
+    OP_NAME = "Mod"
+    OP_NAME_LOWER = "mod"
+    
+    @staticmethod
+    def torch_op(c, a, b):
+        torch.remainder(a, b, out=c)
+    
+    @staticmethod
+    def generate_input_a(shape, a_stride, dtype, device):
+        return TestTensor(shape, a_stride, dtype, device)
+    
+    @staticmethod
+    def generate_input_b(shape, b_stride, dtype, device):
+        # Avoid zeros
+        return TestTensor(shape, b_stride, dtype, device, scale=2, bias=0.1)
+    
+    TOLERANCE_MAP = {
+        InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+        InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6},
+    }
+    
+    EQUAL_NAN = True
+
+
+class MaxTest(BinaryTestBase):
+    OP_NAME = "Max"
+    OP_NAME_LOWER = "max"
+    
+    @staticmethod
+    def torch_op(c, a, b):
+        torch.maximum(a, b, out=c)
+    
+    @staticmethod
+    def generate_input_a(shape, a_stride, dtype, device):
+        return TestTensor(shape, a_stride, dtype, device)
+    
+    @staticmethod
+    def generate_input_b(shape, b_stride, dtype, device):
+        return TestTensor(shape, b_stride, dtype, device)
+    
+    TOLERANCE_MAP = {
+        InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+        InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6},
+    }
+    
+    EQUAL_NAN = True
+
+
+class MinTest(BinaryTestBase):
+    OP_NAME = "Min"
+    OP_NAME_LOWER = "min"
+    
+    @staticmethod
+    def torch_op(c, a, b):
+        torch.minimum(a, b, out=c)
+    
+    @staticmethod
+    def generate_input_a(shape, a_stride, dtype, device):
+        return TestTensor(shape, a_stride, dtype, device)
+    
+    @staticmethod
+    def generate_input_b(shape, b_stride, dtype, device):
+        return TestTensor(shape, b_stride, dtype, device)
+    
+    TOLERANCE_MAP = {
+        InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+        InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6},
+    }
+    
+    EQUAL_NAN = True
+
+
+# ==============================================================================
+# 算子注册表
+# ==============================================================================
+
+# 所有 binary 算子的测试类映射
+BINARY_OP_TESTS = {
+    "div": DivTest,
+    "pow": PowTest,
+    "mod": ModTest,
+    "max": MaxTest,
+    "min": MinTest,
+}
+
+
+# ==============================================================================
+# 主函数
+# ==============================================================================
+
+def main():
+    # 先获取基础参数解析器
+    from libinfiniop.utils import get_args as get_base_args
+    import sys
+    
+    # 创建新的参数解析器，添加 --ops 参数
+    parser = argparse.ArgumentParser(description="Test all binary operators", parents=[])
+    parser.add_argument(
+        "--ops",
+        nargs="+",
+        choices=list(BINARY_OP_TESTS.keys()),
+        default=list(BINARY_OP_TESTS.keys()),
+        help="Specify which operators to test (default: all)",
+    )
+    
+    # 解析参数
+    args, unknown = parser.parse_known_args()
+    
+    # 将未知参数传递给基础参数解析器
+    if unknown:
+        sys.argv = [sys.argv[0]] + unknown
+        base_args = get_base_args()
+    else:
+        # 如果没有其他参数，使用默认值
+        sys.argv = [sys.argv[0]]
+        base_args = get_base_args()
+    
+    # 合并参数
+    for attr in dir(base_args):
+        if not attr.startswith("_") and not hasattr(args, attr):
+            setattr(args, attr, getattr(base_args, attr))
+    
+    # 运行选定的算子测试
+    print(f"\n{'='*60}")
+    print(f"Testing {len(args.ops)} binary operator(s): {', '.join(args.ops)}")
+    print(f"{'='*60}\n")
+    
+    failed_ops = []
+    passed_ops = []
+    
+    for op_name in args.ops:
+        test_class = BINARY_OP_TESTS[op_name]
+        print(f"\n{'='*60}")
+        print(f"Testing {test_class.OP_NAME} operator")
+        print(f"{'='*60}")
+        
+        try:
+            # 创建临时参数对象，传递给测试类
+            test_class.DEBUG = args.debug
+            test_class.PROFILE = args.profile
+            test_class.NUM_PRERUN = args.num_prerun
+            test_class.NUM_ITERATIONS = args.num_iterations
+            
+            # 运行测试
+            for device in get_test_devices(args):
+                test_operator(device, test_class.test, test_class.TEST_CASES, test_class.TENSOR_DTYPES)
+            
+            print(f"\033[92m{test_class.OP_NAME} test passed!\033[0m")
+            passed_ops.append(op_name)
+        except Exception as e:
+            print(f"\033[91m{test_class.OP_NAME} test failed: {e}\033[0m")
+            failed_ops.append(op_name)
+            if args.debug:
+                import traceback
+                traceback.print_exc()
+    
+    # 打印总结
+    print(f"\n{'='*60}")
+    print("Test Summary")
+    print(f"{'='*60}")
+    print(f"Total operators: {len(args.ops)}")
+    print(f"\033[92mPassed: {len(passed_ops)} - {', '.join(passed_ops)}\033[0m")
+    if failed_ops:
+        print(f"\033[91mFailed: {len(failed_ops)} - {', '.join(failed_ops)}\033[0m")
+    print(f"{'='*60}\n")
+    
+    if failed_ops:
+        exit(1)
+
+
+if __name__ == "__main__":
+    from libinfiniop.utils import get_test_devices, test_operator
+    main()
diff --git a/test/infiniop/test_all_unary_ops.py b/test/infiniop/test_all_unary_ops.py
new file mode 100644
index 000000000..b9d7cdc8b
--- /dev/null
+++ b/test/infiniop/test_all_unary_ops.py
@@ -0,0 +1,548 @@
+"""
+统一测试所有 Unary 算子
+
+这个文件包含所有 unary 算子的测试，方便统一管理和运行。
+可以通过命令行参数选择运行哪些算子，或者运行所有算子。
+
+使用方法:
+    # 运行所有 unary 算子测试
+    python test_all_unary_ops.py
+    
+    # 只运行 abs 和 log 算子
+    python test_all_unary_ops.py --ops abs log
+    
+    # 运行特定设备上的测试
+    python test_all_unary_ops.py --cpu --nvidia
+"""
+
+import torch
+import argparse
+from libinfiniop import InfiniDtype
+from libinfiniop.unary_test_base import UnaryTestBase
+
+
+# ==============================================================================
+# 所有 Unary 算子的测试类定义
+# ==============================================================================
+
+class AbsTest(UnaryTestBase):
+    OP_NAME = "Abs"
+    OP_NAME_LOWER = "abs"
+    
+    @staticmethod
+    def torch_op(x):
+        return torch.abs(x).to(x.dtype)
+    
+    @staticmethod
+    def generate_input(shape, dtype, device):
+        return torch.rand(shape, dtype=dtype, device=device) * 2 - 1
+    
+    TOLERANCE_MAP = {
+        InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+        InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7},
+    }
+
+
+class AcosTest(UnaryTestBase):
+    OP_NAME = "Acos"
+    OP_NAME_LOWER = "acos"
+    
+    @staticmethod
+    def torch_op(x):
+        return torch.acos(x).to(x.dtype)
+    
+    @staticmethod
+    def generate_input(shape, dtype, device):
+        # acos domain is [-1, 1]
+        return torch.rand(shape, dtype=dtype, device=device) * 2 - 1
+    
+    TOLERANCE_MAP = {
+        InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+        InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7},
+    }
+    
+    EQUAL_NAN = True
+
+
+class AcoshTest(UnaryTestBase):
+    OP_NAME = "Acosh"
+    OP_NAME_LOWER = "acosh"
+    
+    @staticmethod
+    def torch_op(x):
+        return torch.acosh(x).to(x.dtype)
+    
+    @staticmethod
+    def generate_input(shape, dtype, device):
+        # acosh domain is [1, +∞)
+        return torch.rand(shape, dtype=dtype, device=device) * 10 + 1
+    
+    TOLERANCE_MAP = {
+        InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+        InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7},
+    }
+    
+    EQUAL_NAN = True
+
+
+class AsinTest(UnaryTestBase):
+    OP_NAME = "Asin"
+    OP_NAME_LOWER = "asin"
+    
+    @staticmethod
+    def torch_op(x):
+        return torch.asin(x).to(x.dtype)
+    
+    @staticmethod
+    def generate_input(shape, dtype, device):
+        # asin domain is [-1, 1]
+        return torch.rand(shape, dtype=dtype, device=device) * 2 - 1
+    
+    TOLERANCE_MAP = {
+        InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+        InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7},
+    }
+    
+    EQUAL_NAN = True
+
+
+class AsinhTest(UnaryTestBase):
+    OP_NAME = "Asinh"
+    OP_NAME_LOWER = "asinh"
+    
+    @staticmethod
+    def torch_op(x):
+        return torch.asinh(x).to(x.dtype)
+    
+    @staticmethod
+    def generate_input(shape, dtype, device):
+        return torch.rand(shape, dtype=dtype, device=device) * 2 - 1
+    
+    TOLERANCE_MAP = {
+        InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+        InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7},
+    }
+    
+    EQUAL_NAN = True
+
+
+class AtanTest(UnaryTestBase):
+    OP_NAME = "Atan"
+    OP_NAME_LOWER = "atan"
+    
+    @staticmethod
+    def torch_op(x):
+        return torch.atan(x).to(x.dtype)
+    
+    @staticmethod
+    def generate_input(shape, dtype, device):
+        return torch.rand(shape, dtype=dtype, device=device) * 2 - 1
+    
+    TOLERANCE_MAP = {
+        InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+        InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7},
+    }
+    
+    EQUAL_NAN = True
+
+
+class AtanhTest(UnaryTestBase):
+    OP_NAME = "Atanh"
+    OP_NAME_LOWER = "atanh"
+    
+    @staticmethod
+    def torch_op(x):
+        return torch.atanh(x).to(x.dtype)
+    
+    @staticmethod
+    def generate_input(shape, dtype, device):
+        # atanh domain is (-1, 1)
+        return torch.rand(shape, dtype=dtype, device=device) * 1.8 - 0.9
+    
+    TOLERANCE_MAP = {
+        InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+        InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7},
+    }
+    
+    EQUAL_NAN = True
+
+
+class CeilTest(UnaryTestBase):
+    OP_NAME = "Ceil"
+    OP_NAME_LOWER = "ceil"
+    
+    @staticmethod
+    def torch_op(x):
+        return torch.ceil(x).to(x.dtype)
+    
+    @staticmethod
+    def generate_input(shape, dtype, device):
+        return torch.rand(shape, dtype=dtype, device=device) * 10 - 5
+    
+    TOLERANCE_MAP = {
+        InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+        InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7},
+    }
+
+
+class CosTest(UnaryTestBase):
+    OP_NAME = "Cos"
+    OP_NAME_LOWER = "cos"
+    
+    @staticmethod
+    def torch_op(x):
+        return torch.cos(x).to(x.dtype)
+    
+    @staticmethod
+    def generate_input(shape, dtype, device):
+        # Generate test tensors with values in range [-200, -100) for cos operation
+        # cos domain is (-∞, +∞), so we use range [-200, -100)
+        return torch.rand(shape, dtype=dtype, device=device) * 100 - 200
+    
+    TOLERANCE_MAP = {
+        InfiniDtype.F16: {"atol": 1e-4, "rtol": 1e-2},
+        InfiniDtype.F32: {"atol": 1e-4, "rtol": 1e-2},
+    }
+    
+    EQUAL_NAN = True
+
+
+class CoshTest(UnaryTestBase):
+    OP_NAME = "Cosh"
+    OP_NAME_LOWER = "cosh"
+    
+    @staticmethod
+    def torch_op(x):
+        return torch.cosh(x).to(x.dtype)
+    
+    @staticmethod
+    def generate_input(shape, dtype, device):
+        return torch.rand(shape, dtype=dtype, device=device) * 2 - 1
+    
+    TOLERANCE_MAP = {
+        InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+        InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7},
+    }
+    
+    EQUAL_NAN = True
+
+
+class ErfTest(UnaryTestBase):
+    OP_NAME = "Erf"
+    OP_NAME_LOWER = "erf"
+    
+    @staticmethod
+    def torch_op(x):
+        return torch.erf(x).to(x.dtype)
+    
+    @staticmethod
+    def generate_input(shape, dtype, device):
+        return torch.rand(shape, dtype=dtype, device=device) * 2 - 1
+    
+    TOLERANCE_MAP = {
+        InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+        InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7},
+    }
+    
+    EQUAL_NAN = True
+
+
+class FloorTest(UnaryTestBase):
+    OP_NAME = "Floor"
+    OP_NAME_LOWER = "floor"
+    
+    @staticmethod
+    def torch_op(x):
+        return torch.floor(x).to(x.dtype)
+    
+    @staticmethod
+    def generate_input(shape, dtype, device):
+        return torch.rand(shape, dtype=dtype, device=device) * 10 - 5
+    
+    TOLERANCE_MAP = {
+        InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+        InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7},
+    }
+    
+    EQUAL_NAN = True
+
+
+class LogTest(UnaryTestBase):
+    OP_NAME = "Log"
+    OP_NAME_LOWER = "log"
+    
+    @staticmethod
+    def torch_op(x):
+        return torch.log(x).to(x.dtype)
+    
+    @staticmethod
+    def generate_input(shape, dtype, device):
+        # log domain is (0, +∞), so we use range [0.1, 1.1)
+        return torch.rand(shape, dtype=dtype, device=device) + 0.1
+    
+    TOLERANCE_MAP = {
+        InfiniDtype.F16: {"atol": 1e-7, "rtol": 1e-3},
+        InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-3},
+    }
+    
+    EQUAL_NAN = True
+
+
+class NegTest(UnaryTestBase):
+    OP_NAME = "Neg"
+    OP_NAME_LOWER = "neg"
+    
+    @staticmethod
+    def torch_op(x):
+        return torch.neg(x).to(x.dtype)
+    
+    @staticmethod
+    def generate_input(shape, dtype, device):
+        return torch.rand(shape, dtype=dtype, device=device) * 2 - 1
+    
+    TOLERANCE_MAP = {
+        InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+        InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7},
+    }
+    
+    EQUAL_NAN = True
+
+
+class ReciprocalTest(UnaryTestBase):
+    OP_NAME = "Reciprocal"
+    OP_NAME_LOWER = "reciprocal"
+    
+    @staticmethod
+    def torch_op(x):
+        return torch.reciprocal(x).to(x.dtype)
+    
+    @staticmethod
+    def generate_input(shape, dtype, device):
+        # Avoid zeros
+        return torch.rand(shape, dtype=dtype, device=device) * 2 + 0.1
+    
+    TOLERANCE_MAP = {
+        InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+        InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7},
+    }
+    
+    EQUAL_NAN = True
+
+
+class RoundTest(UnaryTestBase):
+    OP_NAME = "Round"
+    OP_NAME_LOWER = "round"
+    
+    @staticmethod
+    def torch_op(x):
+        return torch.round(x).to(x.dtype)
+    
+    @staticmethod
+    def generate_input(shape, dtype, device):
+        return torch.rand(shape, dtype=dtype, device=device) * 10 - 5
+    
+    TOLERANCE_MAP = {
+        InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+        InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7},
+    }
+    
+    EQUAL_NAN = True
+
+
+class SignTest(UnaryTestBase):
+    OP_NAME = "Sign"
+    OP_NAME_LOWER = "sign"
+    
+    @staticmethod
+    def torch_op(x):
+        return torch.sign(x).to(x.dtype)
+    
+    @staticmethod
+    def generate_input(shape, dtype, device):
+        return torch.rand(shape, dtype=dtype, device=device) * 2 - 1
+    
+    TOLERANCE_MAP = {
+        InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+        InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7},
+    }
+    
+    EQUAL_NAN = True
+
+
+class SinhTest(UnaryTestBase):
+    OP_NAME = "Sinh"
+    OP_NAME_LOWER = "sinh"
+    
+    @staticmethod
+    def torch_op(x):
+        return torch.sinh(x).to(x.dtype)
+    
+    @staticmethod
+    def generate_input(shape, dtype, device):
+        return torch.rand(shape, dtype=dtype, device=device) * 2 - 1
+    
+    TOLERANCE_MAP = {
+        InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+        InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7},
+    }
+    
+    EQUAL_NAN = True
+
+
+class SqrtTest(UnaryTestBase):
+    OP_NAME = "Sqrt"
+    OP_NAME_LOWER = "sqrt"
+    
+    @staticmethod
+    def torch_op(x):
+        return torch.sqrt(x).to(x.dtype)
+    
+    @staticmethod
+    def generate_input(shape, dtype, device):
+        # sqrt domain is [0, +∞)
+        return torch.rand(shape, dtype=dtype, device=device) * 100
+    
+    TOLERANCE_MAP = {
+        InfiniDtype.F16: {"atol": 0, "rtol": 1e-3},
+        InfiniDtype.F32: {"atol": 0, "rtol": 1e-3},
+    }
+    
+    EQUAL_NAN = True
+
+
+class TanTest(UnaryTestBase):
+    OP_NAME = "Tan"
+    OP_NAME_LOWER = "tan"
+    
+    @staticmethod
+    def torch_op(x):
+        return torch.tan(x).to(x.dtype)
+    
+    @staticmethod
+    def generate_input(shape, dtype, device):
+        return torch.rand(shape, dtype=dtype, device=device) * 2 - 1
+    
+    TOLERANCE_MAP = {
+        InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+        InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7},
+    }
+    
+    EQUAL_NAN = True
+
+
+# ==============================================================================
+# 算子注册表
+# ==============================================================================
+
+# 所有 unary 算子的测试类映射
+UNARY_OP_TESTS = {
+    "abs": AbsTest,
+    "acos": AcosTest,
+    "acosh": AcoshTest,
+    "asin": AsinTest,
+    "asinh": AsinhTest,
+    "atan": AtanTest,
+    "atanh": AtanhTest,
+    "ceil": CeilTest,
+    "cos": CosTest,
+    "cosh": CoshTest,
+    "erf": ErfTest,
+    "floor": FloorTest,
+    "log": LogTest,
+    "neg": NegTest,
+    "reciprocal": ReciprocalTest,
+    "round": RoundTest,
+    "sign": SignTest,
+    "sinh": SinhTest,
+    "sqrt": SqrtTest,
+    "tan": TanTest,
+}
+
+
+# ==============================================================================
+# 主函数
+# ==============================================================================
+
+def main():
+    # 先获取基础参数解析器
+    from libinfiniop.utils import get_args as get_base_args
+    import sys
+    
+    # 创建新的参数解析器，添加 --ops 参数
+    parser = argparse.ArgumentParser(description="Test all unary operators", parents=[])
+    parser.add_argument(
+        "--ops",
+        nargs="+",
+        choices=list(UNARY_OP_TESTS.keys()),
+        default=list(UNARY_OP_TESTS.keys()),
+        help="Specify which operators to test (default: all)",
+    )
+    
+    # 解析参数
+    args, unknown = parser.parse_known_args()
+    
+    # 将未知参数传递给基础参数解析器
+    if unknown:
+        sys.argv = [sys.argv[0]] + unknown
+        base_args = get_base_args()
+    else:
+        # 如果没有其他参数，使用默认值
+        sys.argv = [sys.argv[0]]
+        base_args = get_base_args()
+    
+    # 合并参数
+    for attr in dir(base_args):
+        if not attr.startswith("_") and not hasattr(args, attr):
+            setattr(args, attr, getattr(base_args, attr))
+    
+    # 运行选定的算子测试
+    print(f"\n{'='*60}")
+    print(f"Testing {len(args.ops)} unary operator(s): {', '.join(args.ops)}")
+    print(f"{'='*60}\n")
+    
+    failed_ops = []
+    passed_ops = []
+    
+    for op_name in args.ops:
+        test_class = UNARY_OP_TESTS[op_name]
+        print(f"\n{'='*60}")
+        print(f"Testing {test_class.OP_NAME} operator")
+        print(f"{'='*60}")
+        
+        try:
+            # 创建临时参数对象，传递给测试类
+            test_class.DEBUG = args.debug
+            test_class.PROFILE = args.profile
+            test_class.NUM_PRERUN = args.num_prerun
+            test_class.NUM_ITERATIONS = args.num_iterations
+            
+            # 运行测试
+            for device in get_test_devices(args):
+                test_operator(device, test_class.test, test_class.TEST_CASES, test_class.TENSOR_DTYPES)
+            
+            print(f"\033[92m{test_class.OP_NAME} test passed!\033[0m")
+            passed_ops.append(op_name)
+        except Exception as e:
+            print(f"\033[91m{test_class.OP_NAME} test failed: {e}\033[0m")
+            failed_ops.append(op_name)
+            if args.debug:
+                import traceback
+                traceback.print_exc()
+    
+    # 打印总结
+    print(f"\n{'='*60}")
+    print("Test Summary")
+    print(f"{'='*60}")
+    print(f"Total operators: {len(args.ops)}")
+    print(f"\033[92mPassed: {len(passed_ops)} - {', '.join(passed_ops)}\033[0m")
+    if failed_ops:
+        print(f"\033[91mFailed: {len(failed_ops)} - {', '.join(failed_ops)}\033[0m")
+    print(f"{'='*60}\n")
+    
+    if failed_ops:
+        exit(1)
+
+
+if __name__ == "__main__":
+    from libinfiniop.utils import get_test_devices, test_operator
+    main()

From dcea337976d62fe93ba415040dddc576f7d4d1b4 Mon Sep 17 00:00:00 2001
From: gongchensu <zhuyue_134@qq.com>
Date: Thu, 8 Jan 2026 06:21:12 +0000
Subject: [PATCH 5/7] Issue/888 - Add
 averagepool,batch_norm,cross_entropy_loss,exp,hardswish,gather,index_copy_inplace,interpolate_nearest,maxpool,scatter
 operators from competition.

---
 include/infiniop/ops/averagepool.h            |  29 ++
 include/infiniop/ops/batch_norm.h             |  37 ++
 include/infiniop/ops/cross_entropy_loss.h     |  27 ++
 include/infiniop/ops/exp.h                    |  24 ++
 include/infiniop/ops/gather.h                 |  31 ++
 include/infiniop/ops/hardswish.h              |  24 ++
 include/infiniop/ops/index_copy_inplace.h     |  30 ++
 include/infiniop/ops/interpolate_nearest.h    |  25 ++
 include/infiniop/ops/maxpool.h                |  29 ++
 include/infiniop/ops/scatter.h                |  30 ++
 src/infiniop/ops/averagepool/averagepool.h    |  52 +++
 .../ops/averagepool/cpu/averagepool_cpu.cc    | 348 ++++++++++++++++++
 .../ops/averagepool/cpu/averagepool_cpu.h     |   8 +
 .../averagepool/cuda/averagepool_kernel.cuh   | 185 ++++++++++
 src/infiniop/ops/averagepool/info.h           | 136 +++++++
 .../ops/averagepool/nvidia/averagepool.cu     | 220 +++++++++++
 .../averagepool/nvidia/averagepool_nvidia.cuh |   8 +
 src/infiniop/ops/averagepool/operator.cc      | 147 ++++++++
 src/infiniop/ops/batch_norm/batch_norm.h      |  54 +++
 .../ops/batch_norm/cpu/batch_norm_cpu.cc      | 118 ++++++
 .../ops/batch_norm/cpu/batch_norm_cpu.h       |   8 +
 src/infiniop/ops/batch_norm/cuda/kernel.cuh   |  57 +++
 src/infiniop/ops/batch_norm/info.h            |  69 ++++
 .../batch_norm/nvidia/batch_norm_nvidia.cu    | 176 +++++++++
 .../batch_norm/nvidia/batch_norm_nvidia.cuh   |  10 +
 src/infiniop/ops/batch_norm/operator.cc       | 156 ++++++++
 .../cpu/cross_entropy_loss_cpu.cc             | 321 ++++++++++++++++
 .../cpu/cross_entropy_loss_cpu.h              |   8 +
 .../cross_entropy_loss/cross_entropy_loss.h   |  48 +++
 src/infiniop/ops/cross_entropy_loss/info.h    |  36 ++
 .../nvidia/cross_entropy_loss_nvidia.cu       | 217 +++++++++++
 .../nvidia/cross_entropy_loss_nvidia.cuh      |   8 +
 .../ops/cross_entropy_loss/operator.cc        | 142 +++++++
 src/infiniop/ops/exp/cpu/exp_cpu.cc           |  52 +++
 src/infiniop/ops/exp/cpu/exp_cpu.h            |  21 ++
 src/infiniop/ops/exp/cuda/kernel.cuh          |  39 ++
 src/infiniop/ops/exp/nvidia/exp_nvidia.cu     |  59 +++
 src/infiniop/ops/exp/nvidia/exp_nvidia.cuh    |   8 +
 src/infiniop/ops/exp/operator.cc              | 139 +++++++
 src/infiniop/ops/gather/cpu/gather_cpu.cc     |  96 +++++
 src/infiniop/ops/gather/cpu/gather_cpu.h      |   8 +
 src/infiniop/ops/gather/cuda/kernel.cuh       |  37 ++
 src/infiniop/ops/gather/gather.h              |  47 +++
 src/infiniop/ops/gather/info.h                |  58 +++
 .../ops/gather/nvidia/gather_nvidia.cu        | 179 +++++++++
 .../ops/gather/nvidia/gather_nvidia.cuh       |   7 +
 src/infiniop/ops/gather/operator.cc           | 144 ++++++++
 .../ops/hardswish/cpu/hardswish_cpu.cc        |  52 +++
 .../ops/hardswish/cpu/hardswish_cpu.h         |  30 ++
 src/infiniop/ops/hardswish/cuda/kernel.cuh    |  55 +++
 .../ops/hardswish/nvidia/hardswish_nvidia.cu  |  59 +++
 .../ops/hardswish/nvidia/hardswish_nvidia.cuh |   8 +
 src/infiniop/ops/hardswish/operator.cc        | 139 +++++++
 .../cpu/index_copy_inplace_cpu.cc             |  93 +++++
 .../cpu/index_copy_inplace_cpu.h              |   8 +
 .../index_copy_inplace/index_copy_inplace.h   |  53 +++
 src/infiniop/ops/index_copy_inplace/info.h    |  74 ++++
 .../nvidia/index_copy_inplace_nvidia.cu       | 127 +++++++
 .../nvidia/index_copy_inplace_nvidia.cuh      |   7 +
 .../ops/index_copy_inplace/operator.cc        | 144 ++++++++
 .../cpu/interpolate_nearest_cpu.cc            | 284 ++++++++++++++
 .../cpu/interpolate_nearest_cpu.h             |   8 +
 .../ops/interpolate_nearest/cuda/kernel.cuh   | 168 +++++++++
 src/infiniop/ops/interpolate_nearest/info.h   | 118 ++++++
 .../interpolate_nearest/interpolate_nearest.h |  51 +++
 .../nvidia/interpolate_nearest_nvidia.cu      |  93 +++++
 .../nvidia/interpolate_nearest_nvidia.cuh     |   9 +
 .../ops/interpolate_nearest/operator.cc       | 139 +++++++
 src/infiniop/ops/maxpool/cpu/maxpool_cpu.cc   | 322 ++++++++++++++++
 src/infiniop/ops/maxpool/cpu/maxpool_cpu.h    |   8 +
 src/infiniop/ops/maxpool/info.h               | 113 ++++++
 src/infiniop/ops/maxpool/maxpool.h            |  53 +++
 .../ops/maxpool/nvidia/maxpool_nvidia.cu      | 240 ++++++++++++
 .../ops/maxpool/nvidia/maxpool_nvidia.cuh     |   8 +
 src/infiniop/ops/maxpool/operator.cc          | 147 ++++++++
 src/infiniop/ops/scatter/cpu/scatter_cpu.cc   | 100 +++++
 src/infiniop/ops/scatter/cpu/scatter_cpu.h    |   8 +
 src/infiniop/ops/scatter/cuda/kernel.cuh      |  37 ++
 src/infiniop/ops/scatter/info.h               |  64 ++++
 .../ops/scatter/metax/scatter_metax.h         |   8 +
 .../ops/scatter/metax/scatter_metax.maca      | 190 ++++++++++
 .../ops/scatter/nvidia/scatter_nvidia.cu      | 180 +++++++++
 .../ops/scatter/nvidia/scatter_nvidia.cuh     |   7 +
 src/infiniop/ops/scatter/operator.cc          | 160 ++++++++
 src/infiniop/ops/scatter/scatter.h            |  47 +++
 test/infiniop/averagepool.py                  | 239 ++++++++++++
 test/infiniop/batch_norm.py                   | 244 ++++++++++++
 test/infiniop/cross_entropy_loss.py           | 213 +++++++++++
 test/infiniop/exp.py                          | 165 +++++++++
 test/infiniop/gather.py                       | 160 ++++++++
 test/infiniop/hardswish.py                    | 167 +++++++++
 test/infiniop/index_copy_inplace.py           | 180 +++++++++
 test/infiniop/interpolate_nearest.py          | 265 +++++++++++++
 test/infiniop/libinfiniop/op_register.py      | 321 +++++++++++++++-
 test/infiniop/maxpool.py                      | 242 ++++++++++++
 test/infiniop/scatter.py                      | 196 ++++++++++
 96 files changed, 9514 insertions(+), 1 deletion(-)
 create mode 100644 include/infiniop/ops/averagepool.h
 create mode 100644 include/infiniop/ops/batch_norm.h
 create mode 100644 include/infiniop/ops/cross_entropy_loss.h
 create mode 100644 include/infiniop/ops/exp.h
 create mode 100644 include/infiniop/ops/gather.h
 create mode 100644 include/infiniop/ops/hardswish.h
 create mode 100644 include/infiniop/ops/index_copy_inplace.h
 create mode 100644 include/infiniop/ops/interpolate_nearest.h
 create mode 100644 include/infiniop/ops/maxpool.h
 create mode 100644 include/infiniop/ops/scatter.h
 create mode 100644 src/infiniop/ops/averagepool/averagepool.h
 create mode 100644 src/infiniop/ops/averagepool/cpu/averagepool_cpu.cc
 create mode 100644 src/infiniop/ops/averagepool/cpu/averagepool_cpu.h
 create mode 100644 src/infiniop/ops/averagepool/cuda/averagepool_kernel.cuh
 create mode 100644 src/infiniop/ops/averagepool/info.h
 create mode 100644 src/infiniop/ops/averagepool/nvidia/averagepool.cu
 create mode 100644 src/infiniop/ops/averagepool/nvidia/averagepool_nvidia.cuh
 create mode 100644 src/infiniop/ops/averagepool/operator.cc
 create mode 100644 src/infiniop/ops/batch_norm/batch_norm.h
 create mode 100644 src/infiniop/ops/batch_norm/cpu/batch_norm_cpu.cc
 create mode 100644 src/infiniop/ops/batch_norm/cpu/batch_norm_cpu.h
 create mode 100644 src/infiniop/ops/batch_norm/cuda/kernel.cuh
 create mode 100644 src/infiniop/ops/batch_norm/info.h
 create mode 100644 src/infiniop/ops/batch_norm/nvidia/batch_norm_nvidia.cu
 create mode 100644 src/infiniop/ops/batch_norm/nvidia/batch_norm_nvidia.cuh
 create mode 100644 src/infiniop/ops/batch_norm/operator.cc
 create mode 100644 src/infiniop/ops/cross_entropy_loss/cpu/cross_entropy_loss_cpu.cc
 create mode 100644 src/infiniop/ops/cross_entropy_loss/cpu/cross_entropy_loss_cpu.h
 create mode 100644 src/infiniop/ops/cross_entropy_loss/cross_entropy_loss.h
 create mode 100644 src/infiniop/ops/cross_entropy_loss/info.h
 create mode 100644 src/infiniop/ops/cross_entropy_loss/nvidia/cross_entropy_loss_nvidia.cu
 create mode 100644 src/infiniop/ops/cross_entropy_loss/nvidia/cross_entropy_loss_nvidia.cuh
 create mode 100644 src/infiniop/ops/cross_entropy_loss/operator.cc
 create mode 100644 src/infiniop/ops/exp/cpu/exp_cpu.cc
 create mode 100644 src/infiniop/ops/exp/cpu/exp_cpu.h
 create mode 100644 src/infiniop/ops/exp/cuda/kernel.cuh
 create mode 100644 src/infiniop/ops/exp/nvidia/exp_nvidia.cu
 create mode 100644 src/infiniop/ops/exp/nvidia/exp_nvidia.cuh
 create mode 100644 src/infiniop/ops/exp/operator.cc
 create mode 100644 src/infiniop/ops/gather/cpu/gather_cpu.cc
 create mode 100644 src/infiniop/ops/gather/cpu/gather_cpu.h
 create mode 100644 src/infiniop/ops/gather/cuda/kernel.cuh
 create mode 100644 src/infiniop/ops/gather/gather.h
 create mode 100644 src/infiniop/ops/gather/info.h
 create mode 100644 src/infiniop/ops/gather/nvidia/gather_nvidia.cu
 create mode 100644 src/infiniop/ops/gather/nvidia/gather_nvidia.cuh
 create mode 100644 src/infiniop/ops/gather/operator.cc
 create mode 100644 src/infiniop/ops/hardswish/cpu/hardswish_cpu.cc
 create mode 100644 src/infiniop/ops/hardswish/cpu/hardswish_cpu.h
 create mode 100644 src/infiniop/ops/hardswish/cuda/kernel.cuh
 create mode 100644 src/infiniop/ops/hardswish/nvidia/hardswish_nvidia.cu
 create mode 100644 src/infiniop/ops/hardswish/nvidia/hardswish_nvidia.cuh
 create mode 100644 src/infiniop/ops/hardswish/operator.cc
 create mode 100644 src/infiniop/ops/index_copy_inplace/cpu/index_copy_inplace_cpu.cc
 create mode 100644 src/infiniop/ops/index_copy_inplace/cpu/index_copy_inplace_cpu.h
 create mode 100644 src/infiniop/ops/index_copy_inplace/index_copy_inplace.h
 create mode 100644 src/infiniop/ops/index_copy_inplace/info.h
 create mode 100644 src/infiniop/ops/index_copy_inplace/nvidia/index_copy_inplace_nvidia.cu
 create mode 100644 src/infiniop/ops/index_copy_inplace/nvidia/index_copy_inplace_nvidia.cuh
 create mode 100644 src/infiniop/ops/index_copy_inplace/operator.cc
 create mode 100644 src/infiniop/ops/interpolate_nearest/cpu/interpolate_nearest_cpu.cc
 create mode 100644 src/infiniop/ops/interpolate_nearest/cpu/interpolate_nearest_cpu.h
 create mode 100644 src/infiniop/ops/interpolate_nearest/cuda/kernel.cuh
 create mode 100644 src/infiniop/ops/interpolate_nearest/info.h
 create mode 100644 src/infiniop/ops/interpolate_nearest/interpolate_nearest.h
 create mode 100644 src/infiniop/ops/interpolate_nearest/nvidia/interpolate_nearest_nvidia.cu
 create mode 100644 src/infiniop/ops/interpolate_nearest/nvidia/interpolate_nearest_nvidia.cuh
 create mode 100644 src/infiniop/ops/interpolate_nearest/operator.cc
 create mode 100644 src/infiniop/ops/maxpool/cpu/maxpool_cpu.cc
 create mode 100644 src/infiniop/ops/maxpool/cpu/maxpool_cpu.h
 create mode 100644 src/infiniop/ops/maxpool/info.h
 create mode 100644 src/infiniop/ops/maxpool/maxpool.h
 create mode 100644 src/infiniop/ops/maxpool/nvidia/maxpool_nvidia.cu
 create mode 100644 src/infiniop/ops/maxpool/nvidia/maxpool_nvidia.cuh
 create mode 100644 src/infiniop/ops/maxpool/operator.cc
 create mode 100644 src/infiniop/ops/scatter/cpu/scatter_cpu.cc
 create mode 100644 src/infiniop/ops/scatter/cpu/scatter_cpu.h
 create mode 100644 src/infiniop/ops/scatter/cuda/kernel.cuh
 create mode 100644 src/infiniop/ops/scatter/info.h
 create mode 100644 src/infiniop/ops/scatter/metax/scatter_metax.h
 create mode 100644 src/infiniop/ops/scatter/metax/scatter_metax.maca
 create mode 100644 src/infiniop/ops/scatter/nvidia/scatter_nvidia.cu
 create mode 100644 src/infiniop/ops/scatter/nvidia/scatter_nvidia.cuh
 create mode 100644 src/infiniop/ops/scatter/operator.cc
 create mode 100644 src/infiniop/ops/scatter/scatter.h
 create mode 100644 test/infiniop/averagepool.py
 create mode 100644 test/infiniop/batch_norm.py
 create mode 100644 test/infiniop/cross_entropy_loss.py
 create mode 100644 test/infiniop/exp.py
 create mode 100644 test/infiniop/gather.py
 create mode 100644 test/infiniop/hardswish.py
 create mode 100644 test/infiniop/index_copy_inplace.py
 create mode 100644 test/infiniop/interpolate_nearest.py
 create mode 100644 test/infiniop/maxpool.py
 create mode 100644 test/infiniop/scatter.py

diff --git a/include/infiniop/ops/averagepool.h b/include/infiniop/ops/averagepool.h
new file mode 100644
index 000000000..87e857175
--- /dev/null
+++ b/include/infiniop/ops/averagepool.h
@@ -0,0 +1,29 @@
+#ifndef __INFINIOP_AVERAGEPOOL_H__
+#define __INFINIOP_AVERAGEPOOL_H__
+
+#include "../operator_descriptor.h"
+
+__C typedef struct InfiniopDescriptor *infiniopAvgPoolDescriptor_t;
+
+__C infiniStatus_t infiniopCreateAvgPoolDescriptor(infiniopHandle_t handle,
+                                                   infiniopAvgPoolDescriptor_t *desc_ptr,
+                                                   infiniopTensorDescriptor_t output_desc,
+                                                   infiniopTensorDescriptor_t input_desc,
+                                                   void *kernel_size,
+                                                   void *strides,
+                                                   void *pads,
+                                                   bool ceil_mode);
+
+__C infiniStatus_t infiniopGetAvgPoolWorkspaceSize(infiniopAvgPoolDescriptor_t desc,
+                                                   size_t *size);
+
+__C infiniStatus_t infiniopAvgPool(infiniopAvgPoolDescriptor_t desc,
+                                   void *workspace,
+                                   size_t workspace_size,
+                                   void *output,
+                                   const void *input,
+                                   void *stream);
+
+__C infiniStatus_t infiniopDestroyAvgPoolDescriptor(infiniopAvgPoolDescriptor_t desc);
+
+#endif // __INFINIOP_AVERAGEPOOL_H__
diff --git a/include/infiniop/ops/batch_norm.h b/include/infiniop/ops/batch_norm.h
new file mode 100644
index 000000000..5487a1f69
--- /dev/null
+++ b/include/infiniop/ops/batch_norm.h
@@ -0,0 +1,37 @@
+#ifndef __INFINIOP_BATCH_NORM_API_H__
+#define __INFINIOP_BATCH_NORM_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopBatchNormDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateBatchNormDescriptor(
+    infiniopHandle_t handle,
+    infiniopBatchNormDescriptor_t *desc_ptr,
+	infiniopTensorDescriptor_t output_desc,
+	infiniopTensorDescriptor_t running_mean_desc,
+	infiniopTensorDescriptor_t running_var_desc,
+	infiniopTensorDescriptor_t input_desc,
+	infiniopTensorDescriptor_t weight_desc,
+	infiniopTensorDescriptor_t bias_desc,
+	float momentum,
+	float eps
+);
+
+__C __export infiniStatus_t infiniopGetBatchNormWorkspaceSize(infiniopBatchNormDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopBatchNorm(infiniopBatchNormDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+	void * output,
+	void * running_mean,
+	void * running_var,
+	const void * input,
+	const void * weight,
+	const void * bias,
+    void *stream
+);
+
+__C __export infiniStatus_t infiniopDestroyBatchNormDescriptor(infiniopBatchNormDescriptor_t desc);
+
+#endif
diff --git a/include/infiniop/ops/cross_entropy_loss.h b/include/infiniop/ops/cross_entropy_loss.h
new file mode 100644
index 000000000..8b59843c9
--- /dev/null
+++ b/include/infiniop/ops/cross_entropy_loss.h
@@ -0,0 +1,27 @@
+#ifndef __INFINIOP_CROSS_ENTROPY_LOSS_API_H__
+#define __INFINIOP_CROSS_ENTROPY_LOSS_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopCrossEntropyLossDescriptor_t;
+
+__C infiniStatus_t infiniopCreateCrossEntropyLossDescriptor(infiniopHandle_t handle,
+                                                            infiniopCrossEntropyLossDescriptor_t *desc_ptr,
+                                                            infiniopTensorDescriptor_t loss_desc,
+                                                            infiniopTensorDescriptor_t logits_desc,
+                                                            infiniopTensorDescriptor_t target_desc);
+
+__C infiniStatus_t infiniopGetCrossEntropyLossWorkspaceSize(infiniopCrossEntropyLossDescriptor_t desc,
+                                                            size_t *size);
+
+__C infiniStatus_t infiniopCrossEntropyLoss(infiniopCrossEntropyLossDescriptor_t desc,
+                                            void *workspace,
+                                            size_t workspace_size,
+                                            void *loss,
+                                            const void *logits,
+                                            const void *target,
+                                            void *stream);
+
+__C infiniStatus_t infiniopDestroyCrossEntropyLossDescriptor(infiniopCrossEntropyLossDescriptor_t desc);
+
+#endif // __INFINIOP_CROSS_ENTROPY_LOSS_API_H__
diff --git a/include/infiniop/ops/exp.h b/include/infiniop/ops/exp.h
new file mode 100644
index 000000000..624bc5363
--- /dev/null
+++ b/include/infiniop/ops/exp.h
@@ -0,0 +1,24 @@
+#ifndef __INFINIOP_EXP_API_H__
+#define __INFINIOP_EXP_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopExpDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateExpDescriptor(infiniopHandle_t handle,
+                                                        infiniopExpDescriptor_t *desc_ptr,
+                                                        infiniopTensorDescriptor_t output,
+                                                        infiniopTensorDescriptor_t input);
+
+__C __export infiniStatus_t infiniopGetExpWorkspaceSize(infiniopExpDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopExp(infiniopExpDescriptor_t desc,
+                                        void *workspace,
+                                        size_t workspace_size,
+                                        void *output,
+                                        const void *input,
+                                        void *stream);
+
+__C __export infiniStatus_t infiniopDestroyExpDescriptor(infiniopExpDescriptor_t desc);
+
+#endif
diff --git a/include/infiniop/ops/gather.h b/include/infiniop/ops/gather.h
new file mode 100644
index 000000000..9ffe310c9
--- /dev/null
+++ b/include/infiniop/ops/gather.h
@@ -0,0 +1,31 @@
+#ifndef __INFINIOP_GATHER_API_H__
+#define __INFINIOP_GATHER_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopGatherDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateGatherDescriptor(
+    infiniopHandle_t handle,
+    infiniopGatherDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t output_desc,
+    infiniopTensorDescriptor_t input_desc,
+    infiniopTensorDescriptor_t index_desc,
+    size_t dim
+);
+
+__C __export infiniStatus_t infiniopGetGatherWorkspaceSize(infiniopGatherDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopGather(
+    infiniopGatherDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void * output,
+    const void * input,
+    const void * index,
+    void *stream
+);
+
+__C __export infiniStatus_t infiniopDestroyGatherDescriptor(infiniopGatherDescriptor_t desc);
+
+#endif
diff --git a/include/infiniop/ops/hardswish.h b/include/infiniop/ops/hardswish.h
new file mode 100644
index 000000000..8d655fe82
--- /dev/null
+++ b/include/infiniop/ops/hardswish.h
@@ -0,0 +1,24 @@
+#ifndef __INFINIOP_HARDSWISH_API_H__
+#define __INFINIOP_HARDSWISH_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopHardswishDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateHardswishDescriptor(infiniopHandle_t handle,
+                                                              infiniopHardswishDescriptor_t *desc_ptr,
+                                                              infiniopTensorDescriptor_t output,
+                                                              infiniopTensorDescriptor_t input);
+
+__C __export infiniStatus_t infiniopGetHardswishWorkspaceSize(infiniopHardswishDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopHardswish(infiniopHardswishDescriptor_t desc,
+                                              void *workspace,
+                                              size_t workspace_size,
+                                              void *output,
+                                              const void *input,
+                                              void *stream);
+
+__C __export infiniStatus_t infiniopDestroyHardswishDescriptor(infiniopHardswishDescriptor_t desc);
+
+#endif
diff --git a/include/infiniop/ops/index_copy_inplace.h b/include/infiniop/ops/index_copy_inplace.h
new file mode 100644
index 000000000..e2266299a
--- /dev/null
+++ b/include/infiniop/ops/index_copy_inplace.h
@@ -0,0 +1,30 @@
+#ifndef __INFINIOP_INDEX_COPY_INPLACE_API_H__
+#define __INFINIOP_INDEX_COPY_INPLACE_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopIndexCopyInplaceDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateIndexCopyInplaceDescriptor(
+    infiniopHandle_t handle,
+    infiniopIndexCopyInplaceDescriptor_t *desc_ptr,
+	infiniopTensorDescriptor_t output_desc,
+	infiniopTensorDescriptor_t input_desc,
+	infiniopTensorDescriptor_t index_desc,
+	size_t dim
+);
+
+__C __export infiniStatus_t infiniopGetIndexCopyInplaceWorkspaceSize(infiniopIndexCopyInplaceDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopIndexCopyInplace(infiniopIndexCopyInplaceDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+	void * output,
+	const void * input,
+	const void * index,
+    void *stream
+);
+
+__C __export infiniStatus_t infiniopDestroyIndexCopyInplaceDescriptor(infiniopIndexCopyInplaceDescriptor_t desc);
+
+#endif
diff --git a/include/infiniop/ops/interpolate_nearest.h b/include/infiniop/ops/interpolate_nearest.h
new file mode 100644
index 000000000..7f970dc38
--- /dev/null
+++ b/include/infiniop/ops/interpolate_nearest.h
@@ -0,0 +1,25 @@
+#ifndef __INFINIOP_INTERPOLATE_NEAREST_H__
+#define __INFINIOP_INTERPOLATE_NEAREST_H__
+
+#include "../operator_descriptor.h"
+
+__C typedef struct InfiniopDescriptor *infiniopInterpolateNearestDescriptor_t;
+
+__C infiniStatus_t infiniopCreateInterpolateNearestDescriptor(infiniopHandle_t handle,
+                                                              infiniopInterpolateNearestDescriptor_t *desc_ptr,
+                                                              infiniopTensorDescriptor_t output_desc,
+                                                              infiniopTensorDescriptor_t input_desc);
+
+__C infiniStatus_t infiniopGetInterpolateNearestWorkspaceSize(infiniopInterpolateNearestDescriptor_t desc,
+                                                              size_t *size);
+
+__C infiniStatus_t infiniopInterpolateNearest(infiniopInterpolateNearestDescriptor_t desc,
+                                              void *workspace,
+                                              size_t workspace_size,
+                                              void *output,
+                                              const void *input,
+                                              void *stream);
+
+__C infiniStatus_t infiniopDestroyInterpolateNearestDescriptor(infiniopInterpolateNearestDescriptor_t desc);
+
+#endif // __INFINIOP_INTERPOLATE_NEAREST_H__
diff --git a/include/infiniop/ops/maxpool.h b/include/infiniop/ops/maxpool.h
new file mode 100644
index 000000000..e47a43aed
--- /dev/null
+++ b/include/infiniop/ops/maxpool.h
@@ -0,0 +1,29 @@
+#ifndef __INFINIOP_MAX_POOL_H__
+#define __INFINIOP_MAX_POOL_H__
+
+#include "../operator_descriptor.h"
+
+__C typedef struct InfiniopDescriptor *infiniopMaxPoolDescriptor_t;
+
+__C infiniStatus_t infiniopCreateMaxPoolDescriptor(infiniopHandle_t handle,
+                                                   infiniopMaxPoolDescriptor_t *desc_ptr,
+                                                   infiniopTensorDescriptor_t output_desc,
+                                                   infiniopTensorDescriptor_t input_desc,
+                                                   void *kernel_size,
+                                                   void *strides,
+                                                   void *pads,
+                                                   bool ceil_mode);
+
+__C infiniStatus_t infiniopGetMaxPoolWorkspaceSize(infiniopMaxPoolDescriptor_t desc,
+                                                   size_t *size);
+
+__C infiniStatus_t infiniopMaxPool(infiniopMaxPoolDescriptor_t desc,
+                                   void *workspace,
+                                   size_t workspace_size,
+                                   void *output,
+                                   const void *input,
+                                   void *stream);
+
+__C infiniStatus_t infiniopDestroyMaxPoolDescriptor(infiniopMaxPoolDescriptor_t desc);
+
+#endif // __INFINIOP_MAX_POOL_H__
diff --git a/include/infiniop/ops/scatter.h b/include/infiniop/ops/scatter.h
new file mode 100644
index 000000000..22e0eff83
--- /dev/null
+++ b/include/infiniop/ops/scatter.h
@@ -0,0 +1,30 @@
+#ifndef __INFINIOP_SCATTER_API_H__
+#define __INFINIOP_SCATTER_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopScatterDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateScatterDescriptor(
+    infiniopHandle_t handle,
+    infiniopScatterDescriptor_t *desc_ptr,
+	infiniopTensorDescriptor_t output_desc,
+	infiniopTensorDescriptor_t input_desc,
+	infiniopTensorDescriptor_t index_desc,
+	size_t dim
+);
+
+__C __export infiniStatus_t infiniopGetScatterWorkspaceSize(infiniopScatterDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopScatter(infiniopScatterDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+	void * output,
+	const void * input,
+	const void * index,
+    void *stream
+);
+
+__C __export infiniStatus_t infiniopDestroyScatterDescriptor(infiniopScatterDescriptor_t desc);
+
+#endif
diff --git a/src/infiniop/ops/averagepool/averagepool.h b/src/infiniop/ops/averagepool/averagepool.h
new file mode 100644
index 000000000..7762826ab
--- /dev/null
+++ b/src/infiniop/ops/averagepool/averagepool.h
@@ -0,0 +1,52 @@
+#ifndef __AVERAGEPOOL_H__
+#define __AVERAGEPOOL_H__
+
+#include "../../operator.h"
+#include "info.h"
+
+#define DESCRIPTOR(NAMESPACE)                                    \
+    namespace op::averagepool::NAMESPACE {                       \
+    class Descriptor final : public InfiniopDescriptor {         \
+        struct Opaque;                                           \
+        Opaque *_opaque;                                         \
+        infiniDtype_t _dtype;                                    \
+        AvgPoolInfo _info;                                       \
+        size_t _workspace_size;                                  \
+                                                                 \
+        Descriptor(                                              \
+            infiniDtype_t dtype,                                 \
+            AvgPoolInfo info,                                    \
+            size_t workspace_size_,                              \
+            Opaque *opaque,                                      \
+            infiniDevice_t device_type,                          \
+            int device_id)                                       \
+            : InfiniopDescriptor{device_type, device_id},        \
+              _opaque(opaque),                                   \
+              _dtype(dtype),                                     \
+              _info(info),                                       \
+              _workspace_size(workspace_size_) {}                \
+                                                                 \
+    public:                                                      \
+        ~Descriptor();                                           \
+                                                                 \
+        size_t workspaceSize() const { return _workspace_size; } \
+                                                                 \
+        static infiniStatus_t create(                            \
+            infiniopHandle_t handle,                             \
+            Descriptor **desc_ptr,                               \
+            infiniopTensorDescriptor_t output_desc,              \
+            infiniopTensorDescriptor_t input_desc,               \
+            void *kernel_size,                                   \
+            void *strides,                                       \
+            void *pads,                                          \
+            bool ceil_mode);                                     \
+                                                                 \
+        infiniStatus_t calculate(                                \
+            void *workspace, size_t workspace_size,              \
+            void *output,                                        \
+            const void *input,                                   \
+            void *stream) const;                                 \
+    };                                                           \
+    }
+
+#endif // __AVERAGEPOOL_H__
diff --git a/src/infiniop/ops/averagepool/cpu/averagepool_cpu.cc b/src/infiniop/ops/averagepool/cpu/averagepool_cpu.cc
new file mode 100644
index 000000000..95a347ddc
--- /dev/null
+++ b/src/infiniop/ops/averagepool/cpu/averagepool_cpu.cc
@@ -0,0 +1,348 @@
+#include "averagepool_cpu.h"
+#include "../../../devices/cpu/common_cpu.h"
+#include "../../../devices/cpu/cpu_handle.h"
+#include "../info.h"
+#include <algorithm>
+#include <cmath>
+#include <cstring>
+#include <numeric>
+#include <vector>
+
+namespace op::averagepool::cpu {
+
+struct Descriptor::Opaque {
+    device::cpu::Handle *handle;
+    AvgPoolInfo info;
+    size_t workspace_size = 0;
+
+private:
+    Opaque(device::cpu::Handle *handle_ptr, const AvgPoolInfo &avgpool_info)
+        : handle(handle_ptr), info(avgpool_info) {
+        workspace_size = 0;
+    }
+
+    template <typename T, typename Ydata>
+    void _avgpool_1d(Ydata *output, const T *input) const {
+        size_t batch_size = info.batch;
+        size_t channels = info.channels;
+        size_t input_width = info.input_dims[0];
+        size_t output_width = info.output_dims[0];
+        size_t kernel_width = info.kernel_sizes[0];
+        size_t stride_width = info.strides[0];
+        size_t pad_width = info.pads[0];
+
+        const size_t input_nc_stride = input_width;
+        const size_t output_nc_stride = output_width;
+
+#pragma omp parallel for schedule(static)
+        for (int b = 0; b < static_cast<int>(batch_size); ++b) {
+            for (int c = 0; c < static_cast<int>(channels); ++c) {
+                const size_t input_offset = (static_cast<size_t>(b) * channels + static_cast<size_t>(c)) * input_nc_stride;
+                const size_t output_offset = (static_cast<size_t>(b) * channels + static_cast<size_t>(c)) * output_nc_stride;
+
+                for (size_t ow = 0; ow < output_width; ++ow) {
+                    float sum = 0.0f;
+                    int valid_count = 0;
+
+                    const int window_start = static_cast<int>(ow * stride_width) - static_cast<int>(pad_width);
+                    const int window_end = window_start + static_cast<int>(kernel_width);
+
+                    for (int iw = window_start; iw < window_end; ++iw) {
+                        if (iw >= 0 && iw < static_cast<int>(input_width)) {
+                            sum += utils::cast<float>(input[input_offset + iw]);
+                            valid_count++;
+                        } else if (iw >= -static_cast<int>(pad_width) && iw < static_cast<int>(input_width + pad_width)) {
+                            valid_count++;
+                        }
+                    }
+
+                    float result = 0.0f;
+                    if (valid_count > 0) {
+                        result = sum / static_cast<float>(valid_count);
+                    }
+                    output[output_offset + ow] = utils::cast<Ydata>(result);
+                }
+            }
+        }
+    }
+
+    template <typename T, typename Ydata>
+    void _avgpool_2d(Ydata *output, const T *input) const {
+        size_t batch_size = info.batch;
+        size_t channels = info.channels;
+        size_t input_height = info.input_dims[0];
+        size_t input_width = info.input_dims[1];
+        size_t output_height = info.output_dims[0];
+        size_t output_width = info.output_dims[1];
+        size_t kernel_height = info.kernel_sizes[0];
+        size_t kernel_width = info.kernel_sizes[1];
+        size_t stride_height = info.strides[0];
+        size_t stride_width = info.strides[1];
+        size_t pad_height = info.pads[0];
+        size_t pad_width = info.pads[1];
+
+        const size_t input_nc_stride = input_height * input_width;
+        const size_t output_nc_stride = output_height * output_width;
+
+#pragma omp parallel for schedule(static)
+        for (int b = 0; b < static_cast<int>(batch_size); ++b) {
+            for (int c = 0; c < static_cast<int>(channels); ++c) {
+                const size_t input_offset = (static_cast<size_t>(b) * channels + static_cast<size_t>(c)) * input_nc_stride;
+                const size_t output_offset = (static_cast<size_t>(b) * channels + static_cast<size_t>(c)) * output_nc_stride;
+
+                for (size_t oh = 0; oh < output_height; ++oh) {
+                    for (size_t ow = 0; ow < output_width; ++ow) {
+                        float sum = 0.0f;
+                        int valid_count = 0;
+
+                        const int start_h = static_cast<int>(oh * stride_height) - static_cast<int>(pad_height);
+                        const int start_w = static_cast<int>(ow * stride_width) - static_cast<int>(pad_width);
+
+                        for (int kh = 0; kh < static_cast<int>(kernel_height); ++kh) {
+                            for (int kw = 0; kw < static_cast<int>(kernel_width); ++kw) {
+                                const int ih = start_h + kh;
+                                const int iw = start_w + kw;
+
+                                if (ih >= 0 && ih < static_cast<int>(input_height) && iw >= 0 && iw < static_cast<int>(input_width)) {
+                                    sum += utils::cast<float>(input[input_offset + ih * input_width + iw]);
+                                    valid_count++;
+                                } else if (ih >= -static_cast<int>(pad_height) && ih < static_cast<int>(input_height + pad_height) && iw >= -static_cast<int>(pad_width) && iw < static_cast<int>(input_width + pad_width)) {
+                                    valid_count++;
+                                }
+                            }
+                        }
+
+                        float result = 0.0f;
+                        if (valid_count > 0) {
+                            result = sum / static_cast<float>(valid_count);
+                        }
+                        output[output_offset + oh * output_width + ow] = utils::cast<Ydata>(result);
+                    }
+                }
+            }
+        }
+    }
+
+    template <typename T, typename Ydata>
+    void _avgpool_3d(Ydata *output, const T *input) const {
+        size_t batch_size = info.batch;
+        size_t channels = info.channels;
+        size_t input_depth = info.input_dims[0];
+        size_t input_height = info.input_dims[1];
+        size_t input_width = info.input_dims[2];
+        size_t output_depth = info.output_dims[0];
+        size_t output_height = info.output_dims[1];
+        size_t output_width = info.output_dims[2];
+        size_t kernel_depth = info.kernel_sizes[0];
+        size_t kernel_height = info.kernel_sizes[1];
+        size_t kernel_width = info.kernel_sizes[2];
+        size_t stride_depth = info.strides[0];
+        size_t stride_height = info.strides[1];
+        size_t stride_width = info.strides[2];
+        size_t pad_depth = info.pads[0];
+        size_t pad_height = info.pads[1];
+        size_t pad_width = info.pads[2];
+
+        const size_t input_nc_stride = input_depth * input_height * input_width;
+        const size_t output_nc_stride = output_depth * output_height * output_width;
+
+#pragma omp parallel for schedule(static)
+        for (int b = 0; b < static_cast<int>(batch_size); ++b) {
+            for (int c = 0; c < static_cast<int>(channels); ++c) {
+                const size_t input_offset = (static_cast<size_t>(b) * channels + static_cast<size_t>(c)) * input_nc_stride;
+                const size_t output_offset = (static_cast<size_t>(b) * channels + static_cast<size_t>(c)) * output_nc_stride;
+
+                for (size_t od = 0; od < output_depth; ++od) {
+                    for (size_t oh = 0; oh < output_height; ++oh) {
+                        for (size_t ow = 0; ow < output_width; ++ow) {
+                            float sum = 0.0f;
+                            int valid_count = 0;
+
+                            const int start_d = static_cast<int>(od * stride_depth) - static_cast<int>(pad_depth);
+                            const int start_h = static_cast<int>(oh * stride_height) - static_cast<int>(pad_height);
+                            const int start_w = static_cast<int>(ow * stride_width) - static_cast<int>(pad_width);
+
+                            for (int kd = 0; kd < static_cast<int>(kernel_depth); ++kd) {
+                                const int id = start_d + kd;
+                                for (int kh = 0; kh < static_cast<int>(kernel_height); ++kh) {
+                                    const int ih = start_h + kh;
+                                    for (int kw = 0; kw < static_cast<int>(kernel_width); ++kw) {
+                                        const int iw = start_w + kw;
+
+                                        if (id >= 0 && id < static_cast<int>(input_depth) && ih >= 0 && ih < static_cast<int>(input_height) && iw >= 0 && iw < static_cast<int>(input_width)) {
+                                            const size_t idx = id * (input_height * input_width) + ih * input_width + iw;
+                                            sum += utils::cast<float>(input[input_offset + idx]);
+                                            valid_count++;
+                                        } else if (id >= -static_cast<int>(pad_depth) && id < static_cast<int>(input_depth + pad_depth) && ih >= -static_cast<int>(pad_height) && ih < static_cast<int>(input_height + pad_height) && iw >= -static_cast<int>(pad_width) && iw < static_cast<int>(input_width + pad_width)) {
+                                            valid_count++;
+                                        }
+                                    }
+                                }
+                            }
+
+                            float result = 0.0f;
+                            if (valid_count > 0) {
+                                result = sum / static_cast<float>(valid_count);
+                            }
+
+                            const size_t out_idx = od * (output_height * output_width) + oh * output_width + ow;
+                            output[output_offset + out_idx] = utils::cast<Ydata>(result);
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    template <typename T, typename Ydata>
+    void _avgpool_cpu(Ydata *output, const T *input) const {
+        switch (info.ndim) {
+        case 1:
+            _avgpool_1d<T, Ydata>(output, input);
+            break;
+        case 2:
+            _avgpool_2d<T, Ydata>(output, input);
+            break;
+        case 3:
+            _avgpool_3d<T, Ydata>(output, input);
+            break;
+        default:
+            break;
+        }
+    }
+
+public:
+    Opaque(Opaque &&other) noexcept
+        : handle(other.handle),
+          info(std::move(other.info)),
+          workspace_size(other.workspace_size) {
+        other.handle = nullptr;
+        other.workspace_size = 0;
+    }
+
+    ~Opaque() = default;
+
+    static inline utils::Result<Opaque>
+    create(device::cpu::Handle *handle_ptr,
+           AvgPoolInfo &info) {
+
+        Opaque opaque(handle_ptr, info);
+        return utils::Result<Opaque>(std::move(opaque));
+    }
+
+    infiniStatus_t calculate(void *workspace, size_t workspace_size,
+                             void *output, const void *input, infiniDtype_t dtype) const {
+        if (!output || !input) {
+            return INFINI_STATUS_BAD_PARAM;
+        }
+
+        size_t output_size = info.batch * info.channels;
+        for (size_t i = 0; i < info.ndim; ++i) {
+            output_size *= info.output_dims[i];
+        }
+
+        switch (dtype) {
+        case INFINI_DTYPE_F32: {
+            float *typed_output = static_cast<float *>(output);
+            const float *typed_input = static_cast<const float *>(input);
+            _avgpool_cpu<float, float>(typed_output, typed_input);
+            break;
+        }
+        case INFINI_DTYPE_F16: {
+            float *typed_output_f32 = static_cast<float *>(workspace);
+            const fp16_t *typed_input = static_cast<const fp16_t *>(input);
+
+            _avgpool_cpu<fp16_t, float>(typed_output_f32, typed_input);
+
+            fp16_t *typed_output = static_cast<fp16_t *>(output);
+#pragma omp parallel for
+            for (int i = 0; i < static_cast<int>(output_size); ++i) {
+                typed_output[i] = utils::cast<fp16_t>(typed_output_f32[i]);
+            }
+            break;
+        }
+        case INFINI_DTYPE_BF16: {
+            float *typed_output_f32 = static_cast<float *>(workspace);
+            const bf16_t *typed_input = static_cast<const bf16_t *>(input);
+
+            _avgpool_cpu<bf16_t, float>(typed_output_f32, typed_input);
+
+            bf16_t *typed_output = static_cast<bf16_t *>(output);
+#pragma omp parallel for
+            for (int i = 0; i < static_cast<int>(output_size); ++i) {
+                typed_output[i] = utils::cast<bf16_t>(typed_output_f32[i]);
+            }
+            break;
+        }
+        default:
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;
+        }
+
+        return INFINI_STATUS_SUCCESS;
+    }
+};
+
+Descriptor::~Descriptor() {
+    if (_opaque) {
+        delete _opaque;
+    }
+}
+
+inline size_t calculateOutputSize(const AvgPoolInfo &info) {
+    size_t size = info.batch * info.channels;
+    for (size_t i = 0; i < info.ndim; ++i) {
+        size *= info.output_dims[i];
+    }
+    return size;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t output_desc,
+    infiniopTensorDescriptor_t input_desc,
+    void *kernel_size,
+    void *strides,
+    void *pads,
+    bool ceil_mode) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = input_desc->dtype();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F32, INFINI_DTYPE_F16, INFINI_DTYPE_BF16);
+
+    auto result = AvgPoolInfo::create(output_desc, input_desc, kernel_size,
+                                      strides, pads, ceil_mode);
+    CHECK_RESULT(result);
+    auto info = result.take();
+
+    auto opaque_result = Opaque::create(handle, info);
+    CHECK_RESULT(opaque_result);
+    auto opaque = new Opaque(opaque_result.take());
+
+    size_t workspace_size = 0;
+    if (dtype == INFINI_DTYPE_F16 || dtype == INFINI_DTYPE_BF16) {
+        workspace_size = calculateOutputSize(info) * sizeof(float);
+    }
+
+    *desc_ptr = new Descriptor(dtype, std::move(info), workspace_size,
+                               opaque, handle->device, handle->device_id);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    const void *input,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    return _opaque->calculate(workspace, workspace_size, output, input, _dtype);
+}
+
+} // namespace op::averagepool::cpu
diff --git a/src/infiniop/ops/averagepool/cpu/averagepool_cpu.h b/src/infiniop/ops/averagepool/cpu/averagepool_cpu.h
new file mode 100644
index 000000000..8388f80ff
--- /dev/null
+++ b/src/infiniop/ops/averagepool/cpu/averagepool_cpu.h
@@ -0,0 +1,8 @@
+#ifndef __AVERAGEPOOL_CPU_H__
+#define __AVERAGEPOOL_CPU_H__
+
+#include "../averagepool.h"
+
+DESCRIPTOR(cpu)
+
+#endif // __AVERAGEPOOL_CPU_H__
diff --git a/src/infiniop/ops/averagepool/cuda/averagepool_kernel.cuh b/src/infiniop/ops/averagepool/cuda/averagepool_kernel.cuh
new file mode 100644
index 000000000..7c9d0f438
--- /dev/null
+++ b/src/infiniop/ops/averagepool/cuda/averagepool_kernel.cuh
@@ -0,0 +1,185 @@
+#ifndef __AVERAGEPOOL_KERNEL_H__
+#define __AVERAGEPOOL_KERNEL_H__
+
+#include <cmath>
+
+// 1D平均池化kernel，兼容PyTorch的隐式填充逻辑
+template <typename T>
+__global__ void avgpool1d_pytorch_compatible_kernel(
+    const T *input, T *output, int batch_size, int channels, int input_length,
+    int output_length, int kernel_size, int stride, int padding) {
+
+    int batch_idx = blockIdx.x;
+    int channel_idx = blockIdx.y;
+    int output_idx = blockIdx.z * blockDim.x + threadIdx.x;
+
+    if (batch_idx >= batch_size || channel_idx >= channels || output_idx >= output_length) {
+        return;
+    }
+
+    // 计算输入和输出的偏移
+    const T *input_ptr = input + batch_idx * channels * input_length + channel_idx * input_length;
+    T *output_ptr = output + batch_idx * channels * output_length + channel_idx * output_length;
+
+    // 计算池化窗口的起始位置
+    int window_start = output_idx * stride - padding;
+
+    // 使用单精度进行中间计算
+    float sum = 0.0f;
+    int valid_count = 0;
+
+    // 遍历池化窗口
+    for (int k = 0; k < kernel_size; ++k) {
+        int input_pos = window_start + k;
+
+        if (input_pos >= 0 && input_pos < input_length) {
+            // 有效的输入位置，转换为单精度进行累加
+            sum += static_cast<float>(input_ptr[input_pos]);
+            valid_count++;
+        } else if (input_pos >= -padding && input_pos < input_length + padding) {
+            // 显式填充区域，值为0，只增加计数
+            valid_count++;
+        }
+        // 其他位置是隐式填充，不计入分母
+    }
+
+    // 计算平均值并转换回原始数据类型
+    if (valid_count > 0) {
+        float result = sum / static_cast<float>(valid_count);
+        output_ptr[output_idx] = static_cast<T>(result);
+    } else {
+        output_ptr[output_idx] = T(0);
+    }
+}
+
+// 2D平均池化kernel，兼容PyTorch的隐式填充逻辑
+template <typename T>
+__global__ void avgpool2d_pytorch_compatible_kernel(
+    const T *input, T *output, int batch_size, int channels, int input_height,
+    int input_width, int output_height, int output_width, int kernel_h,
+    int kernel_w, int stride_h, int stride_w, int pad_h, int pad_w) {
+
+    int batch_idx = blockIdx.x;
+    int channel_idx = blockIdx.y;
+    int output_idx = blockIdx.z * blockDim.x + threadIdx.x;
+
+    int total_output_elements = output_height * output_width;
+    if (batch_idx >= batch_size || channel_idx >= channels || output_idx >= total_output_elements) {
+        return;
+    }
+
+    // 将线性索引转换为2D坐标
+    int out_h = output_idx / output_width;
+    int out_w = output_idx % output_width;
+
+    // 计算输入和输出的偏移
+    const T *input_ptr = input + batch_idx * channels * input_height * input_width + channel_idx * input_height * input_width;
+    T *output_ptr = output + batch_idx * channels * output_height * output_width + channel_idx * output_height * output_width;
+
+    // 计算池化窗口的起始位置
+    int window_start_h = out_h * stride_h - pad_h;
+    int window_start_w = out_w * stride_w - pad_w;
+
+    // 使用单精度进行中间计算
+    float sum = 0.0f;
+    int valid_count = 0;
+
+    // 遍历池化窗口
+    for (int kh = 0; kh < kernel_h; ++kh) {
+        for (int kw = 0; kw < kernel_w; ++kw) {
+            int input_h = window_start_h + kh;
+            int input_w = window_start_w + kw;
+
+            if (input_h >= 0 && input_h < input_height && input_w >= 0 && input_w < input_width) {
+                // 有效的输入位置，转换为单精度进行累加
+                int input_idx = input_h * input_width + input_w;
+                sum += static_cast<float>(input_ptr[input_idx]);
+                valid_count++;
+            } else if (input_h >= -pad_h && input_h < input_height + pad_h && input_w >= -pad_w && input_w < input_width + pad_w) {
+                // 显式填充区域，值为0，只增加计数
+                valid_count++;
+            }
+            // 其他位置是隐式填充，不计入分母
+        }
+    }
+
+    // 计算平均值并转换回原始数据类型
+    if (valid_count > 0) {
+        float result = sum / static_cast<float>(valid_count);
+        output_ptr[output_idx] = static_cast<T>(result);
+    } else {
+        output_ptr[output_idx] = T(0);
+    }
+}
+
+// 3D平均池化kernel，兼容PyTorch的隐式填充逻辑
+template <typename T>
+__global__ void avgpool3d_pytorch_compatible_kernel(
+    const T *input, T *output, int batch_size, int channels, int input_depth,
+    int input_height, int input_width, int output_depth, int output_height,
+    int output_width, int kernel_d, int kernel_h, int kernel_w, int stride_d,
+    int stride_h, int stride_w, int pad_d, int pad_h, int pad_w) {
+
+    int batch_idx = blockIdx.x;
+    int channel_idx = blockIdx.y;
+    int output_idx = blockIdx.z * blockDim.x + threadIdx.x;
+
+    int total_output_elements = output_depth * output_height * output_width;
+    if (batch_idx >= batch_size || channel_idx >= channels || output_idx >= total_output_elements) {
+        return;
+    }
+
+    // 将线性索引转换为3D坐标
+    int out_d = output_idx / (output_height * output_width);
+    int remaining = output_idx % (output_height * output_width);
+    int out_h = remaining / output_width;
+    int out_w = remaining % output_width;
+
+    // 计算输入和输出的偏移
+    int input_spatial_size = input_depth * input_height * input_width;
+    int output_spatial_size = output_depth * output_height * output_width;
+
+    const T *input_ptr = input + batch_idx * channels * input_spatial_size + channel_idx * input_spatial_size;
+    T *output_ptr = output + batch_idx * channels * output_spatial_size + channel_idx * output_spatial_size;
+
+    // 计算池化窗口的起始位置
+    int window_start_d = out_d * stride_d - pad_d;
+    int window_start_h = out_h * stride_h - pad_h;
+    int window_start_w = out_w * stride_w - pad_w;
+
+    // 使用单精度进行中间计算
+    float sum = 0.0f;
+    int valid_count = 0;
+
+    // 遍历池化窗口
+    for (int kd = 0; kd < kernel_d; ++kd) {
+        for (int kh = 0; kh < kernel_h; ++kh) {
+            for (int kw = 0; kw < kernel_w; ++kw) {
+                int input_d = window_start_d + kd;
+                int input_h = window_start_h + kh;
+                int input_w = window_start_w + kw;
+
+                if (input_d >= 0 && input_d < input_depth && input_h >= 0 && input_h < input_height && input_w >= 0 && input_w < input_width) {
+                    // 有效的输入位置，转换为单精度进行累加
+                    int input_idx = (input_d * input_height + input_h) * input_width + input_w;
+                    sum += static_cast<float>(input_ptr[input_idx]);
+                    valid_count++;
+                } else if (input_d >= -pad_d && input_d < input_depth + pad_d && input_h >= -pad_h && input_h < input_height + pad_h && input_w >= -pad_w && input_w < input_width + pad_w) {
+                    // 显式填充区域，值为0，只增加计数
+                    valid_count++;
+                }
+                // 其他位置是隐式填充，不计入分母
+            }
+        }
+    }
+
+    // 计算平均值并转换回原始数据类型
+    if (valid_count > 0) {
+        float result = sum / static_cast<float>(valid_count);
+        output_ptr[output_idx] = static_cast<T>(result);
+    } else {
+        output_ptr[output_idx] = T(0);
+    }
+}
+
+#endif // __AVERAGEPOOL_KERNEL_H__
diff --git a/src/infiniop/ops/averagepool/info.h b/src/infiniop/ops/averagepool/info.h
new file mode 100644
index 000000000..871e827a7
--- /dev/null
+++ b/src/infiniop/ops/averagepool/info.h
@@ -0,0 +1,136 @@
+#ifndef __AVERAGEPOOL_INFO_H__
+#define __AVERAGEPOOL_INFO_H__
+
+#include "../../../utils.h"
+#include "../../operator.h"
+#include "../../tensor.h"
+#include <cstddef>
+#include <vector>
+
+namespace op::averagepool {
+
+inline utils::Result<size_t> calculatePoolOutputSize(
+    size_t input_size,
+    size_t kernel_size,
+    size_t stride,
+    size_t padding = 0,
+    bool ceil_mode = false) {
+
+    if (stride == 0) {
+        return utils::Result<size_t>(INFINI_STATUS_BAD_PARAM);
+    }
+    if (kernel_size == 0) {
+        return utils::Result<size_t>(INFINI_STATUS_BAD_PARAM);
+    }
+
+    size_t padded_input_size = input_size + 2 * padding;
+
+    if (padded_input_size < kernel_size) {
+        return utils::Result<size_t>(INFINI_STATUS_BAD_TENSOR_SHAPE);
+    }
+
+    size_t output_size;
+    if (ceil_mode) {
+        // 等效于整数的上取整
+        output_size = (padded_input_size - kernel_size + stride - 1) / stride + 1;
+    } else {
+        // 等效于整数的下取整
+        output_size = (padded_input_size - kernel_size) / stride + 1;
+    }
+
+    return utils::Result<size_t>(output_size);
+}
+
+// 检查是否存在隐式填充
+inline bool hasImplicitPadding(
+    size_t input_size,
+    size_t kernel_size,
+    size_t stride,
+    size_t padding,
+    bool ceil_mode) {
+
+    if (!ceil_mode) {
+        return false;
+    }
+    return ((input_size + 2 * padding) - kernel_size) % stride != 0;
+}
+
+class AvgPoolInfo {
+    AvgPoolInfo() = default;
+
+public:
+    std::vector<size_t> input_dims;
+    std::vector<size_t> output_dims;
+    std::vector<size_t> kernel_sizes;
+    std::vector<size_t> strides;
+    std::vector<size_t> pads;
+    bool ceil_mode;
+    size_t ndim;
+    size_t batch;
+    size_t channels;
+    bool has_implicit_padding = false;
+
+    static utils::Result<AvgPoolInfo> create(
+        infiniopTensorDescriptor_t output_desc,
+        infiniopTensorDescriptor_t input_desc,
+        void *kernel_size,
+        void *strides,
+        void *pads,
+        bool ceil_mode) {
+
+        AvgPoolInfo info;
+
+        if (input_desc->ndim() < 3 || input_desc->ndim() > 5) {
+            return INFINI_STATUS_BAD_TENSOR_SHAPE;
+        }
+
+        if (input_desc->ndim() != output_desc->ndim()) {
+            return INFINI_STATUS_BAD_TENSOR_SHAPE;
+        }
+
+        if (input_desc->dim(0) != output_desc->dim(0) || input_desc->dim(1) != output_desc->dim(1)) {
+            return INFINI_STATUS_BAD_TENSOR_SHAPE;
+        }
+
+        info.ndim = input_desc->ndim() - 2; // 空间维度
+        info.batch = input_desc->dim(0);
+        info.channels = input_desc->dim(1);
+        info.ceil_mode = ceil_mode;
+
+        auto kernel_ptr = reinterpret_cast<const size_t *>(kernel_size);
+        auto stride_ptr = reinterpret_cast<const size_t *>(strides);
+        auto pad_ptr = reinterpret_cast<const size_t *>(pads);
+
+        // 初始化隐式填充标志
+        info.has_implicit_padding = false;
+
+        // 获取并校验空间维度
+        for (size_t i = 0; i < info.ndim; ++i) {
+            info.input_dims.push_back(input_desc->dim(i + 2));
+            info.kernel_sizes.push_back(kernel_ptr[i]);
+            info.strides.push_back(stride_ptr[i]);
+            info.pads.push_back(pad_ptr[i]);
+
+            auto output_size_result = calculatePoolOutputSize(
+                info.input_dims[i], info.kernel_sizes[i], info.strides[i], info.pads[i], info.ceil_mode);
+            CHECK_RESULT(output_size_result);
+
+            size_t expected_size = output_size_result.take();
+            if (expected_size != output_desc->dim(i + 2)) {
+                return INFINI_STATUS_BAD_TENSOR_SHAPE;
+            }
+
+            info.output_dims.push_back(output_desc->dim(i + 2));
+
+            // 检查当前维度是否存在隐式填充
+            if (hasImplicitPadding(info.input_dims[i], info.kernel_sizes[i],
+                                   info.strides[i], info.pads[i], info.ceil_mode)) {
+                info.has_implicit_padding = true;
+            }
+        }
+        return utils::Result<AvgPoolInfo>(std::move(info));
+    }
+};
+} // namespace op::averagepool
+
+#endif // __AVERAGEPOOL_INFO_H__
diff --git a/src/infiniop/ops/averagepool/nvidia/averagepool.cu b/src/infiniop/ops/averagepool/nvidia/averagepool.cu
new file mode 100644
index 000000000..6f276aac8
--- /dev/null
+++ b/src/infiniop/ops/averagepool/nvidia/averagepool.cu
@@ -0,0 +1,220 @@
+#include "../../../devices/nvidia/nvidia_common.cuh"
+#include "../../../devices/nvidia/nvidia_handle.cuh"
+#include "averagepool_nvidia.cuh"
+
+#define DESTROY_CUDNN_DESCRIPTOR(desc_ptr, destroy_func) \
+    do {                                                 \
+        if (desc_ptr) {                                  \
+            destroy_func(desc_ptr);                      \
+            desc_ptr = nullptr;                          \
+        }                                                \
+    } while (0)
+
+#define CLEANUP_CUDNN_DESCRIPTORS()                                            \
+    do {                                                                       \
+        DESTROY_CUDNN_DESCRIPTOR(input_desc, cudnnDestroyTensorDescriptor);    \
+        DESTROY_CUDNN_DESCRIPTOR(output_desc, cudnnDestroyTensorDescriptor);   \
+        DESTROY_CUDNN_DESCRIPTOR(pooling_desc, cudnnDestroyPoolingDescriptor); \
+    } while (0)
+
+namespace op::averagepool::nvidia {
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::nvidia::Handle::Internal> internal;
+    size_t workspace_size = 0;
+
+#ifdef ENABLE_CUDNN_API
+    cudnnTensorDescriptor_t input_desc = nullptr;
+    cudnnTensorDescriptor_t output_desc = nullptr;
+    cudnnPoolingDescriptor_t pooling_desc = nullptr;
+#endif
+
+private:
+    Opaque(std::shared_ptr<device::nvidia::Handle::Internal> internal_ptr)
+        : internal(internal_ptr) {}
+
+#ifdef ENABLE_CUDNN_API
+    infiniStatus_t getCudnnDataType(infiniDtype_t data_type,
+                                    cudnnDataType_t &cudnn_data_type) const {
+        if (data_type == INFINI_DTYPE_F16) {
+            cudnn_data_type = device::nvidia::getCudnnDtype(data_type);
+        } else if (data_type == INFINI_DTYPE_F32) {
+            cudnn_data_type = device::nvidia::getCudnnDtype(data_type);
+        } else if (data_type == INFINI_DTYPE_BF16) {
+            cudnn_data_type = device::nvidia::getCudnnDtype(data_type);
+        } else {
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;
+        }
+        return INFINI_STATUS_SUCCESS;
+    }
+
+    infiniStatus_t createPoolingDescriptors(const AvgPoolInfo &info,
+                                            cudnnDataType_t cudnn_data_type) {
+        CHECK_CUDNN(cudnnCreateTensorDescriptor(&input_desc));
+        CHECK_CUDNN(cudnnCreateTensorDescriptor(&output_desc));
+        CHECK_CUDNN(cudnnCreatePoolingDescriptor(&pooling_desc));
+
+        std::vector<int> input_dims_vec = {static_cast<int>(info.batch),
+                                           static_cast<int>(info.channels)};
+        std::vector<int> output_dims_vec = {static_cast<int>(info.batch),
+                                            static_cast<int>(info.channels)};
+
+        for (size_t i = 0; i < info.ndim; ++i) {
+            input_dims_vec.push_back(static_cast<int>(info.input_dims[i]));
+            output_dims_vec.push_back(static_cast<int>(info.output_dims[i]));
+        }
+
+        if (info.ndim == 1) {
+            input_dims_vec.push_back(1);
+            output_dims_vec.push_back(1);
+        }
+
+        CHECK_CUDNN(cudnnSetTensorNdDescriptorEx(
+            input_desc, CUDNN_TENSOR_NCHW, cudnn_data_type, input_dims_vec.size(),
+            input_dims_vec.data()));
+
+        CHECK_CUDNN(cudnnSetTensorNdDescriptorEx(
+            output_desc, CUDNN_TENSOR_NCHW, cudnn_data_type, output_dims_vec.size(),
+            output_dims_vec.data()));
+
+        return INFINI_STATUS_SUCCESS;
+    }
+
+    infiniStatus_t setupPoolingDescriptor(const AvgPoolInfo &info) {
+        std::vector<int> kernel_vec, stride_vec, pad_vec;
+        for (size_t i = 0; i < info.ndim; ++i) {
+            kernel_vec.push_back(static_cast<int>(info.kernel_sizes[i]));
+            stride_vec.push_back(static_cast<int>(info.strides[i]));
+            pad_vec.push_back(static_cast<int>(info.pads[i]));
+        }
+
+        if (info.ndim == 1) {
+            kernel_vec.push_back(1);
+            stride_vec.push_back(1);
+            pad_vec.push_back(0);
+        }
+
+        CHECK_CUDNN(cudnnSetPoolingNdDescriptor(
+            pooling_desc, CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING,
+            CUDNN_NOT_PROPAGATE_NAN, kernel_vec.size(), kernel_vec.data(),
+            pad_vec.data(), stride_vec.data()));
+
+        return INFINI_STATUS_SUCCESS;
+    }
+
+    infiniStatus_t initializeCudnnContext(AvgPoolInfo &info,
+                                          infiniDtype_t data_type) {
+        cudnnDataType_t cudnn_data_type;
+        CHECK_STATUS(getCudnnDataType(data_type, cudnn_data_type));
+
+        CHECK_STATUS(createPoolingDescriptors(info, cudnn_data_type));
+        CHECK_STATUS(setupPoolingDescriptor(info));
+
+        // Average pooling typically doesn't need a workspace
+        workspace_size = 0;
+
+        return INFINI_STATUS_SUCCESS;
+    }
+#endif
+
+public:
+    Opaque(Opaque &&other) noexcept
+        : internal(std::move(other.internal)),
+          workspace_size(other.workspace_size)
+    // clang-format off
+#ifdef ENABLE_CUDNN_API
+        , input_desc(other.input_desc)
+        , output_desc(other.output_desc)
+        , pooling_desc(other.pooling_desc)
+#endif
+    // clang-format on
+    {
+#ifdef ENABLE_CUDNN_API
+        other.input_desc = nullptr;
+        other.output_desc = nullptr;
+        other.pooling_desc = nullptr;
+#endif
+        other.workspace_size = 0;
+    }
+
+    ~Opaque() {
+#ifdef ENABLE_CUDNN_API
+        CLEANUP_CUDNN_DESCRIPTORS();
+#endif
+    }
+
+    static inline utils::Result<Opaque>
+    create(std::shared_ptr<device::nvidia::Handle::Internal> internal_ptr,
+           AvgPoolInfo &info, infiniDtype_t data_type) {
+#ifdef ENABLE_CUDNN_API
+        Opaque opaque(internal_ptr);
+        auto status = opaque.initializeCudnnContext(info, data_type);
+        if (status != INFINI_STATUS_SUCCESS) {
+            return status;
+        }
+        return utils::Result<Opaque>(std::move(opaque));
+#else
+        return INFINI_STATUS_NOT_IMPLEMENTED;
+#endif
+    }
+};
+
+Descriptor::~Descriptor() {
+    if (_opaque) {
+        delete _opaque;
+    }
+}
+
+infiniStatus_t Descriptor::create(infiniopHandle_t handle_,
+                                  Descriptor **desc_ptr,
+                                  infiniopTensorDescriptor_t output_desc,
+                                  infiniopTensorDescriptor_t input_desc,
+                                  void *kernel_size, void *strides, void *pads,
+                                  bool ceil_mode) {
+
+#ifdef ENABLE_CUDNN_API
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = input_desc->dtype();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16);
+
+    auto result = AvgPoolInfo::create(output_desc, input_desc, kernel_size,
+                                      strides, pads, ceil_mode);
+    CHECK_RESULT(result);
+    auto info = result.take();
+
+    auto opaque_result = Opaque::create(handle->internal(), info, dtype);
+    CHECK_RESULT(opaque_result);
+    auto opaque = new Opaque(opaque_result.take());
+
+    *desc_ptr = new Descriptor(dtype, std::move(info), opaque->workspace_size,
+                               opaque, handle->device, handle->device_id);
+
+    return INFINI_STATUS_SUCCESS;
+#else
+    return INFINI_STATUS_NOT_IMPLEMENTED;
+#endif
+}
+
+infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
+                                     void *output, const void *input,
+                                     void *stream) const {
+
+#ifdef ENABLE_CUDNN_API
+    const float alpha = 1.0f, beta = 0.0f;
+
+    CHECK_STATUS(_opaque->internal->useCudnn(
+        (cudaStream_t)stream, [&](cudnnHandle_t handle) {
+            CHECK_CUDNN(cudnnPoolingForward(handle, _opaque->pooling_desc, &alpha,
+                                            _opaque->input_desc, input, &beta,
+                                            _opaque->output_desc, output));
+            return INFINI_STATUS_SUCCESS;
+        }));
+
+    return INFINI_STATUS_SUCCESS;
+#else
+    return INFINI_STATUS_NOT_IMPLEMENTED;
+#endif
+}
+
+} // namespace op::averagepool::nvidia
diff --git a/src/infiniop/ops/averagepool/nvidia/averagepool_nvidia.cuh b/src/infiniop/ops/averagepool/nvidia/averagepool_nvidia.cuh
new file mode 100644
index 000000000..ef19aa1dc
--- /dev/null
+++ b/src/infiniop/ops/averagepool/nvidia/averagepool_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __AVERAGEPOOL_CUDA_CUH__
+#define __AVERAGEPOOL_CUDA_CUH__
+
+#include "../averagepool.h"
+
+DESCRIPTOR(nvidia)
+
+#endif // __AVERAGEPOOL_CUDA_CUH__
diff --git a/src/infiniop/ops/averagepool/operator.cc b/src/infiniop/ops/averagepool/operator.cc
new file mode 100644
index 000000000..233ec4736
--- /dev/null
+++ b/src/infiniop/ops/averagepool/operator.cc
@@ -0,0 +1,147 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/averagepool.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/averagepool_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
+#include "nvidia/averagepool_nvidia.cuh"
+#endif
+
+__C infiniStatus_t infiniopCreateAvgPoolDescriptor(
+    infiniopHandle_t handle,
+    infiniopAvgPoolDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t output_desc,
+    infiniopTensorDescriptor_t input_desc,
+    void *kernel_size,
+    void *strides,
+    void *pads,
+    bool ceil_mode) {
+
+#define CREATE(CASE, NAMESPACE)                                                    \
+    case CASE:                                                                     \
+        return op::averagepool::NAMESPACE::Descriptor::create(                     \
+            handle,                                                                \
+            reinterpret_cast<op::averagepool::NAMESPACE::Descriptor **>(desc_ptr), \
+            output_desc,                                                           \
+            input_desc,                                                            \
+            kernel_size,                                                           \
+            strides,                                                               \
+            pads,                                                                  \
+            ceil_mode)
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CREATE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetAvgPoolWorkspaceSize(infiniopAvgPoolDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                             \
+    case CASE:                                                                                           \
+        *size = reinterpret_cast<const op::averagepool::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        GET(INFINI_DEVICE_QY, nvidia);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopAvgPool(
+    infiniopAvgPoolDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    const void *input,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                                    \
+    case CASE:                                                                        \
+        return reinterpret_cast<const op::averagepool::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, output, input, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CALCULATE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroyAvgPoolDescriptor(infiniopAvgPoolDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                        \
+    case CASE:                                                                         \
+        delete reinterpret_cast<const op::averagepool::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        DELETE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/batch_norm/batch_norm.h b/src/infiniop/ops/batch_norm/batch_norm.h
new file mode 100644
index 000000000..3bee6b5bb
--- /dev/null
+++ b/src/infiniop/ops/batch_norm/batch_norm.h
@@ -0,0 +1,54 @@
+#ifndef __BATCH_NORM_H__
+#define __BATCH_NORM_H__
+
+#include "../../../utils.h"
+#include "../../operator.h"
+#include "../../tensor.h"
+#include "info.h"
+
+#define DESCRIPTOR(NAMESPACE)                                            \
+    namespace op::batch_norm::NAMESPACE {                                \
+    class Descriptor final : public InfiniopDescriptor {                 \
+        struct Opaque;                                                   \
+        Opaque *_opaque;                                                 \
+        BatchNormInfo _info;                                             \
+        size_t _workspace_size;                                          \
+        Descriptor(                                                      \
+            infiniDtype_t dtype,                                         \
+            BatchNormInfo info,                                          \
+            size_t workspace_size_,                                      \
+            Opaque *opaque,                                              \
+            infiniDevice_t device_type,                                  \
+            int device_id) : InfiniopDescriptor{device_type, device_id}, \
+                             _opaque(opaque),                            \
+                             _info(info),                                \
+                             _workspace_size(workspace_size_) {}         \
+                                                                         \
+    public:                                                              \
+        ~Descriptor();                                                   \
+        size_t workspaceSize() const { return _workspace_size; }         \
+        static infiniStatus_t create(                                    \
+            infiniopHandle_t handle,                                     \
+            Descriptor **desc_ptr,                                       \
+            infiniopTensorDescriptor_t output_desc,                      \
+            infiniopTensorDescriptor_t running_mean_desc,                \
+            infiniopTensorDescriptor_t running_var_desc,                 \
+            infiniopTensorDescriptor_t input_desc,                       \
+            infiniopTensorDescriptor_t weight_desc,                      \
+            infiniopTensorDescriptor_t bias_desc,                        \
+            float momentum,                                              \
+            float eps);                                                  \
+        infiniStatus_t calculate(                                        \
+            void *workspace,                                             \
+            size_t workspace_size,                                       \
+            void *output,                                                \
+            void *running_mean,                                          \
+            void *running_var,                                           \
+            const void *input,                                           \
+            const void *weight,                                          \
+            const void *bias,                                            \
+            void *stream) const;                                         \
+    };                                                                   \
+    }
+
+#endif
\ No newline at end of file
diff --git a/src/infiniop/ops/batch_norm/cpu/batch_norm_cpu.cc b/src/infiniop/ops/batch_norm/cpu/batch_norm_cpu.cc
new file mode 100644
index 000000000..876b82904
--- /dev/null
+++ b/src/infiniop/ops/batch_norm/cpu/batch_norm_cpu.cc
@@ -0,0 +1,118 @@
+#include "batch_norm_cpu.h"
+#include "../../../devices/cpu/common_cpu.h"
+#include "../../../reduce/cpu/reduce.h"
+
+namespace op::batch_norm::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t output_desc,
+    infiniopTensorDescriptor_t running_mean_desc,
+    infiniopTensorDescriptor_t running_var_desc,
+    infiniopTensorDescriptor_t input_desc,
+    infiniopTensorDescriptor_t weight_desc,
+    infiniopTensorDescriptor_t bias_desc,
+    float momentum,
+    float eps) {
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = input_desc->dtype();
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16);
+    auto result = BatchNormInfo::createBatchNormInfo(
+        output_desc,
+        running_mean_desc,
+        running_var_desc,
+        input_desc,
+        weight_desc,
+        bias_desc,
+        momentum,
+        eps);
+    CHECK_RESULT(result);
+    const BatchNormInfo &info = result.take();
+    size_t WorkSpaceSize = 0;
+    *desc_ptr = new Descriptor(
+        dtype, std::move(info), WorkSpaceSize,
+        nullptr,
+        handle->device, handle->device_id);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+template <typename Tdata>
+infiniStatus_t calculate_batch_norm(
+    const BatchNormInfo &info,
+    Tdata *output,
+    Tdata *running_mean,
+    Tdata *running_var,
+    const Tdata *input,
+    const Tdata *weight,
+    const Tdata *bias) {
+
+#pragma omp parallel for
+    for (int c = 0; c < static_cast<int>(info.channel_size); c++) {
+        float sum_sq = 0., sum = 0.;
+        for (size_t b = 0; b < info.batch_size; b++) {
+            sum += op::common_cpu::reduce_op::sum(
+                input + (b * info.channel_size + static_cast<size_t>(c)) * info.dim_size,
+                info.dim_size,
+                1);
+            sum_sq += op::common_cpu::reduce_op::sumSquared(
+                input + (b * info.channel_size + static_cast<size_t>(c)) * info.dim_size,
+                info.dim_size,
+                1);
+        }
+        float batch_and_dim_size = static_cast<float>(info.batch_size * info.dim_size);
+        float E = sum / batch_and_dim_size;
+        float var_biased = sum_sq / batch_and_dim_size - E * E;
+        float var_unbiased = var_biased * batch_and_dim_size / (batch_and_dim_size - 1.0f);
+
+        auto running_mean_ptr = running_mean + static_cast<size_t>(c) * info.running_mean_stride;
+        auto running_var_ptr = running_var + static_cast<size_t>(c) * info.running_var_stride;
+        *running_mean_ptr = utils::cast<Tdata>((1 - info.momentum) * utils::cast<float>(*running_mean_ptr) + info.momentum * E);
+        *running_var_ptr = utils::cast<Tdata>((1 - info.momentum) * utils::cast<float>(*running_var_ptr) + info.momentum * var_unbiased);
+
+        for (size_t b = 0; b < info.batch_size; b++) {
+            for (size_t d = 0; d < info.dim_size; d++) {
+                auto input_ptr = input + ((b * info.channel_size + static_cast<size_t>(c)) * info.dim_size) + d;
+                auto output_ptr = output + ((b * info.channel_size + static_cast<size_t>(c)) * info.dim_size) + d;
+                ;
+                auto weight_ptr = weight + static_cast<size_t>(c) * info.weight_stride;
+                auto bias_ptr = bias + static_cast<size_t>(c) * info.bias_stride;
+                *output_ptr = utils::cast<Tdata>(
+                    (utils::cast<float>(*input_ptr) - E) / std::sqrt(var_biased + info.eps) * utils::cast<float>(*weight_ptr) + utils::cast<float>(*bias_ptr));
+            }
+        }
+    }
+    return INFINI_STATUS_SUCCESS;
+}
+
+#define CALCULATE_BATCH_NORM(TDATA)                 \
+    CHECK_STATUS(calculate_batch_norm<TDATA>(_info, \
+                                             (TDATA *)output, (TDATA *)running_mean, (TDATA *)running_var, (const TDATA *)input, (const TDATA *)weight, (const TDATA *)bias))
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    void *running_mean,
+    void *running_var,
+    const void *input,
+    const void *weight,
+    const void *bias,
+    void *stream) const {
+
+    if (_info.dtype == INFINI_DTYPE_F16) {
+        CALCULATE_BATCH_NORM(fp16_t);
+    } else if (_info.dtype == INFINI_DTYPE_BF16) {
+        CALCULATE_BATCH_NORM(bf16_t);
+    } else if (_info.dtype == INFINI_DTYPE_F32) {
+        CALCULATE_BATCH_NORM(float);
+    } else {
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::batch_norm::cpu
diff --git a/src/infiniop/ops/batch_norm/cpu/batch_norm_cpu.h b/src/infiniop/ops/batch_norm/cpu/batch_norm_cpu.h
new file mode 100644
index 000000000..722ebc6ec
--- /dev/null
+++ b/src/infiniop/ops/batch_norm/cpu/batch_norm_cpu.h
@@ -0,0 +1,8 @@
+#ifndef __BATCH_NORM_CPU_H__
+#define __BATCH_NORM_CPU_H__
+
+#include "../batch_norm.h"
+
+DESCRIPTOR(cpu)
+
+#endif // __BATCH_NORM_CPU_H__
diff --git a/src/infiniop/ops/batch_norm/cuda/kernel.cuh b/src/infiniop/ops/batch_norm/cuda/kernel.cuh
new file mode 100644
index 000000000..6132b1afe
--- /dev/null
+++ b/src/infiniop/ops/batch_norm/cuda/kernel.cuh
@@ -0,0 +1,57 @@
+#ifndef __BATCH_NORM_KERNEL_CUH__
+#define __BATCH_NORM_KERNEL_CUH__
+
+#include "../../../reduce/cuda/reduce.cuh"
+#include <cub/block/block_reduce.cuh>
+
+template <unsigned int BLOCK_SIZE, typename Tdata, typename Tcompute>
+__device__ void batchNormKernel(
+    Tdata *output,
+    Tdata *running_mean,
+    Tdata *running_var,
+    const Tdata *input,
+    const Tdata *weight,
+    const Tdata *bias,
+
+    size_t batch_size,
+    size_t channel_size,
+    size_t dim_size,
+    ptrdiff_t running_mean_stride,
+    ptrdiff_t running_var_stride,
+    ptrdiff_t weight_stride,
+    ptrdiff_t bias_stride,
+    float momentum,
+    float eps) {
+    auto output_ptr = output + dim_size * blockIdx.x;
+    auto input_ptr = input + dim_size * blockIdx.x;
+
+    auto running_mean_ptr = running_mean + running_mean_stride * blockIdx.x;
+    auto running_var_ptr = running_var + running_var_stride * blockIdx.x;
+    auto weight_ptr = weight + weight_stride * blockIdx.x;
+    auto bias_ptr = bias + bias_stride * blockIdx.x;
+
+    Tcompute sum_squared = 0., sum = 0.;
+    for (size_t b = 0; b < batch_size; b++) {
+        sum += op::common_cuda::reduce_op::sum<BLOCK_SIZE, Tdata, Tcompute>(
+            input_ptr + b * (channel_size * dim_size), dim_size);
+        sum_squared += op::common_cuda::reduce_op::sumSquared<BLOCK_SIZE, Tdata, Tcompute>(
+            input_ptr + b * (channel_size * dim_size), dim_size);
+    }
+
+    __shared__ Tcompute E, var_biased;
+    if (threadIdx.x == 0) {
+        E = sum / Tcompute(batch_size * dim_size);
+        var_biased = sum_squared / Tcompute(batch_size * dim_size) - E * E;
+        Tcompute var_unbiased = var_biased * Tcompute(batch_size * dim_size) / Tcompute(batch_size * dim_size - 1);
+        *running_mean_ptr = Tcompute(1 - momentum) * Tcompute(*running_mean_ptr) + Tcompute(momentum) * E;
+        *running_var_ptr = Tcompute(1 - momentum) * Tcompute(*running_var_ptr) + Tcompute(momentum) * var_unbiased;
+    }
+    __syncthreads();
+
+    for (size_t n = threadIdx.x; n < batch_size * dim_size; n += BLOCK_SIZE) {
+        size_t b = n / dim_size, d = n % dim_size;
+        *(output_ptr + b * channel_size * dim_size + d) = (Tcompute(*(input_ptr + b * channel_size * dim_size + d)) - E) / sqrtf(float(var_biased + Tcompute(eps))) * Tcompute(*weight_ptr) + Tcompute(*bias_ptr);
+    }
+}
+
+#endif // __BATCH_NORM_KERNEL_CUH__
\ No newline at end of file
diff --git a/src/infiniop/ops/batch_norm/info.h b/src/infiniop/ops/batch_norm/info.h
new file mode 100644
index 000000000..c27479865
--- /dev/null
+++ b/src/infiniop/ops/batch_norm/info.h
@@ -0,0 +1,69 @@
+#ifndef __BATCH_NORM_INFO_H__
+#define __BATCH_NORM_INFO_H__
+
+#include "../../../utils.h"
+#include "../../operator.h"
+#include "../../tensor.h"
+
+namespace op::batch_norm {
+
+class BatchNormInfo {
+private:
+    BatchNormInfo() = default;
+
+public:
+    //  ---------------------------- start: define member variables of Info ----------------------------
+    infiniDtype_t dtype;
+    size_t batch_size, channel_size, dim_size;
+
+    ptrdiff_t running_mean_stride;
+    ptrdiff_t running_var_stride;
+    ptrdiff_t weight_stride;
+    ptrdiff_t bias_stride;
+    float momentum;
+    float eps;
+
+    //  ----------------------------- end: define member variables of Info -----------------------------
+
+    static utils::Result<BatchNormInfo> createBatchNormInfo(
+        infiniopTensorDescriptor_t output_desc,
+        infiniopTensorDescriptor_t running_mean_desc,
+        infiniopTensorDescriptor_t running_var_desc,
+        infiniopTensorDescriptor_t input_desc,
+        infiniopTensorDescriptor_t weight_desc,
+        infiniopTensorDescriptor_t bias_desc,
+        float momentum,
+        float eps) {
+        //  ------------------------- start: check tensor shape and input validity -------------------------
+        CHECK_OR_RETURN(
+            input_desc->ndim() == 3,
+            INFINI_STATUS_BAD_TENSOR_SHAPE);
+        CHECK_SAME_SHAPE(output_desc->shape(), input_desc->shape());
+        size_t batch_size = output_desc->dim(0),
+               channel_size = output_desc->dim(1),
+               dim_size = output_desc->dim(2);
+        CHECK_SAME_SHAPE(
+            running_mean_desc->shape(), running_var_desc->shape(),
+            weight_desc->shape(), bias_desc->shape());
+        CHECK_OR_RETURN(
+            running_mean_desc->ndim() == 1 && running_mean_desc->dim(0) == channel_size,
+            INFINI_STATUS_BAD_TENSOR_SHAPE);
+
+        //  -------------------------- end: check tensor shape and input validity --------------------------
+        return utils::Result<BatchNormInfo>(BatchNormInfo{
+            //  ------------------------------ start: create an instance of Info -------------------------------
+            output_desc->dtype(),
+            batch_size, channel_size, dim_size,
+            running_mean_desc->stride(0),
+            running_var_desc->stride(0),
+            weight_desc->stride(0),
+            bias_desc->stride(0),
+            momentum,
+            eps
+            //  ------------------------------- end: create an instance of Info --------------------------------
+        });
+    }
+};
+} // namespace op::batch_norm
+
+#endif //  __BATCH_NORM_INFO_H__
diff --git a/src/infiniop/ops/batch_norm/nvidia/batch_norm_nvidia.cu b/src/infiniop/ops/batch_norm/nvidia/batch_norm_nvidia.cu
new file mode 100644
index 000000000..e5e132c89
--- /dev/null
+++ b/src/infiniop/ops/batch_norm/nvidia/batch_norm_nvidia.cu
@@ -0,0 +1,176 @@
+#include "../../../devices/nvidia/nvidia_common.cuh"
+#include "../../../devices/nvidia/nvidia_handle.cuh"
+#include "../../../devices/nvidia/nvidia_kernel_common.cuh"
+
+#include "batch_norm_nvidia.cuh"
+
+#include "../../../reduce/cuda/reduce.cuh"
+#include "../cuda/kernel.cuh"
+#include <cub/block/block_reduce.cuh>
+
+#include "../info.h"
+
+namespace op::batch_norm::nvidia {
+
+//  ---------------------- start: launchKernel: call kernel function of CUDA -----------------------
+template <unsigned int BLOCK_SIZE, typename Tdata, typename Tcompute>
+INFINIOP_CUDA_KERNEL launchKernel(
+    Tdata *output,
+    Tdata *running_mean,
+    Tdata *running_var,
+    const Tdata *input,
+    const Tdata *weight,
+    const Tdata *bias,
+
+    size_t batch_size,
+    size_t channel_size,
+    size_t dim_size,
+    ptrdiff_t running_mean_stride,
+    ptrdiff_t running_var_stride,
+    ptrdiff_t weight_stride,
+    ptrdiff_t bias_stride,
+
+    float momentum,
+    float eps) {
+
+    batchNormKernel<BLOCK_SIZE, Tdata, Tcompute>(
+        output,
+        running_mean,
+        running_var,
+        input,
+        weight,
+        bias,
+
+        batch_size,
+        channel_size,
+        dim_size,
+
+        running_mean_stride,
+        running_var_stride,
+        weight_stride,
+        bias_stride,
+
+        momentum,
+        eps);
+}
+//  ----------------------- end: launchKernel: call kernel function of CUDA ------------------------
+
+//  ----------------------------------- start: call launchKernel -----------------------------------
+template <unsigned int BLOCK_SIZE, typename Tdata>
+infiniStatus_t calculate_batch_norm(
+    const BatchNormInfo &info,
+    Tdata *output,
+    Tdata *running_mean,
+    Tdata *running_var,
+    const Tdata *input,
+    const Tdata *weight,
+    const Tdata *bias,
+
+    cudaStream_t stream) {
+    launchKernel<BLOCK_SIZE, Tdata, float><<<info.channel_size, BLOCK_SIZE, 0, stream>>>(
+        output,
+        running_mean,
+        running_var,
+        input,
+        weight,
+        bias,
+
+        info.batch_size,
+        info.channel_size,
+        info.dim_size,
+
+        info.running_mean_stride,
+        info.running_var_stride,
+        info.weight_stride,
+        info.bias_stride,
+        info.momentum,
+        info.eps);
+    return INFINI_STATUS_SUCCESS;
+}
+//  ------------------------------------ end: call launchKernel ------------------------------------
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::nvidia::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t output_desc,
+    infiniopTensorDescriptor_t running_mean_desc,
+    infiniopTensorDescriptor_t running_var_desc,
+    infiniopTensorDescriptor_t input_desc,
+    infiniopTensorDescriptor_t weight_desc,
+    infiniopTensorDescriptor_t bias_desc,
+    float momentum,
+    float eps) {
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    //  --------------------- start: check data type and calculate workspace size ----------------------
+    auto dtype = output_desc->dtype();
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16);
+    size_t WorkSpaceSize = 0;
+    //  ---------------------- end: check data type and calculate workspace size -----------------------
+    auto result = BatchNormInfo::createBatchNormInfo(
+        output_desc,
+        running_mean_desc,
+        running_var_desc,
+        input_desc,
+        weight_desc,
+        bias_desc,
+        momentum,
+        eps);
+    CHECK_RESULT(result);
+    const BatchNormInfo &info = result.take();
+    *desc_ptr = new Descriptor(
+        dtype, std::move(info), WorkSpaceSize,
+        new Opaque{handle->internal()},
+        handle->device, handle->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    void *running_mean,
+    void *running_var,
+    const void *input,
+    const void *weight,
+    const void *bias,
+    void *stream_) const {
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+    cudaStream_t stream = (cudaStream_t)stream_;
+
+#define CALCULATE_BATCH_NORM(BLOCK_SIZE, TDATA) \
+    calculate_batch_norm<BLOCK_SIZE, TDATA>(_info, (TDATA *)output, (TDATA *)running_mean, (TDATA *)running_var, (const TDATA *)input, (const TDATA *)weight, (const TDATA *)bias, stream)
+#define CALCULATE_BATCH_NORM_WITH_BLOCK_SIZE(BLOCK_SIZE)            \
+    {                                                               \
+        if (_info.dtype == INFINI_DTYPE_F16)                        \
+            return CALCULATE_BATCH_NORM(BLOCK_SIZE, half);          \
+        else if (_info.dtype == INFINI_DTYPE_F32)                   \
+            return CALCULATE_BATCH_NORM(BLOCK_SIZE, float);         \
+        else if (_info.dtype == INFINI_DTYPE_BF16)                  \
+            return CALCULATE_BATCH_NORM(BLOCK_SIZE, __nv_bfloat16); \
+        else                                                        \
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;                  \
+    }
+
+    if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_1024) {
+        CALCULATE_BATCH_NORM_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_1024)
+    } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_512) {
+        CALCULATE_BATCH_NORM_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_512)
+    } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_4096) {
+        CALCULATE_BATCH_NORM_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_4096)
+    } else {
+        return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::batch_norm::nvidia
diff --git a/src/infiniop/ops/batch_norm/nvidia/batch_norm_nvidia.cuh b/src/infiniop/ops/batch_norm/nvidia/batch_norm_nvidia.cuh
new file mode 100644
index 000000000..33c93f2b4
--- /dev/null
+++ b/src/infiniop/ops/batch_norm/nvidia/batch_norm_nvidia.cuh
@@ -0,0 +1,10 @@
+#ifndef __BATCH_NORM_NVIDIA_API_H__
+#define __BATCH_NORM_NVIDIA_API_H__
+
+// #ifdef ENABLE_NINETOOTHED
+#include "../batch_norm.h"
+DESCRIPTOR(nvidia)
+
+// #endif
+
+#endif // __BATCH_NORM_NVIDIA_API_H__
diff --git a/src/infiniop/ops/batch_norm/operator.cc b/src/infiniop/ops/batch_norm/operator.cc
new file mode 100644
index 000000000..a87dfff60
--- /dev/null
+++ b/src/infiniop/ops/batch_norm/operator.cc
@@ -0,0 +1,156 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/batch_norm.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/batch_norm_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
+#include "nvidia/batch_norm_nvidia.cuh"
+#endif
+
+__C infiniStatus_t infiniopCreateBatchNormDescriptor(
+    infiniopHandle_t handle,
+    infiniopBatchNormDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t output_desc,
+    infiniopTensorDescriptor_t running_mean_desc,
+    infiniopTensorDescriptor_t running_var_desc,
+    infiniopTensorDescriptor_t input_desc,
+    infiniopTensorDescriptor_t weight_desc,
+    infiniopTensorDescriptor_t bias_desc,
+    float momentum,
+    float eps) {
+
+#define CREATE(CASE, NAMESPACE)                                                   \
+    case CASE:                                                                    \
+        return op::batch_norm::NAMESPACE::Descriptor::create(                     \
+            handle,                                                               \
+            reinterpret_cast<op::batch_norm::NAMESPACE::Descriptor **>(desc_ptr), \
+            output_desc,                                                          \
+            running_mean_desc,                                                    \
+            running_var_desc,                                                     \
+            input_desc,                                                           \
+            weight_desc,                                                          \
+            bias_desc,                                                            \
+            momentum,                                                             \
+            eps)
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CREATE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetBatchNormWorkspaceSize(infiniopBatchNormDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                            \
+    case CASE:                                                                                          \
+        *size = reinterpret_cast<const op::batch_norm::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        GET(INFINI_DEVICE_QY, nvidia);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopBatchNorm(
+    infiniopBatchNormDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    void *running_mean,
+    void *running_var,
+    const void *input,
+    const void *weight,
+    const void *bias,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                                    \
+    case CASE:                                                                        \
+        return reinterpret_cast<const op::batch_norm::NAMESPACE::Descriptor *>(desc)  \
+            ->calculate(workspace, workspace_size, output, running_mean, running_var, \
+                        input, weight, bias, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CALCULATE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroyBatchNormDescriptor(infiniopBatchNormDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                       \
+    case CASE:                                                                        \
+        delete reinterpret_cast<const op::batch_norm::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        DELETE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/cross_entropy_loss/cpu/cross_entropy_loss_cpu.cc b/src/infiniop/ops/cross_entropy_loss/cpu/cross_entropy_loss_cpu.cc
new file mode 100644
index 000000000..af97c1d09
--- /dev/null
+++ b/src/infiniop/ops/cross_entropy_loss/cpu/cross_entropy_loss_cpu.cc
@@ -0,0 +1,321 @@
+#include "cross_entropy_loss_cpu.h"
+#include "../../../devices/cpu/common_cpu.h"
+#include "../../../devices/cpu/cpu_handle.h"
+#include "../info.h"
+#include <algorithm>
+#include <cmath>
+#include <limits>
+#include <vector>
+
+namespace op::cross_entropy_loss::cpu {
+
+struct Descriptor::Opaque {
+    device::cpu::Handle *handle;
+    std::vector<size_t> logits_shape;
+    size_t workspace_size = 0;
+
+private:
+    Opaque(device::cpu::Handle *handle_ptr, const std::vector<size_t> &shape)
+        : handle(handle_ptr), logits_shape(shape) {
+        // 计算workspace大小：需要存储per-sample loss
+        size_t N = logits_shape[0];
+        size_t inner_size = 1;
+        for (size_t i = 2; i < logits_shape.size(); ++i) {
+            inner_size *= logits_shape[i];
+        }
+        workspace_size = N * inner_size * sizeof(float);
+    }
+
+    void cross_entropy_f16_as_float(float *workspace, float *loss_result,
+                                    const fp16_t *logits, const int64_t *target) const {
+        size_t N = logits_shape[0];
+        size_t C = logits_shape[1];
+        size_t inner_size = 1;
+        for (size_t i = 2; i < logits_shape.size(); ++i) {
+            inner_size *= logits_shape[i];
+        }
+
+        // 转换F16 logits为float
+        size_t total_logits_size = N * C * inner_size;
+        std::vector<float> float_logits(total_logits_size);
+        for (size_t i = 0; i < total_logits_size; ++i) {
+            float_logits[i] = utils::cast<float>(logits[i]);
+        }
+
+        // 使用float精度计算
+        cross_entropy_cpu_float(workspace, loss_result, float_logits.data(), target);
+    }
+
+    // 通用的float版本交叉熵计算
+    void cross_entropy_cpu_float(float *workspace, float *loss_result,
+                                 const float *logits, const int64_t *target) const {
+        size_t N = logits_shape[0];
+        size_t C = logits_shape[1];
+        size_t inner_size = 1;
+        for (size_t i = 2; i < logits_shape.size(); ++i) {
+            inner_size *= logits_shape[i];
+        }
+
+        const int64_t ignore_index = -100;
+        float *per_sample_loss = workspace;
+
+        // 计算每个样本的损失
+        for (size_t n = 0; n < N; ++n) {
+            for (size_t inner = 0; inner < inner_size; ++inner) {
+                size_t sample_idx = n * inner_size + inner;
+                int64_t t = target[sample_idx];
+
+                // 检查ignore_index或无效target
+                if (t == ignore_index || t < 0 || t >= static_cast<int64_t>(C)) {
+                    per_sample_loss[sample_idx] = 0.0f;
+                    continue;
+                }
+
+                // 计算这个位置的logits基址
+                size_t base_offset = n * C * inner_size + inner;
+
+                // 数值稳定的softmax计算：先找最大值
+                float max_logit = -std::numeric_limits<float>::infinity();
+                for (size_t c = 0; c < C; ++c) {
+                    size_t logit_idx = base_offset + c * inner_size;
+                    max_logit = std::max(max_logit, logits[logit_idx]);
+                }
+
+                // 计算exp的和（减去最大值保证数值稳定）
+                float sum_exp = 0.0f;
+                for (size_t c = 0; c < C; ++c) {
+                    size_t logit_idx = base_offset + c * inner_size;
+                    sum_exp += std::exp(logits[logit_idx] - max_logit);
+                }
+
+                // 计算目标类别的logit
+                size_t target_logit_idx = base_offset + static_cast<size_t>(t) * inner_size;
+                float target_logit = logits[target_logit_idx];
+
+                // 计算交叉熵损失：log_softmax[target] = logit[target] - log(sum_exp) - max_logit
+                // 所以 -log_softmax[target] = log(sum_exp) + max_logit - logit[target]
+                per_sample_loss[sample_idx] = std::log(sum_exp) + max_logit - target_logit;
+            }
+        }
+
+        // 计算平均损失（忽略ignore_index的样本）
+        double total_loss = 0.0;
+        size_t valid_count = 0;
+        size_t total_samples = N * inner_size;
+
+        for (size_t i = 0; i < total_samples; ++i) {
+            if (target[i] != ignore_index && target[i] >= 0 && target[i] < static_cast<int64_t>(C)) {
+                total_loss += static_cast<double>(per_sample_loss[i]);
+                valid_count++;
+            }
+        }
+
+        *loss_result = valid_count > 0 ? static_cast<float>(total_loss / valid_count) : 0.0f;
+    }
+
+    // 通用模板版本（用于F32和BF16）
+    template <typename T>
+    void cross_entropy_cpu_generic(float *workspace, T *loss_result,
+                                   const T *logits, const int64_t *target) const {
+        size_t N = logits_shape[0];
+        size_t C = logits_shape[1];
+        size_t inner_size = 1;
+        for (size_t i = 2; i < logits_shape.size(); ++i) {
+            inner_size *= logits_shape[i];
+        }
+
+        const int64_t ignore_index = -100;
+        float *per_sample_loss = workspace;
+
+        // 计算每个样本的损失
+        for (size_t n = 0; n < N; ++n) {
+            for (size_t inner = 0; inner < inner_size; ++inner) {
+                size_t sample_idx = n * inner_size + inner;
+                int64_t t = target[sample_idx];
+
+                // 检查ignore_index或无效target
+                if (t == ignore_index || t < 0 || t >= static_cast<int64_t>(C)) {
+                    per_sample_loss[sample_idx] = 0.0f;
+                    continue;
+                }
+
+                // 计算这个位置的logits基址
+                size_t base_offset = n * C * inner_size + inner;
+
+                // 数值稳定的softmax计算：先找最大值
+                float max_logit = -std::numeric_limits<float>::infinity();
+                for (size_t c = 0; c < C; ++c) {
+                    size_t logit_idx = base_offset + c * inner_size;
+                    float logit_val;
+                    if constexpr (std::is_same<T, bf16_t>::value) {
+                        logit_val = utils::cast<float>(logits[logit_idx]);
+                    } else {
+                        logit_val = logits[logit_idx];
+                    }
+                    max_logit = std::max(max_logit, logit_val);
+                }
+
+                // 计算exp的和
+                float sum_exp = 0.0f;
+                for (size_t c = 0; c < C; ++c) {
+                    size_t logit_idx = base_offset + c * inner_size;
+                    float logit_val;
+                    if constexpr (std::is_same<T, bf16_t>::value) {
+                        logit_val = utils::cast<float>(logits[logit_idx]);
+                    } else {
+                        logit_val = logits[logit_idx];
+                    }
+                    sum_exp += std::exp(logit_val - max_logit);
+                }
+
+                // 计算目标类别的logit
+                size_t target_logit_idx = base_offset + static_cast<size_t>(t) * inner_size;
+                float target_logit;
+                if constexpr (std::is_same<T, bf16_t>::value) {
+                    target_logit = utils::cast<float>(logits[target_logit_idx]);
+                } else {
+                    target_logit = logits[target_logit_idx];
+                }
+
+                // 计算交叉熵损失
+                per_sample_loss[sample_idx] = std::log(sum_exp) + max_logit - target_logit;
+            }
+        }
+
+        // 计算平均损失
+        double total_loss = 0.0;
+        size_t valid_count = 0;
+        size_t total_samples = N * inner_size;
+
+        for (size_t i = 0; i < total_samples; ++i) {
+            if (target[i] != ignore_index && target[i] >= 0 && target[i] < static_cast<int64_t>(C)) {
+                total_loss += static_cast<double>(per_sample_loss[i]);
+                valid_count++;
+            }
+        }
+
+        float mean_loss = valid_count > 0 ? static_cast<float>(total_loss / valid_count) : 0.0f;
+
+        // 转换回输出类型
+        if constexpr (std::is_same<T, bf16_t>::value) {
+            *loss_result = utils::cast<T>(mean_loss);
+        } else {
+            *loss_result = static_cast<T>(mean_loss);
+        }
+    }
+
+public:
+    Opaque(Opaque &&other) noexcept
+        : handle(other.handle),
+          logits_shape(std::move(other.logits_shape)),
+          workspace_size(other.workspace_size) {
+        other.handle = nullptr;
+        other.workspace_size = 0;
+    }
+
+    ~Opaque() = default;
+
+    static inline utils::Result<Opaque>
+    create(device::cpu::Handle *handle_ptr, const std::vector<size_t> &shape) {
+        Opaque opaque(handle_ptr, shape);
+        return utils::Result<Opaque>(std::move(opaque));
+    }
+
+    infiniStatus_t calculate(void *workspace, size_t workspace_size,
+                             void *loss, const void *logits, const void *target,
+                             infiniDtype_t dtype) const {
+        if (!workspace || !loss || !logits || !target) {
+            return INFINI_STATUS_BAD_PARAM;
+        }
+
+        if (workspace_size < this->workspace_size) {
+            return INFINI_STATUS_INTERNAL_ERROR;
+        }
+
+        float *workspace_ptr = static_cast<float *>(workspace);
+        const int64_t *target_ptr = static_cast<const int64_t *>(target);
+
+        switch (dtype) {
+        case INFINI_DTYPE_F32: {
+            const float *logits_ptr = static_cast<const float *>(logits);
+            float *loss_ptr = static_cast<float *>(loss);
+            cross_entropy_cpu_generic(workspace_ptr, loss_ptr, logits_ptr, target_ptr);
+            break;
+        }
+
+        case INFINI_DTYPE_F16: {
+            const fp16_t *logits_ptr = static_cast<const fp16_t *>(logits);
+            fp16_t *loss_ptr = static_cast<fp16_t *>(loss);
+
+            // F16特殊处理：使用float计算
+            float temp_loss;
+            cross_entropy_f16_as_float(workspace_ptr, &temp_loss, logits_ptr, target_ptr);
+            *loss_ptr = utils::cast<fp16_t>(temp_loss);
+            break;
+        }
+
+        case INFINI_DTYPE_BF16: {
+            const bf16_t *logits_ptr = static_cast<const bf16_t *>(logits);
+            bf16_t *loss_ptr = static_cast<bf16_t *>(loss);
+            cross_entropy_cpu_generic(workspace_ptr, loss_ptr, logits_ptr, target_ptr);
+            break;
+        }
+
+        default:
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;
+        }
+
+        return INFINI_STATUS_SUCCESS;
+    }
+
+    size_t get_workspace_size() const {
+        return workspace_size;
+    }
+};
+
+Descriptor::~Descriptor() {
+    if (_opaque) {
+        delete _opaque;
+    }
+}
+
+infiniStatus_t Descriptor::create(infiniopHandle_t handle_,
+                                  Descriptor **desc_ptr,
+                                  infiniopTensorDescriptor_t /*loss_desc*/,
+                                  infiniopTensorDescriptor_t logits_desc,
+                                  infiniopTensorDescriptor_t /*target_desc*/) {
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = logits_desc->dtype();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F32, INFINI_DTYPE_F16, INFINI_DTYPE_BF16);
+
+    const auto &orig_shape = logits_desc->shape();
+    std::vector<size_t> logits_shape;
+
+    if (orig_shape.size() == 1) {
+        logits_shape = {1, orig_shape[0]};
+    } else {
+        logits_shape = orig_shape;
+    }
+
+    if (logits_shape.size() < 2) {
+        return INFINI_STATUS_BAD_TENSOR_SHAPE;
+    }
+
+    auto opaque_result = Opaque::create(handle, logits_shape);
+    CHECK_RESULT(opaque_result);
+    auto opaque = new Opaque(opaque_result.take());
+
+    *desc_ptr = new Descriptor(dtype, opaque->get_workspace_size(), opaque,
+                               handle->device, handle->device_id);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
+                                     void *loss, const void *logits,
+                                     const void *target, void *stream) const {
+    return _opaque->calculate(workspace, workspace_size, loss, logits, target, _dtype);
+}
+
+} // namespace op::cross_entropy_loss::cpu
diff --git a/src/infiniop/ops/cross_entropy_loss/cpu/cross_entropy_loss_cpu.h b/src/infiniop/ops/cross_entropy_loss/cpu/cross_entropy_loss_cpu.h
new file mode 100644
index 000000000..8afec63d0
--- /dev/null
+++ b/src/infiniop/ops/cross_entropy_loss/cpu/cross_entropy_loss_cpu.h
@@ -0,0 +1,8 @@
+#ifndef __CROSS_ENTROPY_LOSS_CPU_H__
+#define __CROSS_ENTROPY_LOSS_CPU_H__
+
+#include "../cross_entropy_loss.h"
+
+DESCRIPTOR(cpu)
+
+#endif // __CROSS_ENTROPY_LOSS_CPU_H__
diff --git a/src/infiniop/ops/cross_entropy_loss/cross_entropy_loss.h b/src/infiniop/ops/cross_entropy_loss/cross_entropy_loss.h
new file mode 100644
index 000000000..dad108d78
--- /dev/null
+++ b/src/infiniop/ops/cross_entropy_loss/cross_entropy_loss.h
@@ -0,0 +1,48 @@
+#ifndef __CROSS_ENTROPY_LOSS_H__
+#define __CROSS_ENTROPY_LOSS_H__
+
+#include "../../operator.h"
+#include "info.h"
+
+#define DESCRIPTOR(NAMESPACE)                                    \
+                                                                 \
+    namespace op::cross_entropy_loss::NAMESPACE {                \
+    class Descriptor final : public InfiniopDescriptor {         \
+        struct Opaque;                                           \
+        Opaque *_opaque;                                         \
+        infiniDtype_t _dtype;                                    \
+        size_t _workspace_size;                                  \
+                                                                 \
+        Descriptor(                                              \
+            infiniDtype_t dtype,                                 \
+            size_t workspace_size_,                              \
+            Opaque *opaque,                                      \
+            infiniDevice_t device_type,                          \
+            int device_id)                                       \
+            : InfiniopDescriptor{device_type, device_id},        \
+              _opaque(opaque),                                   \
+              _dtype(dtype),                                     \
+              _workspace_size(workspace_size_) {}                \
+                                                                 \
+    public:                                                      \
+        ~Descriptor();                                           \
+                                                                 \
+        size_t workspaceSize() const { return _workspace_size; } \
+                                                                 \
+        static infiniStatus_t create(                            \
+            infiniopHandle_t handle,                             \
+            Descriptor **desc_ptr,                               \
+            infiniopTensorDescriptor_t loss_desc,                \
+            infiniopTensorDescriptor_t logits_desc,              \
+            infiniopTensorDescriptor_t target_desc);             \
+                                                                 \
+        infiniStatus_t calculate(                                \
+            void *workspace, size_t workspace_size,              \
+            void *loss,                                          \
+            const void *logits,                                  \
+            const void *target,                                  \
+            void *stream) const;                                 \
+    };                                                           \
+    }
+
+#endif // __CROSS_ENTROPY_LOSS_H__
diff --git a/src/infiniop/ops/cross_entropy_loss/info.h b/src/infiniop/ops/cross_entropy_loss/info.h
new file mode 100644
index 000000000..5278bf912
--- /dev/null
+++ b/src/infiniop/ops/cross_entropy_loss/info.h
@@ -0,0 +1,36 @@
+#ifndef __CROSS_ENTROPY_LOSS_INFO_H__
+#define __CROSS_ENTROPY_LOSS_INFO_H__
+
+#include "../../../utils.h"
+#include "../../operator.h"
+#include "../../tensor.h"
+
+namespace op::cross_entropy_loss {
+
+class CrossEntropyInfo {
+public:
+    CrossEntropyInfo() = default;
+    size_t batch = 0;
+    size_t num_classes = 0;
+    infiniDtype_t dtype;
+
+    static utils::Result<CrossEntropyInfo> create(
+        infiniopTensorDescriptor_t loss,
+        infiniopTensorDescriptor_t logits,
+        infiniopTensorDescriptor_t target) {
+
+        if (logits->ndim() != 2 || loss->ndim() != 1 || target->ndim() != 1) {
+            return INFINI_STATUS_BAD_TENSOR_SHAPE;
+        }
+
+        CrossEntropyInfo info;
+        info.batch = logits->dim(0);
+        info.num_classes = logits->dim(1);
+        info.dtype = logits->dtype();
+        return utils::Result<CrossEntropyInfo>(std::move(info));
+    }
+};
+
+} // namespace op::cross_entropy_loss
+
+#endif // __CROSS_ENTROPY_LOSS_INFO_H__
diff --git a/src/infiniop/ops/cross_entropy_loss/nvidia/cross_entropy_loss_nvidia.cu b/src/infiniop/ops/cross_entropy_loss/nvidia/cross_entropy_loss_nvidia.cu
new file mode 100644
index 000000000..3d795a67a
--- /dev/null
+++ b/src/infiniop/ops/cross_entropy_loss/nvidia/cross_entropy_loss_nvidia.cu
@@ -0,0 +1,217 @@
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+#include <limits.h>
+#include <math_constants.h>
+#include <memory>
+#include <numeric>
+#include <stdio.h>
+#include <vector>
+
+#include "../../../devices/nvidia/nvidia_common.cuh"
+#include "../../../devices/nvidia/nvidia_handle.cuh"
+#include "cross_entropy_loss_nvidia.cuh"
+
+namespace op::cross_entropy_loss::nvidia {
+namespace cuda {
+
+__device__ __forceinline__ float to_float(float v) { return v; }
+__device__ __forceinline__ float to_float(double v) { return (float)v; }
+__device__ __forceinline__ float to_float(half v) { return __half2float(v); }
+__device__ __forceinline__ float to_float(__nv_bfloat16 v) {
+    return __bfloat162float(v);
+}
+
+template <typename T_in, typename T_out = float>
+__global__ void
+softmaxCrossEntropy_per_sample(T_out *__restrict__ loss,
+                               const T_in *__restrict__ logits,
+                               const int64_t *__restrict__ target, int N, int C,
+                               long long inner_size, int64_t ignore_index) {
+    long long total = (long long)N * inner_size;
+    long long idx = (long long)blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= total) {
+        return;
+    }
+
+    int n = (int)(idx / inner_size);
+    int inr = (int)(idx % inner_size);
+
+    int64_t t = target[(long long)n * inner_size + inr];
+    if (ignore_index != LLONG_MIN && t == ignore_index) {
+        loss[idx] = (T_out)0;
+        return;
+    }
+    if (t < 0 || t >= C) {
+        loss[idx] = (T_out)0;
+        return;
+    }
+
+    const long long base = ((long long)n * C * inner_size) + inr;
+
+    // 数值稳定 LSE：lse = log(sum exp(x - m)) + m
+    float m = -CUDART_INF_F;
+    for (int c = 0; c < C; ++c) {
+        m = fmaxf(m, to_float(logits[base + (long long)c * inner_size]));
+    }
+
+    float sum_exp = 0.f;
+    for (int c = 0; c < C; ++c) {
+        sum_exp += expf(to_float(logits[base + (long long)c * inner_size]) - m);
+    }
+
+    float lse = logf(sum_exp) + m;
+    float logit_t = to_float(logits[base + (long long)(int)t * inner_size]);
+    loss[idx] = (T_out)(lse - logit_t);
+}
+
+} // namespace cuda
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::nvidia::Handle::Internal> internal;
+    std::vector<size_t> logits_shape;
+    Opaque(std::shared_ptr<device::nvidia::Handle::Internal> p) : internal(p) {}
+    ~Opaque() = default;
+};
+
+Descriptor::~Descriptor() {
+    if (_opaque) {
+        delete _opaque;
+    }
+}
+
+infiniStatus_t Descriptor::create(infiniopHandle_t handle_,
+                                  Descriptor **desc_ptr,
+                                  infiniopTensorDescriptor_t /*loss_desc*/,
+                                  infiniopTensorDescriptor_t logits_desc,
+                                  infiniopTensorDescriptor_t /*target_desc*/) {
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = logits_desc->dtype();
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F32, INFINI_DTYPE_F16, INFINI_DTYPE_BF16);
+
+    const auto &orig = logits_desc->shape();
+    auto opaque = new Opaque(handle->internal());
+
+    if (orig.size() == 1) {
+        opaque->logits_shape = {1, orig[0]};
+    } else {
+        opaque->logits_shape = orig;
+    }
+
+    const auto &s = opaque->logits_shape;
+    long long N = (long long)s[0];
+    long long inner = 1;
+    for (size_t i = 2; i < s.size(); ++i) {
+        inner *= (long long)s[i];
+    }
+
+    size_t workspace_size = (size_t)(N * inner) * sizeof(float);
+    *desc_ptr = new Descriptor(dtype, workspace_size, opaque, handle->device,
+                               handle->device_id);
+    return INFINI_STATUS_SUCCESS;
+#else
+    return INFINI_STATUS_NOT_IMPLEMENTED;
+#endif
+}
+
+infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
+                                     void *loss, const void *logits,
+                                     const void *target, void *stream) const {
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
+    const auto &s = _opaque->logits_shape;
+    int N = (int)s[0];
+    int C = (int)s[1];
+    long long inner = 1;
+    for (size_t i = 2; i < s.size(); ++i) {
+        inner *= (long long)s[i];
+    }
+    long long total = (long long)N * inner;
+
+    size_t need_ws = (size_t)total * sizeof(float);
+    if (workspace_size < need_ws) {
+        return INFINI_STATUS_INTERNAL_ERROR;
+    }
+    float *per_sample = reinterpret_cast<float *>(workspace);
+
+    const int64_t *tgt_i64 = reinterpret_cast<const int64_t *>(target);
+    const int64_t ignore_index = -100;
+
+    // 1) 写 per-sample loss -> workspace（float）
+    dim3 block(256);
+    dim3 grid((total + block.x - 1) / block.x);
+    cudaStream_t st = (cudaStream_t)stream;
+
+    if (_dtype == INFINI_DTYPE_F32) {
+        cuda::softmaxCrossEntropy_per_sample<float, float><<<grid, block, 0, st>>>(
+            per_sample, (const float *)logits, tgt_i64, N, C, inner, ignore_index);
+    } else if (_dtype == INFINI_DTYPE_F16) {
+        cuda::softmaxCrossEntropy_per_sample<half, float><<<grid, block, 0, st>>>(
+            per_sample, (const half *)logits, tgt_i64, N, C, inner, ignore_index);
+    } else if (_dtype == INFINI_DTYPE_BF16) {
+        cuda::softmaxCrossEntropy_per_sample<__nv_bfloat16, float>
+            <<<grid, block, 0, st>>>(per_sample, (const __nv_bfloat16 *)logits,
+                                     tgt_i64, N, C, inner, ignore_index);
+    }
+    {
+        auto err = cudaGetLastError();
+        if (err != cudaSuccess) {
+            return INFINI_STATUS_INTERNAL_ERROR;
+        }
+    }
+
+    // 2) host 侧 mean（仅统计 target != ignore_index）
+    std::vector<float> h_loss((size_t)total);
+    std::vector<int64_t> h_tgt((size_t)total);
+    if (cudaMemcpyAsync(h_loss.data(), per_sample, need_ws,
+                        cudaMemcpyDeviceToHost, st)
+        != cudaSuccess) {
+        return INFINI_STATUS_INTERNAL_ERROR;
+    }
+    if (cudaMemcpyAsync(h_tgt.data(), tgt_i64, (size_t)total * sizeof(int64_t),
+                        cudaMemcpyDeviceToHost, st)
+        != cudaSuccess) {
+        return INFINI_STATUS_INTERNAL_ERROR;
+    }
+    if (cudaStreamSynchronize(st) != cudaSuccess) {
+        return INFINI_STATUS_INTERNAL_ERROR;
+    }
+
+    double acc = 0.0;
+    long long cnt = 0;
+    for (long long i = 0; i < total; ++i) {
+        if (h_tgt[i] != ignore_index) {
+            acc += (double)h_loss[i];
+            ++cnt;
+        }
+    }
+    double mean = (cnt > 0) ? (acc / (double)cnt) : 0.0;
+
+    // 3) 把标量 mean 写回 device 的 loss 指针（按输入 dtype 写 1 个元素）
+    if (_dtype == INFINI_DTYPE_F32) {
+        float v = (float)mean;
+        if (cudaMemcpyAsync(loss, &v, sizeof(float), cudaMemcpyHostToDevice, st) != cudaSuccess) {
+            return INFINI_STATUS_INTERNAL_ERROR;
+        }
+    } else if (_dtype == INFINI_DTYPE_F16) {
+        half v = __float2half((float)mean);
+        if (cudaMemcpyAsync(loss, &v, sizeof(half), cudaMemcpyHostToDevice, st) != cudaSuccess) {
+            return INFINI_STATUS_INTERNAL_ERROR;
+        }
+    } else if (_dtype == INFINI_DTYPE_BF16) {
+        __nv_bfloat16 v = __float2bfloat16((float)mean);
+        if (cudaMemcpyAsync(loss, &v, sizeof(__nv_bfloat16), cudaMemcpyHostToDevice,
+                            st)
+            != cudaSuccess) {
+            return INFINI_STATUS_INTERNAL_ERROR;
+        }
+    }
+    if (cudaStreamSynchronize(st) != cudaSuccess) {
+        return INFINI_STATUS_INTERNAL_ERROR;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+#else
+    return INFINI_STATUS_NOT_IMPLEMENTED;
+#endif
+}
+} // namespace op::cross_entropy_loss::nvidia
diff --git a/src/infiniop/ops/cross_entropy_loss/nvidia/cross_entropy_loss_nvidia.cuh b/src/infiniop/ops/cross_entropy_loss/nvidia/cross_entropy_loss_nvidia.cuh
new file mode 100644
index 000000000..843fc943d
--- /dev/null
+++ b/src/infiniop/ops/cross_entropy_loss/nvidia/cross_entropy_loss_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __CROSS_ENTROPY_LOSS_CUDA_CUH__
+#define __CROSS_ENTROPY_LOSS_CUDA_CUH__
+
+#include "../cross_entropy_loss.h"
+
+DESCRIPTOR(nvidia)
+
+#endif // __CROSS_ENTROPY_LOSS_CUDA_CUH__
diff --git a/src/infiniop/ops/cross_entropy_loss/operator.cc b/src/infiniop/ops/cross_entropy_loss/operator.cc
new file mode 100644
index 000000000..8668dc574
--- /dev/null
+++ b/src/infiniop/ops/cross_entropy_loss/operator.cc
@@ -0,0 +1,142 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/cross_entropy_loss.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/cross_entropy_loss_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
+#include "nvidia/cross_entropy_loss_nvidia.cuh"
+#endif
+
+__C infiniStatus_t infiniopCreateCrossEntropyLossDescriptor(
+    infiniopHandle_t handle,
+    infiniopCrossEntropyLossDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t loss_desc,
+    infiniopTensorDescriptor_t logits_desc,
+    infiniopTensorDescriptor_t target_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                                           \
+    case CASE:                                                                            \
+        return op::cross_entropy_loss::NAMESPACE::Descriptor::create(                     \
+            handle,                                                                       \
+            reinterpret_cast<op::cross_entropy_loss::NAMESPACE::Descriptor **>(desc_ptr), \
+            loss_desc,                                                                    \
+            logits_desc,                                                                  \
+            target_desc)
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CREATE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetCrossEntropyLossWorkspaceSize(infiniopCrossEntropyLossDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                                    \
+    case CASE:                                                                                                  \
+        *size = reinterpret_cast<const op::cross_entropy_loss::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        GET(INFINI_DEVICE_QY, nvidia);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopCrossEntropyLoss(
+    infiniopCrossEntropyLossDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *loss,
+    const void *logits,
+    const void *target,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                                           \
+    case CASE:                                                                               \
+        return reinterpret_cast<const op::cross_entropy_loss::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, loss, logits, target, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CALCULATE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroyCrossEntropyLossDescriptor(infiniopCrossEntropyLossDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                               \
+    case CASE:                                                                                \
+        delete reinterpret_cast<const op::cross_entropy_loss::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        DELETE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/exp/cpu/exp_cpu.cc b/src/infiniop/ops/exp/cpu/exp_cpu.cc
new file mode 100644
index 000000000..58a6d0f2d
--- /dev/null
+++ b/src/infiniop/ops/exp/cpu/exp_cpu.cc
@@ -0,0 +1,52 @@
+#include "exp_cpu.h"
+
+namespace op::exp::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &output_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    CHECK_SAME_SHAPE(output_shape, input_shape);
+
+    // create CPU elementwise descriptor
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<ExpOp, fp16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<ExpOp, float>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<ExpOp, double>(_info, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<ExpOp, bf16_t>(_info, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::exp::cpu
diff --git a/src/infiniop/ops/exp/cpu/exp_cpu.h b/src/infiniop/ops/exp/cpu/exp_cpu.h
new file mode 100644
index 000000000..867c7afa5
--- /dev/null
+++ b/src/infiniop/ops/exp/cpu/exp_cpu.h
@@ -0,0 +1,21 @@
+#ifndef __EXP_CPU_H__
+#define __EXP_CPU_H__
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+#include <cmath>
+
+ELEMENTWISE_DESCRIPTOR(exp, cpu)
+
+namespace op::exp::cpu {
+typedef struct ExpOp {
+public:
+    static constexpr size_t num_inputs = 1;
+
+    template <typename T>
+    T operator()(const T &input) const {
+        return std::exp(input);
+    }
+} ExpOp;
+} // namespace op::exp::cpu
+
+#endif // __EXP_CPU_H__
diff --git a/src/infiniop/ops/exp/cuda/kernel.cuh b/src/infiniop/ops/exp/cuda/kernel.cuh
new file mode 100644
index 000000000..12446f31a
--- /dev/null
+++ b/src/infiniop/ops/exp/cuda/kernel.cuh
@@ -0,0 +1,39 @@
+#ifndef __EXP_CUDA_H__
+#define __EXP_CUDA_H__
+
+#include <cmath>
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+
+namespace op::exp::cuda {
+typedef struct ExpOp {
+    static constexpr size_t num_inputs = 1;
+
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &input) const {
+        if constexpr (std::is_same_v<T, half2>) {
+            float2 vf = __half22float2(input);
+            float2 vr = make_float2(__expf(vf.x), __expf(vf.y));
+            return __float22half2_rn(vr);
+        } else if constexpr (std::is_same_v<T, half>) {
+            float inputf = __half2float(input);
+            return __float2half_rn(__expf(inputf));
+        } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
+            float f0 = __bfloat162float(__low2bfloat16(input));
+            float f1 = __bfloat162float(__high2bfloat16(input));
+            return __floats2bfloat162_rn(__expf(f0), __expf(f1));
+        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+            float inputf = __bfloat162float(input);
+            return __float2bfloat16_rn(__expf(inputf));
+        } else if constexpr (std::is_same_v<T, float>) {
+            return __expf(input);
+        } else if constexpr (std::is_same_v<T, double>) {
+            return std::exp(input);
+        } else {
+            return std::exp(input);
+        }
+    }
+} ExpOp;
+} // namespace op::exp::cuda
+
+#endif // __EXP_CUDA_H__
diff --git a/src/infiniop/ops/exp/nvidia/exp_nvidia.cu b/src/infiniop/ops/exp/nvidia/exp_nvidia.cu
new file mode 100644
index 000000000..3bdf2eb45
--- /dev/null
+++ b/src/infiniop/ops/exp/nvidia/exp_nvidia.cu
@@ -0,0 +1,59 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "exp_nvidia.cuh"
+
+namespace op::exp::nvidia {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &output_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    CHECK_SAME_SHAPE(output_shape, input_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::ExpOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::ExpOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::ExpOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, cuda::ExpOp, double>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::exp::nvidia
diff --git a/src/infiniop/ops/exp/nvidia/exp_nvidia.cuh b/src/infiniop/ops/exp/nvidia/exp_nvidia.cuh
new file mode 100644
index 000000000..7545e8f3e
--- /dev/null
+++ b/src/infiniop/ops/exp/nvidia/exp_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __EXP_CUDA_API_H__
+#define __EXP_CUDA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(exp, nvidia)
+
+#endif // __EXP_CUDA_API_H__
diff --git a/src/infiniop/ops/exp/operator.cc b/src/infiniop/ops/exp/operator.cc
new file mode 100644
index 000000000..cc369d660
--- /dev/null
+++ b/src/infiniop/ops/exp/operator.cc
@@ -0,0 +1,139 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/exp.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/exp_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
+#include "nvidia/exp_nvidia.cuh"
+#endif
+
+__C infiniStatus_t infiniopCreateExpDescriptor(
+    infiniopHandle_t handle,
+    infiniopExpDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t output_desc,
+    infiniopTensorDescriptor_t input_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                            \
+    case CASE:                                                             \
+        return op::exp::NAMESPACE::Descriptor::create(                     \
+            handle,                                                        \
+            reinterpret_cast<op::exp::NAMESPACE::Descriptor **>(desc_ptr), \
+            output_desc,                                                   \
+            {input_desc})
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CREATE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetExpWorkspaceSize(infiniopExpDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                               \
+    case CASE:                                                                             \
+        *size = reinterpret_cast<op::exp::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        GET(INFINI_DEVICE_QY, nvidia);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopExp(
+    infiniopExpDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    const void *input,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                            \
+    case CASE:                                                                \
+        return reinterpret_cast<const op::exp::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, output, {input}, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CALCULATE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroyExpDescriptor(infiniopExpDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                \
+    case CASE:                                                                 \
+        delete reinterpret_cast<const op::exp::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        DELETE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/gather/cpu/gather_cpu.cc b/src/infiniop/ops/gather/cpu/gather_cpu.cc
new file mode 100644
index 000000000..f7251bdd7
--- /dev/null
+++ b/src/infiniop/ops/gather/cpu/gather_cpu.cc
@@ -0,0 +1,96 @@
+#include "gather_cpu.h"
+#include "../../../devices/cpu/common_cpu.h"
+#include "../../../reduce/cpu/reduce.h"
+#include "../info.h"
+
+namespace op::gather::cpu {
+
+infiniStatus_t calculate_gather(
+    const GatherInfo &info,
+    char *output,
+    const char *input,
+    const int64_t *index) {
+    //  -------------------------------- start: perform operator on CPU --------------------------------
+    std::vector<ptrdiff_t> contiguous_strides(info.ndim);
+    ptrdiff_t last_dim = 1;
+    ptrdiff_t last_stride = 1;
+    for (size_t d = 0; d < info.ndim; d++) {
+        contiguous_strides[d] = last_dim * last_stride;
+        last_dim = info.output_shape[d];
+        last_stride = contiguous_strides[d];
+    }
+    size_t total_size = last_dim * last_stride;
+
+    int gather_dim = static_cast<int>(info.dim);
+    size_t element_size = infiniSizeOf(info.dtype);
+
+#pragma omp parallel for
+    for (int i = 0; i < static_cast<int>(total_size); i++) {
+        auto output_ptr = output;
+        auto input_ptr = input;
+        auto index_ptr = index;
+        size_t rem = static_cast<size_t>(i);
+        for (int d = static_cast<int>(info.ndim) - 1; d >= 0; d--) {
+            size_t dim_index = rem / contiguous_strides[d];
+            rem = rem % contiguous_strides[d];
+            output_ptr += dim_index * element_size * info.output_strides[d];
+            index_ptr += dim_index * info.index_strides[d];
+            if (d != gather_dim) {
+                input_ptr += dim_index * element_size * info.input_strides[d];
+            }
+        }
+        int64_t gather_number = *index_ptr;
+        input_ptr += gather_number * element_size * info.input_strides[gather_dim];
+        // *output_ptr = *input_ptr;
+        memcpy(
+            output_ptr,
+            input_ptr,
+            element_size);
+    }
+    //  --------------------------------- end: perform operator on CPU ---------------------------------
+    return INFINI_STATUS_SUCCESS;
+}
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t output_desc,
+    infiniopTensorDescriptor_t input_desc,
+    infiniopTensorDescriptor_t index_desc,
+    size_t dim) {
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+
+    //  --------------------- start: check data type and calculate workspace size ----------------------
+    auto dtype = output_desc->dtype();
+    size_t WorkSpaceSize = 0;
+    //  ---------------------- end: check data type and calculate workspace size -----------------------
+
+    auto result = GatherInfo::createGatherInfo(
+        output_desc,
+        input_desc,
+        index_desc,
+        dim);
+    CHECK_RESULT(result);
+    const GatherInfo &info = result.take();
+
+    *desc_ptr = new Descriptor(
+        dtype, std::move(info), WorkSpaceSize,
+        nullptr,
+        handle->device, handle->device_id);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    const void *input,
+    const void *index,
+    void *stream) const {
+
+    return calculate_gather(_info, (char *)output, (const char *)input, (const int64_t *)index);
+}
+} // namespace op::gather::cpu
diff --git a/src/infiniop/ops/gather/cpu/gather_cpu.h b/src/infiniop/ops/gather/cpu/gather_cpu.h
new file mode 100644
index 000000000..dac3583ac
--- /dev/null
+++ b/src/infiniop/ops/gather/cpu/gather_cpu.h
@@ -0,0 +1,8 @@
+#ifndef __GATHER_CPU_H__
+#define __GATHER_CPU_H__
+
+#include "../gather.h"
+
+DESCRIPTOR(cpu)
+
+#endif // __GATHER_CPU_H__
diff --git a/src/infiniop/ops/gather/cuda/kernel.cuh b/src/infiniop/ops/gather/cuda/kernel.cuh
new file mode 100644
index 000000000..dbb818e83
--- /dev/null
+++ b/src/infiniop/ops/gather/cuda/kernel.cuh
@@ -0,0 +1,37 @@
+#ifndef __GATHER_KERNEL_CUH__
+#define __GATHER_KERNEL_CUH__
+//  ------------------------------- start: perform operator on CUDA --------------------------------
+template <unsigned int BLOCK_SIZE, typename Tdata>
+__device__ void gatherKernel(
+    Tdata *output,
+    const Tdata *input,
+    const int64_t *index,
+    size_t ndim,
+    size_t index_gather_size,
+    ptrdiff_t *output_strides,
+    ptrdiff_t *input_strides,
+    ptrdiff_t *index_strides,
+    ptrdiff_t *contiguous_strides,
+    int gather_dim) {
+    auto output_ptr = output;
+    auto input_ptr = input;
+    auto index_ptr = index;
+    size_t rem = blockIdx.x;
+    for (int d = ndim - 1; d >= 0; d--) {
+        if (d == gather_dim) {
+            continue;
+        }
+        size_t dim_index = rem / contiguous_strides[d];
+        rem = rem % contiguous_strides[d];
+        output_ptr += dim_index * output_strides[d];
+        input_ptr += dim_index * input_strides[d];
+        index_ptr += dim_index * index_strides[d];
+    }
+    for (size_t c = threadIdx.x; c < index_gather_size; c++) {
+        int64_t gather_number = *(index_ptr + c * index_strides[gather_dim]);
+        *(output_ptr + c * output_strides[gather_dim]) = *(input_ptr + gather_number * input_strides[gather_dim]);
+    }
+}
+//  -------------------------------- end: perform operator on CUDA ---------------------------------
+
+#endif // __GATHER_KERNEL_CUH__
diff --git a/src/infiniop/ops/gather/gather.h b/src/infiniop/ops/gather/gather.h
new file mode 100644
index 000000000..d4c88b0c4
--- /dev/null
+++ b/src/infiniop/ops/gather/gather.h
@@ -0,0 +1,47 @@
+#ifndef __GATHER_H__
+#define __GATHER_H__
+
+#include "../../../utils.h"
+#include "../../operator.h"
+#include "../../tensor.h"
+#include "info.h"
+
+#define DESCRIPTOR(NAMESPACE)                                            \
+    namespace op::gather::NAMESPACE {                                    \
+    class Descriptor final : public InfiniopDescriptor {                 \
+        struct Opaque;                                                   \
+        Opaque *_opaque;                                                 \
+        GatherInfo _info;                                                \
+        size_t _workspace_size;                                          \
+        Descriptor(                                                      \
+            infiniDtype_t dtype,                                         \
+            GatherInfo info,                                             \
+            size_t workspace_size_,                                      \
+            Opaque *opaque,                                              \
+            infiniDevice_t device_type,                                  \
+            int device_id) : InfiniopDescriptor{device_type, device_id}, \
+                             _opaque(opaque),                            \
+                             _info(info),                                \
+                             _workspace_size(workspace_size_) {}         \
+                                                                         \
+    public:                                                              \
+        ~Descriptor();                                                   \
+        size_t workspaceSize() const { return _workspace_size; }         \
+        static infiniStatus_t create(                                    \
+            infiniopHandle_t handle,                                     \
+            Descriptor **desc_ptr,                                       \
+            infiniopTensorDescriptor_t output_desc,                      \
+            infiniopTensorDescriptor_t input_desc,                       \
+            infiniopTensorDescriptor_t index_desc,                       \
+            size_t dim);                                                 \
+        infiniStatus_t calculate(                                        \
+            void *workspace,                                             \
+            size_t workspace_size,                                       \
+            void *output,                                                \
+            const void *input,                                           \
+            const void *index,                                           \
+            void *stream) const;                                         \
+    };                                                                   \
+    }
+
+#endif
\ No newline at end of file
diff --git a/src/infiniop/ops/gather/info.h b/src/infiniop/ops/gather/info.h
new file mode 100644
index 000000000..0098c7ea1
--- /dev/null
+++ b/src/infiniop/ops/gather/info.h
@@ -0,0 +1,58 @@
+#ifndef __GATHER_INFO_H__
+#define __GATHER_INFO_H__
+
+#include "../../../utils.h"
+#include "../../operator.h"
+#include "../../tensor.h"
+
+namespace op::gather {
+
+class GatherInfo {
+private:
+    GatherInfo() = default;
+
+public:
+    //  ---------------------------- start: define member variables of Info ----------------------------
+    infiniDtype_t dtype;
+    size_t ndim;
+    std::vector<size_t> output_shape;
+    size_t input_dim_size;
+    std::vector<ptrdiff_t> output_strides;
+    std::vector<ptrdiff_t> input_strides;
+    std::vector<ptrdiff_t> index_strides;
+    size_t dim;
+
+    //  ----------------------------- end: define member variables of Info -----------------------------
+
+    static utils::Result<GatherInfo> createGatherInfo(
+        infiniopTensorDescriptor_t output_desc,
+        infiniopTensorDescriptor_t input_desc,
+        infiniopTensorDescriptor_t index_desc,
+        size_t dim) {
+        //  ------------------------- start: check tensor shape and input validity -------------------------
+        CHECK_SAME_SHAPE(output_desc->shape(), index_desc->shape());
+        size_t ndim = output_desc->ndim();
+        for (size_t d = 0; d < ndim; d++) {
+            if (d != dim) {
+                CHECK_OR_RETURN(input_desc->dim(d) == output_desc->dim(d), INFINI_STATUS_BAD_TENSOR_SHAPE);
+            }
+        }
+        CHECK_OR_RETURN(ndim > dim, INFINI_STATUS_BAD_PARAM);
+        //  -------------------------- end: check tensor shape and input validity --------------------------
+        return utils::Result<GatherInfo>(GatherInfo{
+            //  ------------------------------ start: create an instance of Info -------------------------------
+            output_desc->dtype(),
+            ndim,
+            output_desc->shape(),
+            input_desc->dim(dim),
+            output_desc->strides(),
+            input_desc->strides(),
+            index_desc->strides(),
+            dim
+            //  ------------------------------- end: create an instance of Info --------------------------------
+        });
+    }
+};
+} // namespace op::gather
+
+#endif //  __GATHER_INFO_H__
diff --git a/src/infiniop/ops/gather/nvidia/gather_nvidia.cu b/src/infiniop/ops/gather/nvidia/gather_nvidia.cu
new file mode 100644
index 000000000..f0a2a7fb3
--- /dev/null
+++ b/src/infiniop/ops/gather/nvidia/gather_nvidia.cu
@@ -0,0 +1,179 @@
+#include "../../../devices/nvidia/nvidia_common.cuh"
+#include "../../../devices/nvidia/nvidia_handle.cuh"
+#include "../../../devices/nvidia/nvidia_kernel_common.cuh"
+#include "../cuda/kernel.cuh"
+#include "../info.h"
+#include "gather_nvidia.cuh"
+
+namespace op::gather::nvidia {
+
+//  ---------------------- start: launchKernel: call kernel function of CUDA -----------------------
+template <unsigned int BLOCK_SIZE, typename Tdata>
+INFINIOP_CUDA_KERNEL launchKernel(
+    Tdata *output,
+    const Tdata *input,
+    const int64_t *index,
+    size_t ndim,
+    size_t index_gather_size,
+    ptrdiff_t *output_strides,
+    ptrdiff_t *input_strides,
+    ptrdiff_t *index_strides,
+    ptrdiff_t *contiguous_strides,
+    int gather_dim) {
+    gatherKernel<BLOCK_SIZE, Tdata>(
+        output,
+        input,
+        index,
+        ndim,
+        index_gather_size,
+        output_strides,
+        input_strides,
+        index_strides,
+        contiguous_strides,
+        gather_dim);
+}
+//  ----------------------- end: launchKernel: call kernel function of CUDA ------------------------
+
+//  ----------------------------------- start: call launchKernel -----------------------------------
+template <unsigned int BLOCK_SIZE, typename Tdata>
+infiniStatus_t calculate_gather(
+    const GatherInfo &info,
+    Tdata *output,
+    const Tdata *input,
+    const int64_t *index,
+    cudaStream_t stream,
+    void *workspace) {
+    size_t ndim = info.ndim;
+    ptrdiff_t *contiguous_strides = new ptrdiff_t[ndim];
+    size_t last_dim = 1, last_stride = 1;
+    size_t gather_dim = info.dim;
+    for (size_t d = 0; d < ndim; d++) {
+        if (d == gather_dim) {
+            continue;
+        }
+        contiguous_strides[d] = last_dim * last_stride;
+        last_dim = info.output_shape[d];
+        last_stride = contiguous_strides[d];
+    }
+    size_t batch_size = last_dim * last_stride;
+
+    ptrdiff_t *contiguous_strides_cuda = reinterpret_cast<ptrdiff_t *>(workspace);
+    ptrdiff_t *input_strides_cuda = contiguous_strides_cuda + ndim;
+    ptrdiff_t *output_strides_cuda = input_strides_cuda + ndim;
+    ptrdiff_t *index_strides_cuda = output_strides_cuda + ndim;
+
+    CHECK_CUDA(cudaMemcpyAsync(contiguous_strides_cuda, contiguous_strides, sizeof(ptrdiff_t) * ndim, cudaMemcpyHostToDevice, stream));
+    CHECK_CUDA(cudaMemcpyAsync(input_strides_cuda, info.input_strides.data(), sizeof(ptrdiff_t) * ndim, cudaMemcpyHostToDevice, stream));
+    CHECK_CUDA(cudaMemcpyAsync(output_strides_cuda, info.output_strides.data(), sizeof(ptrdiff_t) * ndim, cudaMemcpyHostToDevice, stream));
+    CHECK_CUDA(cudaMemcpyAsync(index_strides_cuda, info.index_strides.data(), sizeof(ptrdiff_t) * ndim, cudaMemcpyHostToDevice, stream));
+
+    launchKernel<1, Tdata><<<batch_size, 1, 0, stream>>>(
+        output,
+        input,
+        index,
+        ndim,
+        info.output_shape[gather_dim],
+        output_strides_cuda,
+        input_strides_cuda,
+        index_strides_cuda,
+        contiguous_strides_cuda,
+        info.dim);
+    delete[] contiguous_strides;
+    return INFINI_STATUS_SUCCESS;
+}
+//  ------------------------------------ end: call launchKernel ------------------------------------
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::nvidia::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t output_desc,
+    infiniopTensorDescriptor_t input_desc,
+    infiniopTensorDescriptor_t index_desc,
+    size_t dim) {
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    //  --------------------- start: check data type and calculate workspace size ----------------------
+    auto dtype = output_desc->dtype();
+    size_t WorkSpaceSize = sizeof(ptrdiff_t) * input_desc->ndim() * 4;
+    //  ---------------------- end: check data type and calculate workspace size -----------------------
+    auto result = GatherInfo::createGatherInfo(
+        output_desc,
+        input_desc,
+        index_desc,
+        dim);
+    CHECK_RESULT(result);
+    const GatherInfo &info = result.take();
+    *desc_ptr = new Descriptor(
+        dtype, std::move(info), WorkSpaceSize,
+        new Opaque{handle->internal()},
+        handle->device, handle->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    const void *input,
+    const void *index,
+    void *stream_) const {
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+    cudaStream_t stream = (cudaStream_t)stream_;
+
+#define CALCULATE_GATHER(BLOCK_SIZE, TDATA) \
+    calculate_gather<BLOCK_SIZE, TDATA>(_info, (TDATA *)output, (const TDATA *)input, (const int64_t *)index, stream, workspace)
+#define CALCULATE_GATHER_WITH_BLOCK_SIZE(BLOCK_SIZE)        \
+    switch (_info.dtype) {                                  \
+    case INFINI_DTYPE_BOOL:                                 \
+        return CALCULATE_GATHER(BLOCK_SIZE, bool);          \
+    case INFINI_DTYPE_U8:                                   \
+        return CALCULATE_GATHER(BLOCK_SIZE, uint8_t);       \
+    case INFINI_DTYPE_U16:                                  \
+        return CALCULATE_GATHER(BLOCK_SIZE, uint16_t);      \
+    case INFINI_DTYPE_U32:                                  \
+        return CALCULATE_GATHER(BLOCK_SIZE, uint32_t);      \
+    case INFINI_DTYPE_U64:                                  \
+        return CALCULATE_GATHER(BLOCK_SIZE, uint64_t);      \
+    case INFINI_DTYPE_I8:                                   \
+        return CALCULATE_GATHER(BLOCK_SIZE, int8_t);        \
+    case INFINI_DTYPE_I16:                                  \
+        return CALCULATE_GATHER(BLOCK_SIZE, int16_t);       \
+    case INFINI_DTYPE_I32:                                  \
+        return CALCULATE_GATHER(BLOCK_SIZE, int32_t);       \
+    case INFINI_DTYPE_I64:                                  \
+        return CALCULATE_GATHER(BLOCK_SIZE, int64_t);       \
+    case INFINI_DTYPE_F16:                                  \
+        return CALCULATE_GATHER(BLOCK_SIZE, half);          \
+    case INFINI_DTYPE_F32:                                  \
+        return CALCULATE_GATHER(BLOCK_SIZE, float);         \
+    case INFINI_DTYPE_BF16:                                 \
+        return CALCULATE_GATHER(BLOCK_SIZE, cuda_bfloat16); \
+    default:                                                \
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;              \
+    }
+
+    if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_1024) {
+        CALCULATE_GATHER_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_1024)
+    } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_512) {
+        CALCULATE_GATHER_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_512)
+    } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_4096) {
+        CALCULATE_GATHER_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_4096)
+    } else {
+        return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE_GATHER_WITH_BLOCK_SIZE
+#undef CALCULATE_GATHER
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::gather::nvidia
diff --git a/src/infiniop/ops/gather/nvidia/gather_nvidia.cuh b/src/infiniop/ops/gather/nvidia/gather_nvidia.cuh
new file mode 100644
index 000000000..46d42fa0c
--- /dev/null
+++ b/src/infiniop/ops/gather/nvidia/gather_nvidia.cuh
@@ -0,0 +1,7 @@
+#ifndef __GATHER_NVIDIA_API_H__
+#define __GATHER_NVIDIA_API_H__
+#include "../gather.h"
+
+DESCRIPTOR(nvidia)
+
+#endif // __GATHER_NVIDIA_API_H__
diff --git a/src/infiniop/ops/gather/operator.cc b/src/infiniop/ops/gather/operator.cc
new file mode 100644
index 000000000..706009e9b
--- /dev/null
+++ b/src/infiniop/ops/gather/operator.cc
@@ -0,0 +1,144 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/gather.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/gather_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
+#include "nvidia/gather_nvidia.cuh"
+#endif
+
+__C infiniStatus_t infiniopCreateGatherDescriptor(
+    infiniopHandle_t handle,
+    infiniopGatherDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t output_desc,
+    infiniopTensorDescriptor_t input_desc,
+    infiniopTensorDescriptor_t index_desc,
+    size_t dim) {
+
+#define CREATE(CASE, NAMESPACE)                                               \
+    case CASE:                                                                \
+        return op::gather::NAMESPACE::Descriptor::create(                     \
+            handle,                                                           \
+            reinterpret_cast<op::gather::NAMESPACE::Descriptor **>(desc_ptr), \
+            output_desc,                                                      \
+            input_desc,                                                       \
+            index_desc,                                                       \
+            dim)
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CREATE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetGatherWorkspaceSize(infiniopGatherDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                        \
+    case CASE:                                                                                      \
+        *size = reinterpret_cast<const op::gather::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        GET(INFINI_DEVICE_QY, nvidia);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopGather(
+    infiniopGatherDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    const void *input,
+    const void *index,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                               \
+    case CASE:                                                                   \
+        return reinterpret_cast<const op::gather::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, output, input, index, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CALCULATE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroyGatherDescriptor(infiniopGatherDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                   \
+    case CASE:                                                                    \
+        delete reinterpret_cast<const op::gather::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        DELETE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/hardswish/cpu/hardswish_cpu.cc b/src/infiniop/ops/hardswish/cpu/hardswish_cpu.cc
new file mode 100644
index 000000000..e7b68508a
--- /dev/null
+++ b/src/infiniop/ops/hardswish/cpu/hardswish_cpu.cc
@@ -0,0 +1,52 @@
+#include "hardswish_cpu.h"
+
+namespace op::hardswish::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &output_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    CHECK_SAME_SHAPE(output_shape, input_shape);
+
+    // create CPU elementwise descriptor
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<HardswishOp, fp16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<HardswishOp, float>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<HardswishOp, double>(_info, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<HardswishOp, bf16_t>(_info, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::hardswish::cpu
diff --git a/src/infiniop/ops/hardswish/cpu/hardswish_cpu.h b/src/infiniop/ops/hardswish/cpu/hardswish_cpu.h
new file mode 100644
index 000000000..e137be8a0
--- /dev/null
+++ b/src/infiniop/ops/hardswish/cpu/hardswish_cpu.h
@@ -0,0 +1,30 @@
+#ifndef __HARDSWISH_CPU_H__
+#define __HARDSWISH_CPU_H__
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+#include <algorithm>
+
+ELEMENTWISE_DESCRIPTOR(hardswish, cpu)
+
+namespace op::hardswish::cpu {
+typedef struct HardswishOp {
+public:
+    static constexpr size_t num_inputs = 1;
+
+    template <typename T>
+    T operator()(const T &input) const {
+        if constexpr (std::is_integral_v<T>) {
+            return static_cast<T>(0);
+        } else {
+            // x * clamp(x + 3, 0, 6) / 6
+            auto x = static_cast<double>(input);
+            double y = x + 3.0;
+            y = std::min(std::max(y, 0.0), 6.0);
+            double out = x * (y / 6.0);
+            return static_cast<T>(out);
+        }
+    }
+} HardswishOp;
+} // namespace op::hardswish::cpu
+
+#endif // __HARDSWISH_CPU_H__
diff --git a/src/infiniop/ops/hardswish/cuda/kernel.cuh b/src/infiniop/ops/hardswish/cuda/kernel.cuh
new file mode 100644
index 000000000..d5b369bce
--- /dev/null
+++ b/src/infiniop/ops/hardswish/cuda/kernel.cuh
@@ -0,0 +1,55 @@
+#ifndef __HARDSWISH_CUDA_H__
+#define __HARDSWISH_CUDA_H__
+
+#include <cmath>
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+
+namespace op::hardswish::cuda {
+
+typedef struct HardswishOp {
+    static constexpr size_t num_inputs = 1;
+
+    // Hardswish: f(x) = x * clamp(x + 3, 0, 6) / 6
+    __device__ __forceinline__ float hswish_f32(float x) const {
+        float y = x + 3.0f;
+        y = y < 0.0f ? 0.0f : (y > 6.0f ? 6.0f : y);
+        return x * (y * (1.0f / 6.0f));
+    }
+
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &input) const {
+        if constexpr (std::is_same_v<T, half2>) {
+            float2 vf = __half22float2(input);
+            float2 vr = make_float2(
+                hswish_f32(vf.x),
+                hswish_f32(vf.y));
+            return __float22half2_rn(vr);
+        } else if constexpr (std::is_same_v<T, half>) {
+            float xf = __half2float(input);
+            float yf = hswish_f32(xf);
+            return __float2half_rn(yf);
+        } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
+            float f0 = __bfloat162float(__low2bfloat16(input));
+            float f1 = __bfloat162float(__high2bfloat16(input));
+            return __floats2bfloat162_rn(hswish_f32(f0), hswish_f32(f1));
+        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+            float xf = __bfloat162float(input);
+            return __float2bfloat16_rz(hswish_f32(xf));
+        } else if constexpr (std::is_same_v<T, float>) {
+            return hswish_f32(input);
+        } else if constexpr (std::is_same_v<T, double>) {
+            double xd = static_cast<double>(input);
+            double yd = xd * (std::fmin(std::fmax(xd + 3.0, 0.0), 6.0) / 6.0);
+            return static_cast<T>(yd);
+        } else {
+            double xd = static_cast<double>(input);
+            double yd = xd * (std::fmin(std::fmax(xd + 3.0, 0.0), 6.0) / 6.0);
+            return static_cast<T>(yd);
+        }
+    }
+} HardswishOp;
+
+} // namespace op::hardswish::cuda
+
+#endif // __HARDSWISH_CUDA_H__
diff --git a/src/infiniop/ops/hardswish/nvidia/hardswish_nvidia.cu b/src/infiniop/ops/hardswish/nvidia/hardswish_nvidia.cu
new file mode 100644
index 000000000..9e279c2ef
--- /dev/null
+++ b/src/infiniop/ops/hardswish/nvidia/hardswish_nvidia.cu
@@ -0,0 +1,59 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "hardswish_nvidia.cuh"
+
+namespace op::hardswish::nvidia {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &output_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    CHECK_SAME_SHAPE(output_shape, input_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::HardswishOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::HardswishOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::HardswishOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, cuda::HardswishOp, double>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::hardswish::nvidia
diff --git a/src/infiniop/ops/hardswish/nvidia/hardswish_nvidia.cuh b/src/infiniop/ops/hardswish/nvidia/hardswish_nvidia.cuh
new file mode 100644
index 000000000..f869ad52f
--- /dev/null
+++ b/src/infiniop/ops/hardswish/nvidia/hardswish_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __HARDSWISH_CUDA_API_H__
+#define __HARDSWISH_CUDA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(hardswish, nvidia)
+
+#endif // __HARDSWISH_CUDA_API_H__
diff --git a/src/infiniop/ops/hardswish/operator.cc b/src/infiniop/ops/hardswish/operator.cc
new file mode 100644
index 000000000..c51b18777
--- /dev/null
+++ b/src/infiniop/ops/hardswish/operator.cc
@@ -0,0 +1,139 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/hardswish.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/hardswish_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
+#include "nvidia/hardswish_nvidia.cuh"
+#endif
+
+__C infiniStatus_t infiniopCreateHardswishDescriptor(
+    infiniopHandle_t handle,
+    infiniopHardswishDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t output_desc,
+    infiniopTensorDescriptor_t input_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                                  \
+    case CASE:                                                                   \
+        return op::hardswish::NAMESPACE::Descriptor::create(                     \
+            handle,                                                              \
+            reinterpret_cast<op::hardswish::NAMESPACE::Descriptor **>(desc_ptr), \
+            output_desc,                                                         \
+            {input_desc})
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CREATE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetHardswishWorkspaceSize(infiniopHardswishDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                     \
+    case CASE:                                                                                   \
+        *size = reinterpret_cast<op::hardswish::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        GET(INFINI_DEVICE_QY, nvidia);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopHardswish(
+    infiniopHardswishDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    const void *input,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                                  \
+    case CASE:                                                                      \
+        return reinterpret_cast<const op::hardswish::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, output, {input}, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CALCULATE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroyHardswishDescriptor(infiniopHardswishDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                      \
+    case CASE:                                                                       \
+        delete reinterpret_cast<const op::hardswish::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        DELETE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/index_copy_inplace/cpu/index_copy_inplace_cpu.cc b/src/infiniop/ops/index_copy_inplace/cpu/index_copy_inplace_cpu.cc
new file mode 100644
index 000000000..68015ba6b
--- /dev/null
+++ b/src/infiniop/ops/index_copy_inplace/cpu/index_copy_inplace_cpu.cc
@@ -0,0 +1,93 @@
+#include "index_copy_inplace_cpu.h"
+#include "../../../devices/cpu/common_cpu.h"
+#include "../../../reduce/cpu/reduce.h"
+#include "../../rearrange/cpu/rearrange_cpu.h"
+#include "../info.h"
+
+namespace op::index_copy_inplace::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t output_desc,
+    infiniopTensorDescriptor_t input_desc,
+    infiniopTensorDescriptor_t index_desc,
+    size_t dim) {
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+
+    //  --------------------- start: check data type and calculate workspace size ----------------------
+    auto dtype = output_desc->dtype();
+
+    auto result = IndexCopyInplaceInfo::createIndexCopyInplaceInfo(
+        output_desc,
+        input_desc,
+        index_desc,
+        dim);
+    CHECK_RESULT(result);
+    const IndexCopyInplaceInfo &info = result.take();
+    size_t WorkSpaceSize = (info.total_input_size + info.total_output_size) * infiniSizeOf(dtype);
+    //  ---------------------- end: check data type and calculate workspace size -----------------------
+    InfiniopTensorDescriptor *rearrange_in_desc = new InfiniopTensorDescriptor(
+        dtype, input_desc->ndim(), input_desc->shape().data(), info.meta_strides.data());
+    InfiniopTensorDescriptor *rearrange_out_desc = new InfiniopTensorDescriptor(
+        dtype, input_desc->ndim(), output_desc->shape().data(), info.meta_strides.data());
+
+    void *in_rearrange_descriptor = nullptr;
+    void *out_rearrange_descriptor = nullptr;
+
+    op::rearrange::cpu::Descriptor::create(
+        handle_, reinterpret_cast<op::rearrange::cpu::Descriptor **>(&in_rearrange_descriptor),
+        rearrange_in_desc, input_desc);
+    op::rearrange::cpu::Descriptor::create(
+        handle_, reinterpret_cast<op::rearrange::cpu::Descriptor **>(&out_rearrange_descriptor),
+        output_desc, rearrange_out_desc);
+
+    *desc_ptr = new Descriptor(
+        dtype, std::move(info), WorkSpaceSize,
+        nullptr,
+        handle->device, handle->device_id,
+        in_rearrange_descriptor,
+        out_rearrange_descriptor);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    const void *input,
+    const void *index,
+    void *stream) const {
+    size_t size_of_dtype = infiniSizeOf(_info.dtype);
+    auto index_ptr = reinterpret_cast<const int64_t *>(index);
+
+    char *workspace_in = reinterpret_cast<char *>(workspace);
+    char *workspace_out = workspace_in + size_of_dtype * _info.total_input_size;
+
+    reinterpret_cast<op::rearrange::cpu::Descriptor *>(_rearrange_desc_in)->calculate(workspace_in, input, stream);
+    memset(workspace_out, 0, _info.total_output_size * size_of_dtype);
+    size_t copy_unit_size = _info.meta_strides[_info.dim] * size_of_dtype;
+#pragma omp parallel for
+    for (int dst_index = 0; dst_index < static_cast<int>(_info.output_shape[_info.dim]); dst_index++) {
+        size_t src_index = _info.index_shape[0] - 1;
+        while (true) {
+            if (*(index_ptr + src_index * _info.index_strides[0]) == static_cast<int64_t>(dst_index)) {
+                std::memcpy(
+                    workspace_out + size_of_dtype * dst_index * _info.meta_strides[_info.dim],
+                    workspace_in + size_of_dtype * src_index * _info.meta_strides[_info.dim],
+                    copy_unit_size);
+                break;
+            } else if (src_index == 0) {
+                break;
+            }
+            src_index--;
+        }
+    }
+    reinterpret_cast<op::rearrange::cpu::Descriptor *>(_rearrange_desc_out)->calculate(output, workspace_out, stream);
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::index_copy_inplace::cpu
diff --git a/src/infiniop/ops/index_copy_inplace/cpu/index_copy_inplace_cpu.h b/src/infiniop/ops/index_copy_inplace/cpu/index_copy_inplace_cpu.h
new file mode 100644
index 000000000..49ba41f42
--- /dev/null
+++ b/src/infiniop/ops/index_copy_inplace/cpu/index_copy_inplace_cpu.h
@@ -0,0 +1,8 @@
+#ifndef __INDEX_COPY_INPLACE_CPU_H__
+#define __INDEX_COPY_INPLACE_CPU_H__
+
+#include "../index_copy_inplace.h"
+
+INDEX_COPY_INPLACE_DESCRIPTOR(cpu)
+
+#endif // __INDEX_COPY_INPLACE_CPU_H__
diff --git a/src/infiniop/ops/index_copy_inplace/index_copy_inplace.h b/src/infiniop/ops/index_copy_inplace/index_copy_inplace.h
new file mode 100644
index 000000000..80849ee2e
--- /dev/null
+++ b/src/infiniop/ops/index_copy_inplace/index_copy_inplace.h
@@ -0,0 +1,53 @@
+#ifndef __INDEX_COPY_INPLACE_H__
+#define __INDEX_COPY_INPLACE_H__
+
+#include "../../../utils.h"
+#include "../../operator.h"
+#include "../../tensor.h"
+#include "info.h"
+
+#define INDEX_COPY_INPLACE_DESCRIPTOR(NAMESPACE)                                    \
+    namespace op::index_copy_inplace::NAMESPACE {                                   \
+    class Descriptor final : public InfiniopDescriptor {                            \
+        struct Opaque;                                                              \
+        Opaque *_opaque;                                                            \
+        IndexCopyInplaceInfo _info;                                                 \
+        size_t _workspace_size;                                                     \
+        void *_rearrange_desc_in;                                                   \
+        void *_rearrange_desc_out;                                                  \
+        Descriptor(                                                                 \
+            infiniDtype_t dtype,                                                    \
+            IndexCopyInplaceInfo info,                                              \
+            size_t workspace_size_,                                                 \
+            Opaque *opaque,                                                         \
+            infiniDevice_t device_type,                                             \
+            int device_id,                                                          \
+            void *rearrange_desc_in,                                                \
+            void *rearrange_desc_out) : InfiniopDescriptor{device_type, device_id}, \
+                                        _opaque(opaque),                            \
+                                        _info(info),                                \
+                                        _workspace_size(workspace_size_),           \
+                                        _rearrange_desc_in(rearrange_desc_in),      \
+                                        _rearrange_desc_out(rearrange_desc_out) {}  \
+                                                                                    \
+    public:                                                                         \
+        ~Descriptor();                                                              \
+        size_t workspaceSize() const { return _workspace_size; }                    \
+        static infiniStatus_t create(                                               \
+            infiniopHandle_t handle,                                                \
+            Descriptor **desc_ptr,                                                  \
+            infiniopTensorDescriptor_t output_desc,                                 \
+            infiniopTensorDescriptor_t input_desc,                                  \
+            infiniopTensorDescriptor_t index_desc,                                  \
+            size_t dim);                                                            \
+        infiniStatus_t calculate(                                                   \
+            void *workspace,                                                        \
+            size_t workspace_size,                                                  \
+            void *output,                                                           \
+            const void *input,                                                      \
+            const void *index,                                                      \
+            void *stream) const;                                                    \
+    };                                                                              \
+    }
+
+#endif
\ No newline at end of file
diff --git a/src/infiniop/ops/index_copy_inplace/info.h b/src/infiniop/ops/index_copy_inplace/info.h
new file mode 100644
index 000000000..d7ad41d6f
--- /dev/null
+++ b/src/infiniop/ops/index_copy_inplace/info.h
@@ -0,0 +1,74 @@
+#ifndef __INDEX_COPY_INPLACE_INFO_H__
+#define __INDEX_COPY_INPLACE_INFO_H__
+
+#include "../../../utils.h"
+#include "../../operator.h"
+#include "../../tensor.h"
+
+namespace op::index_copy_inplace {
+
+class IndexCopyInplaceInfo {
+private:
+    IndexCopyInplaceInfo() = default;
+
+public:
+    //  ---------------------------- start: define member variables of Info ----------------------------
+    infiniDtype_t dtype;
+    size_t total_input_size;
+    size_t total_output_size;
+    std::vector<size_t> output_shape;
+    std::vector<size_t> input_shape;
+    std::vector<size_t> index_shape;
+    std::vector<ptrdiff_t> output_strides;
+    std::vector<ptrdiff_t> input_strides;
+    std::vector<ptrdiff_t> index_strides;
+    std::vector<ptrdiff_t> meta_strides;
+    size_t dim;
+
+    //  ----------------------------- end: define member variables of Info -----------------------------
+
+    static utils::Result<IndexCopyInplaceInfo> createIndexCopyInplaceInfo(
+        infiniopTensorDescriptor_t output_desc,
+        infiniopTensorDescriptor_t input_desc,
+        infiniopTensorDescriptor_t index_desc,
+        size_t dim) {
+        //  ------------------------- start: check tensor shape and input validity -------------------------
+        CHECK_OR_RETURN(output_desc->ndim() == input_desc->ndim(), INFINI_STATUS_BAD_TENSOR_STRIDES);
+        std::vector<ptrdiff_t> meta_strides(input_desc->ndim());
+        ptrdiff_t last_dim = 1;
+        ptrdiff_t last_stride = 1;
+        size_t total_input_size = 1;
+        size_t total_output_size = 1;
+        for (size_t d = 0; d < input_desc->ndim(); d++) {
+            total_input_size *= input_desc->dim(d);
+            total_output_size *= output_desc->dim(d);
+            if (d == dim) {
+                continue;
+            } else {
+                meta_strides[d] = last_dim * last_stride;
+                last_dim = input_desc->dim(d);
+                last_stride = meta_strides[d];
+            }
+        }
+        meta_strides[dim] = last_dim * last_stride;
+        //  -------------------------- end: check tensor shape and input validity --------------------------
+        return utils::Result<IndexCopyInplaceInfo>(IndexCopyInplaceInfo{
+            //  ------------------------------ start: create an instance of Info -------------------------------
+            output_desc->dtype(),
+            total_input_size,
+            total_output_size,
+            output_desc->shape(),
+            input_desc->shape(),
+            index_desc->shape(),
+            output_desc->strides(),
+            input_desc->strides(),
+            index_desc->strides(),
+            meta_strides,
+            dim
+            //  ------------------------------- end: create an instance of Info --------------------------------
+        });
+    }
+};
+} // namespace op::index_copy_inplace
+
+#endif //  __INDEX_COPY_INPLACE_INFO_H__
diff --git a/src/infiniop/ops/index_copy_inplace/nvidia/index_copy_inplace_nvidia.cu b/src/infiniop/ops/index_copy_inplace/nvidia/index_copy_inplace_nvidia.cu
new file mode 100644
index 000000000..70772fe67
--- /dev/null
+++ b/src/infiniop/ops/index_copy_inplace/nvidia/index_copy_inplace_nvidia.cu
@@ -0,0 +1,127 @@
+#include "../../../devices/nvidia/nvidia_common.cuh"
+#include "../../../devices/nvidia/nvidia_handle.cuh"
+#include "../../../devices/nvidia/nvidia_kernel_common.cuh"
+#include "../../rearrange/nvidia/rearrange_nvidia.cuh"
+#include "../info.h"
+#include "index_copy_inplace_nvidia.cuh"
+
+namespace op::index_copy_inplace::nvidia {
+
+infiniStatus_t calculate_index_copy_inplace(
+    char *output,
+    const char *input,
+    const int64_t *index,
+    size_t copy_unit_size,
+    size_t output_len,
+    size_t index_len,
+    ptrdiff_t index_stride,
+    cudaStream_t stream) {
+    int64_t *dst_index = new int64_t;
+    size_t sizeof_int64_t = sizeof(int64_t);
+    for (size_t src_index = 0; src_index < index_len; src_index++) {
+        CHECK_CUDA(cudaMemcpyAsync(
+            dst_index,
+            index + src_index * index_stride,
+            sizeof_int64_t,
+            cudaMemcpyDeviceToHost,
+            stream));
+        cudaStreamSynchronize(stream);
+        CHECK_CUDA(cudaMemcpyAsync(
+            output + (size_t)(*dst_index) * copy_unit_size,
+            input + src_index * copy_unit_size,
+            copy_unit_size,
+            cudaMemcpyDeviceToDevice,
+            stream));
+        cudaStreamSynchronize(stream);
+    }
+    delete dst_index;
+    return INFINI_STATUS_SUCCESS;
+}
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::nvidia::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() {
+    delete reinterpret_cast<op::rearrange::nvidia::Descriptor *>(_rearrange_desc_in);
+    delete reinterpret_cast<op::rearrange::nvidia::Descriptor *>(_rearrange_desc_out);
+    delete _opaque;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t output_desc,
+    infiniopTensorDescriptor_t input_desc,
+    infiniopTensorDescriptor_t index_desc,
+    size_t dim) {
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    //  --------------------- start: check data type and calculate workspace size ----------------------
+    auto dtype = output_desc->dtype();
+    //  ---------------------- end: check data type and calculate workspace size -----------------------
+    auto result = IndexCopyInplaceInfo::createIndexCopyInplaceInfo(
+        output_desc,
+        input_desc,
+        index_desc,
+        dim);
+    CHECK_RESULT(result);
+    const IndexCopyInplaceInfo &info = result.take();
+    size_t WorkSpaceSize = (info.total_input_size + info.total_output_size) * infiniSizeOf(dtype);
+
+    InfiniopTensorDescriptor *rearrange_in_desc = new InfiniopTensorDescriptor(
+        dtype, input_desc->ndim(), input_desc->shape().data(), info.meta_strides.data());
+    InfiniopTensorDescriptor *rearrange_out_desc = new InfiniopTensorDescriptor(
+        dtype, input_desc->ndim(), output_desc->shape().data(), info.meta_strides.data());
+
+    void *in_rearrange_descriptor = nullptr;
+    void *out_rearrange_descriptor = nullptr;
+
+    op::rearrange::nvidia::Descriptor::create(
+        handle_, reinterpret_cast<op::rearrange::nvidia::Descriptor **>(&in_rearrange_descriptor),
+        rearrange_in_desc, input_desc);
+    op::rearrange::nvidia::Descriptor::create(
+        handle_, reinterpret_cast<op::rearrange::nvidia::Descriptor **>(&out_rearrange_descriptor),
+        output_desc, rearrange_out_desc);
+
+    *desc_ptr = new Descriptor(
+        dtype, std::move(info), WorkSpaceSize,
+        new Opaque{handle->internal()},
+        handle->device, handle->device_id,
+        in_rearrange_descriptor,
+        out_rearrange_descriptor);
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    const void *input,
+    const void *index,
+    void *stream_) const {
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+    cudaStream_t stream = (cudaStream_t)stream_;
+
+    size_t elem_size = infiniSizeOf(_info.dtype);
+    char *workspace_in = reinterpret_cast<char *>(workspace);
+    char *workspace_out = workspace_in + elem_size * _info.total_input_size;
+    CHECK_STATUS(reinterpret_cast<op::rearrange::nvidia::Descriptor *>(_rearrange_desc_in)->calculate(workspace_in, input, stream));
+    cudaMemsetAsync(workspace_out, 0, _info.total_output_size * elem_size, stream);
+    cudaDeviceSynchronize();
+    CHECK_STATUS(calculate_index_copy_inplace(
+        reinterpret_cast<char *>(workspace_out),
+        reinterpret_cast<char *>(workspace_in),
+        reinterpret_cast<const int64_t *>(index),
+        elem_size * _info.meta_strides[_info.dim],
+        _info.output_shape[_info.dim],
+        _info.index_shape[0],
+        _info.index_strides[0],
+        stream));
+    cudaDeviceSynchronize();
+
+    CHECK_STATUS(reinterpret_cast<op::rearrange::nvidia::Descriptor *>(_rearrange_desc_out)->calculate(output, workspace_out, stream));
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::index_copy_inplace::nvidia
diff --git a/src/infiniop/ops/index_copy_inplace/nvidia/index_copy_inplace_nvidia.cuh b/src/infiniop/ops/index_copy_inplace/nvidia/index_copy_inplace_nvidia.cuh
new file mode 100644
index 000000000..04c3c86f7
--- /dev/null
+++ b/src/infiniop/ops/index_copy_inplace/nvidia/index_copy_inplace_nvidia.cuh
@@ -0,0 +1,7 @@
+#ifndef __INDEX_COPY_INPLACE_NVIDIA_API_H__
+#define __INDEX_COPY_INPLACE_NVIDIA_API_H__
+#include "../index_copy_inplace.h"
+
+INDEX_COPY_INPLACE_DESCRIPTOR(nvidia)
+
+#endif // __INDEX_COPY_INPLACE_NVIDIA_API_H__
diff --git a/src/infiniop/ops/index_copy_inplace/operator.cc b/src/infiniop/ops/index_copy_inplace/operator.cc
new file mode 100644
index 000000000..00e6bca3a
--- /dev/null
+++ b/src/infiniop/ops/index_copy_inplace/operator.cc
@@ -0,0 +1,144 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/index_copy_inplace.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/index_copy_inplace_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
+#include "nvidia/index_copy_inplace_nvidia.cuh"
+#endif
+
+__C infiniStatus_t infiniopCreateIndexCopyInplaceDescriptor(
+    infiniopHandle_t handle,
+    infiniopIndexCopyInplaceDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t output_desc,
+    infiniopTensorDescriptor_t input_desc,
+    infiniopTensorDescriptor_t index_desc,
+    size_t dim) {
+
+#define CREATE(CASE, NAMESPACE)                                                           \
+    case CASE:                                                                            \
+        return op::index_copy_inplace::NAMESPACE::Descriptor::create(                     \
+            handle,                                                                       \
+            reinterpret_cast<op::index_copy_inplace::NAMESPACE::Descriptor **>(desc_ptr), \
+            output_desc,                                                                  \
+            input_desc,                                                                   \
+            index_desc,                                                                   \
+            dim)
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CREATE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetIndexCopyInplaceWorkspaceSize(infiniopIndexCopyInplaceDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                                    \
+    case CASE:                                                                                                  \
+        *size = reinterpret_cast<const op::index_copy_inplace::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        GET(INFINI_DEVICE_QY, nvidia);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopIndexCopyInplace(
+    infiniopIndexCopyInplaceDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    const void *input,
+    const void *index,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                                           \
+    case CASE:                                                                               \
+        return reinterpret_cast<const op::index_copy_inplace::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, output, input, index, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CALCULATE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroyIndexCopyInplaceDescriptor(infiniopIndexCopyInplaceDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                               \
+    case CASE:                                                                                \
+        delete reinterpret_cast<const op::index_copy_inplace::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        DELETE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/interpolate_nearest/cpu/interpolate_nearest_cpu.cc b/src/infiniop/ops/interpolate_nearest/cpu/interpolate_nearest_cpu.cc
new file mode 100644
index 000000000..508dcecc6
--- /dev/null
+++ b/src/infiniop/ops/interpolate_nearest/cpu/interpolate_nearest_cpu.cc
@@ -0,0 +1,284 @@
+#include "interpolate_nearest_cpu.h"
+#include "../../../devices/cpu/common_cpu.h"
+#include "../../../devices/cpu/cpu_handle.h"
+#include "../info.h"
+#include <algorithm>
+#include <cmath>
+#include <cstring>
+
+namespace op::interpolate_nearest::cpu {
+
+struct Descriptor::Opaque {
+    device::cpu::Handle *handle;
+    InterpolateNearestInfo info;
+    size_t workspace_size = 0;
+
+private:
+    Opaque(device::cpu::Handle *handle_ptr, const InterpolateNearestInfo &interpolate_info)
+        : handle(handle_ptr), info(interpolate_info) {
+        workspace_size = 0;
+    }
+
+    template <typename T>
+    size_t compute_input_index_1d(size_t idx) const {
+        size_t temp = idx;
+
+        // 1D插值：3D张量 (N, C, W)
+        size_t w = temp % info.output_size[0];
+        temp /= info.output_size[0];
+        size_t c = temp % info.channels;
+        size_t b = temp / info.channels;
+
+        float inv_scale = static_cast<float>(info.input_size[0]) / static_cast<float>(info.output_size[0]);
+        size_t input_w = std::min(static_cast<size_t>(std::floor(static_cast<float>(w) * inv_scale)),
+                                  info.input_size[0] - 1);
+
+        return b * info.input_stride[0] + c * info.input_stride[1] + input_w * info.input_stride[2];
+    }
+
+    // 计算2D插值的输入索引
+    template <typename T>
+    size_t compute_input_index_2d(size_t idx) const {
+        size_t temp = idx;
+
+        // 2D插值：4D张量 (N, C, H, W)
+        size_t w = temp % info.output_size[1]; // width在索引1
+        temp /= info.output_size[1];
+        size_t h = temp % info.output_size[0]; // height在索引0
+        temp /= info.output_size[0];
+        size_t c = temp % info.channels;
+        size_t b = temp / info.channels;
+
+        float inv_scale_h = static_cast<float>(info.input_size[0]) / static_cast<float>(info.output_size[0]);
+        float inv_scale_w = static_cast<float>(info.input_size[1]) / static_cast<float>(info.output_size[1]);
+
+        size_t input_h = std::min(static_cast<size_t>(std::floor(static_cast<float>(h) * inv_scale_h)),
+                                  info.input_size[0] - 1);
+        size_t input_w = std::min(static_cast<size_t>(std::floor(static_cast<float>(w) * inv_scale_w)),
+                                  info.input_size[1] - 1);
+
+        return b * info.input_stride[0] + c * info.input_stride[1] + input_h * info.input_stride[2] + input_w * info.input_stride[3];
+    }
+
+    // 计算3D插值的输入索引
+    template <typename T>
+    size_t compute_input_index_3d(size_t idx) const {
+        size_t temp = idx;
+
+        // 3D插值：5D张量 (N, C, D, H, W)
+        size_t w = temp % info.output_size[2]; // width在索引2
+        temp /= info.output_size[2];
+        size_t h = temp % info.output_size[1]; // height在索引1
+        temp /= info.output_size[1];
+        size_t d = temp % info.output_size[0]; // depth在索引0
+        temp /= info.output_size[0];
+        size_t c = temp % info.channels;
+        size_t b = temp / info.channels;
+
+        float inv_scale_d = static_cast<float>(info.input_size[0]) / static_cast<float>(info.output_size[0]);
+        float inv_scale_h = static_cast<float>(info.input_size[1]) / static_cast<float>(info.output_size[1]);
+        float inv_scale_w = static_cast<float>(info.input_size[2]) / static_cast<float>(info.output_size[2]);
+
+        size_t input_d = std::min(static_cast<size_t>(std::floor(static_cast<float>(d) * inv_scale_d)),
+                                  info.input_size[0] - 1);
+        size_t input_h = std::min(static_cast<size_t>(std::floor(static_cast<float>(h) * inv_scale_h)),
+                                  info.input_size[1] - 1);
+        size_t input_w = std::min(static_cast<size_t>(std::floor(static_cast<float>(w) * inv_scale_w)),
+                                  info.input_size[2] - 1);
+
+        return b * info.input_stride[0] + c * info.input_stride[1] + input_d * info.input_stride[2] + input_h * info.input_stride[3] + input_w * info.input_stride[4];
+    }
+
+    // 计算输出索引
+    template <typename T>
+    size_t compute_output_index(size_t idx) const {
+        size_t temp = idx;
+        size_t w, h, d, c, b;
+
+        switch (info.dim) {
+        case INTERPOLATE_1D: {
+            // 3D张量 (N, C, W)
+            w = temp % info.output_size[0];
+            temp /= info.output_size[0];
+            c = temp % info.channels;
+            b = temp / info.channels;
+            return b * info.output_stride[0] + c * info.output_stride[1] + w * info.output_stride[2];
+        }
+
+        case INTERPOLATE_2D: {
+            // 4D张量 (N, C, H, W)
+            w = temp % info.output_size[1];
+            temp /= info.output_size[1];
+            h = temp % info.output_size[0];
+            temp /= info.output_size[0];
+            c = temp % info.channels;
+            b = temp / info.channels;
+            return b * info.output_stride[0] + c * info.output_stride[1] + h * info.output_stride[2] + w * info.output_stride[3];
+        }
+
+        case INTERPOLATE_3D: {
+            // 5D张量 (N, C, D, H, W)
+            w = temp % info.output_size[2];
+            temp /= info.output_size[2];
+            h = temp % info.output_size[1];
+            temp /= info.output_size[1];
+            d = temp % info.output_size[0];
+            temp /= info.output_size[0];
+            c = temp % info.channels;
+            b = temp / info.channels;
+            return b * info.output_stride[0] + c * info.output_stride[1] + d * info.output_stride[2] + h * info.output_stride[3] + w * info.output_stride[4];
+        }
+
+        default:
+            return 0;
+        }
+    }
+
+    // 计算总元素数
+    size_t calculate_total_elements() const {
+        size_t total = info.batch_size * info.channels;
+        switch (info.dim) {
+        case INTERPOLATE_1D:
+            total *= info.output_size[0]; // width
+            break;
+        case INTERPOLATE_2D:
+            total *= info.output_size[0] * info.output_size[1]; // height * width
+            break;
+        case INTERPOLATE_3D:
+            total *= info.output_size[0] * info.output_size[1] * info.output_size[2]; // depth * height * width
+            break;
+        }
+        return total;
+    }
+
+    // 主要的插值计算函数
+    template <typename T>
+    void interpolate_nearest_cpu(T *output, const T *input) const {
+        size_t total_elements = calculate_total_elements();
+
+#pragma omp parallel for schedule(static)
+        for (ptrdiff_t idx = 0; idx < static_cast<ptrdiff_t>(total_elements); ++idx) {
+            size_t input_idx;
+
+            switch (info.dim) {
+            case INTERPOLATE_1D:
+                input_idx = compute_input_index_1d<T>(idx);
+                break;
+            case INTERPOLATE_2D:
+                input_idx = compute_input_index_2d<T>(idx);
+                break;
+            case INTERPOLATE_3D:
+                input_idx = compute_input_index_3d<T>(idx);
+                break;
+            default:
+                continue;
+            }
+
+            size_t output_idx = compute_output_index<T>(idx);
+            output[output_idx] = input[input_idx];
+        }
+    }
+
+public:
+    Opaque(Opaque &&other) noexcept
+        : handle(other.handle),
+          info(std::move(other.info)),
+          workspace_size(other.workspace_size) {
+        other.handle = nullptr;
+        other.workspace_size = 0;
+    }
+
+    ~Opaque() = default;
+
+    static inline utils::Result<Opaque>
+    create(device::cpu::Handle *handle_ptr,
+           const InterpolateNearestInfo &info,
+           infiniDtype_t data_type) {
+        if (data_type != INFINI_DTYPE_F32 && data_type != INFINI_DTYPE_F16 && data_type != INFINI_DTYPE_BF16 && data_type != INFINI_DTYPE_I8) {
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;
+        }
+
+        Opaque opaque(handle_ptr, info);
+        return utils::Result<Opaque>(std::move(opaque));
+    }
+
+    infiniStatus_t calculate(void *workspace, size_t workspace_size,
+                             void *output, const void *input, infiniDtype_t dtype) const {
+
+        if (!output || !input) {
+            return INFINI_STATUS_BAD_PARAM;
+        }
+
+        switch (dtype) {
+        case INFINI_DTYPE_F32: {
+            float *typed_output = static_cast<float *>(output);
+            const float *typed_input = static_cast<const float *>(input);
+            interpolate_nearest_cpu(typed_output, typed_input);
+            break;
+        }
+
+        case INFINI_DTYPE_F16: {
+            fp16_t *typed_output = static_cast<fp16_t *>(output);
+            const fp16_t *typed_input = static_cast<const fp16_t *>(input);
+            interpolate_nearest_cpu(typed_output, typed_input);
+            break;
+        }
+
+        case INFINI_DTYPE_BF16: {
+            bf16_t *typed_output = static_cast<bf16_t *>(output);
+            const bf16_t *typed_input = static_cast<const bf16_t *>(input);
+            interpolate_nearest_cpu(typed_output, typed_input);
+            break;
+        }
+
+        case INFINI_DTYPE_I8: {
+            int8_t *typed_output = static_cast<int8_t *>(output);
+            const int8_t *typed_input = static_cast<const int8_t *>(input);
+            interpolate_nearest_cpu(typed_output, typed_input);
+            break;
+        }
+
+        default:
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;
+        }
+
+        return INFINI_STATUS_SUCCESS;
+    }
+};
+
+Descriptor::~Descriptor() {
+    if (_opaque) {
+        delete _opaque;
+    }
+}
+
+infiniStatus_t Descriptor::create(infiniopHandle_t handle_,
+                                  Descriptor **desc_ptr,
+                                  infiniopTensorDescriptor_t output_desc,
+                                  infiniopTensorDescriptor_t input_desc) {
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = output_desc->dtype();
+
+    // 检查数据类型支持
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F32, INFINI_DTYPE_F16, INFINI_DTYPE_BF16, INFINI_DTYPE_I8);
+
+    InterpolateNearestInfo info;
+    CHECK_STATUS(InterpolateNearestInfo::create(&info, output_desc, input_desc));
+
+    auto opaque_result = Opaque::create(handle, info, dtype);
+    CHECK_RESULT(opaque_result);
+    auto opaque = new Opaque(opaque_result.take());
+
+    *desc_ptr = new Descriptor(dtype, info, opaque->workspace_size, opaque,
+                               handle->device, handle->device_id);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
+                                     void *output, const void *input,
+                                     void *stream) const {
+    return _opaque->calculate(workspace, workspace_size, output, input, _dtype);
+}
+
+} // namespace op::interpolate_nearest::cpu
diff --git a/src/infiniop/ops/interpolate_nearest/cpu/interpolate_nearest_cpu.h b/src/infiniop/ops/interpolate_nearest/cpu/interpolate_nearest_cpu.h
new file mode 100644
index 000000000..78dd3ff97
--- /dev/null
+++ b/src/infiniop/ops/interpolate_nearest/cpu/interpolate_nearest_cpu.h
@@ -0,0 +1,8 @@
+#ifndef __INTERPOLATE_NEAREST_CPU_H__
+#define __INTERPOLATE_NEAREST_CPU_H__
+
+#include "../interpolate_nearest.h"
+
+DESCRIPTOR(cpu)
+
+#endif // __INTERPOLATE_NEAREST_CPU_H__
diff --git a/src/infiniop/ops/interpolate_nearest/cuda/kernel.cuh b/src/infiniop/ops/interpolate_nearest/cuda/kernel.cuh
new file mode 100644
index 000000000..60c798792
--- /dev/null
+++ b/src/infiniop/ops/interpolate_nearest/cuda/kernel.cuh
@@ -0,0 +1,168 @@
+#ifndef INTERPOLATE_NEAREST_KERNEL_CUH
+#define INTERPOLATE_NEAREST_KERNEL_CUH
+
+#include "../info.h"
+#include <cmath>
+
+template <typename T>
+__device__ inline size_t
+compute_input_index_1d(size_t idx, const InterpolateNearestInfo &info) {
+    size_t temp = idx;
+
+    // 1D 插值：3D 张量 (N, C, W)
+    size_t w = temp % info.output_size[0]; // width 在索引 0
+    temp /= info.output_size[0];
+    size_t c = temp % info.channels;
+    size_t b = temp / info.channels;
+
+    float inv_scale = static_cast<float>(info.input_size[0]) / static_cast<float>(info.output_size[0]);
+    size_t input_w = min(static_cast<size_t>(floorf(static_cast<float>(w) * inv_scale)),
+                         info.input_size[0] - 1);
+
+    return b * info.input_stride[0] + c * info.input_stride[1] + input_w * info.input_stride[2];
+}
+
+template <typename T>
+__device__ inline size_t
+compute_input_index_2d(size_t idx, const InterpolateNearestInfo &info) {
+    size_t temp = idx;
+
+    // 2D 插值：4D 张量 (N, C, H, W)
+    size_t w = temp % info.output_size[1]; // width 在索引 1
+    temp /= info.output_size[1];
+    size_t h = temp % info.output_size[0]; // height 在索引 0
+    temp /= info.output_size[0];
+    size_t c = temp % info.channels;
+    size_t b = temp / info.channels;
+
+    float inv_scale_h = static_cast<float>(info.input_size[0]) / static_cast<float>(info.output_size[0]);
+    float inv_scale_w = static_cast<float>(info.input_size[1]) / static_cast<float>(info.output_size[1]);
+
+    size_t input_h = min(static_cast<size_t>(floorf(static_cast<float>(h) * inv_scale_h)),
+                         info.input_size[0] - 1);
+    size_t input_w = min(static_cast<size_t>(floorf(static_cast<float>(w) * inv_scale_w)),
+                         info.input_size[1] - 1);
+
+    return b * info.input_stride[0] + c * info.input_stride[1] + input_h * info.input_stride[2] + input_w * info.input_stride[3];
+}
+
+template <typename T>
+__device__ inline size_t
+compute_input_index_3d(size_t idx, const InterpolateNearestInfo &info) {
+    size_t temp = idx;
+
+    // 3D 插值：5D 张量 (N, C, D, H, W)
+    size_t w = temp % info.output_size[2]; // width 在索引 2
+    temp /= info.output_size[2];
+    size_t h = temp % info.output_size[1]; // height 在索引 1
+    temp /= info.output_size[1];
+    size_t d = temp % info.output_size[0]; // depth 在索引 0
+    temp /= info.output_size[0];
+    size_t c = temp % info.channels;
+    size_t b = temp / info.channels;
+
+    float inv_scale_d = static_cast<float>(info.input_size[0]) / static_cast<float>(info.output_size[0]);
+    float inv_scale_h = static_cast<float>(info.input_size[1]) / static_cast<float>(info.output_size[1]);
+    float inv_scale_w = static_cast<float>(info.input_size[2]) / static_cast<float>(info.output_size[2]);
+
+    size_t input_d = min(static_cast<size_t>(floorf(static_cast<float>(d) * inv_scale_d)),
+                         info.input_size[0] - 1);
+    size_t input_h = min(static_cast<size_t>(floorf(static_cast<float>(h) * inv_scale_h)),
+                         info.input_size[1] - 1);
+    size_t input_w = min(static_cast<size_t>(floorf(static_cast<float>(w) * inv_scale_w)),
+                         info.input_size[2] - 1);
+
+    return b * info.input_stride[0] + c * info.input_stride[1] + input_d * info.input_stride[2] + input_h * info.input_stride[3] + input_w * info.input_stride[4];
+}
+
+template <typename T>
+__device__ inline size_t
+compute_output_index(size_t idx, const InterpolateNearestInfo &info) {
+    size_t temp = idx;
+    size_t w, h, d, c, b;
+
+    switch (info.dim) {
+    case INTERPOLATE_1D: {
+        // 3D 张量 (N, C, W)
+        w = temp % info.output_size[0];
+        temp /= info.output_size[0];
+        c = temp % info.channels;
+        b = temp / info.channels;
+        return b * info.output_stride[0] + c * info.output_stride[1] + w * info.output_stride[2];
+    }
+
+    case INTERPOLATE_2D: {
+        // 4D 张量 (N, C, H, W)
+        w = temp % info.output_size[1];
+        temp /= info.output_size[1];
+        h = temp % info.output_size[0];
+        temp /= info.output_size[0];
+        c = temp % info.channels;
+        b = temp / info.channels;
+        return b * info.output_stride[0] + c * info.output_stride[1] + h * info.output_stride[2] + w * info.output_stride[3];
+    }
+
+    case INTERPOLATE_3D: {
+        // 5D 张量 (N, C, D, H, W)
+        w = temp % info.output_size[2];
+        temp /= info.output_size[2];
+        h = temp % info.output_size[1];
+        temp /= info.output_size[1];
+        d = temp % info.output_size[0];
+        temp /= info.output_size[0];
+        c = temp % info.channels;
+        b = temp / info.channels;
+        return b * info.output_stride[0] + c * info.output_stride[1] + d * info.output_stride[2] + h * info.output_stride[3] + w * info.output_stride[4];
+    }
+
+    default:
+        return 0;
+    }
+}
+
+__host__ __device__ inline size_t
+calculate_total_elements(const InterpolateNearestInfo &info) {
+    size_t total = info.batch_size * info.channels;
+    switch (info.dim) {
+    case INTERPOLATE_1D:
+        total *= info.output_size[0]; // width
+        break;
+    case INTERPOLATE_2D:
+        total *= info.output_size[0] * info.output_size[1]; // height * width
+        break;
+    case INTERPOLATE_3D:
+        total *= info.output_size[0] * info.output_size[1] * info.output_size[2]; // depth * height * width
+        break;
+    }
+    return total;
+}
+
+template <typename T>
+__global__ void interpolate_nearest_kernel(T *output, const T *input,
+                                           InterpolateNearestInfo info) {
+    size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    size_t total_elements = calculate_total_elements(info);
+
+    if (idx < total_elements) {
+        size_t input_idx;
+
+        switch (info.dim) {
+        case INTERPOLATE_1D:
+            input_idx = compute_input_index_1d<T>(idx, info);
+            break;
+        case INTERPOLATE_2D:
+            input_idx = compute_input_index_2d<T>(idx, info);
+            break;
+        case INTERPOLATE_3D:
+            input_idx = compute_input_index_3d<T>(idx, info);
+            break;
+        default:
+            return;
+        }
+
+        size_t output_idx = compute_output_index<T>(idx, info);
+        output[output_idx] = input[input_idx];
+    }
+}
+
+#endif // INTERPOLATE_NEAREST_KERNEL_CUH
diff --git a/src/infiniop/ops/interpolate_nearest/info.h b/src/infiniop/ops/interpolate_nearest/info.h
new file mode 100644
index 000000000..162d6eb02
--- /dev/null
+++ b/src/infiniop/ops/interpolate_nearest/info.h
@@ -0,0 +1,118 @@
+#ifndef __INTERPOLATE_NEAREST_INFO_H__
+#define __INTERPOLATE_NEAREST_INFO_H__
+
+#include "../../../utils.h"
+#include "../../operator.h"
+#include "../../tensor.h"
+#include <cstddef>
+
+enum InterpolateDim {
+    INTERPOLATE_1D = 1, // 3D 张量 (N, C, W)
+    INTERPOLATE_2D = 2, // 4D 张量 (N, C, H, W)
+    INTERPOLATE_3D = 3  // 5D 张量 (N, C, D, H, W)
+};
+
+struct InterpolateNearestInfo {
+    size_t batch_size;
+    size_t channels;
+
+    // 输入和输出的空间维度大小
+    size_t input_size[3];  // [depth/height/width] 根据维度使用不同数量
+    size_t output_size[3]; // [depth/height/width] 根据维度使用不同数量
+
+    InterpolateDim dim; // 插值维度：1D, 2D, 3D
+    infiniDtype_t dtype;
+
+    // 张量步长（最多支持 5D 张量）
+    size_t input_stride[5];
+    size_t output_stride[5];
+
+    static infiniStatus_t create(
+        InterpolateNearestInfo *info,
+        infiniopTensorDescriptor_t output_desc,
+        infiniopTensorDescriptor_t input_desc) {
+
+        // 检查数据类型
+        if (input_desc->dtype() != output_desc->dtype()) {
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;
+        }
+
+        auto input_shape = input_desc->shape();
+        auto output_shape = output_desc->shape();
+        auto input_stride = input_desc->strides();
+        auto output_stride = output_desc->strides();
+
+        // 根据张量维度确定插值类型
+        if (input_desc->ndim() == 3 && output_desc->ndim() == 3) {
+            // 1D 插值：3D 张量 (N, C, W)
+            info->dim = INTERPOLATE_1D;
+            info->batch_size = input_shape[0];
+            info->channels = input_shape[1];
+            info->input_size[0] = input_shape[2];   // width
+            info->output_size[0] = output_shape[2]; // width
+
+            // 检查 N,C 维度匹配
+            if (input_shape[0] != output_shape[0] || input_shape[1] != output_shape[1]) {
+                return INFINI_STATUS_BAD_TENSOR_SHAPE;
+            }
+
+            // 复制步长
+            for (int i = 0; i < 3; ++i) {
+                info->input_stride[i] = input_stride[i];
+                info->output_stride[i] = output_stride[i];
+            }
+
+        } else if (input_desc->ndim() == 4 && output_desc->ndim() == 4) {
+            // 2D 插值：4D 张量 (N, C, H, W)
+            info->dim = INTERPOLATE_2D;
+            info->batch_size = input_shape[0];
+            info->channels = input_shape[1];
+            info->input_size[0] = input_shape[2];   // height
+            info->input_size[1] = input_shape[3];   // width
+            info->output_size[0] = output_shape[2]; // height
+            info->output_size[1] = output_shape[3]; // width
+
+            // 检查 N,C 维度匹配
+            if (input_shape[0] != output_shape[0] || input_shape[1] != output_shape[1]) {
+                return INFINI_STATUS_BAD_TENSOR_SHAPE;
+            }
+
+            // 复制步长
+            for (int i = 0; i < 4; ++i) {
+                info->input_stride[i] = input_stride[i];
+                info->output_stride[i] = output_stride[i];
+            }
+
+        } else if (input_desc->ndim() == 5 && output_desc->ndim() == 5) {
+            // 3D 插值：5D 张量 (N, C, D, H, W)
+            info->dim = INTERPOLATE_3D;
+            info->batch_size = input_shape[0];
+            info->channels = input_shape[1];
+            info->input_size[0] = input_shape[2];   // depth
+            info->input_size[1] = input_shape[3];   // height
+            info->input_size[2] = input_shape[4];   // width
+            info->output_size[0] = output_shape[2]; // depth
+            info->output_size[1] = output_shape[3]; // height
+            info->output_size[2] = output_shape[4]; // width
+
+            // 检查 N,C 维度匹配
+            if (input_shape[0] != output_shape[0] || input_shape[1] != output_shape[1]) {
+                return INFINI_STATUS_BAD_TENSOR_SHAPE;
+            }
+
+            // 复制步长
+            for (int i = 0; i < 5; ++i) {
+                info->input_stride[i] = input_stride[i];
+                info->output_stride[i] = output_stride[i];
+            }
+
+        } else {
+            return INFINI_STATUS_BAD_TENSOR_SHAPE;
+        }
+
+        info->dtype = input_desc->dtype();
+        return INFINI_STATUS_SUCCESS;
+    }
+};
+
+#endif // __INTERPOLATE_NEAREST_INFO_H__
diff --git a/src/infiniop/ops/interpolate_nearest/interpolate_nearest.h b/src/infiniop/ops/interpolate_nearest/interpolate_nearest.h
new file mode 100644
index 000000000..73499c2ff
--- /dev/null
+++ b/src/infiniop/ops/interpolate_nearest/interpolate_nearest.h
@@ -0,0 +1,51 @@
+#ifndef __INTERPOLATE_NEAREST_H__
+#define __INTERPOLATE_NEAREST_H__
+
+#include "../../operator.h"
+#include "info.h"
+
+#define DESCRIPTOR(NAMESPACE)                                    \
+                                                                 \
+    namespace op::interpolate_nearest::NAMESPACE {               \
+    class Descriptor final : public InfiniopDescriptor {         \
+        struct Opaque;                                           \
+        Opaque *_opaque;                                         \
+                                                                 \
+        InterpolateNearestInfo _info;                            \
+        infiniDtype_t _dtype;                                    \
+        size_t _workspace_size;                                  \
+                                                                 \
+        Descriptor(                                              \
+            infiniDtype_t dtype,                                 \
+            InterpolateNearestInfo info,                         \
+            size_t workspace_size,                               \
+            Opaque *opaque,                                      \
+            infiniDevice_t device_type,                          \
+            int device_id)                                       \
+            : InfiniopDescriptor{device_type, device_id},        \
+              _opaque(opaque),                                   \
+              _info(info),                                       \
+              _dtype(dtype),                                     \
+              _workspace_size(workspace_size) {}                 \
+                                                                 \
+    public:                                                      \
+        ~Descriptor();                                           \
+                                                                 \
+        static infiniStatus_t create(                            \
+            infiniopHandle_t handle,                             \
+            Descriptor **desc_ptr,                               \
+            infiniopTensorDescriptor_t output_desc,              \
+            infiniopTensorDescriptor_t input_desc);              \
+                                                                 \
+        size_t workspaceSize() const { return _workspace_size; } \
+                                                                 \
+        infiniStatus_t calculate(                                \
+            void *workspace,                                     \
+            size_t workspace_size,                               \
+            void *output,                                        \
+            const void *input,                                   \
+            void *stream) const;                                 \
+    };                                                           \
+    }
+
+#endif // __INTERPOLATE_NEAREST_H__
diff --git a/src/infiniop/ops/interpolate_nearest/nvidia/interpolate_nearest_nvidia.cu b/src/infiniop/ops/interpolate_nearest/nvidia/interpolate_nearest_nvidia.cu
new file mode 100644
index 000000000..a7b63c6f4
--- /dev/null
+++ b/src/infiniop/ops/interpolate_nearest/nvidia/interpolate_nearest_nvidia.cu
@@ -0,0 +1,93 @@
+#include "../../../devices/nvidia/nvidia_common.cuh"
+#include "../../../devices/nvidia/nvidia_kernel_common.cuh"
+#include "../cuda/kernel.cuh"
+#include "interpolate_nearest_nvidia.cuh"
+#include <cstddef>
+#include <cstdint>
+#include <cuda_bf16.h>
+
+namespace op::interpolate_nearest::nvidia {
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::nvidia::Handle::Internal> internal;
+
+    Opaque(std::shared_ptr<device::nvidia::Handle::Internal> internal_)
+        : internal(internal_) {}
+};
+
+Descriptor::~Descriptor() { delete _opaque; }
+
+infiniStatus_t Descriptor::create(infiniopHandle_t handle_,
+                                  Descriptor **desc_ptr,
+                                  infiniopTensorDescriptor_t output_desc,
+                                  infiniopTensorDescriptor_t input_desc) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = output_desc->dtype();
+
+    // Check supported data types
+    if (dtype != INFINI_DTYPE_F16 && dtype != INFINI_DTYPE_F32 && dtype != INFINI_DTYPE_BF16 && dtype != INFINI_DTYPE_I8) {
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    InterpolateNearestInfo info;
+    CHECK_STATUS(InterpolateNearestInfo::create(&info, output_desc, input_desc));
+
+    *desc_ptr = new Descriptor(dtype, info, 0, new Opaque{handle->internal()},
+                               handle->device, handle->device_id);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
+                                     void *output, const void *input,
+                                     void *stream) const {
+
+    auto cuda_stream = reinterpret_cast<cudaStream_t>(stream);
+
+    size_t total_elements = calculate_total_elements(_info);
+
+    int block_size = 256;
+    int grid_size = (total_elements + block_size - 1) / block_size;
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F32: {
+        float *typed_output = reinterpret_cast<float *>(output);
+        const float *typed_input = reinterpret_cast<const float *>(input);
+        interpolate_nearest_kernel<float>
+            <<<grid_size, block_size, 0, cuda_stream>>>(typed_output, typed_input,
+                                                        _info);
+    } break;
+
+    case INFINI_DTYPE_F16: {
+        half *typed_output = reinterpret_cast<half *>(output);
+        const half *typed_input = reinterpret_cast<const half *>(input);
+        interpolate_nearest_kernel<half><<<grid_size, block_size, 0, cuda_stream>>>(
+            typed_output, typed_input, _info);
+    } break;
+
+    case INFINI_DTYPE_BF16: {
+        auto typed_output = reinterpret_cast<__nv_bfloat16 *>(output);
+        auto typed_input = reinterpret_cast<const __nv_bfloat16 *>(input);
+        interpolate_nearest_kernel<__nv_bfloat16>
+            <<<grid_size, block_size, 0, cuda_stream>>>(typed_output, typed_input,
+                                                        _info);
+    } break;
+
+    case INFINI_DTYPE_I8: {
+        auto typed_output = reinterpret_cast<int8_t *>(output);
+        auto typed_input = reinterpret_cast<const int8_t *>(input);
+        interpolate_nearest_kernel<int8_t>
+            <<<grid_size, block_size, 0, cuda_stream>>>(typed_output, typed_input,
+                                                        _info);
+    } break;
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    CHECK_CUDA(cudaGetLastError());
+    CHECK_CUDA(cudaStreamSynchronize(cuda_stream));
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::interpolate_nearest::nvidia
diff --git a/src/infiniop/ops/interpolate_nearest/nvidia/interpolate_nearest_nvidia.cuh b/src/infiniop/ops/interpolate_nearest/nvidia/interpolate_nearest_nvidia.cuh
new file mode 100644
index 000000000..aab5f7882
--- /dev/null
+++ b/src/infiniop/ops/interpolate_nearest/nvidia/interpolate_nearest_nvidia.cuh
@@ -0,0 +1,9 @@
+#ifndef __INTERPOLATE_NEAREST_NVIDIA_CUH__
+#define __INTERPOLATE_NEAREST_NVIDIA_CUH__
+
+#include "../../../devices/nvidia/nvidia_handle.h"
+#include "../interpolate_nearest.h"
+
+DESCRIPTOR(nvidia)
+
+#endif // __INTERPOLATE_NEAREST_NVIDIA_CUH__
diff --git a/src/infiniop/ops/interpolate_nearest/operator.cc b/src/infiniop/ops/interpolate_nearest/operator.cc
new file mode 100644
index 000000000..df367cfde
--- /dev/null
+++ b/src/infiniop/ops/interpolate_nearest/operator.cc
@@ -0,0 +1,139 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/interpolate_nearest.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/interpolate_nearest_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
+#include "nvidia/interpolate_nearest_nvidia.cuh"
+#endif
+
+__C infiniStatus_t infiniopCreateInterpolateNearestDescriptor(
+    infiniopHandle_t handle,
+    infiniopInterpolateNearestDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t output_desc,
+    infiniopTensorDescriptor_t input_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                                            \
+    case CASE:                                                                             \
+        return op::interpolate_nearest::NAMESPACE::Descriptor::create(                     \
+            handle,                                                                        \
+            reinterpret_cast<op::interpolate_nearest::NAMESPACE::Descriptor **>(desc_ptr), \
+            output_desc,                                                                   \
+            input_desc)
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CREATE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetInterpolateNearestWorkspaceSize(infiniopInterpolateNearestDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                                     \
+    case CASE:                                                                                                   \
+        *size = reinterpret_cast<const op::interpolate_nearest::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        GET(INFINI_DEVICE_QY, nvidia);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopInterpolateNearest(
+    infiniopInterpolateNearestDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    const void *input,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                                            \
+    case CASE:                                                                                \
+        return reinterpret_cast<const op::interpolate_nearest::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, output, input, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CALCULATE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroyInterpolateNearestDescriptor(infiniopInterpolateNearestDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                                \
+    case CASE:                                                                                 \
+        delete reinterpret_cast<const op::interpolate_nearest::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        DELETE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/maxpool/cpu/maxpool_cpu.cc b/src/infiniop/ops/maxpool/cpu/maxpool_cpu.cc
new file mode 100644
index 000000000..16c859985
--- /dev/null
+++ b/src/infiniop/ops/maxpool/cpu/maxpool_cpu.cc
@@ -0,0 +1,322 @@
+#include "maxpool_cpu.h"
+#include "../../../devices/cpu/common_cpu.h"
+#include "../../../devices/cpu/cpu_handle.h"
+#include "../info.h"
+#include <algorithm>
+#include <cmath>
+#include <cstring>
+#include <limits>
+
+namespace op::maxpool::cpu {
+
+struct Descriptor::Opaque {
+    device::cpu::Handle *handle;
+    MaxPoolInfo info;
+    size_t workspace_size = 0;
+
+private:
+    Opaque(device::cpu::Handle *handle_ptr, const MaxPoolInfo &maxpool_info)
+        : handle(handle_ptr), info(maxpool_info) {
+        // CPU实现通常不需要额外的工作空间
+        workspace_size = 0;
+    }
+
+    // 获取数据类型的最小值
+    template <typename T>
+    static T get_min_value() {
+        if constexpr (std::is_same<T, float>::value) {
+            return -std::numeric_limits<float>::infinity();
+        } else if constexpr (std::is_same<T, fp16_t>::value) {
+            return _f32_to_f16(-std::numeric_limits<float>::infinity());
+        } else if constexpr (std::is_same<T, bf16_t>::value) {
+            return _f32_to_bf16(-std::numeric_limits<float>::infinity());
+        } else {
+            return std::numeric_limits<T>::lowest();
+        }
+    }
+
+    // 比较两个值的大小（处理半精度类型）
+    template <typename T>
+    static bool is_greater(const T &a, const T &b) {
+        if constexpr (std::is_same<T, fp16_t>::value) {
+            return utils::cast<float>(a) > utils::cast<float>(b);
+        } else if constexpr (std::is_same<T, bf16_t>::value) {
+            return utils::cast<float>(a) > utils::cast<float>(b);
+        } else {
+            return a > b;
+        }
+    }
+
+    // 1D最大池化
+    template <typename T>
+    void maxpool_1d(T *output, const T *input) const {
+        size_t batch_size = info.batch;
+        size_t channels = info.channels;
+        size_t input_width = info.input_dims[0];
+        size_t output_width = info.output_dims[0];
+        size_t kernel_width = info.kernel_sizes[0];
+        size_t stride_width = info.strides[0];
+        size_t pad_width = info.pads[0];
+
+        // 并行处理每个批次和通道
+#pragma omp parallel for schedule(static)
+        for (int b = 0; b < static_cast<int>(batch_size); ++b) {
+            for (int c = 0; c < static_cast<int>(channels); ++c) {
+                size_t input_offset = static_cast<size_t>(b) * channels * input_width + static_cast<size_t>(c) * input_width;
+                size_t output_offset = static_cast<size_t>(b) * channels * output_width + static_cast<size_t>(c) * output_width;
+
+                for (size_t ow = 0; ow < output_width; ++ow) {
+                    T max_val = get_min_value<T>();
+                    bool found_valid = false;
+
+                    int start_w = static_cast<int>(ow * stride_width) - static_cast<int>(pad_width);
+                    int end_w = start_w + static_cast<int>(kernel_width);
+
+                    for (int kw = start_w; kw < end_w; ++kw) {
+                        if (kw >= 0 && kw < static_cast<int>(input_width)) {
+                            T val = input[input_offset + kw];
+                            if (!found_valid || is_greater(val, max_val)) {
+                                max_val = val;
+                                found_valid = true;
+                            }
+                        }
+                    }
+
+                    output[output_offset + ow] = max_val;
+                }
+            }
+        }
+    }
+
+    // 2D最大池化
+    template <typename T>
+    void maxpool_2d(T *output, const T *input) const {
+        size_t batch_size = info.batch;
+        size_t channels = info.channels;
+        size_t input_height = info.input_dims[0];
+        size_t input_width = info.input_dims[1];
+        size_t output_height = info.output_dims[0];
+        size_t output_width = info.output_dims[1];
+        size_t kernel_height = info.kernel_sizes[0];
+        size_t kernel_width = info.kernel_sizes[1];
+        size_t stride_height = info.strides[0];
+        size_t stride_width = info.strides[1];
+        size_t pad_height = info.pads[0];
+        size_t pad_width = info.pads[1];
+
+        // 并行处理每个批次和通道
+#pragma omp parallel for schedule(static)
+        for (int b = 0; b < static_cast<int>(batch_size); ++b) {
+            for (int c = 0; c < static_cast<int>(channels); ++c) {
+                size_t input_offset = static_cast<size_t>(b) * channels * input_height * input_width + static_cast<size_t>(c) * input_height * input_width;
+                size_t output_offset = static_cast<size_t>(b) * channels * output_height * output_width + static_cast<size_t>(c) * output_height * output_width;
+
+                for (size_t oh = 0; oh < output_height; ++oh) {
+                    for (size_t ow = 0; ow < output_width; ++ow) {
+                        T max_val = get_min_value<T>();
+                        bool found_valid = false;
+
+                        int start_h = static_cast<int>(oh * stride_height) - static_cast<int>(pad_height);
+                        int end_h = start_h + static_cast<int>(kernel_height);
+                        int start_w = static_cast<int>(ow * stride_width) - static_cast<int>(pad_width);
+                        int end_w = start_w + static_cast<int>(kernel_width);
+
+                        for (int kh = start_h; kh < end_h; ++kh) {
+                            for (int kw = start_w; kw < end_w; ++kw) {
+                                if (kh >= 0 && kh < static_cast<int>(input_height) && kw >= 0 && kw < static_cast<int>(input_width)) {
+                                    T val = input[input_offset + kh * input_width + kw];
+                                    if (!found_valid || is_greater(val, max_val)) {
+                                        max_val = val;
+                                        found_valid = true;
+                                    }
+                                }
+                            }
+                        }
+
+                        output[output_offset + oh * output_width + ow] = max_val;
+                    }
+                }
+            }
+        }
+    }
+
+    // 3D最大池化
+    template <typename T>
+    void maxpool_3d(T *output, const T *input) const {
+        size_t batch_size = info.batch;
+        size_t channels = info.channels;
+        size_t input_depth = info.input_dims[0];
+        size_t input_height = info.input_dims[1];
+        size_t input_width = info.input_dims[2];
+        size_t output_depth = info.output_dims[0];
+        size_t output_height = info.output_dims[1];
+        size_t output_width = info.output_dims[2];
+        size_t kernel_depth = info.kernel_sizes[0];
+        size_t kernel_height = info.kernel_sizes[1];
+        size_t kernel_width = info.kernel_sizes[2];
+        size_t stride_depth = info.strides[0];
+        size_t stride_height = info.strides[1];
+        size_t stride_width = info.strides[2];
+        size_t pad_depth = info.pads[0];
+        size_t pad_height = info.pads[1];
+        size_t pad_width = info.pads[2];
+
+        // 并行处理每个批次和通道
+#pragma omp parallel for schedule(static)
+        for (int b = 0; b < static_cast<int>(batch_size); ++b) {
+            for (int c = 0; c < static_cast<int>(channels); ++c) {
+                size_t input_offset = static_cast<size_t>(b) * channels * input_depth * input_height * input_width + static_cast<size_t>(c) * input_depth * input_height * input_width;
+                size_t output_offset = static_cast<size_t>(b) * channels * output_depth * output_height * output_width + static_cast<size_t>(c) * output_depth * output_height * output_width;
+
+                for (size_t od = 0; od < output_depth; ++od) {
+                    for (size_t oh = 0; oh < output_height; ++oh) {
+                        for (size_t ow = 0; ow < output_width; ++ow) {
+                            T max_val = get_min_value<T>();
+                            bool found_valid = false;
+
+                            int start_d = static_cast<int>(od * stride_depth) - static_cast<int>(pad_depth);
+                            int end_d = start_d + static_cast<int>(kernel_depth);
+                            int start_h = static_cast<int>(oh * stride_height) - static_cast<int>(pad_height);
+                            int end_h = start_h + static_cast<int>(kernel_height);
+                            int start_w = static_cast<int>(ow * stride_width) - static_cast<int>(pad_width);
+                            int end_w = start_w + static_cast<int>(kernel_width);
+
+                            for (int kd = start_d; kd < end_d; ++kd) {
+                                for (int kh = start_h; kh < end_h; ++kh) {
+                                    for (int kw = start_w; kw < end_w; ++kw) {
+                                        if (kd >= 0 && kd < static_cast<int>(input_depth) && kh >= 0 && kh < static_cast<int>(input_height) && kw >= 0 && kw < static_cast<int>(input_width)) {
+                                            T val = input[input_offset + kd * input_height * input_width + kh * input_width + kw];
+                                            if (!found_valid || is_greater(val, max_val)) {
+                                                max_val = val;
+                                                found_valid = true;
+                                            }
+                                        }
+                                    }
+                                }
+                            }
+
+                            output[output_offset + od * output_height * output_width + oh * output_width + ow] = max_val;
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    // 主要的最大池化计算函数
+    template <typename T>
+    void maxpool_cpu(T *output, const T *input) const {
+        switch (info.ndim) {
+        case 1:
+            maxpool_1d(output, input);
+            break;
+        case 2:
+            maxpool_2d(output, input);
+            break;
+        case 3:
+            maxpool_3d(output, input);
+            break;
+        default:
+            break;
+        }
+    }
+
+public:
+    Opaque(Opaque &&other) noexcept
+        : handle(other.handle),
+          info(std::move(other.info)),
+          workspace_size(other.workspace_size) {
+        other.handle = nullptr;
+        other.workspace_size = 0;
+    }
+
+    ~Opaque() = default;
+
+    static inline utils::Result<Opaque>
+    create(device::cpu::Handle *handle_ptr,
+           MaxPoolInfo &info,
+           infiniDtype_t data_type) {
+        if (data_type != INFINI_DTYPE_F32 && data_type != INFINI_DTYPE_F16 && data_type != INFINI_DTYPE_BF16) {
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;
+        }
+
+        Opaque opaque(handle_ptr, info);
+        return utils::Result<Opaque>(std::move(opaque));
+    }
+
+    infiniStatus_t calculate(void *workspace, size_t workspace_size,
+                             void *output, const void *input, infiniDtype_t dtype) const {
+
+        if (!output || !input) {
+            return INFINI_STATUS_BAD_PARAM;
+        }
+
+        switch (dtype) {
+        case INFINI_DTYPE_F32: {
+            float *typed_output = static_cast<float *>(output);
+            const float *typed_input = static_cast<const float *>(input);
+            maxpool_cpu(typed_output, typed_input);
+            break;
+        }
+
+        case INFINI_DTYPE_F16: {
+            fp16_t *typed_output = static_cast<fp16_t *>(output);
+            const fp16_t *typed_input = static_cast<const fp16_t *>(input);
+            maxpool_cpu(typed_output, typed_input);
+            break;
+        }
+
+        case INFINI_DTYPE_BF16: {
+            bf16_t *typed_output = static_cast<bf16_t *>(output);
+            const bf16_t *typed_input = static_cast<const bf16_t *>(input);
+            maxpool_cpu(typed_output, typed_input);
+            break;
+        }
+
+        default:
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;
+        }
+
+        return INFINI_STATUS_SUCCESS;
+    }
+};
+
+Descriptor::~Descriptor() {
+    if (_opaque) {
+        delete _opaque;
+    }
+}
+
+infiniStatus_t Descriptor::create(infiniopHandle_t handle_,
+                                  Descriptor **desc_ptr,
+                                  infiniopTensorDescriptor_t output_desc,
+                                  infiniopTensorDescriptor_t input_desc,
+                                  void *kernel_size, void *strides, void *pads,
+                                  bool ceil_mode) {
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = input_desc->dtype();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F32, INFINI_DTYPE_F16, INFINI_DTYPE_BF16);
+
+    auto result = MaxPoolInfo::create(output_desc, input_desc, kernel_size,
+                                      strides, pads, ceil_mode);
+    CHECK_RESULT(result);
+    auto info = result.take();
+
+    auto opaque_result = Opaque::create(handle, info, dtype);
+    CHECK_RESULT(opaque_result);
+    auto opaque = new Opaque(opaque_result.take());
+
+    *desc_ptr = new Descriptor(dtype, std::move(info), opaque->workspace_size,
+                               opaque, handle->device, handle->device_id);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
+                                     void *output, const void *input,
+                                     void *stream) const {
+    return _opaque->calculate(workspace, workspace_size, output, input, _dtype);
+}
+
+} // namespace op::maxpool::cpu
diff --git a/src/infiniop/ops/maxpool/cpu/maxpool_cpu.h b/src/infiniop/ops/maxpool/cpu/maxpool_cpu.h
new file mode 100644
index 000000000..f3ecd349d
--- /dev/null
+++ b/src/infiniop/ops/maxpool/cpu/maxpool_cpu.h
@@ -0,0 +1,8 @@
+#ifndef __MAX_POOL_CPU_H__
+#define __MAX_POOL_CPU_H__
+
+#include "../maxpool.h"
+
+DESCRIPTOR(cpu)
+
+#endif // __MAX_POOL_CPU_H__
diff --git a/src/infiniop/ops/maxpool/info.h b/src/infiniop/ops/maxpool/info.h
new file mode 100644
index 000000000..ff56fe28c
--- /dev/null
+++ b/src/infiniop/ops/maxpool/info.h
@@ -0,0 +1,113 @@
+#ifndef __MAX_POOL_INFO_H__
+#define __MAX_POOL_INFO_H__
+
+#include "../../../utils.h"
+#include "../../operator.h"
+#include "../../tensor.h"
+#include <vector>
+
+namespace op::maxpool {
+
+inline utils::Result<size_t> calculateMaxPoolOutputSize(
+    size_t input_size,
+    size_t kernel_size,
+    size_t stride,
+    size_t padding = 0,
+    bool ceil_mode = false) {
+
+    if (stride == 0) {
+        return utils::Result<size_t>(INFINI_STATUS_BAD_PARAM);
+    }
+    if (kernel_size == 0) {
+        return utils::Result<size_t>(INFINI_STATUS_BAD_PARAM);
+    }
+
+    // 理论最大输出数
+    size_t max_output = 0;
+    if (ceil_mode) {
+        max_output = (input_size + 2 * padding - kernel_size + stride - 1) / stride + 1;
+    } else {
+        max_output = (input_size + 2 * padding - kernel_size) / stride + 1;
+    }
+
+    size_t valid_output = 0;
+    for (size_t i = 0; i < max_output; ++i) {
+        int64_t start = static_cast<int64_t>(i) * stride - padding;
+        int64_t end = start + kernel_size;
+        // 判断区间 [start, end) 和 [0, input_size) 是否有交集
+        int64_t real_start = std::max(start, int64_t(0));
+        int64_t real_end = std::min(end, int64_t(input_size));
+        if (real_end > real_start) {
+            ++valid_output;
+        }
+    }
+    return utils::Result<size_t>(valid_output);
+}
+
+class MaxPoolInfo {
+    MaxPoolInfo() = default;
+
+public:
+    std::vector<size_t> input_dims;
+    std::vector<size_t> output_dims;
+    std::vector<size_t> kernel_sizes;
+    std::vector<size_t> strides;
+    std::vector<size_t> pads;
+    bool ceil_mode;
+    size_t ndim;
+    size_t batch;
+    size_t channels;
+
+    static utils::Result<MaxPoolInfo> create(
+        infiniopTensorDescriptor_t output_desc,
+        infiniopTensorDescriptor_t input_desc,
+        void *kernel_size,
+        void *strides,
+        void *pads,
+        bool ceil_mode) {
+
+        MaxPoolInfo info;
+
+        if (input_desc->ndim() < 3 || input_desc->ndim() > 5) {
+            return INFINI_STATUS_BAD_TENSOR_SHAPE;
+        }
+
+        if (input_desc->ndim() != output_desc->ndim()) {
+            return INFINI_STATUS_BAD_TENSOR_SHAPE;
+        }
+
+        if (input_desc->dim(0) != output_desc->dim(0) || input_desc->dim(1) != output_desc->dim(1)) {
+            return INFINI_STATUS_BAD_TENSOR_SHAPE;
+        }
+
+        info.ndim = input_desc->ndim() - 2; // spatial dimensions
+        info.batch = input_desc->dim(0);
+        info.channels = input_desc->dim(1);
+        info.ceil_mode = ceil_mode;
+
+        auto kernel_ptr = reinterpret_cast<const size_t *>(kernel_size);
+        auto stride_ptr = reinterpret_cast<const size_t *>(strides);
+        auto pad_ptr = reinterpret_cast<const size_t *>(pads);
+
+        // Get spatial dimensions
+        for (size_t i = 0; i < info.ndim; ++i) {
+            info.input_dims.push_back(input_desc->dim(i + 2));
+            info.kernel_sizes.push_back(kernel_ptr[i]);
+            info.strides.push_back(stride_ptr[i]);
+            info.pads.push_back(pad_ptr[i]);
+            auto output_size = calculateMaxPoolOutputSize(
+                info.input_dims[i], info.kernel_sizes[i], info.strides[i], info.pads[i], info.ceil_mode);
+            CHECK_RESULT(output_size);
+            size_t expected_size = output_size.take();
+            if (expected_size != output_desc->dim(i + 2)) {
+                return INFINI_STATUS_BAD_TENSOR_SHAPE;
+            }
+
+            info.output_dims.push_back(output_desc->dim(i + 2));
+        }
+        return utils::Result<MaxPoolInfo>(std::move(info));
+    }
+};
+} // namespace op::maxpool
+
+#endif // __MAX_POOL_INFO_H__
diff --git a/src/infiniop/ops/maxpool/maxpool.h b/src/infiniop/ops/maxpool/maxpool.h
new file mode 100644
index 000000000..5ee7703c5
--- /dev/null
+++ b/src/infiniop/ops/maxpool/maxpool.h
@@ -0,0 +1,53 @@
+#ifndef __MAX_POOL_H__
+#define __MAX_POOL_H__
+
+#include "../../operator.h"
+#include "info.h"
+
+#define DESCRIPTOR(NAMESPACE)                                    \
+                                                                 \
+    namespace op::maxpool::NAMESPACE {                           \
+    class Descriptor final : public InfiniopDescriptor {         \
+        struct Opaque;                                           \
+        Opaque *_opaque;                                         \
+        infiniDtype_t _dtype;                                    \
+        MaxPoolInfo _info;                                       \
+        size_t _workspace_size;                                  \
+                                                                 \
+        Descriptor(                                              \
+            infiniDtype_t dtype,                                 \
+            MaxPoolInfo info,                                    \
+            size_t workspace_size_,                              \
+            Opaque *opaque,                                      \
+            infiniDevice_t device_type,                          \
+            int device_id)                                       \
+            : InfiniopDescriptor{device_type, device_id},        \
+              _opaque(opaque),                                   \
+              _dtype(dtype),                                     \
+              _info(info),                                       \
+              _workspace_size(workspace_size_) {}                \
+                                                                 \
+    public:                                                      \
+        ~Descriptor();                                           \
+                                                                 \
+        size_t workspaceSize() const { return _workspace_size; } \
+                                                                 \
+        static infiniStatus_t create(                            \
+            infiniopHandle_t handle,                             \
+            Descriptor **desc_ptr,                               \
+            infiniopTensorDescriptor_t output_desc,              \
+            infiniopTensorDescriptor_t input_desc,               \
+            void *kernel_size,                                   \
+            void *strides,                                       \
+            void *pads,                                          \
+            bool ceil_mode);                                     \
+                                                                 \
+        infiniStatus_t calculate(                                \
+            void *workspace, size_t workspace_size,              \
+            void *output,                                        \
+            const void *input,                                   \
+            void *stream) const;                                 \
+    };                                                           \
+    }
+
+#endif // __MAX_POOL_H__
diff --git a/src/infiniop/ops/maxpool/nvidia/maxpool_nvidia.cu b/src/infiniop/ops/maxpool/nvidia/maxpool_nvidia.cu
new file mode 100644
index 000000000..8b94a29c1
--- /dev/null
+++ b/src/infiniop/ops/maxpool/nvidia/maxpool_nvidia.cu
@@ -0,0 +1,240 @@
+#include "../../../devices/nvidia/nvidia_common.cuh"
+#include "../../../devices/nvidia/nvidia_handle.cuh"
+#include "maxpool_nvidia.cuh"
+
+#define DESTROY_CUDNN_DESCRIPTOR(desc_ptr, destroy_func) \
+    do {                                                 \
+        if (desc_ptr) {                                  \
+            destroy_func(desc_ptr);                      \
+            desc_ptr = nullptr;                          \
+        }                                                \
+    } while (0)
+
+#define CLEANUP_CUDNN_DESCRIPTORS()                                            \
+    do {                                                                       \
+        DESTROY_CUDNN_DESCRIPTOR(input_desc, cudnnDestroyTensorDescriptor);    \
+        DESTROY_CUDNN_DESCRIPTOR(output_desc, cudnnDestroyTensorDescriptor);   \
+        DESTROY_CUDNN_DESCRIPTOR(pooling_desc, cudnnDestroyPoolingDescriptor); \
+    } while (0)
+
+namespace op::maxpool::nvidia {
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::nvidia::Handle::Internal> internal;
+    size_t workspace_size = 0;
+
+#ifdef ENABLE_CUDNN_API
+    cudnnTensorDescriptor_t input_desc = nullptr;
+    cudnnTensorDescriptor_t output_desc = nullptr;
+    cudnnPoolingDescriptor_t pooling_desc = nullptr;
+#endif
+
+private:
+    Opaque(std::shared_ptr<device::nvidia::Handle::Internal> internal_ptr)
+        : internal(internal_ptr) {}
+
+#ifdef ENABLE_CUDNN_API
+    infiniStatus_t getCudnnDataType(infiniDtype_t data_type,
+                                    cudnnDataType_t &cudnn_data_type) const {
+        if (data_type == INFINI_DTYPE_F16) {
+            cudnn_data_type = device::nvidia::getCudnnDtype(data_type);
+        } else if (data_type == INFINI_DTYPE_F32) {
+            cudnn_data_type = device::nvidia::getCudnnDtype(data_type);
+        } else if (data_type == INFINI_DTYPE_BF16) {
+            cudnn_data_type = device::nvidia::getCudnnDtype(data_type);
+        } else {
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;
+        }
+        return INFINI_STATUS_SUCCESS;
+    }
+
+    infiniStatus_t createPoolingDescriptors(const MaxPoolInfo &info,
+                                            cudnnDataType_t cudnn_data_type) {
+        // Create CUDNN descriptors
+        CHECK_CUDNN(cudnnCreateTensorDescriptor(&input_desc));
+        CHECK_CUDNN(cudnnCreateTensorDescriptor(&output_desc));
+        CHECK_CUDNN(cudnnCreatePoolingDescriptor(&pooling_desc));
+
+        // Setup tensor descriptors
+        std::vector<int> input_dims_vec = {static_cast<int>(info.batch),
+                                           static_cast<int>(info.channels)};
+        std::vector<int> output_dims_vec = {static_cast<int>(info.batch),
+                                            static_cast<int>(info.channels)};
+
+        for (size_t i = 0; i < info.ndim; ++i) {
+            input_dims_vec.push_back(static_cast<int>(info.input_dims[i]));
+            output_dims_vec.push_back(static_cast<int>(info.output_dims[i]));
+        }
+
+        if (info.ndim == 1) {
+            // For 1D pooling, add dummy dimension
+            input_dims_vec.push_back(1);
+            output_dims_vec.push_back(1);
+        }
+
+        CHECK_CUDNN(cudnnSetTensorNdDescriptorEx(
+            input_desc, CUDNN_TENSOR_NCHW, cudnn_data_type, input_dims_vec.size(),
+            input_dims_vec.data()));
+
+        CHECK_CUDNN(cudnnSetTensorNdDescriptorEx(
+            output_desc, CUDNN_TENSOR_NCHW, cudnn_data_type, output_dims_vec.size(),
+            output_dims_vec.data()));
+
+        return INFINI_STATUS_SUCCESS;
+    }
+
+    infiniStatus_t setupPoolingDescriptor(const MaxPoolInfo &info) {
+        // Setup pooling descriptor
+        std::vector<int> kernel_vec, stride_vec, pad_vec;
+        for (size_t i = 0; i < info.ndim; ++i) {
+            kernel_vec.push_back(static_cast<int>(info.kernel_sizes[i]));
+            stride_vec.push_back(static_cast<int>(info.strides[i]));
+            pad_vec.push_back(static_cast<int>(info.pads[i]));
+        }
+
+        if (info.ndim == 1) {
+            // For 1D pooling, add dummy dimension
+            kernel_vec.push_back(1);
+            stride_vec.push_back(1);
+            pad_vec.push_back(0);
+        }
+
+        CHECK_CUDNN(cudnnSetPoolingNdDescriptor(
+            pooling_desc, CUDNN_POOLING_MAX, CUDNN_NOT_PROPAGATE_NAN,
+            kernel_vec.size(), kernel_vec.data(), pad_vec.data(),
+            stride_vec.data()));
+
+        return INFINI_STATUS_SUCCESS;
+    }
+
+    infiniStatus_t initializeCudnnContext(MaxPoolInfo &info,
+                                          infiniDtype_t data_type) {
+        cudnnDataType_t cudnn_data_type;
+        CHECK_STATUS(getCudnnDataType(data_type, cudnn_data_type));
+
+        CHECK_STATUS(createPoolingDescriptors(info, cudnn_data_type));
+        CHECK_STATUS(setupPoolingDescriptor(info));
+
+        // Max pooling typically doesn't need workspace
+        workspace_size = 0;
+
+        return INFINI_STATUS_SUCCESS;
+    }
+#endif
+
+public:
+    Opaque(Opaque &&other) noexcept
+        : internal(std::move(other.internal)),
+          workspace_size(other.workspace_size)
+    // clang-format off
+#ifdef ENABLE_CUDNN_API
+          , input_desc(other.input_desc)
+          , output_desc(other.output_desc)
+          , pooling_desc(other.pooling_desc)
+#endif
+    // clang-format on
+    {
+#ifdef ENABLE_CUDNN_API
+        other.input_desc = nullptr;
+        other.output_desc = nullptr;
+        other.pooling_desc = nullptr;
+#endif
+        other.workspace_size = 0;
+    }
+
+    ~Opaque() {
+#ifdef ENABLE_CUDNN_API
+        CLEANUP_CUDNN_DESCRIPTORS();
+#endif
+    }
+
+    static inline utils::Result<Opaque>
+    create(std::shared_ptr<device::nvidia::Handle::Internal> internal_ptr,
+           MaxPoolInfo &info, infiniDtype_t data_type) {
+#ifdef ENABLE_CUDNN_API
+        Opaque opaque(internal_ptr);
+        auto status = opaque.initializeCudnnContext(info, data_type);
+        if (status != INFINI_STATUS_SUCCESS) {
+            return status;
+        }
+        return utils::Result<Opaque>(std::move(opaque));
+#else
+        return INFINI_STATUS_NOT_IMPLEMENTED;
+#endif
+    }
+};
+
+Descriptor::~Descriptor() {
+    if (_opaque) {
+        delete _opaque;
+    }
+}
+
+infiniStatus_t Descriptor::create(infiniopHandle_t handle_,
+                                  Descriptor **desc_ptr,
+                                  infiniopTensorDescriptor_t output_desc,
+                                  infiniopTensorDescriptor_t input_desc,
+                                  void *kernel_size, void *strides, void *pads,
+                                  bool ceil_mode) {
+
+#ifdef ENABLE_CUDNN_API
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = input_desc->dtype();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16);
+
+    auto result = MaxPoolInfo::create(output_desc, input_desc, kernel_size,
+                                      strides, pads, ceil_mode);
+    CHECK_RESULT(result);
+    auto info = result.take();
+
+    auto opaque_result = Opaque::create(handle->internal(), info, dtype);
+    CHECK_RESULT(opaque_result);
+    auto opaque = new Opaque(opaque_result.take());
+
+    *desc_ptr = new Descriptor(dtype, std::move(info), opaque->workspace_size,
+                               opaque, handle->device, handle->device_id);
+
+    return INFINI_STATUS_SUCCESS;
+#else
+    return INFINI_STATUS_NOT_IMPLEMENTED;
+#endif
+}
+
+infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
+                                     void *output, const void *input,
+                                     void *stream) const {
+
+#ifdef ENABLE_CUDNN_API
+    const float alpha = 1.0f, beta = 0.0f;
+
+    // 打印input展平后的前十个数据
+    //  printf("MaxPool input (first 10 elements): ");
+    //  const uint16_t *input_data = static_cast<const uint16_t *>(input);
+    //  for (int i = 0; i < 10; ++i) {
+    //    // 将BF16转换为float显示
+    //    union {
+    //      uint32_t bits;
+    //      float value;
+    //    } converter;
+    //    uint16_t bf16_val = input_data[i];
+    //    converter.bits = static_cast<uint32_t>(bf16_val) << 16;
+    //    printf("%f ", converter.value);
+    //  }
+    //  printf("\n");
+
+    CHECK_STATUS(_opaque->internal->useCudnn(
+        (cudaStream_t)stream, [&](cudnnHandle_t handle) {
+            CHECK_CUDNN(cudnnPoolingForward(handle, _opaque->pooling_desc, &alpha,
+                                            _opaque->input_desc, input, &beta,
+                                            _opaque->output_desc, output));
+            return INFINI_STATUS_SUCCESS;
+        }));
+
+    return INFINI_STATUS_SUCCESS;
+#else
+    return INFINI_STATUS_NOT_IMPLEMENTED;
+#endif
+}
+
+} // namespace op::maxpool::nvidia
diff --git a/src/infiniop/ops/maxpool/nvidia/maxpool_nvidia.cuh b/src/infiniop/ops/maxpool/nvidia/maxpool_nvidia.cuh
new file mode 100644
index 000000000..539ad5a1a
--- /dev/null
+++ b/src/infiniop/ops/maxpool/nvidia/maxpool_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __MAX_POOL_CUDA_CUH__
+#define __MAX_POOL_CUDA_CUH__
+
+#include "../maxpool.h"
+
+DESCRIPTOR(nvidia)
+
+#endif // __MAX_POOL_CUDA_CUH__
diff --git a/src/infiniop/ops/maxpool/operator.cc b/src/infiniop/ops/maxpool/operator.cc
new file mode 100644
index 000000000..fa47b5b72
--- /dev/null
+++ b/src/infiniop/ops/maxpool/operator.cc
@@ -0,0 +1,147 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/maxpool.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/maxpool_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
+#include "nvidia/maxpool_nvidia.cuh"
+#endif
+
+__C infiniStatus_t infiniopCreateMaxPoolDescriptor(
+    infiniopHandle_t handle,
+    infiniopMaxPoolDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t output_desc,
+    infiniopTensorDescriptor_t input_desc,
+    void *kernel_size,
+    void *strides,
+    void *pads,
+    bool ceil_mode) {
+
+#define CREATE(CASE, NAMESPACE)                                                \
+    case CASE:                                                                 \
+        return op::maxpool::NAMESPACE::Descriptor::create(                     \
+            handle,                                                            \
+            reinterpret_cast<op::maxpool::NAMESPACE::Descriptor **>(desc_ptr), \
+            output_desc,                                                       \
+            input_desc,                                                        \
+            kernel_size,                                                       \
+            strides,                                                           \
+            pads,                                                              \
+            ceil_mode)
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CREATE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetMaxPoolWorkspaceSize(infiniopMaxPoolDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                         \
+    case CASE:                                                                                       \
+        *size = reinterpret_cast<const op::maxpool::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        GET(INFINI_DEVICE_QY, nvidia);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopMaxPool(
+    infiniopMaxPoolDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    const void *input,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                                \
+    case CASE:                                                                    \
+        return reinterpret_cast<const op::maxpool::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, output, input, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CALCULATE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroyMaxPoolDescriptor(infiniopMaxPoolDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                    \
+    case CASE:                                                                     \
+        delete reinterpret_cast<const op::maxpool::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        DELETE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/scatter/cpu/scatter_cpu.cc b/src/infiniop/ops/scatter/cpu/scatter_cpu.cc
new file mode 100644
index 000000000..03b808bf5
--- /dev/null
+++ b/src/infiniop/ops/scatter/cpu/scatter_cpu.cc
@@ -0,0 +1,100 @@
+#include "scatter_cpu.h"
+#include "../../../devices/cpu/common_cpu.h"
+#include "../../../reduce/cpu/reduce.h"
+#include "../info.h"
+
+namespace op::scatter::cpu {
+
+infiniStatus_t calculate_scatter(
+    const ScatterInfo &info,
+    char *output,
+    const char *input,
+    const int64_t *index) {
+    //  -------------------------------- start: perform operator on CPU --------------------------------
+    std::vector<ptrdiff_t> contiguous_strides(info.ndim);
+    ptrdiff_t last_dim = 1;
+    ptrdiff_t last_stride = 1;
+    for (size_t d = 0; d < info.ndim; d++) {
+        if (d == info.dim) {
+            continue;
+        }
+        contiguous_strides[d] = last_dim * last_stride;
+        last_dim = info.index_shape[d];
+        last_stride = contiguous_strides[d];
+    }
+    size_t batch_size = last_dim * last_stride;
+    int scatter_dim = static_cast<int>(info.dim);
+    size_t element_size = infiniSizeOf(info.dtype);
+
+#pragma omp parallel for
+    for (int n = 0; n < static_cast<int>(batch_size); n++) {
+        auto output_ptr = output;
+        auto input_ptr = input;
+        auto index_ptr = index;
+        size_t rem = static_cast<size_t>(n);
+        for (int d = static_cast<int>(info.ndim) - 1; d >= 0; d--) {
+            if (d == scatter_dim) {
+                continue;
+            }
+            size_t dim_index = rem / contiguous_strides[d];
+            rem = rem % contiguous_strides[d];
+            output_ptr += dim_index * element_size * info.output_strides[d];
+            input_ptr += dim_index * element_size * info.input_strides[d];
+            index_ptr += dim_index * info.index_strides[d];
+        }
+        for (size_t c = 0; c < info.index_shape[scatter_dim]; c++) {
+            int64_t scatter_number = *(index_ptr + c * info.index_strides[scatter_dim]);
+            memcpy(
+                output_ptr + scatter_number * element_size * info.output_strides[scatter_dim],
+                input_ptr + c * element_size * info.input_strides[scatter_dim],
+                element_size);
+        }
+    }
+
+    //  --------------------------------- end: perform operator on CPU ---------------------------------
+    return INFINI_STATUS_SUCCESS;
+}
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t output_desc,
+    infiniopTensorDescriptor_t input_desc,
+    infiniopTensorDescriptor_t index_desc,
+    size_t dim) {
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+
+    //  --------------------- start: check data type and calculate workspace size ----------------------
+    auto dtype = input_desc->dtype();
+    size_t WorkSpaceSize = 0;
+    //  ---------------------- end: check data type and calculate workspace size -----------------------
+
+    auto result = ScatterInfo::createScatterInfo(
+        output_desc,
+        input_desc,
+        index_desc,
+        dim);
+    CHECK_RESULT(result);
+    const ScatterInfo &info = result.take();
+
+    *desc_ptr = new Descriptor(
+        dtype, std::move(info), WorkSpaceSize,
+        nullptr,
+        handle->device, handle->device_id);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    const void *input,
+    const void *index,
+    void *stream) const {
+
+    return calculate_scatter(_info, (char *)output, (const char *)input, (const int64_t *)index);
+}
+} // namespace op::scatter::cpu
diff --git a/src/infiniop/ops/scatter/cpu/scatter_cpu.h b/src/infiniop/ops/scatter/cpu/scatter_cpu.h
new file mode 100644
index 000000000..ad52c7b91
--- /dev/null
+++ b/src/infiniop/ops/scatter/cpu/scatter_cpu.h
@@ -0,0 +1,8 @@
+#ifndef __SCATTER_CPU_H__
+#define __SCATTER_CPU_H__
+
+#include "../scatter.h"
+
+DESCRIPTOR(cpu)
+
+#endif // __SCATTER_CPU_H__
diff --git a/src/infiniop/ops/scatter/cuda/kernel.cuh b/src/infiniop/ops/scatter/cuda/kernel.cuh
new file mode 100644
index 000000000..733d2e14d
--- /dev/null
+++ b/src/infiniop/ops/scatter/cuda/kernel.cuh
@@ -0,0 +1,37 @@
+#ifndef __SCATTER_KERNEL_CUH__
+#define __SCATTER_KERNEL_CUH__
+//  ------------------------------- start: perform operator on CUDA --------------------------------
+template <unsigned int BLOCK_SIZE, typename Tdata>
+__device__ void scatterKernel(
+    Tdata *output,
+    const Tdata *input,
+    const int64_t *index,
+    size_t ndim,
+    size_t index_scatter_size,
+    ptrdiff_t *output_strides,
+    ptrdiff_t *input_strides,
+    ptrdiff_t *index_strides,
+    ptrdiff_t *contiguous_strides,
+    int scatter_dim) {
+    auto output_ptr = output;
+    auto input_ptr = input;
+    auto index_ptr = index;
+    size_t rem = blockIdx.x;
+    for (int d = ndim - 1; d >= 0; d--) {
+        if (d == scatter_dim) {
+            continue;
+        }
+        size_t dim_index = rem / contiguous_strides[d];
+        rem = rem % contiguous_strides[d];
+        output_ptr += dim_index * output_strides[d];
+        input_ptr += dim_index * input_strides[d];
+        index_ptr += dim_index * index_strides[d];
+    }
+    for (size_t c = threadIdx.x; c < index_scatter_size; c += BLOCK_SIZE) {
+        int64_t scatter_number = *(index_ptr + c * index_strides[scatter_dim]);
+        *(output_ptr + scatter_number * output_strides[scatter_dim]) = *(input_ptr + c * input_strides[scatter_dim]);
+    }
+}
+//  -------------------------------- end: perform operator on CUDA ---------------------------------
+
+#endif // __SCATTER_KERNEL_CUH__
diff --git a/src/infiniop/ops/scatter/info.h b/src/infiniop/ops/scatter/info.h
new file mode 100644
index 000000000..4ef8b9e76
--- /dev/null
+++ b/src/infiniop/ops/scatter/info.h
@@ -0,0 +1,64 @@
+#ifndef __SCATTER_INFO_H__
+#define __SCATTER_INFO_H__
+
+#include "../../../utils.h"
+#include "../../operator.h"
+#include "../../tensor.h"
+
+namespace op::scatter {
+
+class ScatterInfo {
+private:
+    ScatterInfo() = default;
+
+public:
+    //  ---------------------------- start: define member variables of Info ----------------------------
+    infiniDtype_t dtype;
+    size_t ndim;
+    std::vector<size_t> output_shape;
+    std::vector<size_t> input_shape;
+    std::vector<size_t> index_shape;
+    std::vector<ptrdiff_t> output_strides;
+    std::vector<ptrdiff_t> input_strides;
+    std::vector<ptrdiff_t> index_strides;
+    size_t dim;
+
+    //  ----------------------------- end: define member variables of Info -----------------------------
+
+    static utils::Result<ScatterInfo> createScatterInfo(
+        infiniopTensorDescriptor_t output_desc,
+        infiniopTensorDescriptor_t input_desc,
+        infiniopTensorDescriptor_t index_desc,
+        size_t dim) {
+        //  ------------------------- start: check tensor shape and input validity -------------------------
+        CHECK_OR_RETURN(
+            input_desc->ndim() == output_desc->ndim() && output_desc->ndim() == index_desc->ndim(),
+            INFINI_STATUS_BAD_TENSOR_SHAPE);
+        size_t ndim = output_desc->ndim();
+        for (size_t d = 0; d < ndim; d++) {
+            if (d != dim) {
+                CHECK_OR_RETURN(
+                    index_desc->dim(d) <= input_desc->dim(d) && index_desc->dim(d) <= output_desc->dim(d),
+                    INFINI_STATUS_BAD_TENSOR_SHAPE;);
+            }
+        }
+        CHECK_OR_RETURN(index_desc->dim(dim) <= input_desc->dim(dim), INFINI_STATUS_BAD_TENSOR_SHAPE);
+        //  -------------------------- end: check tensor shape and input validity --------------------------
+        return utils::Result<ScatterInfo>(ScatterInfo{
+            //  ------------------------------ start: create an instance of Info -------------------------------
+            output_desc->dtype(),
+            ndim,
+            output_desc->shape(),
+            input_desc->shape(),
+            index_desc->shape(),
+            output_desc->strides(),
+            input_desc->strides(),
+            index_desc->strides(),
+            dim
+            //  ------------------------------- end: create an instance of Info --------------------------------
+        });
+    }
+};
+} // namespace op::scatter
+
+#endif //  __SCATTER_INFO_H__
diff --git a/src/infiniop/ops/scatter/metax/scatter_metax.h b/src/infiniop/ops/scatter/metax/scatter_metax.h
new file mode 100644
index 000000000..d5ce0ef16
--- /dev/null
+++ b/src/infiniop/ops/scatter/metax/scatter_metax.h
@@ -0,0 +1,8 @@
+#ifndef __SCATTER_METAX_H__
+#define __SCATTER_METAX_H__
+
+#include "../scatter.h"
+
+DESCRIPTOR(metax)
+
+#endif // __SCATTER_METAX_H__
diff --git a/src/infiniop/ops/scatter/metax/scatter_metax.maca b/src/infiniop/ops/scatter/metax/scatter_metax.maca
new file mode 100644
index 000000000..1c742f60d
--- /dev/null
+++ b/src/infiniop/ops/scatter/metax/scatter_metax.maca
@@ -0,0 +1,190 @@
+#include "../../../devices/metax/metax_common.h"
+#include "scatter_metax.h"
+#include <hccub/block/block_reduce.cuh>
+#include "../../../devices/metax/metax_kernel_common.h"
+#include "../../../reduce/cuda/reduce.cuh"
+#include "../cuda/kernel.cuh"
+#include "../info.h"
+
+namespace op::scatter::metax {
+
+template <unsigned int BLOCK_SIZE, typename Tdata>
+INFINIOP_METAX_KERNEL launchKernel(
+    Tdata * output,
+    const Tdata * input,
+    const int64_t * index,
+    size_t ndim,
+    size_t index_scatter_size,
+    ptrdiff_t * output_strides,
+    ptrdiff_t * input_strides,
+    ptrdiff_t * index_strides,
+    ptrdiff_t * contiguous_strides,
+    int scatter_dim
+) {
+    scatterKernel<BLOCK_SIZE, Tdata>(
+        output,
+        input,
+        index,
+        ndim,
+        index_scatter_size,
+        output_strides,
+        input_strides,
+        index_strides,
+        contiguous_strides,
+        scatter_dim
+    );
+}
+//  ----------------------- end: launchKernel: call kernel function of CUDA ------------------------
+
+//  ----------------------------------- start: call launchKernel -----------------------------------
+template<unsigned int BLOCK_SIZE, typename Tdata>
+infiniStatus_t calculate_scatter(
+    const ScatterInfo &info,
+    Tdata * output,
+    const Tdata * input,
+    const int64_t *  index,
+    hcStream_t stream,
+    void * workspace
+) {
+    size_t ndim = info.ndim;
+    ptrdiff_t * contiguous_strides = new ptrdiff_t[ndim];
+    size_t last_dim = 1, last_stride = 1;
+    size_t scatter_dim = info.dim;
+    for(size_t d = 0; d < ndim; d ++)
+    {
+        if (d == scatter_dim) 
+            continue;
+        contiguous_strides[d] = last_dim * last_stride;
+        last_dim = info.index_shape[d];
+        last_stride = contiguous_strides[d];
+    }
+
+    size_t batch_size = last_dim * last_stride;
+
+    ptrdiff_t * contiguous_strides_cuda = reinterpret_cast<ptrdiff_t*>(workspace);
+    ptrdiff_t * input_strides_cuda = contiguous_strides_cuda + ndim;
+    ptrdiff_t * output_strides_cuda = input_strides_cuda + ndim;
+    ptrdiff_t * index_strides_cuda = output_strides_cuda + ndim;
+
+    CHECK_METAX(hcMemcpyAsync(contiguous_strides_cuda, contiguous_strides, sizeof(ptrdiff_t) * ndim, hcMemcpyHostToDevice, stream));
+    CHECK_METAX(hcMemcpyAsync(input_strides_cuda, info.input_strides.data(), sizeof(ptrdiff_t) * ndim, hcMemcpyHostToDevice, stream));
+    CHECK_METAX(hcMemcpyAsync(output_strides_cuda, info.output_strides.data(), sizeof(ptrdiff_t) * ndim, hcMemcpyHostToDevice, stream));
+    CHECK_METAX(hcMemcpyAsync(index_strides_cuda, info.index_strides.data(), sizeof(ptrdiff_t) * ndim, hcMemcpyHostToDevice, stream));    
+
+
+    launchKernel<BLOCK_SIZE, Tdata><<<batch_size, BLOCK_SIZE, 0, stream>>>(
+        output,
+        input,
+        index,
+        ndim,
+        info.index_shape[scatter_dim],
+        output_strides_cuda,
+        input_strides_cuda,
+        index_strides_cuda,
+        contiguous_strides_cuda,
+        scatter_dim
+    );
+    delete[] contiguous_strides;
+    return INFINI_STATUS_SUCCESS;
+}
+//  ------------------------------------ end: call launchKernel ------------------------------------
+
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::metax::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t output_desc,
+    infiniopTensorDescriptor_t input_desc,
+    infiniopTensorDescriptor_t index_desc,
+    size_t dim
+) {
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+//  --------------------- start: check data type and calculate workspace size ----------------------    
+    auto dtype = output_desc->dtype();
+    auto result = ScatterInfo::createScatterInfo(
+        output_desc,
+        input_desc,
+        index_desc,
+        dim
+    );
+    CHECK_RESULT(result);
+    const ScatterInfo &info = result.take();
+    size_t WorkSpaceSize = sizeof(ptrdiff_t) * input_desc->ndim() * 4;
+//  ---------------------- end: check data type and calculate workspace size -----------------------
+
+    *desc_ptr = new Descriptor(
+        dtype, std::move(info), WorkSpaceSize, 
+        new Opaque{handle->internal()},
+        handle->device, handle->device_id
+    );
+    return INFINI_STATUS_SUCCESS;
+}
+
+
+
+infiniStatus_t Descriptor::calculate(
+    void * workspace,
+    size_t workspace_size,
+    void * output,
+    const void * input,
+    const void * index,
+    void *stream_
+) const {
+    if (workspace_size < _workspace_size)
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+
+    hcStream_t stream = (hcStream_t)stream_;
+
+    #define CALCULATE_SCATTER(BLOCK_SIZE, TDATA) \
+        calculate_scatter<BLOCK_SIZE, TDATA>(_info, (TDATA *)output, (const TDATA *)input, (const int64_t *)index, stream, workspace)
+
+    #define CALCULATE_SCATTER_WITH_METAX_BLOCK(BLOCK_SIZE)            \
+    switch (_info.dtype) {                                            \
+        case INFINI_DTYPE_BOOL:                                       \
+            return CALCULATE_SCATTER(BLOCK_SIZE, bool);               \
+        case INFINI_DTYPE_U8:                                         \
+            return CALCULATE_SCATTER(BLOCK_SIZE, uint8_t);            \
+        case INFINI_DTYPE_U16:                                        \
+            return CALCULATE_SCATTER(BLOCK_SIZE, uint16_t);           \
+        case INFINI_DTYPE_U32:                                        \
+            return CALCULATE_SCATTER(BLOCK_SIZE, uint32_t);           \
+        case INFINI_DTYPE_U64:                                        \
+            return CALCULATE_SCATTER(BLOCK_SIZE, uint64_t);           \
+        case INFINI_DTYPE_I8:                                         \
+            return CALCULATE_SCATTER(BLOCK_SIZE, int8_t);             \
+        case INFINI_DTYPE_I16:                                        \
+            return CALCULATE_SCATTER(BLOCK_SIZE, int16_t);            \
+        case INFINI_DTYPE_I32:                                        \
+            return CALCULATE_SCATTER(BLOCK_SIZE, int32_t);            \
+        case INFINI_DTYPE_I64:                                        \
+            return CALCULATE_SCATTER(BLOCK_SIZE, int64_t);            \
+        case INFINI_DTYPE_F16:                                        \
+            return CALCULATE_SCATTER(BLOCK_SIZE, half);               \
+        case INFINI_DTYPE_F32:                                        \
+            return CALCULATE_SCATTER(BLOCK_SIZE, float);              \
+        case INFINI_DTYPE_BF16:                                       \
+            return CALCULATE_SCATTER(BLOCK_SIZE, cuda_bfloat16);      \
+        default:                                                      \
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;                    \
+    }   
+
+    if (_opaque->internal->maxThreadsPerBlock() == METAX_BLOCK_SIZE_1024)
+        CALCULATE_SCATTER_WITH_METAX_BLOCK(METAX_BLOCK_SIZE_1024)
+    else if (_opaque->internal->maxThreadsPerBlock() == METAX_BLOCK_SIZE_512)
+        CALCULATE_SCATTER_WITH_METAX_BLOCK(METAX_BLOCK_SIZE_512)
+    else
+        return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
+    return INFINI_STATUS_SUCCESS;
+
+    #undef CALCULATE_SCATTER_WITH_METAX_BLOCK
+    #undef CALCULATE_SCATTER
+}
+} // namespace op::scatter::metax
diff --git a/src/infiniop/ops/scatter/nvidia/scatter_nvidia.cu b/src/infiniop/ops/scatter/nvidia/scatter_nvidia.cu
new file mode 100644
index 000000000..136ad1f65
--- /dev/null
+++ b/src/infiniop/ops/scatter/nvidia/scatter_nvidia.cu
@@ -0,0 +1,180 @@
+#include "../../../devices/nvidia/nvidia_common.cuh"
+#include "../../../devices/nvidia/nvidia_handle.cuh"
+#include "../../../devices/nvidia/nvidia_kernel_common.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "../info.h"
+#include "scatter_nvidia.cuh"
+
+namespace op::scatter::nvidia {
+
+//  ---------------------- start: launchKernel: call kernel function of CUDA -----------------------
+template <unsigned int BLOCK_SIZE, typename Tdata>
+INFINIOP_CUDA_KERNEL launchKernel(
+    Tdata *output,
+    const Tdata *input,
+    const int64_t *index,
+    size_t ndim,
+    size_t index_scatter_size,
+    ptrdiff_t *output_strides,
+    ptrdiff_t *input_strides,
+    ptrdiff_t *index_strides,
+    ptrdiff_t *contiguous_strides,
+    int scatter_dim) {
+    scatterKernel<BLOCK_SIZE, Tdata>(
+        output,
+        input,
+        index,
+        ndim,
+        index_scatter_size,
+        output_strides,
+        input_strides,
+        index_strides,
+        contiguous_strides,
+        scatter_dim);
+}
+//  ----------------------- end: launchKernel: call kernel function of CUDA ------------------------
+
+//  ----------------------------------- start: call launchKernel -----------------------------------
+template <unsigned int BLOCK_SIZE, typename Tdata>
+infiniStatus_t calculate_scatter(
+    const ScatterInfo &info,
+    Tdata *output,
+    const Tdata *input,
+    const int64_t *index,
+    cudaStream_t stream,
+    void *workspace) {
+    size_t ndim = info.ndim;
+    ptrdiff_t *contiguous_strides = new ptrdiff_t[ndim];
+    size_t last_dim = 1, last_stride = 1;
+    size_t scatter_dim = info.dim;
+    for (size_t d = 0; d < ndim; d++) {
+        if (d == scatter_dim) {
+            continue;
+        }
+        contiguous_strides[d] = last_dim * last_stride;
+        last_dim = info.index_shape[d];
+        last_stride = contiguous_strides[d];
+    }
+
+    size_t batch_size = last_dim * last_stride;
+
+    ptrdiff_t *contiguous_strides_cuda = reinterpret_cast<ptrdiff_t *>(workspace);
+    ptrdiff_t *input_strides_cuda = contiguous_strides_cuda + ndim;
+    ptrdiff_t *output_strides_cuda = input_strides_cuda + ndim;
+    ptrdiff_t *index_strides_cuda = output_strides_cuda + ndim;
+
+    CHECK_CUDA(cudaMemcpyAsync(contiguous_strides_cuda, contiguous_strides, sizeof(ptrdiff_t) * ndim, cudaMemcpyHostToDevice, stream));
+    CHECK_CUDA(cudaMemcpyAsync(input_strides_cuda, info.input_strides.data(), sizeof(ptrdiff_t) * ndim, cudaMemcpyHostToDevice, stream));
+    CHECK_CUDA(cudaMemcpyAsync(output_strides_cuda, info.output_strides.data(), sizeof(ptrdiff_t) * ndim, cudaMemcpyHostToDevice, stream));
+    CHECK_CUDA(cudaMemcpyAsync(index_strides_cuda, info.index_strides.data(), sizeof(ptrdiff_t) * ndim, cudaMemcpyHostToDevice, stream));
+
+    launchKernel<BLOCK_SIZE, Tdata><<<batch_size, BLOCK_SIZE, 0, stream>>>(
+        output,
+        input,
+        index,
+        ndim,
+        info.index_shape[scatter_dim],
+        output_strides_cuda,
+        input_strides_cuda,
+        index_strides_cuda,
+        contiguous_strides_cuda,
+        scatter_dim);
+    delete[] contiguous_strides;
+    return INFINI_STATUS_SUCCESS;
+}
+//  ------------------------------------ end: call launchKernel ------------------------------------
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::nvidia::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t output_desc,
+    infiniopTensorDescriptor_t input_desc,
+    infiniopTensorDescriptor_t index_desc,
+    size_t dim) {
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    //  --------------------- start: check data type and calculate workspace size ----------------------
+    auto dtype = output_desc->dtype();
+    size_t WorkSpaceSize = sizeof(ptrdiff_t) * input_desc->ndim() * 4;
+    //  ---------------------- end: check data type and calculate workspace size -----------------------
+    auto result = ScatterInfo::createScatterInfo(
+        output_desc,
+        input_desc,
+        index_desc,
+        dim);
+    CHECK_RESULT(result);
+    const ScatterInfo &info = result.take();
+    *desc_ptr = new Descriptor(
+        dtype, std::move(info), WorkSpaceSize,
+        new Opaque{handle->internal()},
+        handle->device, handle->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    const void *input,
+    const void *index,
+    void *stream_) const {
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+    cudaStream_t stream = (cudaStream_t)stream_;
+#define CALCULATE_SCATTER(BLOCK_SIZE, TDATA) \
+    calculate_scatter<BLOCK_SIZE, TDATA>(_info, (TDATA *)output, (const TDATA *)input, (const int64_t *)index, stream, workspace)
+#define CALCULATE_SCATTER_WITH_BLOCK_SIZE(BLOCK_SIZE)        \
+    switch (_info.dtype) {                                   \
+    case INFINI_DTYPE_BOOL:                                  \
+        return CALCULATE_SCATTER(BLOCK_SIZE, bool);          \
+    case INFINI_DTYPE_U8:                                    \
+        return CALCULATE_SCATTER(BLOCK_SIZE, uint8_t);       \
+    case INFINI_DTYPE_U16:                                   \
+        return CALCULATE_SCATTER(BLOCK_SIZE, uint16_t);      \
+    case INFINI_DTYPE_U32:                                   \
+        return CALCULATE_SCATTER(BLOCK_SIZE, uint32_t);      \
+    case INFINI_DTYPE_U64:                                   \
+        return CALCULATE_SCATTER(BLOCK_SIZE, uint64_t);      \
+    case INFINI_DTYPE_I8:                                    \
+        return CALCULATE_SCATTER(BLOCK_SIZE, int8_t);        \
+    case INFINI_DTYPE_I16:                                   \
+        return CALCULATE_SCATTER(BLOCK_SIZE, int16_t);       \
+    case INFINI_DTYPE_I32:                                   \
+        return CALCULATE_SCATTER(BLOCK_SIZE, int32_t);       \
+    case INFINI_DTYPE_I64:                                   \
+        return CALCULATE_SCATTER(BLOCK_SIZE, int64_t);       \
+    case INFINI_DTYPE_F16:                                   \
+        return CALCULATE_SCATTER(BLOCK_SIZE, half);          \
+    case INFINI_DTYPE_F32:                                   \
+        return CALCULATE_SCATTER(BLOCK_SIZE, float);         \
+    case INFINI_DTYPE_BF16:                                  \
+        return CALCULATE_SCATTER(BLOCK_SIZE, cuda_bfloat16); \
+    default:                                                 \
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;               \
+    }
+
+    if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_1024) {
+        CALCULATE_SCATTER_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_1024)
+    } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_512) {
+        CALCULATE_SCATTER_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_512)
+    } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_4096) {
+        CALCULATE_SCATTER_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_4096)
+    } else {
+        return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE_SCATTER_WITH_BLOCK_SIZE
+#undef CALCULATE_SCATTER
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::scatter::nvidia
diff --git a/src/infiniop/ops/scatter/nvidia/scatter_nvidia.cuh b/src/infiniop/ops/scatter/nvidia/scatter_nvidia.cuh
new file mode 100644
index 000000000..a199edb6e
--- /dev/null
+++ b/src/infiniop/ops/scatter/nvidia/scatter_nvidia.cuh
@@ -0,0 +1,7 @@
+#ifndef __SCATTER_NVIDIA_API_H__
+#define __SCATTER_NVIDIA_API_H__
+#include "../scatter.h"
+
+DESCRIPTOR(nvidia)
+
+#endif // __SCATTER_NVIDIA_API_H__
diff --git a/src/infiniop/ops/scatter/operator.cc b/src/infiniop/ops/scatter/operator.cc
new file mode 100644
index 000000000..95857d731
--- /dev/null
+++ b/src/infiniop/ops/scatter/operator.cc
@@ -0,0 +1,160 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/scatter.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/scatter_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
+#include "nvidia/scatter_nvidia.cuh"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/scatter_metax.h"
+#endif
+
+__C infiniStatus_t infiniopCreateScatterDescriptor(
+    infiniopHandle_t handle,
+    infiniopScatterDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t output_desc,
+    infiniopTensorDescriptor_t input_desc,
+    infiniopTensorDescriptor_t index_desc,
+    size_t dim) {
+#define CREATE(CASE, NAMESPACE)                                                \
+    case CASE:                                                                 \
+        return op::scatter::NAMESPACE::Descriptor::create(                     \
+            handle,                                                            \
+            reinterpret_cast<op::scatter::NAMESPACE::Descriptor **>(desc_ptr), \
+            output_desc,                                                       \
+            input_desc,                                                        \
+            index_desc,                                                        \
+            dim)
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CREATE(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetScatterWorkspaceSize(infiniopScatterDescriptor_t desc, size_t *size) {
+#define GET(CASE, NAMESPACE)                                                                         \
+    case CASE:                                                                                       \
+        *size = reinterpret_cast<const op::scatter::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        GET(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+}
+
+__C infiniStatus_t infiniopScatter(
+    infiniopScatterDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    const void *input,
+    const void *index,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                                            \
+    case CASE:                                                                                \
+        return reinterpret_cast<const op::scatter::NAMESPACE::Descriptor *>(desc)->calculate( \
+            workspace,                                                                        \
+            workspace_size,                                                                   \
+            output,                                                                           \
+            input,                                                                            \
+            index,                                                                            \
+            stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CALCULATE(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroyScatterDescriptor(infiniopScatterDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                    \
+    case CASE:                                                                     \
+        delete reinterpret_cast<const op::scatter::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        DELETE(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/scatter/scatter.h b/src/infiniop/ops/scatter/scatter.h
new file mode 100644
index 000000000..e1e332471
--- /dev/null
+++ b/src/infiniop/ops/scatter/scatter.h
@@ -0,0 +1,47 @@
+#ifndef __SCATTER_H__
+#define __SCATTER_H__
+
+#include "../../../utils.h"
+#include "../../operator.h"
+#include "../../tensor.h"
+#include "info.h"
+
+#define DESCRIPTOR(NAMESPACE)                                            \
+    namespace op::scatter::NAMESPACE {                                   \
+    class Descriptor final : public InfiniopDescriptor {                 \
+        struct Opaque;                                                   \
+        Opaque *_opaque;                                                 \
+        ScatterInfo _info;                                               \
+        size_t _workspace_size;                                          \
+        Descriptor(                                                      \
+            infiniDtype_t dtype,                                         \
+            ScatterInfo info,                                            \
+            size_t workspace_size_,                                      \
+            Opaque *opaque,                                              \
+            infiniDevice_t device_type,                                  \
+            int device_id) : InfiniopDescriptor{device_type, device_id}, \
+                             _opaque(opaque),                            \
+                             _info(info),                                \
+                             _workspace_size(workspace_size_) {}         \
+                                                                         \
+    public:                                                              \
+        ~Descriptor();                                                   \
+        size_t workspaceSize() const { return _workspace_size; }         \
+        static infiniStatus_t create(                                    \
+            infiniopHandle_t handle,                                     \
+            Descriptor **desc_ptr,                                       \
+            infiniopTensorDescriptor_t output_desc,                      \
+            infiniopTensorDescriptor_t input_desc,                       \
+            infiniopTensorDescriptor_t index_desc,                       \
+            size_t dim);                                                 \
+        infiniStatus_t calculate(                                        \
+            void *workspace,                                             \
+            size_t workspace_size,                                       \
+            void *output,                                                \
+            const void *input,                                           \
+            const void *index,                                           \
+            void *stream) const;                                         \
+    };                                                                   \
+    }
+
+#endif
\ No newline at end of file
diff --git a/test/infiniop/averagepool.py b/test/infiniop/averagepool.py
new file mode 100644
index 000000000..55d5c37cf
--- /dev/null
+++ b/test/infiniop/averagepool.py
@@ -0,0 +1,239 @@
+import torch
+import ctypes
+from ctypes import c_uint64, c_bool
+
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
+)
+from typing import Tuple
+import math
+from torch.nn import functional as F
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+_TEST_CASES = [
+    # ============ 1D Average Pooling Tests (converted to MaxPool format) ============
+    # Basic cases
+    ((4, 8, 128), None, (3,), (1,), (0,), False),  # kernel=3, stride=1, pad=0
+    ((2, 16, 256), None, (5,), (2,), (2,), False),  # kernel=5, stride=2, pad=2
+    ((8, 4, 64), None, (7,), (3,), (1,), False),  # kernel=7, stride=3, pad=1
+    # ceil_mode variations
+    ((1, 3, 99), None, (4,), (3,), (1,), True),  # kernel=4, stride=3, pad=1
+    ((3, 2, 77), None, (6,), (4,), (0,), True),  # kernel=6, stride=4, pad=0
+    # ============ 2D Average Pooling Tests ============
+    # Basic cases with square kernels
+    ((2, 3, 64, 64), None, (3, 3), (1, 1), (1, 1), False),
+    ((4, 16, 128, 128), None, (5, 5), (2, 2), (2, 2), False),
+    ((1, 8, 96, 96), None, (7, 7), (3, 3), (0, 0), False),
+    # Rectangular kernels
+    ((2, 4, 80, 120), None, (3, 5), (1, 2), (1, 2), False),
+    ((1, 6, 72, 48), None, (7, 3), (2, 1), (3, 1), False),
+    ((3, 2, 56, 84), None, (2, 4), (2, 3), (0, 2), False),
+    # ceil_mode variations
+    ((1, 1, 33, 33), None, (4, 4), (3, 3), (1, 1), True),
+    ((2, 5, 77, 89), None, (5, 3), (4, 2), (2, 1), True),
+    # ============ 3D Average Pooling Tests ============
+    # Basic cubic kernels
+    ((1, 2, 32, 32, 32), None, (3, 3, 3), (1, 1, 1), (1, 1, 1), False),
+    ((2, 4, 48, 48, 48), None, (5, 5, 5), (2, 2, 2), (2, 2, 2), False),
+    ((1, 1, 64, 64, 64), None, (7, 7, 7), (3, 3, 3), (0, 0, 0), False),
+    # Non-cubic kernels
+    ((1, 3, 24, 36, 48), None, (2, 3, 4), (1, 2, 2), (0, 1, 2), False),
+    ((2, 2, 40, 32, 56), None, (5, 3, 7), (2, 1, 3), (2, 1, 3), False),
+    ((1, 1, 28, 44, 36), None, (3, 5, 2), (2, 3, 1), (1, 2, 1), False),
+    # ceil_mode variations
+    ((1, 1, 27, 27, 27), None, (4, 4, 4), (3, 3, 3), (1, 1, 1), True),
+    ((2, 2, 33, 45, 39), None, (5, 3, 4), (3, 2, 3), (2, 1, 1), True),
+]
+
+_TENSOR_DTYPES = [InfiniDtype.F32, InfiniDtype.F16, InfiniDtype.BF16]
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 1e-4, "rtol": 1e-4},
+    InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2},
+}
+
+
+def averagepool(input_tensor, kernel_size, stride, padding, ceil_mode, output_tensor):
+    ndim = len(input_tensor.shape) - 2
+    if ndim == 1:
+        result = F.avg_pool1d(
+            input_tensor.to(torch.float32), kernel_size[0], stride[0], padding[0], ceil_mode=ceil_mode
+        )
+    elif ndim == 2:
+        result = F.avg_pool2d(
+            input_tensor.to(torch.float32), kernel_size, stride, padding, ceil_mode=ceil_mode
+        )
+    elif ndim == 3:
+        result = F.avg_pool3d(
+            input_tensor.to(torch.float32), kernel_size, stride, padding, ceil_mode=ceil_mode
+        )
+    else:
+        raise ValueError(f"Unsupported spatial dimensions: {ndim}")
+    
+    # 将计算结果转换回原始数据类型
+    output_tensor.copy_(result.to(output_tensor.dtype))
+
+
+def infer_output_shape(input_shape, kernel_size, stride, padding, ceil_mode):
+    def calc_output_size(input_size, k, s, p, ceil_mode):
+        return (
+            math.ceil((input_size + 2 * p - k) / s + 1)
+            if ceil_mode
+            else math.floor((input_size + 2 * p - k) / s + 1)
+        )
+
+    batch, channel, *spatial = input_shape
+    output_spatial = [
+        calc_output_size(spatial[i], kernel_size[i], stride[i], padding[i], ceil_mode)
+        for i in range(len(spatial))
+    ]
+    return (batch, channel) + tuple(output_spatial)
+
+
+def tuple_to_void_p(py_tuple: Tuple):
+    arr = (ctypes.c_uint64 * len(py_tuple))(*py_tuple)
+    return ctypes.cast(arr, ctypes.c_void_p)
+
+
+def test(
+    handle,
+    device,
+    input_shape,
+    input_stride,
+    kernel_size,
+    stride,
+    padding,
+    ceil_mode,
+    tensor_dtype=InfiniDtype.F16,
+    sync=None,
+):
+    input_tensor = TestTensor(
+        input_shape, input_stride, dt=tensor_dtype, device=device, scale=1.0
+    )
+    output_shape = infer_output_shape(
+        input_shape, kernel_size, stride, padding, ceil_mode
+    )
+    output_tensor = TestTensor(output_shape, None, dt=tensor_dtype, device=device)
+
+    print(
+        f"Testing AvgPool on {InfiniDeviceNames[device]} with input_shape: {input_shape}, kernel_size: {kernel_size}, stride: {stride}, padding: {padding}, ceil_mode: {ceil_mode}, dtype: {InfiniDtypeNames[tensor_dtype]}"
+    )
+
+    averagepool(
+        input_tensor.torch_tensor(),
+        kernel_size,
+        stride,
+        padding,
+        ceil_mode,
+        output_tensor.torch_tensor(),
+    )
+
+    if sync:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateAvgPoolDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            output_tensor.descriptor,
+            input_tensor.descriptor,
+            tuple_to_void_p(kernel_size),
+            tuple_to_void_p(stride),
+            tuple_to_void_p(padding),
+            c_bool(ceil_mode),
+        )
+    )
+
+    for tensor in [input_tensor, output_tensor]:
+        if tensor:
+            tensor.destroy_desc()
+
+    workspace_size = ctypes.c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetAvgPoolWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, output_tensor.device)
+
+    def lib_averagepool():
+        check_error(
+            LIBINFINIOP.infiniopAvgPool(
+                descriptor,
+                workspace.data(),
+                workspace_size.value,
+                output_tensor.data(),
+                input_tensor.data(),
+                None,
+            )
+        )
+
+    lib_averagepool()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, tensor_dtype)
+    if DEBUG:
+        debug(
+            output_tensor.actual_tensor(),
+            output_tensor.torch_tensor(),
+            atol=atol,
+            rtol=rtol,
+        )
+
+    assert torch.allclose(
+        output_tensor.actual_tensor(),
+        output_tensor.torch_tensor(),
+        atol=atol,
+        rtol=rtol,
+    ), f"Mismatch for shape {input_shape}, kernel {kernel_size}"
+
+    if PROFILE:
+        profile_operation(
+            "PyTorch",
+            lambda: averagepool(
+                input_tensor.torch_tensor(),
+                kernel_size,
+                stride,
+                padding,
+                ceil_mode,
+                output_tensor.torch_tensor(),
+            ),
+            device,
+            NUM_PRERUN,
+            NUM_ITERATIONS,
+        )
+        profile_operation(
+            "   lib", lib_averagepool, device, NUM_PRERUN, NUM_ITERATIONS
+        )
+
+    check_error(LIBINFINIOP.infiniopDestroyAvgPoolDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/batch_norm.py b/test/infiniop/batch_norm.py
new file mode 100644
index 000000000..a7b46858f
--- /dev/null
+++ b/test/infiniop/batch_norm.py
@@ -0,0 +1,244 @@
+import torch
+import ctypes
+from ctypes import c_uint64
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
+)
+from enum import Enum, auto
+
+_TEST_CASES_ = [
+    # shape, momentum, eps
+    ((13, 4, 5,), 0.1, 1e-5),
+    ((2, 3, 4),  0.1, 1e-4),
+    ((15, 16, 17,), 0.2, 1e-5),
+    ((50, 60, 70),  0.1, 1e-4),
+]
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE = auto()
+
+
+# Inplace options applied for each test case in _TEST_CASES_
+_INPLACE = [
+    Inplace.OUT_OF_PLACE,
+    Inplace.INPLACE,
+]
+
+_TEST_CASES = [
+    test_case + (inplace_item,)
+    for test_case in _TEST_CASES_
+    for inplace_item in _INPLACE
+]
+
+
+# No implement for INPLACE
+
+
+# Data types used for testing
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16]
+
+# Tolerance map for different data types
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-2, "rtol": 1e-2},
+    InfiniDtype.F32: {"atol": 1e-5, "rtol": 1e-5},
+    InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def torch_batch_norm(
+    output: torch.Tensor,
+    running_mean: torch.Tensor,
+    running_var: torch.Tensor,
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    bias: torch.Tensor,
+    init_running_mean: torch.Tensor,
+    init_running_var: torch.Tensor,
+    momentum: float,
+    eps: float
+):
+    bn = torch.nn.BatchNorm1d(
+        num_features=input.shape[1],
+        eps=eps,
+        momentum=momentum,
+        dtype=input.dtype,
+    )
+    bn.weight.data = weight
+    bn.bias.data = bias
+    bn.running_mean.data = init_running_mean
+    bn.running_var.data = init_running_var
+    output.copy_(bn(input).detach())
+    running_mean.copy_(bn.running_mean.data)
+    running_var.copy_(bn.running_var.data)
+
+
+def test(
+    handle,
+    device,
+    shape, momentum, eps,
+    inplace,
+    dtype,
+    sync=None,
+):
+    running_mean = TestTensor(
+        [shape[1]],
+        None,
+        dtype,
+        device,
+    )    
+    running_var = TestTensor(
+        [shape[1]],
+        None,
+        dtype,
+        device,
+    ) 
+
+    input = TestTensor(
+        shape,
+        None,
+        dtype,
+        device,
+    )   
+    if inplace == Inplace.INPLACE:
+        output = input
+    else:
+        output = TestTensor(
+            shape,
+            None,
+            dtype,
+            device
+        ) 
+
+    weight = TestTensor(
+        [shape[1]],
+        None,
+        dtype,
+        device,
+    )
+    bias = TestTensor(
+        [shape[1]],
+        None,      
+        dtype,
+        device,
+    )            
+
+
+    print(
+        f"Testing BatchNorm on {InfiniDeviceNames[device]} with shape:{shape}, inplace:{inplace}, momentum:{momentum}, eps:{eps},"
+        f"dtype:{InfiniDtypeNames[dtype]}"
+    )
+
+    
+    torch_batch_norm(
+        output.torch_tensor(), running_mean.torch_tensor(), running_var.torch_tensor(),
+        input.torch_tensor(), weight.torch_tensor(), bias.torch_tensor(),
+        running_mean.torch_tensor(), running_var.torch_tensor(),
+        momentum, eps
+    )
+
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateBatchNormDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            output.descriptor,
+            running_mean.descriptor,
+            running_var.descriptor,
+            input.descriptor,
+            weight.descriptor,
+            bias.descriptor,
+            momentum,
+            eps
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in [output, running_mean, running_var, input, weight, bias]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetBatchNormWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, output.device)
+
+    def lib_batch_norm():
+        check_error(
+            LIBINFINIOP.infiniopBatchNorm(
+                descriptor,
+                workspace.data(),
+                workspace.size(),
+                output.data(),
+                running_mean.data(),
+                running_var.data(),
+                input.data(),
+                weight.data(),
+                bias.data(),
+                None,
+            )
+        )
+
+    lib_batch_norm()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol)
+        debug(running_mean.actual_tensor(), running_mean.torch_tensor(), atol=atol, rtol=rtol)
+        debug(running_var.actual_tensor(), running_var.torch_tensor(), atol=atol, rtol=rtol)
+
+
+    assert torch.allclose(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol)
+    assert torch.allclose(running_mean.actual_tensor(), running_mean.torch_tensor(), atol=atol, rtol=rtol)
+    assert torch.allclose(running_var.actual_tensor(), running_var.torch_tensor(), atol=atol, rtol=rtol)
+    
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: torch_batch_norm(
+            output.torch_tensor(), running_mean.torch_tensor(), running_var.torch_tensor(),
+            input.torch_tensor(), weight.torch_tensor(), bias.torch_tensor(), running_mean.torch_tensor(), running_var.torch_tensor(), momentum, eps
+        ), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_batch_norm(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+    check_error(LIBINFINIOP.infiniopDestroyBatchNormDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest my BatchNorm passed!\033[0m")
diff --git a/test/infiniop/cross_entropy_loss.py b/test/infiniop/cross_entropy_loss.py
new file mode 100644
index 000000000..acc5cadc4
--- /dev/null
+++ b/test/infiniop/cross_entropy_loss.py
@@ -0,0 +1,213 @@
+import torch
+import ctypes
+from ctypes import c_uint64
+import numpy as np
+
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    profile_operation,
+    infiniopOperatorDescriptor_t,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    TestWorkspace,
+    InfiniDeviceEnum,
+)
+from torch.nn import functional as F
+
+_TEST_CASES = [
+    # Single sample classification
+    ((10,), 10),
+    ((200,), 200),
+    # 2D: (N, C) - batch classification
+    ((4, 10), 10),
+    ((8, 5), 5),
+    ((16, 100), 100),
+    ((32, 1000), 1000),
+    ((64, 21), 21),
+    ((128, 50), 50),
+    # 3D: (N, C, d1) - sequence classification
+    ((4, 10, 5), 10),
+    # 4D: (N, C, d1, d2) - image segmentation
+    ((2, 8, 8, 8), 8),
+    # 5D: (N, C, d1, d2, d3) - 3D segmentation
+    ((3, 10, 10, 20, 30), 10),
+]
+
+_TENSOR_DTYPES = [InfiniDtype.F32, InfiniDtype.F16, InfiniDtype.BF16]
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 1e-5, "rtol": 1e-5},
+    InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2},
+}
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def cross_entropy_loss_pytorch(logits, target):
+    return F.cross_entropy(logits.double(), target.long(), reduction="mean")
+
+
+def test(
+    handle,
+    device,
+    input_shape,
+    num_classes,
+    tensor_dtype=InfiniDtype.F32,
+    sync=None,
+):
+    # 根据输入形状确定logits和target的形状
+    if len(input_shape) == 1:
+        # Shape (C,) - single sample classification
+        logits_shape = (num_classes,)
+        target_shape = (1,)  # 修改：使用 (1,) 而不是标量
+    else:
+        # Shape (N, C, [d1], [d2], ...)
+        logits_shape = input_shape
+        target_shape = (input_shape[0],) + input_shape[2:]
+
+    print(
+        f"Testing CrossEntropyLoss on {InfiniDeviceNames[device]} with logits_shape: {logits_shape}, target_shape: {target_shape}, dtype:{InfiniDtypeNames[tensor_dtype]}"
+    )
+
+    # 创建logits张量
+    logits = TestTensor(logits_shape, None, dt=tensor_dtype, device=device)
+
+    # 创建target张量
+    target_torch = torch.randint(
+        0,
+        num_classes,
+        target_shape,
+        dtype=torch.long,
+        device=logits.torch_tensor().device,
+    )
+    target = TestTensor.from_torch(target_torch, dt=InfiniDtype.I64, device=device)
+
+    # 创建loss张量
+    loss = TestTensor((1,), None, dt=tensor_dtype, device=device)
+
+    # 计算PyTorch参考损失
+    if len(input_shape) == 1:
+        # 对于一维logits，target需要是标量
+        target_scalar = target.torch_tensor()[0]
+        pytorch_loss = cross_entropy_loss_pytorch(logits.torch_tensor(), target_scalar)
+    else:
+        pytorch_loss = cross_entropy_loss_pytorch(
+            logits.torch_tensor(), target.torch_tensor()
+        )
+
+    # 将参考结果存储到loss张量
+    loss.torch_tensor()[0] = pytorch_loss.to(loss.torch_tensor().dtype)
+
+    if sync:
+        sync()
+
+    # 创建算子描述符
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateCrossEntropyLossDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            loss.descriptor,
+            logits.descriptor,
+            target.descriptor,
+        )
+    )
+
+    # 销毁tensor的描述符以防止内核直接使用
+    for tensor in [logits, target, loss]:
+        tensor.destroy_desc()
+
+    # 获取工作空间大小并创建工作空间
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetCrossEntropyLossWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, device)
+
+    # PyTorch参考实现函数
+    def torch_cross_entropy():
+        if len(input_shape) == 1:
+            target_scalar = target.torch_tensor()[0]
+            result = cross_entropy_loss_pytorch(logits.torch_tensor(), target_scalar)
+        else:
+            result = cross_entropy_loss_pytorch(
+                logits.torch_tensor(), target.torch_tensor()
+            )
+        loss.torch_tensor()[0] = result.to(loss.torch_tensor().dtype)
+
+    # InfiniOP实现函数
+    def lib_cross_entropy():
+        check_error(
+            LIBINFINIOP.infiniopCrossEntropyLoss(
+                descriptor,
+                workspace.data(),
+                workspace_size.value,
+                loss.data(),
+                logits.data(),
+                target.data(),
+                None,
+            )
+        )
+
+    # 执行InfiniOP算子
+    lib_cross_entropy()
+
+    if sync:
+        sync()
+
+    # 验证结果
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, tensor_dtype)
+    actual_loss = loss.actual_tensor()[0]
+    expected_loss = loss.torch_tensor()[0]
+
+    if DEBUG:
+        print(f"Expected loss: {expected_loss.item()}")
+        print(f"Actual loss: {actual_loss.item()}")
+        if target_shape:
+            print(
+                f"Target shape: {target_shape}, first few targets: {target.torch_tensor().flatten()[:5]}"
+            )
+        else:
+            print(f"Target (scalar): {target.torch_tensor()[0].item()}")
+        debug(actual_loss, expected_loss, atol=atol, rtol=rtol)
+
+    if not torch.allclose(actual_loss, expected_loss, atol=atol, rtol=rtol):
+        print("--- ERROR ANALYSIS ---")
+        print(f"Expected: {expected_loss.item()}, Actual: {actual_loss.item()}")
+        print(f"Difference: {abs(actual_loss - expected_loss).item()}")
+        print(f"Tolerance: atol={atol}, rtol={rtol}")
+
+    assert torch.allclose(actual_loss, expected_loss, atol=atol, rtol=rtol)
+
+    # Profile功能
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: torch_cross_entropy(), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_cross_entropy(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+
+    check_error(LIBINFINIOP.infiniopDestroyCrossEntropyLossDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+    print("\033[92mAll CrossEntropyLoss tests passed!\033[0m")
diff --git a/test/infiniop/exp.py b/test/infiniop/exp.py
new file mode 100644
index 000000000..eb139af12
--- /dev/null
+++ b/test/infiniop/exp.py
@@ -0,0 +1,165 @@
+import torch
+import ctypes
+from ctypes import c_uint64
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    get_sync_func,
+    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
+)
+from enum import Enum, auto
+
+# ========================================================================
+#  Configuration (Internal Use Only)
+# ========================================================================
+_TEST_CASES_ = [
+    # shape, input_stride, output_stride
+    ((13, 4), None, None),
+    ((13, 4), (10, 1), (10, 1)),
+    ((13, 4), (0, 1), None),
+    ((13, 4, 4), None, None),
+    ((13, 4, 4), (20, 4, 1), (20, 4, 1)),
+    ((13, 4, 4), (4, 0, 1), None),
+    ((16, 5632), None, None),
+    ((16, 5632), (10240, 1), (10240, 1)),
+    ((4, 4, 5632), None, None),
+    ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1)),
+]
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_INPUT    = auto()
+
+_INPLACE = [
+    Inplace.OUT_OF_PLACE, 
+    Inplace.INPLACE_INPUT,
+]
+
+_TEST_CASES = [
+    test_case + (inplace,)
+    for test_case in _TEST_CASES_
+    for inplace in _INPLACE
+]
+
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16]
+
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6},
+    InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2},
+}
+
+DEBUG       = False
+PROFILE     = False
+NUM_PRERUN  = 10
+NUM_ITERATIONS = 1000
+
+
+def exp(output, input):
+    output.copy_(torch.exp(input))
+
+def test(
+    handle,
+    device,
+    shape,
+    input_stride=None,
+    output_stride=None,
+    inplace=Inplace.OUT_OF_PLACE,
+    dtype=torch.float16,
+    sync=None,
+):
+    input = TestTensor(shape, input_stride, dtype, device)
+    if inplace == Inplace.INPLACE_INPUT:
+        if input_stride != output_stride:
+            return
+        output = input
+    else:
+        output = TestTensor(shape, output_stride, dtype, device, mode="ones")
+
+    if output.is_broadcast():
+        return
+
+    print(
+        f"Testing Exp on {InfiniDeviceNames[device]} with shape:{shape} input_stride:{input_stride} output_stride:{output_stride} "
+        f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}"
+    )
+
+    exp(output.torch_tensor(), input.torch_tensor())
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateExpDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            output.descriptor,
+            input.descriptor,
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in [input, output]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetExpWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, output.device)
+
+    def lib_exp():
+        check_error(
+            LIBINFINIOP.infiniopExp(
+                descriptor,
+                workspace.data(),
+                workspace_size.value,
+                output.data(),
+                input.data(),
+                None,
+            )
+        )
+
+    lib_exp()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol)
+    assert torch.allclose(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol)
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: exp(output.torch_tensor(), input.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_exp(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+    check_error(LIBINFINIOP.infiniopDestroyExpDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/gather.py b/test/infiniop/gather.py
new file mode 100644
index 000000000..b5c8ea93d
--- /dev/null
+++ b/test/infiniop/gather.py
@@ -0,0 +1,160 @@
+import torch
+import ctypes
+from ctypes import c_uint64
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
+)
+from enum import Enum, auto
+
+_TEST_CASES = [
+    # input_shape, output_shape, dim, input_strides, output_strides, index_strides
+    ((2, 3, 7), (2, 3, 5), 2, (177, 17, 1), None, None),
+    ((10, 5, 4), (10, 4, 4), 1, (30, 5, 1), None, [16, 4, 1]),
+    ((11, 2, 2, 4), (11, 2, 2, 4), 0, None, (1007, 107, 10, 1), None),
+    ((11, 20, 20, 13, 37), (11, 20, 20, 13, 37), 1, None, None, None)
+]
+
+# Data types used for testing
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16]
+
+# Tolerance map for different data types
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 0, "rtol": 0},
+    InfiniDtype.F32: {"atol": 0, "rtol": 0},
+    InfiniDtype.BF16: {"atol": 0, "rtol": 0},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def torch_gather(output, input, dim, index):
+    torch.gather(input, dim, index, out=output)
+
+def test(
+    handle,
+    device,
+    input_shape, output_shape, dim, input_strides, output_strides, index_strides,
+    dtype,
+    sync=None,
+):
+    print(
+        f"Testing Gather on {InfiniDeviceNames[device]} with input shape:{input_shape}, dim:{dim}, output_shape:{output_shape},"
+        f"dtype:{InfiniDtypeNames[dtype]}"
+    )
+
+    input = TestTensor(
+        input_shape,
+        input_strides,
+        dtype,
+        device
+    ) 
+    torch_index = torch.randint(low=0, high=input_shape[dim], size=output_shape, dtype=torch.int64)
+    if index_strides:
+        torch_index = torch_index.as_strided(output_shape, index_strides)        
+    index = TestTensor(
+        output_shape,
+        torch_index.stride(),
+        InfiniDtype.I64,
+        device,
+        "manual",
+        set_tensor=torch_index
+    ) 
+    output = TestTensor(
+        output_shape,
+        output_strides,
+        dtype,
+        device,
+    )
+
+    torch_gather(output.torch_tensor(), input.torch_tensor(), dim, index.torch_tensor())
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateGatherDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            output.descriptor,
+            input.descriptor,
+            index.descriptor,
+            dim
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in [input, output, index]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetGatherWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, input.device)
+
+    def lib_gather():
+        check_error(
+            LIBINFINIOP.infiniopGather(
+                descriptor,
+                workspace.data(),
+                workspace.size(),
+                output.data(),
+                input.data(),
+                index.data(),
+                None,
+            )
+        )
+
+    lib_gather()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol)
+    # print("x:", input.torch_tensor())
+    # print("CALCULATED:\n", output.actual_tensor(), )
+    # print("GT\n", output.torch_tensor())
+    assert torch.allclose(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol)
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: torch_gather(
+            output.torch_tensor(), input.torch_tensor(), dim, index.torch_tensor()
+        ), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_gather(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+    check_error(LIBINFINIOP.infiniopDestroyGatherDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest my Gather passed!\033[0m")
diff --git a/test/infiniop/hardswish.py b/test/infiniop/hardswish.py
new file mode 100644
index 000000000..424b30567
--- /dev/null
+++ b/test/infiniop/hardswish.py
@@ -0,0 +1,167 @@
+import torch
+import ctypes
+from ctypes import c_uint64
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    get_sync_func,
+    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
+)
+from enum import Enum, auto
+
+# ========================================================================
+#  Configuration (Internal Use Only)
+# ========================================================================
+_TEST_CASES_ = [
+    # shape, input_stride, output_stride
+    ((13, 4), None, None),
+    ((13, 4), (10, 1), (10, 1)),
+    ((13, 4), (0, 1), None),
+    ((13, 4, 4), None, None),
+    ((13, 4, 4), (20, 4, 1), (20, 4, 1)),
+    ((13, 4, 4), (4, 0, 1), None),
+    ((16, 5632), None, None),
+    ((16, 5632), (10240, 1), (10240, 1)),
+    ((4, 4, 5632), None, None),
+    ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1)),
+]
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_INPUT = auto()
+
+_INPLACE = [
+    Inplace.OUT_OF_PLACE, 
+    Inplace.INPLACE_INPUT,
+]
+
+_TEST_CASES = [
+    test_case + (inplace,)
+    for test_case in _TEST_CASES_
+    for inplace in _INPLACE
+]
+
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16]
+
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6},
+    InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2},
+}
+
+DEBUG       = False
+PROFILE     = False
+NUM_PRERUN  = 10
+NUM_ITERATIONS = 1000
+
+
+def hardswish(output, input):
+    output.copy_(input * torch.clamp(input + 3, min=0, max=6) / 6)
+
+
+def test(
+    handle,
+    device,
+    shape,
+    input_stride=None,
+    output_stride=None,
+    inplace=Inplace.OUT_OF_PLACE,
+    dtype=torch.float16,
+    sync=None,
+):
+    input = TestTensor(shape, input_stride, dtype, device)
+    if inplace == Inplace.INPLACE_INPUT:
+        if input_stride != output_stride:
+            return
+        output = input
+    else:
+        output = TestTensor(shape, output_stride, dtype, device, mode="ones")
+
+    if output.is_broadcast():
+        return
+
+    print(
+        f"Testing Hardswish on {InfiniDeviceNames[device]} with shape:{shape} input_stride:{input_stride} output_stride:{output_stride} "
+        f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}"
+    )
+
+    hardswish(output.torch_tensor(), input.torch_tensor())
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateHardswishDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            output.descriptor,
+            input.descriptor,
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in [input, output]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetHardswishWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, output.device)
+
+    def lib_hardswish():
+        check_error(
+            LIBINFINIOP.infiniopHardswish(
+                descriptor,
+                workspace.data(),
+                workspace_size.value,
+                output.data(),
+                input.data(),
+                None,
+            )
+        )
+
+    lib_hardswish()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol)
+
+    assert torch.allclose(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol)
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: hardswish(output.torch_tensor(), input.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_hardswish(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+    check_error(LIBINFINIOP.infiniopDestroyHardswishDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/index_copy_inplace.py b/test/infiniop/index_copy_inplace.py
new file mode 100644
index 000000000..97dbd8266
--- /dev/null
+++ b/test/infiniop/index_copy_inplace.py
@@ -0,0 +1,180 @@
+import torch
+import ctypes
+from ctypes import c_uint64
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
+)
+from enum import Enum, auto
+import random
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE = auto()
+
+_TEST_CASES = [
+    # input_shape, output_shape, dim, output_strides, input_strides,
+    ([13, 1], [13, 4], 1, [37, 1], [37, 1], Inplace.OUT_OF_PLACE),
+    ([1333, 4], [1333, 4], 0, [1, 1333], [1, 2333], Inplace.INPLACE),
+    ([1333, 4], [1333, 4], 0, [1, 1333], [1, 2333], Inplace.OUT_OF_PLACE),
+    ([133, 23, 53], [133, 23, 53], 1, None, None, Inplace.OUT_OF_PLACE),
+    ([133, 23, 13, 53], [133, 23, 13, 53], 2, None, None, Inplace.OUT_OF_PLACE),
+]
+
+# Data types used for testing
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16]
+
+# Tolerance map for different data types
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 0, "rtol": 0},
+    InfiniDtype.F32: {"atol": 0, "rtol": 0},
+    InfiniDtype.BF16: {"atol": 0, "rtol": 0},
+}
+
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def torch_index_copy_inplace(output, input, index, dim):
+    output.index_copy_(dim, index, input.clone())
+    
+
+def test(
+    handle,
+    device,
+    input_shape, output_shape, dim, output_strides, input_strides,
+    inplace,
+    dtype,
+    sync=None,
+):
+    print(
+        f"Testing index_copy_inplace on {InfiniDeviceNames[device]} with shape:{input_shape},"
+        f"inplace:{inplace},"
+        f"dtype:{InfiniDtypeNames[dtype]}"
+    )
+
+    input = TestTensor(
+        input_shape,
+        input_strides,
+        dtype,
+        device,
+    )
+    if inplace == Inplace.INPLACE:
+        assert output_shape == input_shape
+        output = input
+    else:
+        output = TestTensor(
+            output_shape,
+            output_strides,
+            dtype,
+            device,
+            "zeros",
+        )
+
+    index_list = list(range(output_shape[dim]))
+    
+    random.shuffle(index_list)
+    torch_index = torch.tensor(index_list[:input_shape[dim]], dtype=torch.int64)
+    index = TestTensor(
+        [input_shape[dim]],
+        torch_index.stride(),
+        InfiniDtype.I64,
+        device,
+        "manual",
+        set_tensor=torch_index
+    )
+
+    torch_index_copy_inplace(output.torch_tensor(), input.torch_tensor(), index.torch_tensor(), dim)
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateIndexCopyInplaceDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+			output.descriptor,
+			input.descriptor,
+			index.descriptor,
+			dim,
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in [output, input, index]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetIndexCopyInplaceWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, output.device)
+
+    def lib_index_copy_inplace():
+        check_error(
+            LIBINFINIOP.infiniopIndexCopyInplace(
+                descriptor,
+                workspace.data(),
+                workspace.size(),
+				output.data(),
+				input.data(),
+				index.data(),                
+                None,
+            )
+        )
+
+    lib_index_copy_inplace()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol)
+    # print('input:\n', input.torch_tensor())
+    # print('index:\n', index.torch_tensor())
+    # print('output:\n', output.torch_tensor(), '\n', output.actual_tensor(), )
+
+
+    assert torch.allclose(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol)         
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: torch_index_copy_inplace(
+            output.torch_tensor(), input.torch_tensor(), index.torch_tensor(), dim
+        ), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_index_copy_inplace(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+    check_error(LIBINFINIOP.infiniopDestroyIndexCopyInplaceDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest my index_copy_inplace passed!\033[0m")
diff --git a/test/infiniop/interpolate_nearest.py b/test/infiniop/interpolate_nearest.py
new file mode 100644
index 000000000..0440fdfec
--- /dev/null
+++ b/test/infiniop/interpolate_nearest.py
@@ -0,0 +1,265 @@
+import torch
+import ctypes
+from ctypes import c_uint64
+
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
+)
+from enum import Enum, auto
+from typing import List, Tuple
+import math
+from torch.nn import functional as F
+
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+# Test cases: (input_shape, input_stride, output_shape, output_stride)
+_TEST_CASES = [
+    # 2D test cases - simplified to one line each
+    ((1, 1, 2, 2), None, (1, 1, 4, 4), None),  # Simple contiguous case
+    ((1, 3, 4, 4), (48, 16, 4, 1), (1, 3, 8, 8), (192, 64, 8, 1)),  # 2D upscaling 2x
+    ((1, 3, 8, 8), (192, 64, 8, 1), (1, 3, 4, 4), (48, 16, 4, 1)),  # 2D downscaling 2x
+    ((2, 4, 2, 2), (16, 4, 2, 1), (2, 4, 6, 6), (144, 36, 6, 1)),  # Batch upscaling
+    (
+        (1, 1, 3, 5),
+        (15, 15, 5, 1),
+        (1, 1, 9, 10),
+        (90, 90, 10, 1),
+    ),  # Different aspect ratio
+    (
+        (4, 64, 16, 16),
+        (16384, 256, 16, 1),
+        (4, 64, 32, 32),
+        (65536, 1024, 32, 1),
+    ),  # Large batch
+    ((1, 1, 1, 1), (1, 1, 1, 1), (1, 1, 7, 7), (49, 49, 7, 1)),  # Small to large
+    (
+        (1, 2, 3, 4),
+        (24, 1, 8, 2),
+        (1, 2, 6, 8),
+        (96, 1, 16, 2),
+    ),  # Non-contiguous layout
+    ((2, 3, 2, 2), (32, 8, 4, 1), (2, 3, 4, 4), (128, 32, 8, 1)),  # Padded strides
+    # 1D test cases
+    ((1, 3, 8), (24, 8, 1), (1, 3, 16), (48, 16, 1)),  # 1D upscaling 2x
+    ((2, 5, 10), (50, 10, 1), (2, 5, 5), (25, 5, 1)),  # 1D downscaling 2x
+    ((4, 2, 32), (64, 32, 1), (4, 2, 64), (128, 64, 1)),  # 1D larger upscaling
+    # 3D test cases
+    (
+        (1, 2, 2, 2, 2),
+        (16, 8, 4, 2, 1),
+        (1, 2, 4, 4, 4),
+        (128, 64, 16, 4, 1),
+    ),  # 3D upscaling 2x
+    (
+        (1, 1, 2, 3, 4),
+        (24, 24, 12, 4, 1),
+        (1, 1, 4, 6, 8),
+        (192, 192, 48, 8, 1),
+    ),  # 3D uniform upscaling
+    (
+        (3, 2, 5, 5, 5),
+        (250, 125, 25, 5, 1),
+        (3, 2, 3, 3, 3),
+        (54, 27, 9, 3, 1),
+    ),  # 3D non-uniform scaling
+]
+
+# Data types used for testing
+_TENSOR_DTYPES = [InfiniDtype.F32, InfiniDtype.F16, InfiniDtype.BF16, InfiniDtype.I8]
+
+# Tolerance map for different data types
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 1e-4, "rtol": 1e-4},
+    InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2},
+    InfiniDtype.I8: {"atol": 0, "rtol": 0},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def interpolate_nearest(input_tensor, output_shape, output_tensor):
+    """
+    Perform nearest neighbor interpolation using PyTorch as reference
+    """
+    # Extract spatial dimensions (H, W)
+    target_size = output_shape[2:]  # Skip batch and channel dimensions
+
+    # Use PyTorch's interpolate function with nearest mode
+    if input_tensor.dtype in [
+        torch.int8,
+        torch.uint8,
+        torch.int16,
+        torch.int32,
+        torch.int64,
+    ]:
+        # 对于整数类型，先转换为 float32，进行插值，再转换回原类型
+        original_dtype = input_tensor.dtype
+
+        # 转换为 float32 进行插值
+        float_input = input_tensor.float()
+        result = F.interpolate(float_input, size=target_size, mode="nearest")
+
+        # 转换回原始类型
+        result = result.to(original_dtype)
+    else:
+        result = F.interpolate(input_tensor, size=target_size, mode="nearest")
+
+    output_tensor.copy_(result)
+
+
+def test(
+    handle,
+    device,
+    input_shape,
+    input_stride,
+    output_shape,
+    output_stride,
+    tensor_dtype=InfiniDtype.F16,
+    sync=None,
+):
+    # Create input and output tensors
+    # For I8 type, use appropriate randint range (-128 to 127) and don't use scale
+    if tensor_dtype == InfiniDtype.I8:
+        input_tensor = TestTensor(
+            input_shape, input_stride, dt=tensor_dtype, device=device,
+            randint_low=-128, randint_high=128
+        )
+        output_tensor = TestTensor(
+            output_shape, output_stride, dt=tensor_dtype, device=device,
+            randint_low=-128, randint_high=128
+        )
+    else:
+        input_tensor = TestTensor(
+            input_shape, input_stride, dt=tensor_dtype, device=device, scale=1.0
+        )
+        output_tensor = TestTensor(
+            output_shape, output_stride, dt=tensor_dtype, device=device
+        )
+
+    print(
+        f"Testing InterpolateNearest on {InfiniDeviceNames[device]} with "
+        f"input_shape: {input_shape}, output_shape: {output_shape}, "
+        f"input_stride: {input_stride}, output_stride: {output_stride}, "
+        f"dtype: {InfiniDtypeNames[tensor_dtype]}"
+    )
+
+    # Compute reference result using PyTorch
+    interpolate_nearest(
+        input_tensor.torch_tensor(), output_shape, output_tensor.torch_tensor()
+    )
+
+    if sync is not None:
+        sync()
+
+    # Create descriptor for our interpolate_nearest operator
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateInterpolateNearestDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            output_tensor.descriptor,
+            input_tensor.descriptor,
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in [input_tensor, output_tensor]:
+        if tensor is not None:
+            tensor.destroy_desc()
+
+    # Get workspace size
+    workspace_size = ctypes.c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetInterpolateNearestWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, output_tensor.device)
+
+    def lib_interpolate_nearest():
+        check_error(
+            LIBINFINIOP.infiniopInterpolateNearest(
+                descriptor,
+                workspace.data(),
+                workspace_size.value,
+                output_tensor.data(),
+                input_tensor.data(),
+                None,
+            )
+        )
+
+    # Execute the operation
+    lib_interpolate_nearest()
+
+    # Check results
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, tensor_dtype)
+    if DEBUG:
+        debug(
+            output_tensor.actual_tensor(),
+            output_tensor.torch_tensor(),
+            atol=atol,
+            rtol=rtol,
+        )
+
+    assert torch.allclose(
+        output_tensor.actual_tensor(),
+        output_tensor.torch_tensor(),
+        atol=atol,
+        rtol=rtol,
+    ), f"Results don't match for shape {input_shape} -> {output_shape}"
+
+    # Profiling workflow
+    if PROFILE:
+        profile_operation(
+            "PyTorch",
+            lambda: interpolate_nearest(
+                input_tensor.torch_tensor(), output_shape, output_tensor.torch_tensor()
+            ),
+            device,
+            NUM_PRERUN,
+            NUM_ITERATIONS,
+        )
+        profile_operation(
+            "    lib",
+            lambda: lib_interpolate_nearest(),
+            device,
+            NUM_PRERUN,
+            NUM_ITERATIONS,
+        )
+
+    # Clean up
+    check_error(LIBINFINIOP.infiniopDestroyInterpolateNearestDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/libinfiniop/op_register.py b/test/infiniop/libinfiniop/op_register.py
index 20a9188d6..a0f7cbccb 100644
--- a/test/infiniop/libinfiniop/op_register.py
+++ b/test/infiniop/libinfiniop/op_register.py
@@ -4,7 +4,7 @@
     infiniopOperatorDescriptor_t,
 )
 
-from ctypes import c_int32, c_void_p, c_size_t, POINTER, c_float
+from ctypes import c_int32, c_void_p, c_size_t, POINTER, c_float, c_bool
 
 
 class OpRegister:
@@ -1897,3 +1897,322 @@ def paged_attention_prefill_(lib):
     lib.infiniopDestroyPagedAttentionPrefillDescriptor.argtypes = [
         infiniopOperatorDescriptor_t,
     ]
+
+
+@OpRegister.operator
+def averagepool_(lib):
+    lib.infiniopCreateAvgPoolDescriptor.restype = c_int32
+    lib.infiniopCreateAvgPoolDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_bool,
+    ]
+    lib.infiniopGetAvgPoolWorkspaceSize.restype = c_int32
+    lib.infiniopGetAvgPoolWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+    lib.infiniopAvgPool.restype = c_int32
+    lib.infiniopAvgPool.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyAvgPoolDescriptor.restype = c_int32
+    lib.infiniopDestroyAvgPoolDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def batch_norm_(lib):
+    lib.infiniopCreateBatchNormDescriptor.restype = c_int32
+    lib.infiniopCreateBatchNormDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        c_float,
+        c_float,
+    ]
+    lib.infiniopGetBatchNormWorkspaceSize.restype = c_int32
+    lib.infiniopGetBatchNormWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+    lib.infiniopBatchNorm.restype = c_int32
+    lib.infiniopBatchNorm.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyBatchNormDescriptor.restype = c_int32
+    lib.infiniopDestroyBatchNormDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def cross_entropy_loss_(lib):
+    lib.infiniopCreateCrossEntropyLossDescriptor.restype = c_int32
+    lib.infiniopCreateCrossEntropyLossDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+    lib.infiniopGetCrossEntropyLossWorkspaceSize.restype = c_int32
+    lib.infiniopGetCrossEntropyLossWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+    lib.infiniopCrossEntropyLoss.restype = c_int32
+    lib.infiniopCrossEntropyLoss.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyCrossEntropyLossDescriptor.restype = c_int32
+    lib.infiniopDestroyCrossEntropyLossDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def exp_(lib):
+    lib.infiniopCreateExpDescriptor.restype = c_int32
+    lib.infiniopCreateExpDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+    lib.infiniopGetExpWorkspaceSize.restype = c_int32
+    lib.infiniopGetExpWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+    lib.infiniopExp.restype = c_int32
+    lib.infiniopExp.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyExpDescriptor.restype = c_int32
+    lib.infiniopDestroyExpDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def gather_(lib):
+    lib.infiniopCreateGatherDescriptor.restype = c_int32
+    lib.infiniopCreateGatherDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        c_size_t,
+    ]
+    lib.infiniopGetGatherWorkspaceSize.restype = c_int32
+    lib.infiniopGetGatherWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+    lib.infiniopGather.restype = c_int32
+    lib.infiniopGather.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyGatherDescriptor.restype = c_int32
+    lib.infiniopDestroyGatherDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def hardswish_(lib):
+    lib.infiniopCreateHardswishDescriptor.restype = c_int32
+    lib.infiniopCreateHardswishDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+    lib.infiniopGetHardswishWorkspaceSize.restype = c_int32
+    lib.infiniopGetHardswishWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+    lib.infiniopHardswish.restype = c_int32
+    lib.infiniopHardswish.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyHardswishDescriptor.restype = c_int32
+    lib.infiniopDestroyHardswishDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def index_copy_inplace_(lib):
+    lib.infiniopCreateIndexCopyInplaceDescriptor.restype = c_int32
+    lib.infiniopCreateIndexCopyInplaceDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        c_size_t,
+    ]
+    lib.infiniopGetIndexCopyInplaceWorkspaceSize.restype = c_int32
+    lib.infiniopGetIndexCopyInplaceWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+    lib.infiniopIndexCopyInplace.restype = c_int32
+    lib.infiniopIndexCopyInplace.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyIndexCopyInplaceDescriptor.restype = c_int32
+    lib.infiniopDestroyIndexCopyInplaceDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def interpolate_nearest_(lib):
+    lib.infiniopCreateInterpolateNearestDescriptor.restype = c_int32
+    lib.infiniopCreateInterpolateNearestDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+    lib.infiniopGetInterpolateNearestWorkspaceSize.restype = c_int32
+    lib.infiniopGetInterpolateNearestWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+    lib.infiniopInterpolateNearest.restype = c_int32
+    lib.infiniopInterpolateNearest.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyInterpolateNearestDescriptor.restype = c_int32
+    lib.infiniopDestroyInterpolateNearestDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def maxpool_(lib):
+    lib.infiniopCreateMaxPoolDescriptor.restype = c_int32
+    lib.infiniopCreateMaxPoolDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_bool,
+    ]
+    lib.infiniopGetMaxPoolWorkspaceSize.restype = c_int32
+    lib.infiniopGetMaxPoolWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+    lib.infiniopMaxPool.restype = c_int32
+    lib.infiniopMaxPool.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyMaxPoolDescriptor.restype = c_int32
+    lib.infiniopDestroyMaxPoolDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def scatter_(lib):
+    lib.infiniopCreateScatterDescriptor.restype = c_int32
+    lib.infiniopCreateScatterDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        c_size_t,
+    ]
+    lib.infiniopGetScatterWorkspaceSize.restype = c_int32
+    lib.infiniopGetScatterWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+    lib.infiniopScatter.restype = c_int32
+    lib.infiniopScatter.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyScatterDescriptor.restype = c_int32
+    lib.infiniopDestroyScatterDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
diff --git a/test/infiniop/maxpool.py b/test/infiniop/maxpool.py
new file mode 100644
index 000000000..81ddce060
--- /dev/null
+++ b/test/infiniop/maxpool.py
@@ -0,0 +1,242 @@
+import torch
+import ctypes
+from ctypes import c_uint64, c_bool
+
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
+)
+from enum import Enum, auto
+from typing import List, Tuple
+import math
+from torch.nn import functional as F
+
+# Configuration for profiling
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+# Test cases: (input_shape, input_stride, kernel_size, stride, padding, ceil_mode)
+_TEST_CASES = [
+    # 1D max pooling cases
+    ((1, 3, 8), None, (2,), (2,), (0,), False),
+    ((2, 4, 16), None, (3,), (2,), (1,), False),
+    ((3, 2, 77), None, (6,), (4,), (3,), True),
+    # 2D max pooling cases
+    ((1, 1, 4, 4), None, (2, 2), (2, 2), (0, 0), False),
+    ((2, 3, 8, 8), None, (3, 3), (2, 2), (1, 1), False),
+    ((1, 64, 32, 32), None, (2, 2), (2, 2), (0, 0), False),
+    ((4, 128, 16, 16), None, (3, 3), (1, 1), (1, 1), False),
+    # 3D max pooling cases
+    ((1, 1, 4, 4, 4), None, (2, 2, 2), (2, 2, 2), (0, 0, 0), False),
+    ((2, 2, 8, 8, 8), None, (2, 3, 3), (2, 2, 2), (0, 1, 1), False),
+    # Cases with ceil_mode=True
+    ((1, 1, 7, 7), None, (3, 3), (2, 2), (1, 1), True),
+    ((1, 2, 5), None, (3,), (2,), (0,), True),
+]
+
+# Data types used for testing
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16]
+
+# Tolerance map for different data types
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 1e-4, "rtol": 1e-4},
+    InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2},
+}
+
+
+def max_pool(input_tensor, kernel_size, stride, padding, ceil_mode):
+    """
+    Perform max pooling using PyTorch as reference
+    """
+    ndim = len(input_tensor.shape) - 2  # Spatial dimensions
+
+    if ndim == 1:
+        result = F.max_pool1d(
+            input_tensor,
+            kernel_size=kernel_size[0],
+            stride=stride[0],
+            padding=padding[0],
+            ceil_mode=ceil_mode,
+        )
+    elif ndim == 2:
+        result = F.max_pool2d(
+            input_tensor,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            ceil_mode=ceil_mode,
+        )
+    elif ndim == 3:
+        result = F.max_pool3d(
+            input_tensor,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            ceil_mode=ceil_mode,
+        )
+    else:
+        raise ValueError(f"Unsupported spatial dimensions: {ndim}")
+
+    return result
+
+
+def tuple_to_void_p(py_tuple: Tuple):
+    """Convert a python tuple to a ctype void pointer"""
+    array = ctypes.c_uint64 * len(py_tuple)
+    data_array = array(*py_tuple)
+    return ctypes.cast(data_array, ctypes.c_void_p)
+
+
+def test(
+    handle,
+    device,
+    input_shape,
+    input_stride,
+    kernel_size,
+    stride,
+    padding,
+    ceil_mode,
+    tensor_dtype=InfiniDtype.F16,
+    sync=None,
+):
+    # Create input tensor
+    input_tensor = TestTensor(
+        input_shape, input_stride, dt=tensor_dtype, device=device, scale=1.0
+    )
+
+    # Compute reference result using PyTorch
+    torch_ref_output = max_pool(
+        input_tensor.torch_tensor(),
+        kernel_size,
+        stride,
+        padding,
+        ceil_mode,
+    )
+
+    # Use PyTorch输出shape来初始化output_tensor
+    output_tensor = TestTensor(
+        torch_ref_output.shape, None, dt=tensor_dtype, device=device
+    )
+
+    print(
+        f"Testing MaxPool on {InfiniDeviceNames[device]} with "
+        f"input_shape: {input_shape}, kernel_size: {kernel_size}, "
+        f"stride: {stride}, padding: {padding}, ceil_mode: {ceil_mode}, "
+        f"dtype: {InfiniDtypeNames[tensor_dtype]}"
+    )
+
+    if sync is not None:
+        sync()
+
+    # Create descriptor for our max pool operator
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateMaxPoolDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            output_tensor.descriptor,
+            input_tensor.descriptor,
+            tuple_to_void_p(kernel_size),
+            tuple_to_void_p(stride),
+            tuple_to_void_p(padding),
+            c_bool(ceil_mode),
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in [input_tensor, output_tensor]:
+        if tensor is not None:
+            tensor.destroy_desc()
+
+    # Get workspace size
+    workspace_size = ctypes.c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetMaxPoolWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, output_tensor.device)
+
+    def lib_max_pool():
+        check_error(
+            LIBINFINIOP.infiniopMaxPool(
+                descriptor,
+                workspace.data(),
+                workspace_size.value,
+                output_tensor.data(),
+                input_tensor.data(),
+                None,
+            )
+        )
+
+    # Execute the operation
+    lib_max_pool()
+
+    # Check results
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, tensor_dtype)
+    if DEBUG:
+        debug(
+            output_tensor.actual_tensor(),
+            torch_ref_output,
+            atol=atol,
+            rtol=rtol,
+        )
+
+    assert torch.allclose(
+        output_tensor.actual_tensor(),
+        torch_ref_output,
+        atol=atol,
+        rtol=rtol,
+    ), f"Results don't match for input_shape {input_shape}, kernel_size {kernel_size}"
+
+    # Profiling workflow
+    if PROFILE:
+        profile_operation(
+            "PyTorch",
+            lambda: max_pool(
+                input_tensor.torch_tensor(),
+                kernel_size,
+                stride,
+                padding,
+                ceil_mode,
+            ),
+            device,
+            NUM_PRERUN,
+            NUM_ITERATIONS,
+        )
+        profile_operation(
+            "    lib", lambda: lib_max_pool(), device, NUM_PRERUN, NUM_ITERATIONS
+        )
+
+    # Clean up
+    check_error(LIBINFINIOP.infiniopDestroyMaxPoolDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/scatter.py b/test/infiniop/scatter.py
new file mode 100644
index 000000000..86ccdcdeb
--- /dev/null
+++ b/test/infiniop/scatter.py
@@ -0,0 +1,196 @@
+import torch
+import ctypes
+from ctypes import c_uint64
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
+)
+from enum import Enum, auto
+import random
+
+_TEST_CASES = [
+    # input_shape, index_shape, output_shape, dim, input_strides, output_strides, index_strides
+    ((6, 7), (6, 7), (6, 7), 1, (7, 1), (1, 7), None),
+    ((2, 3, 7), (2, 3, 5), (2, 3, 5), 2, (1, 2, 6), None, None),
+    ((10, 5, 4), (10, 4, 4), (10, 4, 4), 1, None, None, [16, 4, 1]),
+    ((11, 2, 2, 4), (11, 2, 2, 4), (11, 2, 2, 4), 0, None, [16, 8, 4, 1], None),
+]
+
+
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16]
+
+# Tolerance map for different data types
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 0, "rtol": 0},
+    InfiniDtype.F32: {"atol": 0, "rtol": 0},
+    InfiniDtype.BF16: {"atol": 0, "rtol": 0},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def torch_scatter(output: torch.Tensor, input, index, dim):
+    output.scatter_(dim, index, src=input)
+    
+
+def test(
+    handle,
+    device,
+    input_shape, index_shape, output_shape, dim, input_strides, output_strides, index_strides,
+    dtype,
+    sync=None,
+):
+    print(
+        f"Testing scatter on {InfiniDeviceNames[device]} with input_shape:{input_shape}, index_shape:{index_shape}, output_shape:{output_shape}, dim:{dim},"
+        f"dtype:{InfiniDtypeNames[dtype]}"
+    )
+
+    output = TestTensor(
+        output_shape,
+        output_strides,
+        dtype,
+        device,
+        "zeros",
+    )
+
+    input = TestTensor(
+        input_shape,
+        input_strides,
+        dtype,
+        device,
+    )
+
+    def get_test_index_tensor(input_shape, index_shape, output_shape, scatter_dim):
+        index = torch.empty(index_shape, dtype=torch.int64)
+        ndim = len(input_shape)
+        if ndim == 2 and scatter_dim == 1:
+            for i in range(input.shape[0]):
+                row = list(range(output_shape[dim]))
+                random.shuffle(row)
+                index[i, :] = torch.tensor(row[:index_shape[dim]]).type(torch.float64)
+        elif ndim == 3 and scatter_dim == 2:
+            for i in range(input.shape[0]):
+                for j in range(input.shape[1]):
+                    row = list(range(output_shape[dim]))
+                    random.shuffle(row)
+                    index[i, j, :] = torch.tensor(row[:index_shape[dim]]).type(torch.float64)
+        elif ndim == 3 and scatter_dim == 1:
+            for i in range(input.shape[0]):
+                for j in range(input.shape[2]):
+                    row = list(range(output_shape[dim]))
+                    random.shuffle(row)
+                    index[i, :, j] = torch.tensor(row[:index_shape[dim]]).type(torch.float64)
+        elif ndim == 4 and scatter_dim == 0:
+            for i in range(input.shape[1]):
+                for j in range(input.shape[2]):
+                    for k in range(input.shape[3]):
+                        row = list(range(output_shape[dim]))
+                        random.shuffle(row)
+                        index[:, i, j, k] = torch.tensor(row[:index_shape[dim]]).type(torch.float64)
+        return index
+    
+    torch_index = get_test_index_tensor(input_shape, index_shape, output_shape, dim).type(torch.int64)
+    if index_strides:
+        torch_index = torch_index.as_strided(index_shape, index_strides)    
+    index = TestTensor(
+        index_shape,
+        torch_index.stride(),
+        InfiniDtype.I64,
+        device,
+        "manual",
+        set_tensor=torch_index
+    )
+
+    torch_scatter(output.torch_tensor(), input.torch_tensor(), index.torch_tensor(), dim)
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateScatterDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+			output.descriptor,
+			input.descriptor,
+			index.descriptor,
+			dim,
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in [output, input, index]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetScatterWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, output.device)
+
+    def lib_scatter():
+        check_error(
+            LIBINFINIOP.infiniopScatter(
+                descriptor,
+                workspace.data(),
+                workspace.size(),
+				output.data(),
+				input.data(),
+				index.data(),                
+                None,
+            )
+        )
+
+    lib_scatter()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol)
+    # print('input:\n', input.torch_tensor())
+    # print('index:\n', index.torch_tensor())
+    # print('output:\n', output.torch_tensor(), '\n', output.actual_tensor(), )
+
+
+    assert torch.allclose(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol)         
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: torch_scatter(
+            output.torch_tensor(), input.torch_tensor(), index.torch_tensor(), dim
+        ), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_scatter(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+    check_error(LIBINFINIOP.infiniopDestroyScatterDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest my scatter passed!\033[0m")

From 05096eacf045078e0429415e065e244a69167350 Mon Sep 17 00:00:00 2001
From: gongchensu <zhuyue_134@qq.com>
Date: Tue, 20 Jan 2026 02:43:25 +0000
Subject: [PATCH 6/7] Issue/888 - Refactor: integrate exp and hardswish
 operators into unified unary framework.

---
 include/infiniop/ops/exp.h                    |  24 ---
 include/infiniop/ops/hardswish.h              |  24 ---
 include/infiniop/ops/unary_ops_api.h          |   2 +
 .../elementwise/cpu/elementwise_cpu_impl.h    | 185 +++++++++-------
 .../nvidia/elementwise_nvidia_impl.cuh        | 197 ++++++++++--------
 src/infiniop/elementwise/unary.h              |  47 +++++
 src/infiniop/ops/exp/cpu/exp_cpu.cc           |  48 +----
 src/infiniop/ops/exp/cpu/exp_cpu.h            |  16 +-
 src/infiniop/ops/exp/cuda/kernel.cuh          |  33 +--
 src/infiniop/ops/exp/nvidia/exp_nvidia.cu     |  53 +----
 src/infiniop/ops/exp/operator.cc              | 134 +-----------
 .../ops/hardswish/cpu/hardswish_cpu.cc        |  48 +----
 .../ops/hardswish/cpu/hardswish_cpu.h         |  25 +--
 src/infiniop/ops/hardswish/cuda/kernel.cuh    |  49 +----
 .../ops/hardswish/nvidia/hardswish_nvidia.cu  |  53 +----
 src/infiniop/ops/hardswish/operator.cc        | 134 +-----------
 test/infiniop/exp.py                          | 165 ---------------
 test/infiniop/hardswish.py                    | 167 ---------------
 test/infiniop/test_all_unary_ops.py           |  46 ++++
 19 files changed, 337 insertions(+), 1113 deletions(-)
 delete mode 100644 include/infiniop/ops/exp.h
 delete mode 100644 include/infiniop/ops/hardswish.h
 delete mode 100644 test/infiniop/exp.py
 delete mode 100644 test/infiniop/hardswish.py

diff --git a/include/infiniop/ops/exp.h b/include/infiniop/ops/exp.h
deleted file mode 100644
index 624bc5363..000000000
--- a/include/infiniop/ops/exp.h
+++ /dev/null
@@ -1,24 +0,0 @@
-#ifndef __INFINIOP_EXP_API_H__
-#define __INFINIOP_EXP_API_H__
-
-#include "../operator_descriptor.h"
-
-typedef struct InfiniopDescriptor *infiniopExpDescriptor_t;
-
-__C __export infiniStatus_t infiniopCreateExpDescriptor(infiniopHandle_t handle,
-                                                        infiniopExpDescriptor_t *desc_ptr,
-                                                        infiniopTensorDescriptor_t output,
-                                                        infiniopTensorDescriptor_t input);
-
-__C __export infiniStatus_t infiniopGetExpWorkspaceSize(infiniopExpDescriptor_t desc, size_t *size);
-
-__C __export infiniStatus_t infiniopExp(infiniopExpDescriptor_t desc,
-                                        void *workspace,
-                                        size_t workspace_size,
-                                        void *output,
-                                        const void *input,
-                                        void *stream);
-
-__C __export infiniStatus_t infiniopDestroyExpDescriptor(infiniopExpDescriptor_t desc);
-
-#endif
diff --git a/include/infiniop/ops/hardswish.h b/include/infiniop/ops/hardswish.h
deleted file mode 100644
index 8d655fe82..000000000
--- a/include/infiniop/ops/hardswish.h
+++ /dev/null
@@ -1,24 +0,0 @@
-#ifndef __INFINIOP_HARDSWISH_API_H__
-#define __INFINIOP_HARDSWISH_API_H__
-
-#include "../operator_descriptor.h"
-
-typedef struct InfiniopDescriptor *infiniopHardswishDescriptor_t;
-
-__C __export infiniStatus_t infiniopCreateHardswishDescriptor(infiniopHandle_t handle,
-                                                              infiniopHardswishDescriptor_t *desc_ptr,
-                                                              infiniopTensorDescriptor_t output,
-                                                              infiniopTensorDescriptor_t input);
-
-__C __export infiniStatus_t infiniopGetHardswishWorkspaceSize(infiniopHardswishDescriptor_t desc, size_t *size);
-
-__C __export infiniStatus_t infiniopHardswish(infiniopHardswishDescriptor_t desc,
-                                              void *workspace,
-                                              size_t workspace_size,
-                                              void *output,
-                                              const void *input,
-                                              void *stream);
-
-__C __export infiniStatus_t infiniopDestroyHardswishDescriptor(infiniopHardswishDescriptor_t desc);
-
-#endif
diff --git a/include/infiniop/ops/unary_ops_api.h b/include/infiniop/ops/unary_ops_api.h
index 95b0773b6..11cae2190 100644
--- a/include/infiniop/ops/unary_ops_api.h
+++ b/include/infiniop/ops/unary_ops_api.h
@@ -35,5 +35,7 @@ UNARY_OP_API_DECLARE(erf, Erf)
 UNARY_OP_API_DECLARE(atan, Atan)
 UNARY_OP_API_DECLARE(acos, Acos)
 UNARY_OP_API_DECLARE(ceil, Ceil)
+UNARY_OP_API_DECLARE(exp, Exp)
+UNARY_OP_API_DECLARE(hardswish, Hardswish)
 
 #endif // __INFINIOP_UNARY_OPS_API_H__
diff --git a/src/infiniop/elementwise/cpu/elementwise_cpu_impl.h b/src/infiniop/elementwise/cpu/elementwise_cpu_impl.h
index 030f4d87e..fff5b1819 100644
--- a/src/infiniop/elementwise/cpu/elementwise_cpu_impl.h
+++ b/src/infiniop/elementwise/cpu/elementwise_cpu_impl.h
@@ -25,8 +25,74 @@
  *   }
  */
 
+// =========================================================================
+//  Internal Helpers (Private Macros to reduce duplication)
+// =========================================================================
+
+/**
+ * @brief Common Calculate Switch Cases (F16 & F32)
+ */
+#define _IMPL_CALC_CASES_COMMON \
+    case INFINI_DTYPE_F16: \
+        return _device_info->template calculate<Op, fp16_t>(_info, output, inputs, stream); \
+    case INFINI_DTYPE_F32: \
+        return _device_info->template calculate<Op, float>(_info, output, inputs, stream);
+
 /**
- * @brief Macro to generate binary operator implementation.
+ * @brief Extended Calculate Switch Cases (Adds F64 & BF16)
+ */
+#define _IMPL_CALC_CASES_EXTENDED \
+    _IMPL_CALC_CASES_COMMON \
+    case INFINI_DTYPE_F64: \
+        return _device_info->template calculate<Op, double>(_info, output, inputs, stream); \
+    case INFINI_DTYPE_BF16: \
+        return _device_info->template calculate<Op, bf16_t>(_info, output, inputs, stream);
+
+/**
+ * @brief Generic Template for the Calculate method
+ * @param CASES_MACRO The macro containing the switch cases to use
+ */
+#define _IMPL_CALCULATE_METHOD(CASES_MACRO) \
+    infiniStatus_t Descriptor::calculate( \
+        void *workspace, \
+        size_t workspace_size, \
+        void *output, \
+        std::vector<const void *> inputs, \
+        void *stream) const { \
+        switch (_dtype) { \
+            CASES_MACRO \
+            default: \
+                return INFINI_STATUS_BAD_TENSOR_DTYPE; \
+        } \
+    }
+
+/**
+ * @brief Generic Template for the Create method
+ * @param SHAPE_CHECK_BLOCK Code block to execute for shape checking
+ * @param ... Variadic arguments for allowed data types in CHECK_DTYPE
+ */
+#define _IMPL_CREATE_METHOD(SHAPE_CHECK_BLOCK, ...) \
+    Descriptor::~Descriptor() = default; \
+    infiniStatus_t Descriptor::create( \
+        infiniopHandle_t handle_, \
+        Descriptor **desc_ptr, \
+        infiniopTensorDescriptor_t out_desc, \
+        std::vector<infiniopTensorDescriptor_t> input_desc_vec) { \
+        auto handle = reinterpret_cast<device::cpu::Handle *>(handle_); \
+        auto dtype = out_desc->dtype(); \
+        const auto &out_shape = out_desc->shape(); \
+        SHAPE_CHECK_BLOCK \
+        CHECK_DTYPE(dtype, __VA_ARGS__); \
+        CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); \
+        return INFINI_STATUS_SUCCESS; \
+    }
+
+// =========================================================================
+//  Public API Implementation Macros
+// =========================================================================
+
+/**
+ * @brief Implementation for Binary Operators (F16, F32)
  *
  * This macro generates the Descriptor destructor, create, and calculate methods
  * for binary operators, using the generic implementation.
@@ -37,48 +103,19 @@
  *       ELEMENTWISE_CPU_IMPL_BINARY(pow)
  *   }
  */
-#define ELEMENTWISE_CPU_IMPL_BINARY(OP)                                             \
-                                                                                    \
-    Descriptor::~Descriptor() = default;                                            \
-                                                                                    \
-    infiniStatus_t Descriptor::create(                                              \
-        infiniopHandle_t handle_,                                                   \
-        Descriptor **desc_ptr,                                                      \
-        infiniopTensorDescriptor_t out_desc,                                        \
-        std::vector<infiniopTensorDescriptor_t> input_desc_vec) {                   \
-        auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);             \
-        auto dtype = out_desc->dtype();                                             \
-        const auto &a_desc = input_desc_vec.at(0);                                  \
-        const auto &b_desc = input_desc_vec.at(1);                                  \
-        const auto &out_shape = out_desc->shape();                                  \
-        const auto &a_shape = a_desc->shape();                                      \
-        const auto &b_shape = b_desc->shape();                                      \
-        CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);                     \
-        CHECK_SAME_SHAPE(out_shape, a_shape, b_shape);                              \
-        CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); \
-        return INFINI_STATUS_SUCCESS;                                               \
-    }                                                                               \
-                                                                                    \
-    infiniStatus_t Descriptor::calculate(                                           \
-        void *workspace,                                                            \
-        size_t workspace_size,                                                      \
-        void *output,                                                               \
-        std::vector<const void *> inputs,                                           \
-        void *stream) const {                                                       \
-        switch (_dtype) {                                                           \
-        case INFINI_DTYPE_F16:                                                      \
-            return _device_info->template calculate<Op, fp16_t>(                    \
-                _info, output, inputs, stream);                                     \
-        case INFINI_DTYPE_F32:                                                      \
-            return _device_info->template calculate<Op, float>(                     \
-                _info, output, inputs, stream);                                     \
-        default:                                                                    \
-            return INFINI_STATUS_BAD_TENSOR_DTYPE;                                  \
-        }                                                                           \
-    }
+#define ELEMENTWISE_CPU_IMPL_BINARY(OP) \
+    _IMPL_CREATE_METHOD( \
+        const auto &a_desc = input_desc_vec.at(0); \
+        const auto &b_desc = input_desc_vec.at(1); \
+        const auto &a_shape = a_desc->shape(); \
+        const auto &b_shape = b_desc->shape(); \
+        CHECK_SAME_SHAPE(out_shape, a_shape, b_shape);, \
+        INFINI_DTYPE_F16, INFINI_DTYPE_F32 \
+    ) \
+    _IMPL_CALCULATE_METHOD(_IMPL_CALC_CASES_COMMON)
 
 /**
- * @brief Macro to generate unary operator implementation.
+ * @brief Implementation for Unary Operators (F16, F32)
  *
  * This macro generates the Descriptor destructor, create, and calculate methods
  * for unary operators, using the generic implementation.
@@ -89,42 +126,34 @@
  *       ELEMENTWISE_CPU_IMPL_UNARY(sqrt)
  *   }
  */
-#define ELEMENTWISE_CPU_IMPL_UNARY(OP)                                              \
-                                                                                    \
-    Descriptor::~Descriptor() = default;                                            \
-                                                                                    \
-    infiniStatus_t Descriptor::create(                                              \
-        infiniopHandle_t handle_,                                                   \
-        Descriptor **desc_ptr,                                                      \
-        infiniopTensorDescriptor_t out_desc,                                        \
-        std::vector<infiniopTensorDescriptor_t> input_desc_vec) {                   \
-        auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);             \
-        auto dtype = out_desc->dtype();                                             \
-        const auto &x_desc = input_desc_vec.at(0);                                  \
-        const auto &y_shape = out_desc->shape();                                    \
-        const auto &x_shape = x_desc->shape();                                      \
-        CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);                     \
-        CHECK_SAME_SHAPE(y_shape, x_shape);                                         \
-        CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); \
-        return INFINI_STATUS_SUCCESS;                                               \
-    }                                                                               \
-                                                                                    \
-    infiniStatus_t Descriptor::calculate(                                           \
-        void *workspace,                                                            \
-        size_t workspace_size,                                                      \
-        void *output,                                                               \
-        std::vector<const void *> inputs,                                           \
-        void *stream) const {                                                       \
-        switch (_dtype) {                                                           \
-        case INFINI_DTYPE_F16:                                                      \
-            return _device_info->template calculate<Op, fp16_t>(                    \
-                _info, output, inputs, stream);                                     \
-        case INFINI_DTYPE_F32:                                                      \
-            return _device_info->template calculate<Op, float>(                     \
-                _info, output, inputs, stream);                                     \
-        default:                                                                    \
-            return INFINI_STATUS_BAD_TENSOR_DTYPE;                                  \
-        }                                                                           \
-    }
+#define ELEMENTWISE_CPU_IMPL_UNARY(OP) \
+    _IMPL_CREATE_METHOD( \
+        const auto &x_desc = input_desc_vec.at(0); \
+        const auto &x_shape = x_desc->shape(); \
+        CHECK_SAME_SHAPE(out_shape, x_shape);, \
+        INFINI_DTYPE_F16, INFINI_DTYPE_F32 \
+    ) \
+    _IMPL_CALCULATE_METHOD(_IMPL_CALC_CASES_COMMON)
+
+/**
+ * @brief Implementation for Unary Operators Extended (F16, F32, F64, BF16)
+ *
+ * This macro generates the Descriptor destructor, create, and calculate methods
+ * for unary operators supporting F16, F32, F64, and BF16 data types.
+ *
+ * Usage:
+ *   namespace op::exp::cpu {
+ *       using Op = op::elementwise::unary::UnaryOp<UnaryMode::Exp>;
+ *       ELEMENTWISE_CPU_IMPL_UNARY_EXTENDED(exp)
+ *   }
+ */
+#define ELEMENTWISE_CPU_IMPL_UNARY_EXTENDED(OP) \
+    _IMPL_CREATE_METHOD( \
+        const auto &x_desc = input_desc_vec.at(0); \
+        const auto &x_shape = x_desc->shape(); \
+        CHECK_SAME_SHAPE(out_shape, x_shape);, \
+        INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16 \
+    ) \
+    _IMPL_CALCULATE_METHOD(_IMPL_CALC_CASES_EXTENDED)
 
 #endif // __INFINIOP_ELEMENTWISE_CPU_IMPL_H__
diff --git a/src/infiniop/elementwise/nvidia/elementwise_nvidia_impl.cuh b/src/infiniop/elementwise/nvidia/elementwise_nvidia_impl.cuh
index 39b78884a..b0716db1a 100644
--- a/src/infiniop/elementwise/nvidia/elementwise_nvidia_impl.cuh
+++ b/src/infiniop/elementwise/nvidia/elementwise_nvidia_impl.cuh
@@ -25,8 +25,81 @@
  *   }
  */
 
+// =========================================================================
+//  Internal Helpers (Private Macros to reduce duplication)
+// =========================================================================
+
+/**
+ * @brief Common Calculate Switch Cases (F16 & F32)
+ */
+#define _IMPL_CALC_CASES_COMMON \
+    case INFINI_DTYPE_F16: \
+        return _device_info->calculate<256, cuda::Op, half>(_info, workspace, output, inputs, stream); \
+    case INFINI_DTYPE_F32: \
+        return _device_info->calculate<256, cuda::Op, float>(_info, workspace, output, inputs, stream);
+
 /**
- * @brief Macro to generate binary operator implementation for NVIDIA/CUDA.
+ * @brief Extended Calculate Switch Cases (Adds F64 & BF16)
+ * Note: Order is F16, BF16, F32, F64 to match original implementation
+ */
+#define _IMPL_CALC_CASES_EXTENDED \
+    case INFINI_DTYPE_F16: \
+        return _device_info->calculate<256, cuda::Op, half>(_info, workspace, output, inputs, stream); \
+    case INFINI_DTYPE_BF16: \
+        return _device_info->calculate<256, cuda::Op, cuda_bfloat16>(_info, workspace, output, inputs, stream); \
+    case INFINI_DTYPE_F32: \
+        return _device_info->calculate<256, cuda::Op, float>(_info, workspace, output, inputs, stream); \
+    case INFINI_DTYPE_F64: \
+        return _device_info->calculate<256, cuda::Op, double>(_info, workspace, output, inputs, stream);
+
+/**
+ * @brief Generic Template for the Calculate method
+ * @param CASES_MACRO The macro containing the switch cases to use
+ */
+#define _IMPL_CALCULATE_METHOD(CASES_MACRO) \
+    infiniStatus_t Descriptor::calculate( \
+        void *workspace, \
+        size_t workspace_size, \
+        void *output, \
+        std::vector<const void *> inputs, \
+        void *stream) const { \
+        if (workspace_size < _workspace_size) { \
+            return INFINI_STATUS_INSUFFICIENT_WORKSPACE; \
+        } \
+        switch (_dtype) { \
+            CASES_MACRO \
+            default: \
+                return INFINI_STATUS_BAD_TENSOR_DTYPE; \
+        } \
+    }
+
+/**
+ * @brief Generic Template for the Create method
+ * @param SHAPE_CHECK_BLOCK Code block to execute for shape checking
+ * @param ... Variadic arguments for allowed data types in CHECK_DTYPE
+ */
+#define _IMPL_CREATE_METHOD(SHAPE_CHECK_BLOCK, ...) \
+    Descriptor::~Descriptor() = default; \
+    infiniStatus_t Descriptor::create( \
+        infiniopHandle_t handle_, \
+        Descriptor **desc_ptr, \
+        infiniopTensorDescriptor_t out_desc, \
+        std::vector<infiniopTensorDescriptor_t> input_desc_vec) { \
+        auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_); \
+        auto dtype = out_desc->dtype(); \
+        const auto &out_shape = out_desc->shape(); \
+        SHAPE_CHECK_BLOCK \
+        CHECK_DTYPE(dtype, __VA_ARGS__); \
+        CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); \
+        return INFINI_STATUS_SUCCESS; \
+    }
+
+// =========================================================================
+//  Public API Implementation Macros
+// =========================================================================
+
+/**
+ * @brief Implementation for Binary Operators (F16, F32)
  *
  * This macro generates the Descriptor destructor, create, and calculate methods
  * for binary operators, using the generic implementation.
@@ -36,51 +109,19 @@
  *       ELEMENTWISE_NVIDIA_IMPL_BINARY(pow)
  *   }
  */
-#define ELEMENTWISE_NVIDIA_IMPL_BINARY(OP)                                           \
-                                                                                     \
-    Descriptor::~Descriptor() = default;                                             \
-                                                                                     \
-    infiniStatus_t Descriptor::create(                                               \
-        infiniopHandle_t handle_,                                                    \
-        Descriptor **desc_ptr,                                                       \
-        infiniopTensorDescriptor_t out_desc,                                         \
-        std::vector<infiniopTensorDescriptor_t> input_desc_vec) {                    \
-        auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);           \
-        auto dtype = out_desc->dtype();                                              \
-        const auto &a_desc = input_desc_vec.at(0);                                   \
-        const auto &b_desc = input_desc_vec.at(1);                                   \
-        const auto &c_shape = out_desc->shape();                                     \
-        const auto &a_shape = a_desc->shape();                                       \
-        const auto &b_shape = b_desc->shape();                                       \
-        CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);                      \
-        CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);                                 \
-        CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); \
-        return INFINI_STATUS_SUCCESS;                                                \
-    }                                                                                \
-                                                                                     \
-    infiniStatus_t Descriptor::calculate(                                            \
-        void *workspace,                                                             \
-        size_t workspace_size,                                                       \
-        void *output,                                                                \
-        std::vector<const void *> inputs,                                            \
-        void *stream) const {                                                        \
-        if (workspace_size < _workspace_size) {                                      \
-            return INFINI_STATUS_INSUFFICIENT_WORKSPACE;                             \
-        }                                                                            \
-        switch (_dtype) {                                                            \
-        case INFINI_DTYPE_F16:                                                       \
-            return _device_info->calculate<256, cuda::Op, half>(                     \
-                _info, workspace, output, inputs, stream);                           \
-        case INFINI_DTYPE_F32:                                                       \
-            return _device_info->calculate<256, cuda::Op, float>(                    \
-                _info, workspace, output, inputs, stream);                           \
-        default:                                                                     \
-            return INFINI_STATUS_BAD_TENSOR_DTYPE;                                   \
-        }                                                                            \
-    }
+#define ELEMENTWISE_NVIDIA_IMPL_BINARY(OP) \
+    _IMPL_CREATE_METHOD( \
+        const auto &a_desc = input_desc_vec.at(0); \
+        const auto &b_desc = input_desc_vec.at(1); \
+        const auto &a_shape = a_desc->shape(); \
+        const auto &b_shape = b_desc->shape(); \
+        CHECK_SAME_SHAPE(out_shape, a_shape, b_shape);, \
+        INFINI_DTYPE_F16, INFINI_DTYPE_F32 \
+    ) \
+    _IMPL_CALCULATE_METHOD(_IMPL_CALC_CASES_COMMON)
 
 /**
- * @brief Macro to generate unary operator implementation for NVIDIA/CUDA.
+ * @brief Implementation for Unary Operators (F16, F32)
  *
  * This macro generates the Descriptor destructor, create, and calculate methods
  * for unary operators, using the generic implementation.
@@ -90,45 +131,33 @@
  *       ELEMENTWISE_NVIDIA_IMPL_UNARY(sqrt)
  *   }
  */
-#define ELEMENTWISE_NVIDIA_IMPL_UNARY(OP)                                            \
-                                                                                     \
-    Descriptor::~Descriptor() = default;                                             \
-                                                                                     \
-    infiniStatus_t Descriptor::create(                                               \
-        infiniopHandle_t handle_,                                                    \
-        Descriptor **desc_ptr,                                                       \
-        infiniopTensorDescriptor_t out_desc,                                         \
-        std::vector<infiniopTensorDescriptor_t> input_desc_vec) {                    \
-        auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);           \
-        auto dtype = out_desc->dtype();                                              \
-        const auto &x_desc = input_desc_vec.at(0);                                   \
-        const auto &y_shape = out_desc->shape();                                     \
-        const auto &x_shape = x_desc->shape();                                       \
-        CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);                      \
-        CHECK_SAME_SHAPE(y_shape, x_shape);                                          \
-        CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); \
-        return INFINI_STATUS_SUCCESS;                                                \
-    }                                                                                \
-                                                                                     \
-    infiniStatus_t Descriptor::calculate(                                            \
-        void *workspace,                                                             \
-        size_t workspace_size,                                                       \
-        void *output,                                                                \
-        std::vector<const void *> inputs,                                            \
-        void *stream) const {                                                        \
-        if (workspace_size < _workspace_size) {                                      \
-            return INFINI_STATUS_INSUFFICIENT_WORKSPACE;                             \
-        }                                                                            \
-        switch (_dtype) {                                                            \
-        case INFINI_DTYPE_F16:                                                       \
-            return _device_info->calculate<256, cuda::Op, half>(                     \
-                _info, workspace, output, inputs, stream);                           \
-        case INFINI_DTYPE_F32:                                                       \
-            return _device_info->calculate<256, cuda::Op, float>(                    \
-                _info, workspace, output, inputs, stream);                           \
-        default:                                                                     \
-            return INFINI_STATUS_BAD_TENSOR_DTYPE;                                   \
-        }                                                                            \
-    }
+#define ELEMENTWISE_NVIDIA_IMPL_UNARY(OP) \
+    _IMPL_CREATE_METHOD( \
+        const auto &x_desc = input_desc_vec.at(0); \
+        const auto &x_shape = x_desc->shape(); \
+        CHECK_SAME_SHAPE(out_shape, x_shape);, \
+        INFINI_DTYPE_F16, INFINI_DTYPE_F32 \
+    ) \
+    _IMPL_CALCULATE_METHOD(_IMPL_CALC_CASES_COMMON)
+
+/**
+ * @brief Implementation for Unary Operators Extended (F16, F32, F64, BF16)
+ *
+ * This macro generates the Descriptor destructor, create, and calculate methods
+ * for unary operators supporting F16, F32, F64, and BF16 data types.
+ *
+ * Usage:
+ *   namespace op::exp::nvidia {
+ *       ELEMENTWISE_NVIDIA_IMPL_UNARY_EXTENDED(exp)
+ *   }
+ */
+#define ELEMENTWISE_NVIDIA_IMPL_UNARY_EXTENDED(OP) \
+    _IMPL_CREATE_METHOD( \
+        const auto &x_desc = input_desc_vec.at(0); \
+        const auto &x_shape = x_desc->shape(); \
+        CHECK_SAME_SHAPE(out_shape, x_shape);, \
+        INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16 \
+    ) \
+    _IMPL_CALCULATE_METHOD(_IMPL_CALC_CASES_EXTENDED)
 
 #endif // __INFINIOP_ELEMENTWISE_NVIDIA_IMPL_CUH__
diff --git a/src/infiniop/elementwise/unary.h b/src/infiniop/elementwise/unary.h
index 9f41dedb2..330f305dd 100644
--- a/src/infiniop/elementwise/unary.h
+++ b/src/infiniop/elementwise/unary.h
@@ -48,6 +48,7 @@ enum class UnaryMode {
     Sigmoid,
     Sign,
     Erf,
+    Hardswish,
 };
 
 /**
@@ -124,6 +125,17 @@ struct UnaryOp {
             return x > T(0) ? T(1) : (x == T(0) ? T(0) : T(-1));
         } else if constexpr (Mode == UnaryMode::Erf) {
             return std::erf(x);
+        } else if constexpr (Mode == UnaryMode::Hardswish) {
+            if constexpr (std::is_integral_v<T>) {
+                return static_cast<T>(0);
+            } else {
+                // x * clamp(x + 3, 0, 6) / 6
+                auto x_val = static_cast<double>(x);
+                double y = x_val + 3.0;
+                y = std::min(std::max(y, 0.0), 6.0);
+                double out = x_val * (y / 6.0);
+                return static_cast<T>(out);
+            }
         } else {
             static_assert(Mode != Mode, "Unsupported unary operation mode");
             return x;
@@ -487,6 +499,41 @@ struct UnaryOp {
             } else {
                 return std::erf(x);
             }
+        } else if constexpr (Mode == UnaryMode::Hardswish) {
+            // Hardswish: f(x) = x * clamp(x + 3, 0, 6) / 6
+            auto hswish_f32 = [](float x) -> float {
+                float y = x + 3.0f;
+                y = y < 0.0f ? 0.0f : (y > 6.0f ? 6.0f : y);
+                return x * (y * (1.0f / 6.0f));
+            };
+            if constexpr (std::is_same_v<T, half2>) {
+                float2 vf = __half22float2(x);
+                float2 vr = make_float2(
+                    hswish_f32(vf.x),
+                    hswish_f32(vf.y));
+                return __float22half2_rn(vr);
+            } else if constexpr (std::is_same_v<T, half>) {
+                float xf = __half2float(x);
+                float yf = hswish_f32(xf);
+                return __float2half_rn(yf);
+            } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
+                float f0 = __bfloat162float(__low2bfloat16(x));
+                float f1 = __bfloat162float(__high2bfloat16(x));
+                return __floats2bfloat162_rn(hswish_f32(f0), hswish_f32(f1));
+            } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+                float xf = __bfloat162float(x);
+                return __float2bfloat16_rz(hswish_f32(xf));
+            } else if constexpr (std::is_same_v<T, float>) {
+                return hswish_f32(x);
+            } else if constexpr (std::is_same_v<T, double>) {
+                double xd = static_cast<double>(x);
+                double yd = xd * (std::fmin(std::fmax(xd + 3.0, 0.0), 6.0) / 6.0);
+                return static_cast<T>(yd);
+            } else {
+                double xd = static_cast<double>(x);
+                double yd = xd * (std::fmin(std::fmax(xd + 3.0, 0.0), 6.0) / 6.0);
+                return static_cast<T>(yd);
+            }
         } else {
             static_assert(Mode != Mode, "Unsupported unary operation mode");
             return x;
diff --git a/src/infiniop/ops/exp/cpu/exp_cpu.cc b/src/infiniop/ops/exp/cpu/exp_cpu.cc
index 58a6d0f2d..fb254884f 100644
--- a/src/infiniop/ops/exp/cpu/exp_cpu.cc
+++ b/src/infiniop/ops/exp/cpu/exp_cpu.cc
@@ -1,52 +1,8 @@
 #include "exp_cpu.h"
+#include "../../../elementwise/cpu/elementwise_cpu_impl.h"
 
 namespace op::exp::cpu {
 
-Descriptor::~Descriptor() = default;
+ELEMENTWISE_CPU_IMPL_UNARY_EXTENDED(exp)
 
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t out_desc,
-    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
-
-    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
-    auto dtype = out_desc->dtype();
-
-    const auto &input_desc = input_desc_vec.at(0);
-    const auto &output_shape = out_desc->shape();
-    const auto &input_shape = input_desc->shape();
-
-    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
-
-    CHECK_SAME_SHAPE(output_shape, input_shape);
-
-    // create CPU elementwise descriptor
-    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    std::vector<const void *> inputs,
-    void *stream) const {
-
-    switch (_dtype) {
-    case INFINI_DTYPE_F16:
-        return _device_info->calculate<ExpOp, fp16_t>(_info, output, inputs, stream);
-    case INFINI_DTYPE_F32:
-        return _device_info->calculate<ExpOp, float>(_info, output, inputs, stream);
-    case INFINI_DTYPE_F64:
-        return _device_info->calculate<ExpOp, double>(_info, output, inputs, stream);
-    case INFINI_DTYPE_BF16:
-        return _device_info->calculate<ExpOp, bf16_t>(_info, output, inputs, stream);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
 } // namespace op::exp::cpu
diff --git a/src/infiniop/ops/exp/cpu/exp_cpu.h b/src/infiniop/ops/exp/cpu/exp_cpu.h
index 867c7afa5..c247651a5 100644
--- a/src/infiniop/ops/exp/cpu/exp_cpu.h
+++ b/src/infiniop/ops/exp/cpu/exp_cpu.h
@@ -2,20 +2,8 @@
 #define __EXP_CPU_H__
 
 #include "../../../elementwise/cpu/elementwise_cpu.h"
-#include <cmath>
+#include "../../../elementwise/unary.h"
 
-ELEMENTWISE_DESCRIPTOR(exp, cpu)
-
-namespace op::exp::cpu {
-typedef struct ExpOp {
-public:
-    static constexpr size_t num_inputs = 1;
-
-    template <typename T>
-    T operator()(const T &input) const {
-        return std::exp(input);
-    }
-} ExpOp;
-} // namespace op::exp::cpu
+UNARY_ELEMENTWISE_DESCRIPTOR(exp, cpu, op::elementwise::unary::UnaryMode::Exp)
 
 #endif // __EXP_CPU_H__
diff --git a/src/infiniop/ops/exp/cuda/kernel.cuh b/src/infiniop/ops/exp/cuda/kernel.cuh
index 12446f31a..8ef5375b8 100644
--- a/src/infiniop/ops/exp/cuda/kernel.cuh
+++ b/src/infiniop/ops/exp/cuda/kernel.cuh
@@ -1,39 +1,10 @@
 #ifndef __EXP_CUDA_H__
 #define __EXP_CUDA_H__
 
-#include <cmath>
-#include <cuda_bf16.h>
-#include <cuda_fp16.h>
+#include "../../../elementwise/unary.h"
 
 namespace op::exp::cuda {
-typedef struct ExpOp {
-    static constexpr size_t num_inputs = 1;
-
-    template <typename T>
-    __device__ __forceinline__ T operator()(const T &input) const {
-        if constexpr (std::is_same_v<T, half2>) {
-            float2 vf = __half22float2(input);
-            float2 vr = make_float2(__expf(vf.x), __expf(vf.y));
-            return __float22half2_rn(vr);
-        } else if constexpr (std::is_same_v<T, half>) {
-            float inputf = __half2float(input);
-            return __float2half_rn(__expf(inputf));
-        } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
-            float f0 = __bfloat162float(__low2bfloat16(input));
-            float f1 = __bfloat162float(__high2bfloat16(input));
-            return __floats2bfloat162_rn(__expf(f0), __expf(f1));
-        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
-            float inputf = __bfloat162float(input);
-            return __float2bfloat16_rn(__expf(inputf));
-        } else if constexpr (std::is_same_v<T, float>) {
-            return __expf(input);
-        } else if constexpr (std::is_same_v<T, double>) {
-            return std::exp(input);
-        } else {
-            return std::exp(input);
-        }
-    }
-} ExpOp;
+using Op = op::elementwise::unary::cuda::UnaryOp<op::elementwise::unary::UnaryMode::Exp>;
 } // namespace op::exp::cuda
 
 #endif // __EXP_CUDA_H__
diff --git a/src/infiniop/ops/exp/nvidia/exp_nvidia.cu b/src/infiniop/ops/exp/nvidia/exp_nvidia.cu
index 3bdf2eb45..532f3a0d7 100644
--- a/src/infiniop/ops/exp/nvidia/exp_nvidia.cu
+++ b/src/infiniop/ops/exp/nvidia/exp_nvidia.cu
@@ -1,59 +1,10 @@
-#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh"
 
 #include "../cuda/kernel.cuh"
 #include "exp_nvidia.cuh"
 
 namespace op::exp::nvidia {
 
-Descriptor::~Descriptor() = default;
+ELEMENTWISE_NVIDIA_IMPL_UNARY_EXTENDED(exp)
 
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t out_desc,
-    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
-
-    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
-    auto dtype = out_desc->dtype();
-
-    const auto &input_desc = input_desc_vec.at(0);
-    const auto &output_shape = out_desc->shape();
-    const auto &input_shape = input_desc->shape();
-
-    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
-
-    CHECK_SAME_SHAPE(output_shape, input_shape);
-
-    // create CUDA elementwise descriptor
-    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    std::vector<const void *> inputs,
-    void *stream) const {
-
-    if (workspace_size < _workspace_size) {
-        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
-    }
-
-    switch (_dtype) {
-    case INFINI_DTYPE_F16:
-        return _device_info->calculate<256, cuda::ExpOp, half>(_info, workspace, output, inputs, stream);
-    case INFINI_DTYPE_BF16:
-        return _device_info->calculate<256, cuda::ExpOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
-    case INFINI_DTYPE_F32:
-        return _device_info->calculate<256, cuda::ExpOp, float>(_info, workspace, output, inputs, stream);
-    case INFINI_DTYPE_F64:
-        return _device_info->calculate<256, cuda::ExpOp, double>(_info, workspace, output, inputs, stream);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
 } // namespace op::exp::nvidia
diff --git a/src/infiniop/ops/exp/operator.cc b/src/infiniop/ops/exp/operator.cc
index cc369d660..9a7aaecc5 100644
--- a/src/infiniop/ops/exp/operator.cc
+++ b/src/infiniop/ops/exp/operator.cc
@@ -1,6 +1,5 @@
-#include "../../operator.h"
-#include "../../handle.h"
-#include "infiniop/ops/exp.h"
+#include "../../operator_impl.h"
+#include "infiniop/ops/unary_ops_api.h"
 
 #ifdef ENABLE_CPU_API
 #include "cpu/exp_cpu.h"
@@ -9,131 +8,4 @@
 #include "nvidia/exp_nvidia.cuh"
 #endif
 
-__C infiniStatus_t infiniopCreateExpDescriptor(
-    infiniopHandle_t handle,
-    infiniopExpDescriptor_t *desc_ptr,
-    infiniopTensorDescriptor_t output_desc,
-    infiniopTensorDescriptor_t input_desc) {
-
-#define CREATE(CASE, NAMESPACE)                                            \
-    case CASE:                                                             \
-        return op::exp::NAMESPACE::Descriptor::create(                     \
-            handle,                                                        \
-            reinterpret_cast<op::exp::NAMESPACE::Descriptor **>(desc_ptr), \
-            output_desc,                                                   \
-            {input_desc})
-
-    switch (handle->device) {
-
-#ifdef ENABLE_CPU_API
-        CREATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        CREATE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef CREATE
-}
-
-__C infiniStatus_t infiniopGetExpWorkspaceSize(infiniopExpDescriptor_t desc, size_t *size) {
-
-#define GET(CASE, NAMESPACE)                                                               \
-    case CASE:                                                                             \
-        *size = reinterpret_cast<op::exp::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
-        return INFINI_STATUS_SUCCESS;
-
-    switch (desc->device_type) {
-#ifdef ENABLE_CPU_API
-        GET(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        GET(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        GET(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        GET(INFINI_DEVICE_QY, nvidia);
-#endif
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-#undef GET
-
-    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-}
-
-__C infiniStatus_t infiniopExp(
-    infiniopExpDescriptor_t desc,
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    const void *input,
-    void *stream) {
-
-#define CALCULATE(CASE, NAMESPACE)                                            \
-    case CASE:                                                                \
-        return reinterpret_cast<const op::exp::NAMESPACE::Descriptor *>(desc) \
-            ->calculate(workspace, workspace_size, output, {input}, stream)
-
-    switch (desc->device_type) {
-
-#ifdef ENABLE_CPU_API
-        CALCULATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        CALCULATE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef CALCULATE
-}
-
-__C infiniStatus_t
-infiniopDestroyExpDescriptor(infiniopExpDescriptor_t desc) {
-
-#define DELETE(CASE, NAMESPACE)                                                \
-    case CASE:                                                                 \
-        delete reinterpret_cast<const op::exp::NAMESPACE::Descriptor *>(desc); \
-        return INFINI_STATUS_SUCCESS;
-
-    switch (desc->device_type) {
-
-#ifdef ENABLE_CPU_API
-        DELETE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        DELETE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef DELETE
-}
+UNARY_OP_IMPL(exp, Exp)
diff --git a/src/infiniop/ops/hardswish/cpu/hardswish_cpu.cc b/src/infiniop/ops/hardswish/cpu/hardswish_cpu.cc
index e7b68508a..0d5b8577c 100644
--- a/src/infiniop/ops/hardswish/cpu/hardswish_cpu.cc
+++ b/src/infiniop/ops/hardswish/cpu/hardswish_cpu.cc
@@ -1,52 +1,8 @@
 #include "hardswish_cpu.h"
+#include "../../../elementwise/cpu/elementwise_cpu_impl.h"
 
 namespace op::hardswish::cpu {
 
-Descriptor::~Descriptor() = default;
+ELEMENTWISE_CPU_IMPL_UNARY_EXTENDED(hardswish)
 
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t out_desc,
-    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
-
-    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
-    auto dtype = out_desc->dtype();
-
-    const auto &input_desc = input_desc_vec.at(0);
-    const auto &output_shape = out_desc->shape();
-    const auto &input_shape = input_desc->shape();
-
-    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
-
-    CHECK_SAME_SHAPE(output_shape, input_shape);
-
-    // create CPU elementwise descriptor
-    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    std::vector<const void *> inputs,
-    void *stream) const {
-
-    switch (_dtype) {
-    case INFINI_DTYPE_F16:
-        return _device_info->calculate<HardswishOp, fp16_t>(_info, output, inputs, stream);
-    case INFINI_DTYPE_F32:
-        return _device_info->calculate<HardswishOp, float>(_info, output, inputs, stream);
-    case INFINI_DTYPE_F64:
-        return _device_info->calculate<HardswishOp, double>(_info, output, inputs, stream);
-    case INFINI_DTYPE_BF16:
-        return _device_info->calculate<HardswishOp, bf16_t>(_info, output, inputs, stream);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
 } // namespace op::hardswish::cpu
diff --git a/src/infiniop/ops/hardswish/cpu/hardswish_cpu.h b/src/infiniop/ops/hardswish/cpu/hardswish_cpu.h
index e137be8a0..de1a78f65 100644
--- a/src/infiniop/ops/hardswish/cpu/hardswish_cpu.h
+++ b/src/infiniop/ops/hardswish/cpu/hardswish_cpu.h
@@ -2,29 +2,8 @@
 #define __HARDSWISH_CPU_H__
 
 #include "../../../elementwise/cpu/elementwise_cpu.h"
-#include <algorithm>
+#include "../../../elementwise/unary.h"
 
-ELEMENTWISE_DESCRIPTOR(hardswish, cpu)
-
-namespace op::hardswish::cpu {
-typedef struct HardswishOp {
-public:
-    static constexpr size_t num_inputs = 1;
-
-    template <typename T>
-    T operator()(const T &input) const {
-        if constexpr (std::is_integral_v<T>) {
-            return static_cast<T>(0);
-        } else {
-            // x * clamp(x + 3, 0, 6) / 6
-            auto x = static_cast<double>(input);
-            double y = x + 3.0;
-            y = std::min(std::max(y, 0.0), 6.0);
-            double out = x * (y / 6.0);
-            return static_cast<T>(out);
-        }
-    }
-} HardswishOp;
-} // namespace op::hardswish::cpu
+UNARY_ELEMENTWISE_DESCRIPTOR(hardswish, cpu, op::elementwise::unary::UnaryMode::Hardswish)
 
 #endif // __HARDSWISH_CPU_H__
diff --git a/src/infiniop/ops/hardswish/cuda/kernel.cuh b/src/infiniop/ops/hardswish/cuda/kernel.cuh
index d5b369bce..95377b75c 100644
--- a/src/infiniop/ops/hardswish/cuda/kernel.cuh
+++ b/src/infiniop/ops/hardswish/cuda/kernel.cuh
@@ -1,55 +1,10 @@
 #ifndef __HARDSWISH_CUDA_H__
 #define __HARDSWISH_CUDA_H__
 
-#include <cmath>
-#include <cuda_bf16.h>
-#include <cuda_fp16.h>
+#include "../../../elementwise/unary.h"
 
 namespace op::hardswish::cuda {
-
-typedef struct HardswishOp {
-    static constexpr size_t num_inputs = 1;
-
-    // Hardswish: f(x) = x * clamp(x + 3, 0, 6) / 6
-    __device__ __forceinline__ float hswish_f32(float x) const {
-        float y = x + 3.0f;
-        y = y < 0.0f ? 0.0f : (y > 6.0f ? 6.0f : y);
-        return x * (y * (1.0f / 6.0f));
-    }
-
-    template <typename T>
-    __device__ __forceinline__ T operator()(const T &input) const {
-        if constexpr (std::is_same_v<T, half2>) {
-            float2 vf = __half22float2(input);
-            float2 vr = make_float2(
-                hswish_f32(vf.x),
-                hswish_f32(vf.y));
-            return __float22half2_rn(vr);
-        } else if constexpr (std::is_same_v<T, half>) {
-            float xf = __half2float(input);
-            float yf = hswish_f32(xf);
-            return __float2half_rn(yf);
-        } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
-            float f0 = __bfloat162float(__low2bfloat16(input));
-            float f1 = __bfloat162float(__high2bfloat16(input));
-            return __floats2bfloat162_rn(hswish_f32(f0), hswish_f32(f1));
-        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
-            float xf = __bfloat162float(input);
-            return __float2bfloat16_rz(hswish_f32(xf));
-        } else if constexpr (std::is_same_v<T, float>) {
-            return hswish_f32(input);
-        } else if constexpr (std::is_same_v<T, double>) {
-            double xd = static_cast<double>(input);
-            double yd = xd * (std::fmin(std::fmax(xd + 3.0, 0.0), 6.0) / 6.0);
-            return static_cast<T>(yd);
-        } else {
-            double xd = static_cast<double>(input);
-            double yd = xd * (std::fmin(std::fmax(xd + 3.0, 0.0), 6.0) / 6.0);
-            return static_cast<T>(yd);
-        }
-    }
-} HardswishOp;
-
+using Op = op::elementwise::unary::cuda::UnaryOp<op::elementwise::unary::UnaryMode::Hardswish>;
 } // namespace op::hardswish::cuda
 
 #endif // __HARDSWISH_CUDA_H__
diff --git a/src/infiniop/ops/hardswish/nvidia/hardswish_nvidia.cu b/src/infiniop/ops/hardswish/nvidia/hardswish_nvidia.cu
index 9e279c2ef..ebf5250f7 100644
--- a/src/infiniop/ops/hardswish/nvidia/hardswish_nvidia.cu
+++ b/src/infiniop/ops/hardswish/nvidia/hardswish_nvidia.cu
@@ -1,59 +1,10 @@
-#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh"
 
 #include "../cuda/kernel.cuh"
 #include "hardswish_nvidia.cuh"
 
 namespace op::hardswish::nvidia {
 
-Descriptor::~Descriptor() = default;
+ELEMENTWISE_NVIDIA_IMPL_UNARY_EXTENDED(hardswish)
 
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t out_desc,
-    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
-
-    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
-    auto dtype = out_desc->dtype();
-
-    const auto &input_desc = input_desc_vec.at(0);
-    const auto &output_shape = out_desc->shape();
-    const auto &input_shape = input_desc->shape();
-
-    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
-
-    CHECK_SAME_SHAPE(output_shape, input_shape);
-
-    // create CUDA elementwise descriptor
-    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    std::vector<const void *> inputs,
-    void *stream) const {
-
-    if (workspace_size < _workspace_size) {
-        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
-    }
-
-    switch (_dtype) {
-    case INFINI_DTYPE_F16:
-        return _device_info->calculate<256, cuda::HardswishOp, half>(_info, workspace, output, inputs, stream);
-    case INFINI_DTYPE_BF16:
-        return _device_info->calculate<256, cuda::HardswishOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
-    case INFINI_DTYPE_F32:
-        return _device_info->calculate<256, cuda::HardswishOp, float>(_info, workspace, output, inputs, stream);
-    case INFINI_DTYPE_F64:
-        return _device_info->calculate<256, cuda::HardswishOp, double>(_info, workspace, output, inputs, stream);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
 } // namespace op::hardswish::nvidia
diff --git a/src/infiniop/ops/hardswish/operator.cc b/src/infiniop/ops/hardswish/operator.cc
index c51b18777..4cf68d328 100644
--- a/src/infiniop/ops/hardswish/operator.cc
+++ b/src/infiniop/ops/hardswish/operator.cc
@@ -1,6 +1,5 @@
-#include "../../operator.h"
-#include "../../handle.h"
-#include "infiniop/ops/hardswish.h"
+#include "../../operator_impl.h"
+#include "infiniop/ops/unary_ops_api.h"
 
 #ifdef ENABLE_CPU_API
 #include "cpu/hardswish_cpu.h"
@@ -9,131 +8,4 @@
 #include "nvidia/hardswish_nvidia.cuh"
 #endif
 
-__C infiniStatus_t infiniopCreateHardswishDescriptor(
-    infiniopHandle_t handle,
-    infiniopHardswishDescriptor_t *desc_ptr,
-    infiniopTensorDescriptor_t output_desc,
-    infiniopTensorDescriptor_t input_desc) {
-
-#define CREATE(CASE, NAMESPACE)                                                  \
-    case CASE:                                                                   \
-        return op::hardswish::NAMESPACE::Descriptor::create(                     \
-            handle,                                                              \
-            reinterpret_cast<op::hardswish::NAMESPACE::Descriptor **>(desc_ptr), \
-            output_desc,                                                         \
-            {input_desc})
-
-    switch (handle->device) {
-
-#ifdef ENABLE_CPU_API
-        CREATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        CREATE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef CREATE
-}
-
-__C infiniStatus_t infiniopGetHardswishWorkspaceSize(infiniopHardswishDescriptor_t desc, size_t *size) {
-
-#define GET(CASE, NAMESPACE)                                                                     \
-    case CASE:                                                                                   \
-        *size = reinterpret_cast<op::hardswish::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
-        return INFINI_STATUS_SUCCESS;
-
-    switch (desc->device_type) {
-#ifdef ENABLE_CPU_API
-        GET(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        GET(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        GET(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        GET(INFINI_DEVICE_QY, nvidia);
-#endif
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-#undef GET
-
-    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-}
-
-__C infiniStatus_t infiniopHardswish(
-    infiniopHardswishDescriptor_t desc,
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    const void *input,
-    void *stream) {
-
-#define CALCULATE(CASE, NAMESPACE)                                                  \
-    case CASE:                                                                      \
-        return reinterpret_cast<const op::hardswish::NAMESPACE::Descriptor *>(desc) \
-            ->calculate(workspace, workspace_size, output, {input}, stream)
-
-    switch (desc->device_type) {
-
-#ifdef ENABLE_CPU_API
-        CALCULATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        CALCULATE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef CALCULATE
-}
-
-__C infiniStatus_t
-infiniopDestroyHardswishDescriptor(infiniopHardswishDescriptor_t desc) {
-
-#define DELETE(CASE, NAMESPACE)                                                      \
-    case CASE:                                                                       \
-        delete reinterpret_cast<const op::hardswish::NAMESPACE::Descriptor *>(desc); \
-        return INFINI_STATUS_SUCCESS;
-
-    switch (desc->device_type) {
-
-#ifdef ENABLE_CPU_API
-        DELETE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        DELETE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef DELETE
-}
+UNARY_OP_IMPL(hardswish, Hardswish)
diff --git a/test/infiniop/exp.py b/test/infiniop/exp.py
deleted file mode 100644
index eb139af12..000000000
--- a/test/infiniop/exp.py
+++ /dev/null
@@ -1,165 +0,0 @@
-import torch
-import ctypes
-from ctypes import c_uint64
-from libinfiniop import (
-    LIBINFINIOP,
-    TestTensor,
-    get_test_devices,
-    check_error,
-    test_operator,
-    get_args,
-    debug,
-    get_tolerance,
-    get_sync_func,
-    profile_operation,
-    TestWorkspace,
-    InfiniDtype,
-    InfiniDtypeNames,
-    InfiniDeviceNames,
-    infiniopOperatorDescriptor_t,
-)
-from enum import Enum, auto
-
-# ========================================================================
-#  Configuration (Internal Use Only)
-# ========================================================================
-_TEST_CASES_ = [
-    # shape, input_stride, output_stride
-    ((13, 4), None, None),
-    ((13, 4), (10, 1), (10, 1)),
-    ((13, 4), (0, 1), None),
-    ((13, 4, 4), None, None),
-    ((13, 4, 4), (20, 4, 1), (20, 4, 1)),
-    ((13, 4, 4), (4, 0, 1), None),
-    ((16, 5632), None, None),
-    ((16, 5632), (10240, 1), (10240, 1)),
-    ((4, 4, 5632), None, None),
-    ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1)),
-]
-
-class Inplace(Enum):
-    OUT_OF_PLACE = auto()
-    INPLACE_INPUT    = auto()
-
-_INPLACE = [
-    Inplace.OUT_OF_PLACE, 
-    Inplace.INPLACE_INPUT,
-]
-
-_TEST_CASES = [
-    test_case + (inplace,)
-    for test_case in _TEST_CASES_
-    for inplace in _INPLACE
-]
-
-_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16]
-
-_TOLERANCE_MAP = {
-    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
-    InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6},
-    InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2},
-}
-
-DEBUG       = False
-PROFILE     = False
-NUM_PRERUN  = 10
-NUM_ITERATIONS = 1000
-
-
-def exp(output, input):
-    output.copy_(torch.exp(input))
-
-def test(
-    handle,
-    device,
-    shape,
-    input_stride=None,
-    output_stride=None,
-    inplace=Inplace.OUT_OF_PLACE,
-    dtype=torch.float16,
-    sync=None,
-):
-    input = TestTensor(shape, input_stride, dtype, device)
-    if inplace == Inplace.INPLACE_INPUT:
-        if input_stride != output_stride:
-            return
-        output = input
-    else:
-        output = TestTensor(shape, output_stride, dtype, device, mode="ones")
-
-    if output.is_broadcast():
-        return
-
-    print(
-        f"Testing Exp on {InfiniDeviceNames[device]} with shape:{shape} input_stride:{input_stride} output_stride:{output_stride} "
-        f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}"
-    )
-
-    exp(output.torch_tensor(), input.torch_tensor())
-
-    if sync is not None:
-        sync()
-
-    descriptor = infiniopOperatorDescriptor_t()
-    check_error(
-        LIBINFINIOP.infiniopCreateExpDescriptor(
-            handle,
-            ctypes.byref(descriptor),
-            output.descriptor,
-            input.descriptor,
-        )
-    )
-
-    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
-    for tensor in [input, output]:
-        tensor.destroy_desc()
-
-    workspace_size = c_uint64(0)
-    check_error(
-        LIBINFINIOP.infiniopGetExpWorkspaceSize(
-            descriptor, ctypes.byref(workspace_size)
-        )
-    )
-    workspace = TestWorkspace(workspace_size.value, output.device)
-
-    def lib_exp():
-        check_error(
-            LIBINFINIOP.infiniopExp(
-                descriptor,
-                workspace.data(),
-                workspace_size.value,
-                output.data(),
-                input.data(),
-                None,
-            )
-        )
-
-    lib_exp()
-
-    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
-    if DEBUG:
-        debug(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol)
-    assert torch.allclose(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol)
-
-    # Profiling workflow
-    if PROFILE:
-        # fmt: off
-        profile_operation("PyTorch", lambda: exp(output.torch_tensor(), input.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
-        profile_operation("    lib", lambda: lib_exp(), device, NUM_PRERUN, NUM_ITERATIONS)
-        # fmt: on
-    check_error(LIBINFINIOP.infiniopDestroyExpDescriptor(descriptor))
-
-
-if __name__ == "__main__":
-    args = get_args()
-
-    # Configure testing options
-    DEBUG = args.debug
-    PROFILE = args.profile
-    NUM_PRERUN = args.num_prerun
-    NUM_ITERATIONS = args.num_iterations
-
-    for device in get_test_devices(args):
-        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
-
-    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/hardswish.py b/test/infiniop/hardswish.py
deleted file mode 100644
index 424b30567..000000000
--- a/test/infiniop/hardswish.py
+++ /dev/null
@@ -1,167 +0,0 @@
-import torch
-import ctypes
-from ctypes import c_uint64
-from libinfiniop import (
-    LIBINFINIOP,
-    TestTensor,
-    get_test_devices,
-    check_error,
-    test_operator,
-    get_args,
-    debug,
-    get_tolerance,
-    get_sync_func,
-    profile_operation,
-    TestWorkspace,
-    InfiniDtype,
-    InfiniDtypeNames,
-    InfiniDeviceNames,
-    infiniopOperatorDescriptor_t,
-)
-from enum import Enum, auto
-
-# ========================================================================
-#  Configuration (Internal Use Only)
-# ========================================================================
-_TEST_CASES_ = [
-    # shape, input_stride, output_stride
-    ((13, 4), None, None),
-    ((13, 4), (10, 1), (10, 1)),
-    ((13, 4), (0, 1), None),
-    ((13, 4, 4), None, None),
-    ((13, 4, 4), (20, 4, 1), (20, 4, 1)),
-    ((13, 4, 4), (4, 0, 1), None),
-    ((16, 5632), None, None),
-    ((16, 5632), (10240, 1), (10240, 1)),
-    ((4, 4, 5632), None, None),
-    ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1)),
-]
-
-class Inplace(Enum):
-    OUT_OF_PLACE = auto()
-    INPLACE_INPUT = auto()
-
-_INPLACE = [
-    Inplace.OUT_OF_PLACE, 
-    Inplace.INPLACE_INPUT,
-]
-
-_TEST_CASES = [
-    test_case + (inplace,)
-    for test_case in _TEST_CASES_
-    for inplace in _INPLACE
-]
-
-_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16]
-
-_TOLERANCE_MAP = {
-    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
-    InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6},
-    InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2},
-}
-
-DEBUG       = False
-PROFILE     = False
-NUM_PRERUN  = 10
-NUM_ITERATIONS = 1000
-
-
-def hardswish(output, input):
-    output.copy_(input * torch.clamp(input + 3, min=0, max=6) / 6)
-
-
-def test(
-    handle,
-    device,
-    shape,
-    input_stride=None,
-    output_stride=None,
-    inplace=Inplace.OUT_OF_PLACE,
-    dtype=torch.float16,
-    sync=None,
-):
-    input = TestTensor(shape, input_stride, dtype, device)
-    if inplace == Inplace.INPLACE_INPUT:
-        if input_stride != output_stride:
-            return
-        output = input
-    else:
-        output = TestTensor(shape, output_stride, dtype, device, mode="ones")
-
-    if output.is_broadcast():
-        return
-
-    print(
-        f"Testing Hardswish on {InfiniDeviceNames[device]} with shape:{shape} input_stride:{input_stride} output_stride:{output_stride} "
-        f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}"
-    )
-
-    hardswish(output.torch_tensor(), input.torch_tensor())
-
-    if sync is not None:
-        sync()
-
-    descriptor = infiniopOperatorDescriptor_t()
-    check_error(
-        LIBINFINIOP.infiniopCreateHardswishDescriptor(
-            handle,
-            ctypes.byref(descriptor),
-            output.descriptor,
-            input.descriptor,
-        )
-    )
-
-    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
-    for tensor in [input, output]:
-        tensor.destroy_desc()
-
-    workspace_size = c_uint64(0)
-    check_error(
-        LIBINFINIOP.infiniopGetHardswishWorkspaceSize(
-            descriptor, ctypes.byref(workspace_size)
-        )
-    )
-    workspace = TestWorkspace(workspace_size.value, output.device)
-
-    def lib_hardswish():
-        check_error(
-            LIBINFINIOP.infiniopHardswish(
-                descriptor,
-                workspace.data(),
-                workspace_size.value,
-                output.data(),
-                input.data(),
-                None,
-            )
-        )
-
-    lib_hardswish()
-
-    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
-    if DEBUG:
-        debug(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol)
-
-    assert torch.allclose(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol)
-
-    # Profiling workflow
-    if PROFILE:
-        # fmt: off
-        profile_operation("PyTorch", lambda: hardswish(output.torch_tensor(), input.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
-        profile_operation("    lib", lambda: lib_hardswish(), device, NUM_PRERUN, NUM_ITERATIONS)
-        # fmt: on
-    check_error(LIBINFINIOP.infiniopDestroyHardswishDescriptor(descriptor))
-
-
-if __name__ == "__main__":
-    args = get_args()
-
-    # Configure testing options
-    DEBUG = args.debug
-    PROFILE = args.profile
-    NUM_PRERUN = args.num_prerun
-    NUM_ITERATIONS = args.num_iterations
-
-    for device in get_test_devices(args):
-        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
-
-    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/test_all_unary_ops.py b/test/infiniop/test_all_unary_ops.py
index b9d7cdc8b..2a65cf938 100644
--- a/test/infiniop/test_all_unary_ops.py
+++ b/test/infiniop/test_all_unary_ops.py
@@ -430,6 +430,50 @@ def generate_input(shape, dtype, device):
     EQUAL_NAN = True
 
 
+class ExpTest(UnaryTestBase):
+    OP_NAME = "Exp"
+    OP_NAME_LOWER = "exp"
+    
+    @staticmethod
+    def torch_op(x):
+        return torch.exp(x).to(x.dtype)
+    
+    @staticmethod
+    def generate_input(shape, dtype, device):
+        return torch.rand(shape, dtype=dtype, device=device) * 2 - 1
+    
+    TOLERANCE_MAP = {
+        InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+        InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6},
+        InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2},
+    }
+    
+    # Support BF16
+    TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16]
+
+
+class HardswishTest(UnaryTestBase):
+    OP_NAME = "Hardswish"
+    OP_NAME_LOWER = "hardswish"
+    
+    @staticmethod
+    def torch_op(x):
+        return (x * torch.clamp(x + 3, min=0, max=6) / 6).to(x.dtype)
+    
+    @staticmethod
+    def generate_input(shape, dtype, device):
+        return torch.rand(shape, dtype=dtype, device=device) * 2 - 1
+    
+    TOLERANCE_MAP = {
+        InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+        InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6},
+        InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2},
+    }
+    
+    # Support BF16
+    TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16]
+
+
 # ==============================================================================
 # 算子注册表
 # ==============================================================================
@@ -456,6 +500,8 @@ def generate_input(shape, dtype, device):
     "sinh": SinhTest,
     "sqrt": SqrtTest,
     "tan": TanTest,
+    "exp": ExpTest,
+    "hardswish": HardswishTest,
 }
 
 

From 6d475877b8766c62132fd492c462a7b8c1314802 Mon Sep 17 00:00:00 2001
From: gongchensu <zhuyue_134@qq.com>
Date: Wed, 21 Jan 2026 05:26:22 +0000
Subject: [PATCH 7/7] Issue/888 - Add
 gt,lt,ge,le,eq,ne,logical_and,logical_or,logical_xor,sin,bitwise_and,
 bitwise_or, bitwise_xor, bitwise_left_shift,
 bitwise_right_shift,floor_divide,atan2,exp2,log2,log10,rsqrt,square,hypot,copysign,remainder,isnan,isfinite,isinf,sinc,fmin,fmax,log1p
 binary operators.

---
 include/infiniop.h                            |    8 +
 include/infiniop/ops/averagepool.h            |    8 +-
 include/infiniop/ops/binary_ops_api.h         |   21 +
 include/infiniop/ops/cross_entropy_loss.h     |    8 +-
 include/infiniop/ops/interpolate_nearest.h    |    8 +-
 include/infiniop/ops/maxpool.h                |    8 +-
 include/infiniop/ops/unary_ops_api.h          |   11 +
 src/infiniop/README.md                        |  208 ++++
 src/infiniop/elementwise/binary.h             |  544 +++++++-
 .../elementwise/cpu/elementwise_cpu_impl.h    |  141 ++-
 .../nvidia/elementwise_nvidia_impl.cuh        |  150 ++-
 src/infiniop/elementwise/unary.h              |  354 +++++-
 src/infiniop/ops/atan2/cpu/atan2_cpu.cc       |    8 +
 src/infiniop/ops/atan2/cpu/atan2_cpu.h        |    9 +
 src/infiniop/ops/atan2/cuda/kernel.cuh        |   10 +
 src/infiniop/ops/atan2/nvidia/atan2_nvidia.cu |   10 +
 .../ops/atan2/nvidia/atan2_nvidia.cuh         |    8 +
 src/infiniop/ops/atan2/operator.cc            |   11 +
 .../ops/bitwise_and/cpu/bitwise_and_cpu.cc    |    8 +
 .../ops/bitwise_and/cpu/bitwise_and_cpu.h     |    9 +
 src/infiniop/ops/bitwise_and/cuda/kernel.cuh  |   10 +
 .../bitwise_and/nvidia/bitwise_and_nvidia.cu  |   10 +
 .../bitwise_and/nvidia/bitwise_and_nvidia.cuh |    8 +
 src/infiniop/ops/bitwise_and/operator.cc      |   11 +
 .../cpu/bitwise_left_shift_cpu.cc             |    8 +
 .../cpu/bitwise_left_shift_cpu.h              |    9 +
 .../ops/bitwise_left_shift/cuda/kernel.cuh    |   10 +
 .../nvidia/bitwise_left_shift_nvidia.cu       |   10 +
 .../nvidia/bitwise_left_shift_nvidia.cuh      |    8 +
 .../ops/bitwise_left_shift/operator.cc        |   11 +
 .../ops/bitwise_or/cpu/bitwise_or_cpu.cc      |    8 +
 .../ops/bitwise_or/cpu/bitwise_or_cpu.h       |    9 +
 src/infiniop/ops/bitwise_or/cuda/kernel.cuh   |   10 +
 .../bitwise_or/nvidia/bitwise_or_nvidia.cu    |   10 +
 .../bitwise_or/nvidia/bitwise_or_nvidia.cuh   |    8 +
 src/infiniop/ops/bitwise_or/operator.cc       |   11 +
 .../cpu/bitwise_right_shift_cpu.cc            |    8 +
 .../cpu/bitwise_right_shift_cpu.h             |    9 +
 .../ops/bitwise_right_shift/cuda/kernel.cuh   |   10 +
 .../nvidia/bitwise_right_shift_nvidia.cu      |   10 +
 .../nvidia/bitwise_right_shift_nvidia.cuh     |    8 +
 .../ops/bitwise_right_shift/operator.cc       |   11 +
 .../ops/bitwise_xor/cpu/bitwise_xor_cpu.cc    |    8 +
 .../ops/bitwise_xor/cpu/bitwise_xor_cpu.h     |    9 +
 src/infiniop/ops/bitwise_xor/cuda/kernel.cuh  |   10 +
 .../bitwise_xor/nvidia/bitwise_xor_nvidia.cu  |   10 +
 .../bitwise_xor/nvidia/bitwise_xor_nvidia.cuh |    8 +
 src/infiniop/ops/bitwise_xor/operator.cc      |   11 +
 src/infiniop/ops/copysign/cpu/copysign_cpu.cc |    8 +
 src/infiniop/ops/copysign/cpu/copysign_cpu.h  |    9 +
 src/infiniop/ops/copysign/cuda/kernel.cuh     |   10 +
 .../ops/copysign/nvidia/copysign_nvidia.cu    |   10 +
 .../ops/copysign/nvidia/copysign_nvidia.cuh   |    8 +
 src/infiniop/ops/copysign/operator.cc         |   11 +
 src/infiniop/ops/eq/cpu/eq_cpu.cc             |    8 +
 src/infiniop/ops/eq/cpu/eq_cpu.h              |    9 +
 src/infiniop/ops/eq/cuda/kernel.cuh           |   10 +
 src/infiniop/ops/eq/nvidia/eq_nvidia.cu       |   10 +
 src/infiniop/ops/eq/nvidia/eq_nvidia.cuh      |    8 +
 src/infiniop/ops/eq/operator.cc               |   11 +
 src/infiniop/ops/exp2/cpu/exp2_cpu.cc         |    8 +
 src/infiniop/ops/exp2/cpu/exp2_cpu.h          |    9 +
 src/infiniop/ops/exp2/cuda/kernel.cuh         |   10 +
 src/infiniop/ops/exp2/nvidia/exp2_nvidia.cu   |   10 +
 src/infiniop/ops/exp2/nvidia/exp2_nvidia.cuh  |    8 +
 src/infiniop/ops/exp2/operator.cc             |   11 +
 .../ops/floor_divide/cpu/floor_divide_cpu.cc  |    8 +
 .../ops/floor_divide/cpu/floor_divide_cpu.h   |    9 +
 src/infiniop/ops/floor_divide/cuda/kernel.cuh |   10 +
 .../nvidia/floor_divide_nvidia.cu             |   10 +
 .../nvidia/floor_divide_nvidia.cuh            |    8 +
 src/infiniop/ops/floor_divide/operator.cc     |   11 +
 src/infiniop/ops/fmax/cpu/fmax_cpu.cc         |    8 +
 src/infiniop/ops/fmax/cpu/fmax_cpu.h          |    9 +
 src/infiniop/ops/fmax/cuda/kernel.cuh         |   10 +
 src/infiniop/ops/fmax/nvidia/fmax_nvidia.cu   |   10 +
 src/infiniop/ops/fmax/nvidia/fmax_nvidia.cuh  |    8 +
 src/infiniop/ops/fmax/operator.cc             |   11 +
 src/infiniop/ops/fmin/cpu/fmin_cpu.cc         |    8 +
 src/infiniop/ops/fmin/cpu/fmin_cpu.h          |    9 +
 src/infiniop/ops/fmin/cuda/kernel.cuh         |   10 +
 src/infiniop/ops/fmin/nvidia/fmin_nvidia.cu   |   10 +
 src/infiniop/ops/fmin/nvidia/fmin_nvidia.cuh  |    8 +
 src/infiniop/ops/fmin/operator.cc             |   11 +
 src/infiniop/ops/ge/cpu/ge_cpu.cc             |    8 +
 src/infiniop/ops/ge/cpu/ge_cpu.h              |    9 +
 src/infiniop/ops/ge/cuda/kernel.cuh           |   10 +
 src/infiniop/ops/ge/nvidia/ge_nvidia.cu       |   10 +
 src/infiniop/ops/ge/nvidia/ge_nvidia.cuh      |    8 +
 src/infiniop/ops/ge/operator.cc               |   11 +
 src/infiniop/ops/gt/cpu/gt_cpu.cc             |    8 +
 src/infiniop/ops/gt/cpu/gt_cpu.h              |    9 +
 src/infiniop/ops/gt/cuda/kernel.cuh           |   10 +
 src/infiniop/ops/gt/nvidia/gt_nvidia.cu       |   10 +
 src/infiniop/ops/gt/nvidia/gt_nvidia.cuh      |    8 +
 src/infiniop/ops/gt/operator.cc               |   11 +
 src/infiniop/ops/hypot/cpu/hypot_cpu.cc       |    8 +
 src/infiniop/ops/hypot/cpu/hypot_cpu.h        |    9 +
 src/infiniop/ops/hypot/cuda/kernel.cuh        |   10 +
 src/infiniop/ops/hypot/nvidia/hypot_nvidia.cu |   10 +
 .../ops/hypot/nvidia/hypot_nvidia.cuh         |    8 +
 src/infiniop/ops/hypot/operator.cc            |   11 +
 src/infiniop/ops/isfinite/cpu/isfinite_cpu.cc |    8 +
 src/infiniop/ops/isfinite/cpu/isfinite_cpu.h  |    9 +
 src/infiniop/ops/isfinite/cuda/kernel.cuh     |   10 +
 .../ops/isfinite/nvidia/isfinite_nvidia.cu    |   10 +
 .../ops/isfinite/nvidia/isfinite_nvidia.cuh   |    8 +
 src/infiniop/ops/isfinite/operator.cc         |   11 +
 src/infiniop/ops/isinf/cpu/isinf_cpu.cc       |    8 +
 src/infiniop/ops/isinf/cpu/isinf_cpu.h        |    9 +
 src/infiniop/ops/isinf/cuda/kernel.cuh        |   10 +
 src/infiniop/ops/isinf/nvidia/isinf_nvidia.cu |   10 +
 .../ops/isinf/nvidia/isinf_nvidia.cuh         |    8 +
 src/infiniop/ops/isinf/operator.cc            |   11 +
 src/infiniop/ops/isnan/cpu/isnan_cpu.cc       |    8 +
 src/infiniop/ops/isnan/cpu/isnan_cpu.h        |    9 +
 src/infiniop/ops/isnan/cuda/kernel.cuh        |   10 +
 src/infiniop/ops/isnan/nvidia/isnan_nvidia.cu |   10 +
 .../ops/isnan/nvidia/isnan_nvidia.cuh         |    8 +
 src/infiniop/ops/isnan/operator.cc            |   11 +
 src/infiniop/ops/le/cpu/le_cpu.cc             |    8 +
 src/infiniop/ops/le/cpu/le_cpu.h              |    9 +
 src/infiniop/ops/le/cuda/kernel.cuh           |   10 +
 src/infiniop/ops/le/nvidia/le_nvidia.cu       |   10 +
 src/infiniop/ops/le/nvidia/le_nvidia.cuh      |    8 +
 src/infiniop/ops/le/operator.cc               |   11 +
 src/infiniop/ops/log10/cpu/log10_cpu.cc       |    8 +
 src/infiniop/ops/log10/cpu/log10_cpu.h        |    9 +
 src/infiniop/ops/log10/cuda/kernel.cuh        |   10 +
 src/infiniop/ops/log10/nvidia/log10_nvidia.cu |   10 +
 .../ops/log10/nvidia/log10_nvidia.cuh         |    8 +
 src/infiniop/ops/log10/operator.cc            |   11 +
 src/infiniop/ops/log1p/cpu/log1p_cpu.cc       |    8 +
 src/infiniop/ops/log1p/cpu/log1p_cpu.h        |    9 +
 src/infiniop/ops/log1p/cuda/kernel.cuh        |   10 +
 src/infiniop/ops/log1p/nvidia/log1p_nvidia.cu |   10 +
 .../ops/log1p/nvidia/log1p_nvidia.cuh         |    8 +
 src/infiniop/ops/log1p/operator.cc            |   11 +
 src/infiniop/ops/log2/cpu/log2_cpu.cc         |    8 +
 src/infiniop/ops/log2/cpu/log2_cpu.h          |    9 +
 src/infiniop/ops/log2/cuda/kernel.cuh         |   10 +
 src/infiniop/ops/log2/nvidia/log2_nvidia.cu   |   10 +
 src/infiniop/ops/log2/nvidia/log2_nvidia.cuh  |    8 +
 src/infiniop/ops/log2/operator.cc             |   11 +
 .../ops/logical_and/cpu/logical_and_cpu.cc    |    8 +
 .../ops/logical_and/cpu/logical_and_cpu.h     |    9 +
 src/infiniop/ops/logical_and/cuda/kernel.cuh  |   10 +
 .../logical_and/nvidia/logical_and_nvidia.cu  |   10 +
 .../logical_and/nvidia/logical_and_nvidia.cuh |    8 +
 src/infiniop/ops/logical_and/operator.cc      |   11 +
 .../ops/logical_or/cpu/logical_or_cpu.cc      |    8 +
 .../ops/logical_or/cpu/logical_or_cpu.h       |    9 +
 src/infiniop/ops/logical_or/cuda/kernel.cuh   |   10 +
 .../logical_or/nvidia/logical_or_nvidia.cu    |   10 +
 .../logical_or/nvidia/logical_or_nvidia.cuh   |    8 +
 src/infiniop/ops/logical_or/operator.cc       |   11 +
 .../ops/logical_xor/cpu/logical_xor_cpu.cc    |    8 +
 .../ops/logical_xor/cpu/logical_xor_cpu.h     |    9 +
 src/infiniop/ops/logical_xor/cuda/kernel.cuh  |   10 +
 .../logical_xor/nvidia/logical_xor_nvidia.cu  |   10 +
 .../logical_xor/nvidia/logical_xor_nvidia.cuh |    8 +
 src/infiniop/ops/logical_xor/operator.cc      |   11 +
 src/infiniop/ops/lt/cpu/lt_cpu.cc             |    8 +
 src/infiniop/ops/lt/cpu/lt_cpu.h              |    9 +
 src/infiniop/ops/lt/cuda/kernel.cuh           |   10 +
 src/infiniop/ops/lt/nvidia/lt_nvidia.cu       |   10 +
 src/infiniop/ops/lt/nvidia/lt_nvidia.cuh      |    8 +
 src/infiniop/ops/lt/operator.cc               |   11 +
 src/infiniop/ops/ne/cpu/ne_cpu.cc             |    8 +
 src/infiniop/ops/ne/cpu/ne_cpu.h              |    9 +
 src/infiniop/ops/ne/cuda/kernel.cuh           |   10 +
 src/infiniop/ops/ne/nvidia/ne_nvidia.cu       |   10 +
 src/infiniop/ops/ne/nvidia/ne_nvidia.cuh      |    8 +
 src/infiniop/ops/ne/operator.cc               |   11 +
 .../ops/remainder/cpu/remainder_cpu.cc        |    8 +
 .../ops/remainder/cpu/remainder_cpu.h         |    9 +
 src/infiniop/ops/remainder/cuda/kernel.cuh    |   10 +
 .../ops/remainder/nvidia/remainder_nvidia.cu  |   10 +
 .../ops/remainder/nvidia/remainder_nvidia.cuh |    8 +
 src/infiniop/ops/remainder/operator.cc        |   11 +
 src/infiniop/ops/rsqrt/cpu/rsqrt_cpu.cc       |    8 +
 src/infiniop/ops/rsqrt/cpu/rsqrt_cpu.h        |    9 +
 src/infiniop/ops/rsqrt/cuda/kernel.cuh        |   10 +
 src/infiniop/ops/rsqrt/nvidia/rsqrt_nvidia.cu |   10 +
 .../ops/rsqrt/nvidia/rsqrt_nvidia.cuh         |    8 +
 src/infiniop/ops/rsqrt/operator.cc            |   11 +
 src/infiniop/ops/sin/cpu/sin_cpu.cc           |    8 +
 src/infiniop/ops/sin/cpu/sin_cpu.h            |    9 +
 src/infiniop/ops/sin/cuda/kernel.cuh          |   10 +
 src/infiniop/ops/sin/nvidia/sin_nvidia.cu     |   10 +
 src/infiniop/ops/sin/nvidia/sin_nvidia.cuh    |    8 +
 src/infiniop/ops/sin/operator.cc              |   11 +
 src/infiniop/ops/sinc/cpu/sinc_cpu.cc         |    8 +
 src/infiniop/ops/sinc/cpu/sinc_cpu.h          |    9 +
 src/infiniop/ops/sinc/cuda/kernel.cuh         |   10 +
 src/infiniop/ops/sinc/nvidia/sinc_nvidia.cu   |   10 +
 src/infiniop/ops/sinc/nvidia/sinc_nvidia.cuh  |    8 +
 src/infiniop/ops/sinc/operator.cc             |   11 +
 src/infiniop/ops/square/cpu/square_cpu.cc     |    8 +
 src/infiniop/ops/square/cpu/square_cpu.h      |    9 +
 src/infiniop/ops/square/cuda/kernel.cuh       |   10 +
 .../ops/square/nvidia/square_nvidia.cu        |   10 +
 .../ops/square/nvidia/square_nvidia.cuh       |    8 +
 src/infiniop/ops/square/operator.cc           |   11 +
 test/infiniop/libinfiniop/op_register.py      | 1091 ++++++++++++++++-
 test/infiniop/libinfiniop/utils.py            |   42 +-
 test/infiniop/test_all_binary_ops.py          |  606 +++++++++
 test/infiniop/test_all_unary_ops.py           |  279 +++++
 208 files changed, 5093 insertions(+), 186 deletions(-)
 create mode 100644 src/infiniop/ops/atan2/cpu/atan2_cpu.cc
 create mode 100644 src/infiniop/ops/atan2/cpu/atan2_cpu.h
 create mode 100644 src/infiniop/ops/atan2/cuda/kernel.cuh
 create mode 100644 src/infiniop/ops/atan2/nvidia/atan2_nvidia.cu
 create mode 100644 src/infiniop/ops/atan2/nvidia/atan2_nvidia.cuh
 create mode 100644 src/infiniop/ops/atan2/operator.cc
 create mode 100644 src/infiniop/ops/bitwise_and/cpu/bitwise_and_cpu.cc
 create mode 100644 src/infiniop/ops/bitwise_and/cpu/bitwise_and_cpu.h
 create mode 100644 src/infiniop/ops/bitwise_and/cuda/kernel.cuh
 create mode 100644 src/infiniop/ops/bitwise_and/nvidia/bitwise_and_nvidia.cu
 create mode 100644 src/infiniop/ops/bitwise_and/nvidia/bitwise_and_nvidia.cuh
 create mode 100644 src/infiniop/ops/bitwise_and/operator.cc
 create mode 100644 src/infiniop/ops/bitwise_left_shift/cpu/bitwise_left_shift_cpu.cc
 create mode 100644 src/infiniop/ops/bitwise_left_shift/cpu/bitwise_left_shift_cpu.h
 create mode 100644 src/infiniop/ops/bitwise_left_shift/cuda/kernel.cuh
 create mode 100644 src/infiniop/ops/bitwise_left_shift/nvidia/bitwise_left_shift_nvidia.cu
 create mode 100644 src/infiniop/ops/bitwise_left_shift/nvidia/bitwise_left_shift_nvidia.cuh
 create mode 100644 src/infiniop/ops/bitwise_left_shift/operator.cc
 create mode 100644 src/infiniop/ops/bitwise_or/cpu/bitwise_or_cpu.cc
 create mode 100644 src/infiniop/ops/bitwise_or/cpu/bitwise_or_cpu.h
 create mode 100644 src/infiniop/ops/bitwise_or/cuda/kernel.cuh
 create mode 100644 src/infiniop/ops/bitwise_or/nvidia/bitwise_or_nvidia.cu
 create mode 100644 src/infiniop/ops/bitwise_or/nvidia/bitwise_or_nvidia.cuh
 create mode 100644 src/infiniop/ops/bitwise_or/operator.cc
 create mode 100644 src/infiniop/ops/bitwise_right_shift/cpu/bitwise_right_shift_cpu.cc
 create mode 100644 src/infiniop/ops/bitwise_right_shift/cpu/bitwise_right_shift_cpu.h
 create mode 100644 src/infiniop/ops/bitwise_right_shift/cuda/kernel.cuh
 create mode 100644 src/infiniop/ops/bitwise_right_shift/nvidia/bitwise_right_shift_nvidia.cu
 create mode 100644 src/infiniop/ops/bitwise_right_shift/nvidia/bitwise_right_shift_nvidia.cuh
 create mode 100644 src/infiniop/ops/bitwise_right_shift/operator.cc
 create mode 100644 src/infiniop/ops/bitwise_xor/cpu/bitwise_xor_cpu.cc
 create mode 100644 src/infiniop/ops/bitwise_xor/cpu/bitwise_xor_cpu.h
 create mode 100644 src/infiniop/ops/bitwise_xor/cuda/kernel.cuh
 create mode 100644 src/infiniop/ops/bitwise_xor/nvidia/bitwise_xor_nvidia.cu
 create mode 100644 src/infiniop/ops/bitwise_xor/nvidia/bitwise_xor_nvidia.cuh
 create mode 100644 src/infiniop/ops/bitwise_xor/operator.cc
 create mode 100644 src/infiniop/ops/copysign/cpu/copysign_cpu.cc
 create mode 100644 src/infiniop/ops/copysign/cpu/copysign_cpu.h
 create mode 100644 src/infiniop/ops/copysign/cuda/kernel.cuh
 create mode 100644 src/infiniop/ops/copysign/nvidia/copysign_nvidia.cu
 create mode 100644 src/infiniop/ops/copysign/nvidia/copysign_nvidia.cuh
 create mode 100644 src/infiniop/ops/copysign/operator.cc
 create mode 100644 src/infiniop/ops/eq/cpu/eq_cpu.cc
 create mode 100644 src/infiniop/ops/eq/cpu/eq_cpu.h
 create mode 100644 src/infiniop/ops/eq/cuda/kernel.cuh
 create mode 100644 src/infiniop/ops/eq/nvidia/eq_nvidia.cu
 create mode 100644 src/infiniop/ops/eq/nvidia/eq_nvidia.cuh
 create mode 100644 src/infiniop/ops/eq/operator.cc
 create mode 100644 src/infiniop/ops/exp2/cpu/exp2_cpu.cc
 create mode 100644 src/infiniop/ops/exp2/cpu/exp2_cpu.h
 create mode 100644 src/infiniop/ops/exp2/cuda/kernel.cuh
 create mode 100644 src/infiniop/ops/exp2/nvidia/exp2_nvidia.cu
 create mode 100644 src/infiniop/ops/exp2/nvidia/exp2_nvidia.cuh
 create mode 100644 src/infiniop/ops/exp2/operator.cc
 create mode 100644 src/infiniop/ops/floor_divide/cpu/floor_divide_cpu.cc
 create mode 100644 src/infiniop/ops/floor_divide/cpu/floor_divide_cpu.h
 create mode 100644 src/infiniop/ops/floor_divide/cuda/kernel.cuh
 create mode 100644 src/infiniop/ops/floor_divide/nvidia/floor_divide_nvidia.cu
 create mode 100644 src/infiniop/ops/floor_divide/nvidia/floor_divide_nvidia.cuh
 create mode 100644 src/infiniop/ops/floor_divide/operator.cc
 create mode 100644 src/infiniop/ops/fmax/cpu/fmax_cpu.cc
 create mode 100644 src/infiniop/ops/fmax/cpu/fmax_cpu.h
 create mode 100644 src/infiniop/ops/fmax/cuda/kernel.cuh
 create mode 100644 src/infiniop/ops/fmax/nvidia/fmax_nvidia.cu
 create mode 100644 src/infiniop/ops/fmax/nvidia/fmax_nvidia.cuh
 create mode 100644 src/infiniop/ops/fmax/operator.cc
 create mode 100644 src/infiniop/ops/fmin/cpu/fmin_cpu.cc
 create mode 100644 src/infiniop/ops/fmin/cpu/fmin_cpu.h
 create mode 100644 src/infiniop/ops/fmin/cuda/kernel.cuh
 create mode 100644 src/infiniop/ops/fmin/nvidia/fmin_nvidia.cu
 create mode 100644 src/infiniop/ops/fmin/nvidia/fmin_nvidia.cuh
 create mode 100644 src/infiniop/ops/fmin/operator.cc
 create mode 100644 src/infiniop/ops/ge/cpu/ge_cpu.cc
 create mode 100644 src/infiniop/ops/ge/cpu/ge_cpu.h
 create mode 100644 src/infiniop/ops/ge/cuda/kernel.cuh
 create mode 100644 src/infiniop/ops/ge/nvidia/ge_nvidia.cu
 create mode 100644 src/infiniop/ops/ge/nvidia/ge_nvidia.cuh
 create mode 100644 src/infiniop/ops/ge/operator.cc
 create mode 100644 src/infiniop/ops/gt/cpu/gt_cpu.cc
 create mode 100644 src/infiniop/ops/gt/cpu/gt_cpu.h
 create mode 100644 src/infiniop/ops/gt/cuda/kernel.cuh
 create mode 100644 src/infiniop/ops/gt/nvidia/gt_nvidia.cu
 create mode 100644 src/infiniop/ops/gt/nvidia/gt_nvidia.cuh
 create mode 100644 src/infiniop/ops/gt/operator.cc
 create mode 100644 src/infiniop/ops/hypot/cpu/hypot_cpu.cc
 create mode 100644 src/infiniop/ops/hypot/cpu/hypot_cpu.h
 create mode 100644 src/infiniop/ops/hypot/cuda/kernel.cuh
 create mode 100644 src/infiniop/ops/hypot/nvidia/hypot_nvidia.cu
 create mode 100644 src/infiniop/ops/hypot/nvidia/hypot_nvidia.cuh
 create mode 100644 src/infiniop/ops/hypot/operator.cc
 create mode 100644 src/infiniop/ops/isfinite/cpu/isfinite_cpu.cc
 create mode 100644 src/infiniop/ops/isfinite/cpu/isfinite_cpu.h
 create mode 100644 src/infiniop/ops/isfinite/cuda/kernel.cuh
 create mode 100644 src/infiniop/ops/isfinite/nvidia/isfinite_nvidia.cu
 create mode 100644 src/infiniop/ops/isfinite/nvidia/isfinite_nvidia.cuh
 create mode 100644 src/infiniop/ops/isfinite/operator.cc
 create mode 100644 src/infiniop/ops/isinf/cpu/isinf_cpu.cc
 create mode 100644 src/infiniop/ops/isinf/cpu/isinf_cpu.h
 create mode 100644 src/infiniop/ops/isinf/cuda/kernel.cuh
 create mode 100644 src/infiniop/ops/isinf/nvidia/isinf_nvidia.cu
 create mode 100644 src/infiniop/ops/isinf/nvidia/isinf_nvidia.cuh
 create mode 100644 src/infiniop/ops/isinf/operator.cc
 create mode 100644 src/infiniop/ops/isnan/cpu/isnan_cpu.cc
 create mode 100644 src/infiniop/ops/isnan/cpu/isnan_cpu.h
 create mode 100644 src/infiniop/ops/isnan/cuda/kernel.cuh
 create mode 100644 src/infiniop/ops/isnan/nvidia/isnan_nvidia.cu
 create mode 100644 src/infiniop/ops/isnan/nvidia/isnan_nvidia.cuh
 create mode 100644 src/infiniop/ops/isnan/operator.cc
 create mode 100644 src/infiniop/ops/le/cpu/le_cpu.cc
 create mode 100644 src/infiniop/ops/le/cpu/le_cpu.h
 create mode 100644 src/infiniop/ops/le/cuda/kernel.cuh
 create mode 100644 src/infiniop/ops/le/nvidia/le_nvidia.cu
 create mode 100644 src/infiniop/ops/le/nvidia/le_nvidia.cuh
 create mode 100644 src/infiniop/ops/le/operator.cc
 create mode 100644 src/infiniop/ops/log10/cpu/log10_cpu.cc
 create mode 100644 src/infiniop/ops/log10/cpu/log10_cpu.h
 create mode 100644 src/infiniop/ops/log10/cuda/kernel.cuh
 create mode 100644 src/infiniop/ops/log10/nvidia/log10_nvidia.cu
 create mode 100644 src/infiniop/ops/log10/nvidia/log10_nvidia.cuh
 create mode 100644 src/infiniop/ops/log10/operator.cc
 create mode 100644 src/infiniop/ops/log1p/cpu/log1p_cpu.cc
 create mode 100644 src/infiniop/ops/log1p/cpu/log1p_cpu.h
 create mode 100644 src/infiniop/ops/log1p/cuda/kernel.cuh
 create mode 100644 src/infiniop/ops/log1p/nvidia/log1p_nvidia.cu
 create mode 100644 src/infiniop/ops/log1p/nvidia/log1p_nvidia.cuh
 create mode 100644 src/infiniop/ops/log1p/operator.cc
 create mode 100644 src/infiniop/ops/log2/cpu/log2_cpu.cc
 create mode 100644 src/infiniop/ops/log2/cpu/log2_cpu.h
 create mode 100644 src/infiniop/ops/log2/cuda/kernel.cuh
 create mode 100644 src/infiniop/ops/log2/nvidia/log2_nvidia.cu
 create mode 100644 src/infiniop/ops/log2/nvidia/log2_nvidia.cuh
 create mode 100644 src/infiniop/ops/log2/operator.cc
 create mode 100644 src/infiniop/ops/logical_and/cpu/logical_and_cpu.cc
 create mode 100644 src/infiniop/ops/logical_and/cpu/logical_and_cpu.h
 create mode 100644 src/infiniop/ops/logical_and/cuda/kernel.cuh
 create mode 100644 src/infiniop/ops/logical_and/nvidia/logical_and_nvidia.cu
 create mode 100644 src/infiniop/ops/logical_and/nvidia/logical_and_nvidia.cuh
 create mode 100644 src/infiniop/ops/logical_and/operator.cc
 create mode 100644 src/infiniop/ops/logical_or/cpu/logical_or_cpu.cc
 create mode 100644 src/infiniop/ops/logical_or/cpu/logical_or_cpu.h
 create mode 100644 src/infiniop/ops/logical_or/cuda/kernel.cuh
 create mode 100644 src/infiniop/ops/logical_or/nvidia/logical_or_nvidia.cu
 create mode 100644 src/infiniop/ops/logical_or/nvidia/logical_or_nvidia.cuh
 create mode 100644 src/infiniop/ops/logical_or/operator.cc
 create mode 100644 src/infiniop/ops/logical_xor/cpu/logical_xor_cpu.cc
 create mode 100644 src/infiniop/ops/logical_xor/cpu/logical_xor_cpu.h
 create mode 100644 src/infiniop/ops/logical_xor/cuda/kernel.cuh
 create mode 100644 src/infiniop/ops/logical_xor/nvidia/logical_xor_nvidia.cu
 create mode 100644 src/infiniop/ops/logical_xor/nvidia/logical_xor_nvidia.cuh
 create mode 100644 src/infiniop/ops/logical_xor/operator.cc
 create mode 100644 src/infiniop/ops/lt/cpu/lt_cpu.cc
 create mode 100644 src/infiniop/ops/lt/cpu/lt_cpu.h
 create mode 100644 src/infiniop/ops/lt/cuda/kernel.cuh
 create mode 100644 src/infiniop/ops/lt/nvidia/lt_nvidia.cu
 create mode 100644 src/infiniop/ops/lt/nvidia/lt_nvidia.cuh
 create mode 100644 src/infiniop/ops/lt/operator.cc
 create mode 100644 src/infiniop/ops/ne/cpu/ne_cpu.cc
 create mode 100644 src/infiniop/ops/ne/cpu/ne_cpu.h
 create mode 100644 src/infiniop/ops/ne/cuda/kernel.cuh
 create mode 100644 src/infiniop/ops/ne/nvidia/ne_nvidia.cu
 create mode 100644 src/infiniop/ops/ne/nvidia/ne_nvidia.cuh
 create mode 100644 src/infiniop/ops/ne/operator.cc
 create mode 100644 src/infiniop/ops/remainder/cpu/remainder_cpu.cc
 create mode 100644 src/infiniop/ops/remainder/cpu/remainder_cpu.h
 create mode 100644 src/infiniop/ops/remainder/cuda/kernel.cuh
 create mode 100644 src/infiniop/ops/remainder/nvidia/remainder_nvidia.cu
 create mode 100644 src/infiniop/ops/remainder/nvidia/remainder_nvidia.cuh
 create mode 100644 src/infiniop/ops/remainder/operator.cc
 create mode 100644 src/infiniop/ops/rsqrt/cpu/rsqrt_cpu.cc
 create mode 100644 src/infiniop/ops/rsqrt/cpu/rsqrt_cpu.h
 create mode 100644 src/infiniop/ops/rsqrt/cuda/kernel.cuh
 create mode 100644 src/infiniop/ops/rsqrt/nvidia/rsqrt_nvidia.cu
 create mode 100644 src/infiniop/ops/rsqrt/nvidia/rsqrt_nvidia.cuh
 create mode 100644 src/infiniop/ops/rsqrt/operator.cc
 create mode 100644 src/infiniop/ops/sin/cpu/sin_cpu.cc
 create mode 100644 src/infiniop/ops/sin/cpu/sin_cpu.h
 create mode 100644 src/infiniop/ops/sin/cuda/kernel.cuh
 create mode 100644 src/infiniop/ops/sin/nvidia/sin_nvidia.cu
 create mode 100644 src/infiniop/ops/sin/nvidia/sin_nvidia.cuh
 create mode 100644 src/infiniop/ops/sin/operator.cc
 create mode 100644 src/infiniop/ops/sinc/cpu/sinc_cpu.cc
 create mode 100644 src/infiniop/ops/sinc/cpu/sinc_cpu.h
 create mode 100644 src/infiniop/ops/sinc/cuda/kernel.cuh
 create mode 100644 src/infiniop/ops/sinc/nvidia/sinc_nvidia.cu
 create mode 100644 src/infiniop/ops/sinc/nvidia/sinc_nvidia.cuh
 create mode 100644 src/infiniop/ops/sinc/operator.cc
 create mode 100644 src/infiniop/ops/square/cpu/square_cpu.cc
 create mode 100644 src/infiniop/ops/square/cpu/square_cpu.h
 create mode 100644 src/infiniop/ops/square/cuda/kernel.cuh
 create mode 100644 src/infiniop/ops/square/nvidia/square_nvidia.cu
 create mode 100644 src/infiniop/ops/square/nvidia/square_nvidia.cuh
 create mode 100644 src/infiniop/ops/square/operator.cc

diff --git a/include/infiniop.h b/include/infiniop.h
index e87839bc2..27f52ac85 100644
--- a/include/infiniop.h
+++ b/include/infiniop.h
@@ -9,15 +9,22 @@
 #include "infiniop/ops/add.h"
 #include "infiniop/ops/add_rms_norm.h"
 #include "infiniop/ops/attention.h"
+#include "infiniop/ops/averagepool.h"
+#include "infiniop/ops/batch_norm.h"
 #include "infiniop/ops/causal_softmax.h"
 #include "infiniop/ops/clip.h"
 #include "infiniop/ops/conv.h"
+#include "infiniop/ops/cross_entropy_loss.h"
 #include "infiniop/ops/dequantize_awq.h"
+#include "infiniop/ops/gather.h"
 #include "infiniop/ops/gelu.h"
 #include "infiniop/ops/gemm.h"
+#include "infiniop/ops/index_copy_inplace.h"
+#include "infiniop/ops/interpolate_nearest.h"
 #include "infiniop/ops/layer_norm.h"
 #include "infiniop/ops/logsoftmax.h"
 #include "infiniop/ops/lp_norm.h"
+#include "infiniop/ops/maxpool.h"
 #include "infiniop/ops/mul.h"
 #include "infiniop/ops/ones.h"
 #include "infiniop/ops/paged_attention.h"
@@ -28,6 +35,7 @@
 #include "infiniop/ops/relu.h"
 #include "infiniop/ops/rms_norm.h"
 #include "infiniop/ops/rope.h"
+#include "infiniop/ops/scatter.h"
 #include "infiniop/ops/sigmoid.h"
 #include "infiniop/ops/silu.h"
 #include "infiniop/ops/softmax.h"
diff --git a/include/infiniop/ops/averagepool.h b/include/infiniop/ops/averagepool.h
index 87e857175..752484772 100644
--- a/include/infiniop/ops/averagepool.h
+++ b/include/infiniop/ops/averagepool.h
@@ -5,7 +5,7 @@
 
 __C typedef struct InfiniopDescriptor *infiniopAvgPoolDescriptor_t;
 
-__C infiniStatus_t infiniopCreateAvgPoolDescriptor(infiniopHandle_t handle,
+__C __export infiniStatus_t infiniopCreateAvgPoolDescriptor(infiniopHandle_t handle,
                                                    infiniopAvgPoolDescriptor_t *desc_ptr,
                                                    infiniopTensorDescriptor_t output_desc,
                                                    infiniopTensorDescriptor_t input_desc,
@@ -14,16 +14,16 @@ __C infiniStatus_t infiniopCreateAvgPoolDescriptor(infiniopHandle_t handle,
                                                    void *pads,
                                                    bool ceil_mode);
 
-__C infiniStatus_t infiniopGetAvgPoolWorkspaceSize(infiniopAvgPoolDescriptor_t desc,
+__C __export infiniStatus_t infiniopGetAvgPoolWorkspaceSize(infiniopAvgPoolDescriptor_t desc,
                                                    size_t *size);
 
-__C infiniStatus_t infiniopAvgPool(infiniopAvgPoolDescriptor_t desc,
+__C __export infiniStatus_t infiniopAvgPool(infiniopAvgPoolDescriptor_t desc,
                                    void *workspace,
                                    size_t workspace_size,
                                    void *output,
                                    const void *input,
                                    void *stream);
 
-__C infiniStatus_t infiniopDestroyAvgPoolDescriptor(infiniopAvgPoolDescriptor_t desc);
+__C __export infiniStatus_t infiniopDestroyAvgPoolDescriptor(infiniopAvgPoolDescriptor_t desc);
 
 #endif // __INFINIOP_AVERAGEPOOL_H__
diff --git a/include/infiniop/ops/binary_ops_api.h b/include/infiniop/ops/binary_ops_api.h
index 24d7715c9..8ddc6f0fd 100644
--- a/include/infiniop/ops/binary_ops_api.h
+++ b/include/infiniop/ops/binary_ops_api.h
@@ -15,9 +15,30 @@
 
 // Declare all binary operator APIs
 BINARY_OP_API_DECLARE(div, Div)
+BINARY_OP_API_DECLARE(floor_divide, FloorDivide)
 BINARY_OP_API_DECLARE(pow, Pow)
+BINARY_OP_API_DECLARE(copysign, CopySign)
+BINARY_OP_API_DECLARE(hypot, Hypot)
+BINARY_OP_API_DECLARE(atan2, Atan2)
 BINARY_OP_API_DECLARE(mod, Mod)
+BINARY_OP_API_DECLARE(remainder, Remainder)
 BINARY_OP_API_DECLARE(max, Max)
 BINARY_OP_API_DECLARE(min, Min)
+BINARY_OP_API_DECLARE(fmax, Fmax)
+BINARY_OP_API_DECLARE(fmin, Fmin)
+BINARY_OP_API_DECLARE(gt, Gt)
+BINARY_OP_API_DECLARE(lt, Lt)
+BINARY_OP_API_DECLARE(ge, Ge)
+BINARY_OP_API_DECLARE(le, Le)
+BINARY_OP_API_DECLARE(eq, Eq)
+BINARY_OP_API_DECLARE(ne, Ne)
+BINARY_OP_API_DECLARE(logical_and, LogicalAnd)
+BINARY_OP_API_DECLARE(logical_or, LogicalOr)
+BINARY_OP_API_DECLARE(logical_xor, LogicalXor)
+BINARY_OP_API_DECLARE(bitwise_and, BitwiseAnd)
+BINARY_OP_API_DECLARE(bitwise_or, BitwiseOr)
+BINARY_OP_API_DECLARE(bitwise_xor, BitwiseXor)
+BINARY_OP_API_DECLARE(bitwise_left_shift, BitwiseLeftShift)
+BINARY_OP_API_DECLARE(bitwise_right_shift, BitwiseRightShift)
 
 #endif // __INFINIOP_BINARY_OPS_API_H__
diff --git a/include/infiniop/ops/cross_entropy_loss.h b/include/infiniop/ops/cross_entropy_loss.h
index 8b59843c9..034a0e38f 100644
--- a/include/infiniop/ops/cross_entropy_loss.h
+++ b/include/infiniop/ops/cross_entropy_loss.h
@@ -5,16 +5,16 @@
 
 typedef struct InfiniopDescriptor *infiniopCrossEntropyLossDescriptor_t;
 
-__C infiniStatus_t infiniopCreateCrossEntropyLossDescriptor(infiniopHandle_t handle,
+__C __export infiniStatus_t infiniopCreateCrossEntropyLossDescriptor(infiniopHandle_t handle,
                                                             infiniopCrossEntropyLossDescriptor_t *desc_ptr,
                                                             infiniopTensorDescriptor_t loss_desc,
                                                             infiniopTensorDescriptor_t logits_desc,
                                                             infiniopTensorDescriptor_t target_desc);
 
-__C infiniStatus_t infiniopGetCrossEntropyLossWorkspaceSize(infiniopCrossEntropyLossDescriptor_t desc,
+__C __export infiniStatus_t infiniopGetCrossEntropyLossWorkspaceSize(infiniopCrossEntropyLossDescriptor_t desc,
                                                             size_t *size);
 
-__C infiniStatus_t infiniopCrossEntropyLoss(infiniopCrossEntropyLossDescriptor_t desc,
+__C __export infiniStatus_t infiniopCrossEntropyLoss(infiniopCrossEntropyLossDescriptor_t desc,
                                             void *workspace,
                                             size_t workspace_size,
                                             void *loss,
@@ -22,6 +22,6 @@ __C infiniStatus_t infiniopCrossEntropyLoss(infiniopCrossEntropyLossDescriptor_t
                                             const void *target,
                                             void *stream);
 
-__C infiniStatus_t infiniopDestroyCrossEntropyLossDescriptor(infiniopCrossEntropyLossDescriptor_t desc);
+__C __export infiniStatus_t infiniopDestroyCrossEntropyLossDescriptor(infiniopCrossEntropyLossDescriptor_t desc);
 
 #endif // __INFINIOP_CROSS_ENTROPY_LOSS_API_H__
diff --git a/include/infiniop/ops/interpolate_nearest.h b/include/infiniop/ops/interpolate_nearest.h
index 7f970dc38..c604a7d48 100644
--- a/include/infiniop/ops/interpolate_nearest.h
+++ b/include/infiniop/ops/interpolate_nearest.h
@@ -5,21 +5,21 @@
 
 __C typedef struct InfiniopDescriptor *infiniopInterpolateNearestDescriptor_t;
 
-__C infiniStatus_t infiniopCreateInterpolateNearestDescriptor(infiniopHandle_t handle,
+__C __export infiniStatus_t infiniopCreateInterpolateNearestDescriptor(infiniopHandle_t handle,
                                                               infiniopInterpolateNearestDescriptor_t *desc_ptr,
                                                               infiniopTensorDescriptor_t output_desc,
                                                               infiniopTensorDescriptor_t input_desc);
 
-__C infiniStatus_t infiniopGetInterpolateNearestWorkspaceSize(infiniopInterpolateNearestDescriptor_t desc,
+__C __export infiniStatus_t infiniopGetInterpolateNearestWorkspaceSize(infiniopInterpolateNearestDescriptor_t desc,
                                                               size_t *size);
 
-__C infiniStatus_t infiniopInterpolateNearest(infiniopInterpolateNearestDescriptor_t desc,
+__C __export infiniStatus_t infiniopInterpolateNearest(infiniopInterpolateNearestDescriptor_t desc,
                                               void *workspace,
                                               size_t workspace_size,
                                               void *output,
                                               const void *input,
                                               void *stream);
 
-__C infiniStatus_t infiniopDestroyInterpolateNearestDescriptor(infiniopInterpolateNearestDescriptor_t desc);
+__C __export infiniStatus_t infiniopDestroyInterpolateNearestDescriptor(infiniopInterpolateNearestDescriptor_t desc);
 
 #endif // __INFINIOP_INTERPOLATE_NEAREST_H__
diff --git a/include/infiniop/ops/maxpool.h b/include/infiniop/ops/maxpool.h
index e47a43aed..7ee387fe8 100644
--- a/include/infiniop/ops/maxpool.h
+++ b/include/infiniop/ops/maxpool.h
@@ -5,7 +5,7 @@
 
 __C typedef struct InfiniopDescriptor *infiniopMaxPoolDescriptor_t;
 
-__C infiniStatus_t infiniopCreateMaxPoolDescriptor(infiniopHandle_t handle,
+__C __export infiniStatus_t infiniopCreateMaxPoolDescriptor(infiniopHandle_t handle,
                                                    infiniopMaxPoolDescriptor_t *desc_ptr,
                                                    infiniopTensorDescriptor_t output_desc,
                                                    infiniopTensorDescriptor_t input_desc,
@@ -14,16 +14,16 @@ __C infiniStatus_t infiniopCreateMaxPoolDescriptor(infiniopHandle_t handle,
                                                    void *pads,
                                                    bool ceil_mode);
 
-__C infiniStatus_t infiniopGetMaxPoolWorkspaceSize(infiniopMaxPoolDescriptor_t desc,
+__C __export infiniStatus_t infiniopGetMaxPoolWorkspaceSize(infiniopMaxPoolDescriptor_t desc,
                                                    size_t *size);
 
-__C infiniStatus_t infiniopMaxPool(infiniopMaxPoolDescriptor_t desc,
+__C __export infiniStatus_t infiniopMaxPool(infiniopMaxPoolDescriptor_t desc,
                                    void *workspace,
                                    size_t workspace_size,
                                    void *output,
                                    const void *input,
                                    void *stream);
 
-__C infiniStatus_t infiniopDestroyMaxPoolDescriptor(infiniopMaxPoolDescriptor_t desc);
+__C __export infiniStatus_t infiniopDestroyMaxPoolDescriptor(infiniopMaxPoolDescriptor_t desc);
 
 #endif // __INFINIOP_MAX_POOL_H__
diff --git a/include/infiniop/ops/unary_ops_api.h b/include/infiniop/ops/unary_ops_api.h
index 11cae2190..e97c21b8f 100644
--- a/include/infiniop/ops/unary_ops_api.h
+++ b/include/infiniop/ops/unary_ops_api.h
@@ -17,7 +17,12 @@
 // Declare all unary operator APIs
 UNARY_OP_API_DECLARE(abs, Abs)
 UNARY_OP_API_DECLARE(log, Log)
+UNARY_OP_API_DECLARE(log2, Log2)
+UNARY_OP_API_DECLARE(log10, Log10)
+UNARY_OP_API_DECLARE(log1p, Log1p)
 UNARY_OP_API_DECLARE(sqrt, Sqrt)
+UNARY_OP_API_DECLARE(square, Square)
+UNARY_OP_API_DECLARE(rsqrt, Rsqrt)
 UNARY_OP_API_DECLARE(reciprocal, Reciprocal)
 UNARY_OP_API_DECLARE(neg, Neg)
 UNARY_OP_API_DECLARE(round, Round)
@@ -36,6 +41,12 @@ UNARY_OP_API_DECLARE(atan, Atan)
 UNARY_OP_API_DECLARE(acos, Acos)
 UNARY_OP_API_DECLARE(ceil, Ceil)
 UNARY_OP_API_DECLARE(exp, Exp)
+UNARY_OP_API_DECLARE(exp2, Exp2)
 UNARY_OP_API_DECLARE(hardswish, Hardswish)
+UNARY_OP_API_DECLARE(isnan, IsNan)
+UNARY_OP_API_DECLARE(isinf, IsInf)
+UNARY_OP_API_DECLARE(isfinite, IsFinite)
+UNARY_OP_API_DECLARE(sinc, Sinc)
+UNARY_OP_API_DECLARE(sin, Sin)
 
 #endif // __INFINIOP_UNARY_OPS_API_H__
diff --git a/src/infiniop/README.md b/src/infiniop/README.md
index b4d4059e1..50e30e9c2 100644
--- a/src/infiniop/README.md
+++ b/src/infiniop/README.md
@@ -46,3 +46,211 @@ InfiniOP 是 InfiniCore 下属的统一底层算子框架，为相同算子在
     一些 CUDA kernel 可以被多个支持 CUDA 的平台公用，可以考虑在头文件中实现，并在多个源文件中使用。 比如 `mul/cuda/kernel.cuh` 中只有 device 测代码，会被多个支持 CUDA 的平台源代码引用。
 
 5. 算子实现可以成功编译安装后，在 `test/infiniop/` 中添加单测脚本，与 PyTorch 实现进行正确性和性能比较。你可以仿照已有的测试脚本进行开发，以使用各种通用的测试功能。测例应覆盖算子常用类型和形状。测试成功之后可以将测例添加至 `scripts/python_test.py` 一键测试脚本中（这样 Github 自动测试也会包含该算子）。
+
+## 添加 Elementwise 算子（Binary/Unary）
+
+对于逐元素算子（Elementwise Operators），由于重构后的统一框架，添加新算子变得非常简单。以下步骤展示了如何添加一个新的 elementwise 算子。
+
+### Binary Elementwise 算子示例（以 `pow` 为例）
+
+#### 步骤 1: 在 `BinaryMode` 枚举中添加算子
+
+在 `src/infiniop/elementwise/binary.h` 的 `BinaryMode` 枚举中添加新算子：
+
+```cpp
+enum class BinaryMode {
+    // ... 其他算子
+    Pow,  // 添加新算子
+    // ...
+};
+```
+
+#### 步骤 2: 在 `BinaryOp` 模板中添加计算逻辑
+
+在同一文件的 `BinaryOp` 模板中添加对应的计算实现：
+
+```cpp
+template <BinaryMode Mode>
+struct BinaryOp {
+    template <typename T>
+    T operator()(const T &a, const T &b) const {
+        // ... 其他算子的实现
+        else if constexpr (Mode == BinaryMode::Pow) {
+            return std::pow(a, b);
+        }
+        // ...
+    }
+};
+```
+
+如果需要在 CUDA 端优化，还需要在 `namespace cuda` 的 `BinaryOp` 模板中添加对应的 CUDA 实现。
+
+#### 步骤 3: 在 API 头文件中声明算子
+
+在 `include/infiniop/ops/binary_ops_api.h` 中添加：
+
+```cpp
+BINARY_OP_API_DECLARE(pow, Pow)
+```
+
+#### 步骤 4: 创建算子目录和文件
+
+创建目录结构 `src/infiniop/ops/pow/`，并创建以下文件：
+
+**`operator.cc`** - 主实现文件：
+```cpp
+#include "../../operator_impl.h"
+#include "infiniop/ops/binary_ops_api.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/pow_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
+#include "nvidia/pow_nvidia.cuh"
+#endif
+
+BINARY_OP_IMPL(pow, Pow)
+```
+
+**`cpu/pow_cpu.h`** - CPU 头文件：
+```cpp
+#ifndef __POW_CPU_H__
+#define __POW_CPU_H__
+
+#include "../../../elementwise/binary.h"
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+
+BINARY_ELEMENTWISE_DESCRIPTOR(pow, cpu, op::elementwise::binary::BinaryMode::Pow)
+
+#endif // __POW_CPU_H__
+```
+
+**`cpu/pow_cpu.cc`** - CPU 实现文件：
+```cpp
+#include "pow_cpu.h"
+#include "../../../elementwise/cpu/elementwise_cpu_impl.h"
+
+namespace op::pow::cpu {
+
+ELEMENTWISE_CPU_IMPL_BINARY(pow)
+
+} // namespace op::pow::cpu
+```
+
+**`nvidia/pow_nvidia.cuh`** - NVIDIA 头文件：
+```cpp
+#ifndef __POW_CUDA_API_H__
+#define __POW_CUDA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(pow, nvidia)
+
+#endif // __POW_CUDA_API_H__
+```
+
+**`nvidia/pow_nvidia.cu`** - NVIDIA 实现文件：
+```cpp
+#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "pow_nvidia.cuh"
+
+namespace op::pow::nvidia {
+
+ELEMENTWISE_NVIDIA_IMPL_BINARY(pow)
+
+} // namespace op::pow::nvidia
+```
+
+**`cuda/kernel.cuh`**（可选）- 如果需要在 CUDA kernel 中实现特殊逻辑：
+```cpp
+// 通常不需要，除非有特殊的 CUDA 优化需求
+```
+
+### Unary Elementwise 算子示例（以 `abs` 为例）
+
+Unary 算子的添加流程与 Binary 类似，主要区别如下：
+
+#### 步骤 1: 在 `UnaryMode` 枚举中添加算子
+
+在 `src/infiniop/elementwise/unary.h` 的 `UnaryMode` 枚举中添加：
+
+```cpp
+enum class UnaryMode {
+    // ... 其他算子
+    Abs,  // 添加新算子
+    // ...
+};
+```
+
+#### 步骤 2: 在 `UnaryOp` 模板中添加计算逻辑
+
+```cpp
+template <UnaryMode Mode>
+struct UnaryOp {
+    template <typename T>
+    T operator()(const T &x) const {
+        // ... 其他算子的实现
+        else if constexpr (Mode == UnaryMode::Abs) {
+            if constexpr (std::is_floating_point_v<T>) {
+                return std::fabs(x);
+            } else {
+                return std::abs(x);
+            }
+        }
+        // ...
+    }
+};
+```
+
+#### 步骤 3: 在 API 头文件中声明算子
+
+在 `include/infiniop/ops/unary_ops_api.h` 中添加：
+
+```cpp
+UNARY_OP_API_DECLARE(abs, Abs)
+```
+
+#### 步骤 4: 创建算子目录和文件
+
+文件结构与 Binary 类似，但使用 `UNARY_` 前缀的宏：
+
+**`operator.cc`**:
+```cpp
+UNARY_OP_IMPL(abs, Abs)
+```
+
+**`cpu/abs_cpu.h`**:
+```cpp
+UNARY_ELEMENTWISE_DESCRIPTOR(abs, cpu, op::elementwise::unary::UnaryMode::Abs)
+```
+
+**`cpu/abs_cpu.cc`**:
+```cpp
+ELEMENTWISE_CPU_IMPL_UNARY(abs)
+```
+
+**`nvidia/abs_nvidia.cu`**:
+```cpp
+ELEMENTWISE_NVIDIA_IMPL_UNARY(abs)
+```
+
+### 总结
+
+添加一个新的 elementwise 算子只需要：
+
+1. ✅ 在对应的 `BinaryMode`/`UnaryMode` 枚举中添加算子
+2. ✅ 在 `BinaryOp`/`UnaryOp` 模板中添加计算逻辑
+3. ✅ 在 API 头文件中使用宏声明算子
+4. ✅ 创建算子目录，使用统一的宏实现各平台代码
+
+**关键优势**：
+- 代码复用：所有平台共享相同的实现框架
+- 最小改动：只需添加算子特定的计算逻辑
+- 统一接口：自动生成标准的 C API
+- 易于维护：修改框架代码即可影响所有算子
+
+参考实现：
+- Binary: `src/infiniop/ops/pow/`
+- Unary: `src/infiniop/ops/abs/`
diff --git a/src/infiniop/elementwise/binary.h b/src/infiniop/elementwise/binary.h
index 1823fac3f..ba982744d 100644
--- a/src/infiniop/elementwise/binary.h
+++ b/src/infiniop/elementwise/binary.h
@@ -27,14 +27,40 @@ enum class BinaryMode {
     Subtract,
     Multiply,
     Divide,
+    FloorDivide, // floor_divide: floor(a / b)
     Pow,
+    CopySign,
+    Hypot,
+    Atan2, // atan2: atan2(y, x)
     Mod,
+    Remainder,
     Max,
     Min,
-    // Logical operations (for future use):
-    // And, Or, Xor, Less, LessOrEqual, Equal, Greater, GreaterOrEqual
+    Fmax,
+    Fmin,
+    // Comparison operations:
+    Greater,        // gt: a > b
+    Less,           // lt: a < b
+    GreaterOrEqual, // ge: a >= b
+    LessOrEqual,    // le: a <= b
+    Equal,          // eq: a == b
+    NotEqual,       // ne: a != b
+    // Logical operations:
+    LogicalAnd, // logical_and: a && b (non-zero as true)
+    LogicalOr,  // logical_or: a || b (non-zero as true)
+    LogicalXor, // logical_xor: a ^ b (exactly one non-zero as true)
+    // Bitwise operations:
+    BitwiseAnd,        // bitwise_and: a & b (only for integral types)
+    BitwiseOr,         // bitwise_or: a | b (only for integral types)
+    BitwiseXor,        // bitwise_xor: a ^ b (only for integral types)
+    BitwiseLeftShift,  // bitwise_left_shift: a << b (only for integral types)
+    BitwiseRightShift, // bitwise_right_shift: a >> b (only for integral types)
 };
 
+// Helper template for static_assert in else branches
+template <BinaryMode M>
+struct always_false : std::false_type {};
+
 /**
  * @brief Generic binary operation template that performs different operations
  *        based on the specified BinaryMode.
@@ -59,28 +85,145 @@ struct BinaryOp {
             return a * b;
         } else if constexpr (Mode == BinaryMode::Divide) {
             return a / b;
+        } else if constexpr (Mode == BinaryMode::FloorDivide) {
+            // Floor divide: floor(a / b)
+            if constexpr (std::is_integral_v<T>) {
+                // For integral types, integer division is already floor division
+                return a / b;
+            } else {
+                // For floating point types, use std::floor
+                return std::floor(a / b);
+            }
         } else if constexpr (Mode == BinaryMode::Pow) {
             return std::pow(a, b);
+        } else if constexpr (Mode == BinaryMode::CopySign) {
+            if constexpr (std::is_floating_point_v<T>) {
+                return std::copysign(a, b);
+            } else {
+                // For integral types, return a with sign of b
+                return (b < T(0)) ? -std::abs(a) : std::abs(a);
+            }
+        } else if constexpr (Mode == BinaryMode::Hypot) {
+            return std::hypot(a, b);
+        } else if constexpr (Mode == BinaryMode::Atan2) {
+            // atan2(y, x): returns the angle whose tangent is y/x
+            return std::atan2(a, b);
         } else if constexpr (Mode == BinaryMode::Mod) {
             if constexpr (std::is_floating_point_v<T>) {
                 return std::fmod(a, b);
             } else {
                 return a % b;
             }
+        } else if constexpr (Mode == BinaryMode::Remainder) {
+            if constexpr (std::is_floating_point_v<T>) {
+                // PyTorch remainder: x - floor(x/y) * y, result sign matches divisor (y)
+                T quotient = std::floor(a / b);
+                return a - quotient * b;
+            } else {
+                // For integral types, remainder is same as mod
+                return a % b;
+            }
         } else if constexpr (Mode == BinaryMode::Max) {
+            // Max: propagates NaN (if either is NaN, result is NaN)
             if constexpr (std::is_floating_point_v<T>) {
-                return std::fmax(a, b);
+                // Use std::max which propagates NaN (a > b ? a : b behavior with NaN)
+                return (a > b) ? a : b;
             } else {
                 return std::max(a, b);
             }
         } else if constexpr (Mode == BinaryMode::Min) {
+            // Min: propagates NaN (if either is NaN, result is NaN)
+            if constexpr (std::is_floating_point_v<T>) {
+                // Use std::min which propagates NaN (a < b ? a : b behavior with NaN)
+                return (a < b) ? a : b;
+            } else {
+                return std::min(a, b);
+            }
+        } else if constexpr (Mode == BinaryMode::Fmax) {
+            // Fmax: ignores NaN (if one is NaN, return the other)
+            if constexpr (std::is_floating_point_v<T>) {
+                return std::fmax(a, b);
+            } else {
+                return std::max(a, b);
+            }
+        } else if constexpr (Mode == BinaryMode::Fmin) {
+            // Fmin: ignores NaN (if one is NaN, return the other)
             if constexpr (std::is_floating_point_v<T>) {
                 return std::fmin(a, b);
             } else {
                 return std::min(a, b);
             }
+        } else if constexpr (Mode == BinaryMode::Greater) {
+            // Return 1.0 if a > b, else 0.0
+            return static_cast<T>(a > b ? T(1) : T(0));
+        } else if constexpr (Mode == BinaryMode::Less) {
+            // Return 1.0 if a < b, else 0.0
+            return static_cast<T>(a < b ? T(1) : T(0));
+        } else if constexpr (Mode == BinaryMode::GreaterOrEqual) {
+            // Return 1.0 if a >= b, else 0.0
+            return static_cast<T>(a >= b ? T(1) : T(0));
+        } else if constexpr (Mode == BinaryMode::LessOrEqual) {
+            // Return 1.0 if a <= b, else 0.0
+            return static_cast<T>(a <= b ? T(1) : T(0));
+        } else if constexpr (Mode == BinaryMode::Equal) {
+            // Return 1.0 if a == b, else 0.0
+            return static_cast<T>(a == b ? T(1) : T(0));
+        } else if constexpr (Mode == BinaryMode::NotEqual) {
+            // Return 1.0 if a != b, else 0.0
+            return static_cast<T>(a != b ? T(1) : T(0));
+        } else if constexpr (Mode == BinaryMode::LogicalAnd) {
+            // Return 1.0 if both a and b are non-zero, else 0.0
+            return static_cast<T>((a != T(0) && b != T(0)) ? T(1) : T(0));
+        } else if constexpr (Mode == BinaryMode::LogicalOr) {
+            // Return 1.0 if either a or b is non-zero, else 0.0
+            return static_cast<T>((a != T(0) || b != T(0)) ? T(1) : T(0));
+        } else if constexpr (Mode == BinaryMode::LogicalXor) {
+            // Return 1.0 if exactly one of a or b is non-zero, else 0.0
+            bool a_nonzero = (a != T(0));
+            bool b_nonzero = (b != T(0));
+            return static_cast<T>((a_nonzero != b_nonzero) ? T(1) : T(0));
+        } else if constexpr (Mode == BinaryMode::BitwiseAnd) {
+            // Bitwise AND: a & b (only for integral types)
+            if constexpr (std::is_integral_v<T>) {
+                return a & b;
+            } else {
+                static_assert(std::is_integral_v<T>, "Bitwise operations require integral types");
+                return T(0);
+            }
+        } else if constexpr (Mode == BinaryMode::BitwiseOr) {
+            // Bitwise OR: a | b (only for integral types)
+            if constexpr (std::is_integral_v<T>) {
+                return a | b;
+            } else {
+                static_assert(std::is_integral_v<T>, "Bitwise operations require integral types");
+                return T(0);
+            }
+        } else if constexpr (Mode == BinaryMode::BitwiseXor) {
+            // Bitwise XOR: a ^ b (only for integral types)
+            if constexpr (std::is_integral_v<T>) {
+                return a ^ b;
+            } else {
+                static_assert(std::is_integral_v<T>, "Bitwise operations require integral types");
+                return T(0);
+            }
+        } else if constexpr (Mode == BinaryMode::BitwiseLeftShift) {
+            // Bitwise left shift: a << b (only for integral types)
+            if constexpr (std::is_integral_v<T>) {
+                return a << b;
+            } else {
+                static_assert(std::is_integral_v<T>, "Bitwise operations require integral types");
+                return T(0);
+            }
+        } else if constexpr (Mode == BinaryMode::BitwiseRightShift) {
+            // Bitwise right shift: a >> b (only for integral types)
+            if constexpr (std::is_integral_v<T>) {
+                return a >> b;
+            } else {
+                static_assert(std::is_integral_v<T>, "Bitwise operations require integral types");
+                return T(0);
+            }
         } else {
-            static_assert(Mode != Mode, "Unsupported binary operation mode");
+            static_assert(always_false<Mode>::value, "Unsupported binary operation mode");
             return a;
         }
     }
@@ -143,6 +286,24 @@ struct BinaryOp {
             } else {
                 return a / b;
             }
+        } else if constexpr (Mode == BinaryMode::FloorDivide) {
+            // Floor divide: floor(a / b)
+            if constexpr (std::is_same_v<T, half2>) {
+                float2 a_f2 = __half22float2(a);
+                float2 b_f2 = __half22float2(b);
+                return __float22half2_rn(make_float2(floorf(a_f2.x / b_f2.x), floorf(a_f2.y / b_f2.y)));
+            } else if constexpr (std::is_same_v<T, half>) {
+                float a_ = __half2float(a);
+                float b_ = __half2float(b);
+                return __float2half(floorf(a_ / b_));
+            } else if constexpr (std::is_integral_v<T>) {
+                // For integral types, integer division is already floor division
+                return a / b;
+            } else if constexpr (std::is_same_v<T, float>) {
+                return floorf(a / b);
+            } else {
+                return std::floor(a / b);
+            }
         } else if constexpr (Mode == BinaryMode::Pow) {
             if constexpr (std::is_same_v<T, half2>) {
                 float2 a_f2 = __half22float2(a);
@@ -151,8 +312,8 @@ struct BinaryOp {
             } else if constexpr (std::is_same_v<T, half>) {
                 float a_ = __half2float(a);
                 float b_ = __half2float(b);
-                float ans_f = __powf(a_, b_);
-                return __float2half(isnan(ans_f) ? std::pow(a_, b_) : ans_f);
+                // Use __powf only (std::pow is host function, cannot be used in device code)
+                return __float2half(__powf(a_, b_));
             } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
                 float2 a_f2 = __bfloat1622float2(a);
                 float2 b_f2 = __bfloat1622float2(b);
@@ -166,42 +327,403 @@ struct BinaryOp {
             } else {
                 return std::pow(a, b);
             }
+        } else if constexpr (Mode == BinaryMode::CopySign) {
+            if constexpr (std::is_same_v<T, half2>) {
+                float2 a_f2 = __half22float2(a);
+                float2 b_f2 = __half22float2(b);
+                return __float22half2_rn(make_float2(copysignf(a_f2.x, b_f2.x), copysignf(a_f2.y, b_f2.y)));
+            } else if constexpr (std::is_same_v<T, half>) {
+                float a_ = __half2float(a);
+                float b_ = __half2float(b);
+                return __float2half(copysignf(a_, b_));
+            } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
+                float2 a_f2 = __bfloat1622float2(a);
+                float2 b_f2 = __bfloat1622float2(b);
+                return __floats2bfloat162_rn(copysignf(a_f2.x, b_f2.x), copysignf(a_f2.y, b_f2.y));
+            } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+                float a_ = __bfloat162float(a);
+                float b_ = __bfloat162float(b);
+                return __float2bfloat16_rn(copysignf(a_, b_));
+            } else if constexpr (std::is_same_v<T, float>) {
+                return copysignf(a, b);
+            } else if constexpr (std::is_floating_point_v<T>) {
+                return std::copysign(a, b);
+            } else {
+                // For integral types, return a with sign of b
+                return (b < T(0)) ? -std::abs(a) : std::abs(a);
+            }
+        } else if constexpr (Mode == BinaryMode::Hypot) {
+            if constexpr (std::is_same_v<T, half2>) {
+                float2 a_f2 = __half22float2(a);
+                float2 b_f2 = __half22float2(b);
+                return __float22half2_rn(make_float2(hypotf(a_f2.x, b_f2.x), hypotf(a_f2.y, b_f2.y)));
+            } else if constexpr (std::is_same_v<T, half>) {
+                float a_ = __half2float(a);
+                float b_ = __half2float(b);
+                return __float2half(hypotf(a_, b_));
+            } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
+                float2 a_f2 = __bfloat1622float2(a);
+                float2 b_f2 = __bfloat1622float2(b);
+                return __floats2bfloat162_rn(hypotf(a_f2.x, b_f2.x), hypotf(a_f2.y, b_f2.y));
+            } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+                float a_ = __bfloat162float(a);
+                float b_ = __bfloat162float(b);
+                return __float2bfloat16_rn(hypotf(a_, b_));
+            } else if constexpr (std::is_same_v<T, float>) {
+                return hypotf(a, b);
+            } else {
+                return std::hypot(a, b);
+            }
+        } else if constexpr (Mode == BinaryMode::Atan2) {
+            // atan2(y, x): returns the angle whose tangent is y/x
+            if constexpr (std::is_same_v<T, half2>) {
+                float2 a_f2 = __half22float2(a);
+                float2 b_f2 = __half22float2(b);
+                return __float22half2_rn(make_float2(atan2f(a_f2.x, b_f2.x), atan2f(a_f2.y, b_f2.y)));
+            } else if constexpr (std::is_same_v<T, half>) {
+                float a_ = __half2float(a);
+                float b_ = __half2float(b);
+                return __float2half(atan2f(a_, b_));
+            } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
+                float2 a_f2 = __bfloat1622float2(a);
+                float2 b_f2 = __bfloat1622float2(b);
+                return __floats2bfloat162_rn(atan2f(a_f2.x, b_f2.x), atan2f(a_f2.y, b_f2.y));
+            } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+                float a_ = __bfloat162float(a);
+                float b_ = __bfloat162float(b);
+                return __float2bfloat16_rn(atan2f(a_, b_));
+            } else if constexpr (std::is_same_v<T, float>) {
+                return atan2f(a, b);
+            } else {
+                return std::atan2(a, b);
+            }
         } else if constexpr (Mode == BinaryMode::Mod) {
             if constexpr (std::is_same_v<T, half2>) {
                 float2 a_f2 = __half22float2(a);
                 float2 b_f2 = __half22float2(b);
-                return __float22half2_rn(make_float2(std::fmod(a_f2.x, b_f2.x), std::fmod(a_f2.y, b_f2.y)));
+                return __float22half2_rn(make_float2(fmodf(a_f2.x, b_f2.x), fmodf(a_f2.y, b_f2.y)));
             } else if constexpr (std::is_same_v<T, half>) {
                 float a_ = __half2float(a);
                 float b_ = __half2float(b);
-                return __float2half(std::fmod(a_, b_));
+                return __float2half(fmodf(a_, b_));
+            } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
+                float2 a_f2 = __bfloat1622float2(a);
+                float2 b_f2 = __bfloat1622float2(b);
+                return __floats2bfloat162_rn(fmodf(a_f2.x, b_f2.x), fmodf(a_f2.y, b_f2.y));
+            } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+                float a_ = __bfloat162float(a);
+                float b_ = __bfloat162float(b);
+                return __float2bfloat16_rn(fmodf(a_, b_));
             } else if constexpr (std::is_floating_point_v<T>) {
-                return std::fmod(a, b);
+                return fmodf(a, b);
+            } else {
+                return a % b;
+            }
+        } else if constexpr (Mode == BinaryMode::Remainder) {
+            // PyTorch remainder: x - floor(x/y) * y, result sign matches divisor (y)
+            if constexpr (std::is_same_v<T, half2>) {
+                float2 a_f2 = __half22float2(a);
+                float2 b_f2 = __half22float2(b);
+                float2 q_f2 = make_float2(floorf(a_f2.x / b_f2.x), floorf(a_f2.y / b_f2.y));
+                float2 r_f2 = make_float2(a_f2.x - q_f2.x * b_f2.x, a_f2.y - q_f2.y * b_f2.y);
+                return __float22half2_rn(r_f2);
+            } else if constexpr (std::is_same_v<T, half>) {
+                float a_ = __half2float(a);
+                float b_ = __half2float(b);
+                float q_ = floorf(a_ / b_);
+                float r_ = a_ - q_ * b_;
+                return __float2half(r_);
+            } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
+                float2 a_f2 = __bfloat1622float2(a);
+                float2 b_f2 = __bfloat1622float2(b);
+                float2 q_f2 = make_float2(floorf(a_f2.x / b_f2.x), floorf(a_f2.y / b_f2.y));
+                float2 r_f2 = make_float2(a_f2.x - q_f2.x * b_f2.x, a_f2.y - q_f2.y * b_f2.y);
+                return __floats2bfloat162_rn(r_f2.x, r_f2.y);
+            } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+                float a_ = __bfloat162float(a);
+                float b_ = __bfloat162float(b);
+                float q_ = floorf(a_ / b_);
+                float r_ = a_ - q_ * b_;
+                return __float2bfloat16_rn(r_);
+            } else if constexpr (std::is_same_v<T, float>) {
+                float q = floorf(a / b);
+                return a - q * b;
+            } else if constexpr (std::is_floating_point_v<T>) {
+                T quotient = std::floor(a / b);
+                return a - quotient * b;
             } else {
+                // For integral types, remainder is same as mod
                 return a % b;
             }
         } else if constexpr (Mode == BinaryMode::Max) {
+            // Max: propagates NaN (torch.maximum behavior)
             if constexpr (std::is_same_v<T, half2>) {
                 return __hmax2(a, b);
             } else if constexpr (std::is_same_v<T, half> || std::is_same_v<T, cuda_bfloat16>) {
+                // For half/bfloat16, use comparison which propagates NaN
                 return a > b ? a : b;
             } else if constexpr (std::is_same_v<T, float>) {
-                return fmaxf(a, b);
+                // For float, use comparison which propagates NaN
+                return a > b ? a : b;
             } else {
                 return a > b ? a : b;
             }
         } else if constexpr (Mode == BinaryMode::Min) {
+            // Min: propagates NaN (torch.minimum behavior)
             if constexpr (std::is_same_v<T, half2>) {
                 return __hmin2(a, b);
             } else if constexpr (std::is_same_v<T, half> || std::is_same_v<T, cuda_bfloat16>) {
+                // For half/bfloat16, use comparison which propagates NaN
+                return a < b ? a : b;
+            } else if constexpr (std::is_same_v<T, float>) {
+                // For float, use comparison which propagates NaN
                 return a < b ? a : b;
+            } else {
+                return a < b ? a : b;
+            }
+        } else if constexpr (Mode == BinaryMode::Fmax) {
+            // Fmax: ignores NaN (torch.fmax behavior - if one is NaN, return the other)
+            if constexpr (std::is_same_v<T, half2>) {
+                // __hmax2 may propagate NaN, so implement custom NaN-ignoring version
+                float2 a_f2 = __half22float2(a);
+                float2 b_f2 = __half22float2(b);
+                return __float22half2_rn(make_float2(fmaxf(a_f2.x, b_f2.x), fmaxf(a_f2.y, b_f2.y)));
+            } else if constexpr (std::is_same_v<T, half>) {
+                float a_ = __half2float(a);
+                float b_ = __half2float(b);
+                return __float2half(fmaxf(a_, b_));
+            } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
+                float a0 = __bfloat162float(__low2bfloat16(a));
+                float a1 = __bfloat162float(__high2bfloat16(a));
+                float b0 = __bfloat162float(__low2bfloat16(b));
+                float b1 = __bfloat162float(__high2bfloat16(b));
+                return __floats2bfloat162_rn(fmaxf(a0, b0), fmaxf(a1, b1));
+            } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+                float a_ = __bfloat162float(a);
+                float b_ = __bfloat162float(b);
+                return __float2bfloat16_rn(fmaxf(a_, b_));
+            } else if constexpr (std::is_same_v<T, float>) {
+                return fmaxf(a, b);
+            } else if constexpr (std::is_same_v<T, double>) {
+                return fmax(a, b);
+            } else {
+                return a > b ? a : b;
+            }
+        } else if constexpr (Mode == BinaryMode::Fmin) {
+            // Fmin: ignores NaN (torch.fmin behavior - if one is NaN, return the other)
+            if constexpr (std::is_same_v<T, half2>) {
+                // __hmin2 may propagate NaN, so implement custom NaN-ignoring version
+                float2 a_f2 = __half22float2(a);
+                float2 b_f2 = __half22float2(b);
+                return __float22half2_rn(make_float2(fminf(a_f2.x, b_f2.x), fminf(a_f2.y, b_f2.y)));
+            } else if constexpr (std::is_same_v<T, half>) {
+                float a_ = __half2float(a);
+                float b_ = __half2float(b);
+                return __float2half(fminf(a_, b_));
+            } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
+                float a0 = __bfloat162float(__low2bfloat16(a));
+                float a1 = __bfloat162float(__high2bfloat16(a));
+                float b0 = __bfloat162float(__low2bfloat16(b));
+                float b1 = __bfloat162float(__high2bfloat16(b));
+                return __floats2bfloat162_rn(fminf(a0, b0), fminf(a1, b1));
+            } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+                float a_ = __bfloat162float(a);
+                float b_ = __bfloat162float(b);
+                return __float2bfloat16_rn(fminf(a_, b_));
             } else if constexpr (std::is_same_v<T, float>) {
                 return fminf(a, b);
+            } else if constexpr (std::is_same_v<T, double>) {
+                return fmin(a, b);
             } else {
                 return a < b ? a : b;
             }
+        } else if constexpr (Mode == BinaryMode::Greater) {
+            // Return 1.0 if a > b, else 0.0
+            if constexpr (std::is_same_v<T, half2>) {
+                float2 a_f2 = __half22float2(a);
+                float2 b_f2 = __half22float2(b);
+                return __float22half2_rn(make_float2(
+                    (a_f2.x > b_f2.x) ? 1.0f : 0.0f,
+                    (a_f2.y > b_f2.y) ? 1.0f : 0.0f));
+            } else if constexpr (std::is_same_v<T, half> || std::is_same_v<T, cuda_bfloat16>) {
+                return (a > b) ? T(1) : T(0);
+            } else if constexpr (std::is_same_v<T, float>) {
+                return (a > b) ? 1.0f : 0.0f;
+            } else {
+                return static_cast<T>((a > b) ? 1 : 0);
+            }
+        } else if constexpr (Mode == BinaryMode::Less) {
+            // Return 1.0 if a < b, else 0.0
+            if constexpr (std::is_same_v<T, half2>) {
+                float2 a_f2 = __half22float2(a);
+                float2 b_f2 = __half22float2(b);
+                return __float22half2_rn(make_float2(
+                    (a_f2.x < b_f2.x) ? 1.0f : 0.0f,
+                    (a_f2.y < b_f2.y) ? 1.0f : 0.0f));
+            } else if constexpr (std::is_same_v<T, half> || std::is_same_v<T, cuda_bfloat16>) {
+                return (a < b) ? T(1) : T(0);
+            } else if constexpr (std::is_same_v<T, float>) {
+                return (a < b) ? 1.0f : 0.0f;
+            } else {
+                return static_cast<T>((a < b) ? 1 : 0);
+            }
+        } else if constexpr (Mode == BinaryMode::GreaterOrEqual) {
+            // Return 1.0 if a >= b, else 0.0
+            if constexpr (std::is_same_v<T, half2>) {
+                float2 a_f2 = __half22float2(a);
+                float2 b_f2 = __half22float2(b);
+                return __float22half2_rn(make_float2(
+                    (a_f2.x >= b_f2.x) ? 1.0f : 0.0f,
+                    (a_f2.y >= b_f2.y) ? 1.0f : 0.0f));
+            } else if constexpr (std::is_same_v<T, half> || std::is_same_v<T, cuda_bfloat16>) {
+                return (a >= b) ? T(1) : T(0);
+            } else if constexpr (std::is_same_v<T, float>) {
+                return (a >= b) ? 1.0f : 0.0f;
+            } else {
+                return static_cast<T>((a >= b) ? 1 : 0);
+            }
+        } else if constexpr (Mode == BinaryMode::LessOrEqual) {
+            // Return 1.0 if a <= b, else 0.0
+            if constexpr (std::is_same_v<T, half2>) {
+                float2 a_f2 = __half22float2(a);
+                float2 b_f2 = __half22float2(b);
+                return __float22half2_rn(make_float2(
+                    (a_f2.x <= b_f2.x) ? 1.0f : 0.0f,
+                    (a_f2.y <= b_f2.y) ? 1.0f : 0.0f));
+            } else if constexpr (std::is_same_v<T, half> || std::is_same_v<T, cuda_bfloat16>) {
+                return (a <= b) ? T(1) : T(0);
+            } else if constexpr (std::is_same_v<T, float>) {
+                return (a <= b) ? 1.0f : 0.0f;
+            } else {
+                return static_cast<T>((a <= b) ? 1 : 0);
+            }
+        } else if constexpr (Mode == BinaryMode::Equal) {
+            // Return 1.0 if a == b, else 0.0
+            if constexpr (std::is_same_v<T, half2>) {
+                float2 a_f2 = __half22float2(a);
+                float2 b_f2 = __half22float2(b);
+                return __float22half2_rn(make_float2(
+                    (a_f2.x == b_f2.x) ? 1.0f : 0.0f,
+                    (a_f2.y == b_f2.y) ? 1.0f : 0.0f));
+            } else if constexpr (std::is_same_v<T, half> || std::is_same_v<T, cuda_bfloat16>) {
+                return (a == b) ? T(1) : T(0);
+            } else if constexpr (std::is_same_v<T, float>) {
+                return (a == b) ? 1.0f : 0.0f;
+            } else {
+                return static_cast<T>((a == b) ? 1 : 0);
+            }
+        } else if constexpr (Mode == BinaryMode::NotEqual) {
+            // Return 1.0 if a != b, else 0.0
+            if constexpr (std::is_same_v<T, half2>) {
+                float2 a_f2 = __half22float2(a);
+                float2 b_f2 = __half22float2(b);
+                return __float22half2_rn(make_float2(
+                    (a_f2.x != b_f2.x) ? 1.0f : 0.0f,
+                    (a_f2.y != b_f2.y) ? 1.0f : 0.0f));
+            } else if constexpr (std::is_same_v<T, half> || std::is_same_v<T, cuda_bfloat16>) {
+                return (a != b) ? T(1) : T(0);
+            } else if constexpr (std::is_same_v<T, float>) {
+                return (a != b) ? 1.0f : 0.0f;
+            } else {
+                return static_cast<T>((a != b) ? 1 : 0);
+            }
+        } else if constexpr (Mode == BinaryMode::LogicalAnd) {
+            // Return 1.0 if both a and b are non-zero, else 0.0
+            if constexpr (std::is_same_v<T, half2>) {
+                float2 a_f2 = __half22float2(a);
+                float2 b_f2 = __half22float2(b);
+                return __float22half2_rn(make_float2(
+                    ((a_f2.x != 0.0f) && (b_f2.x != 0.0f)) ? 1.0f : 0.0f,
+                    ((a_f2.y != 0.0f) && (b_f2.y != 0.0f)) ? 1.0f : 0.0f));
+            } else if constexpr (std::is_same_v<T, half> || std::is_same_v<T, cuda_bfloat16>) {
+                return ((a != T(0)) && (b != T(0))) ? T(1) : T(0);
+            } else if constexpr (std::is_same_v<T, float>) {
+                return ((a != 0.0f) && (b != 0.0f)) ? 1.0f : 0.0f;
+            } else {
+                return static_cast<T>(((a != T(0)) && (b != T(0))) ? 1 : 0);
+            }
+        } else if constexpr (Mode == BinaryMode::LogicalOr) {
+            // Return 1.0 if either a or b is non-zero, else 0.0
+            if constexpr (std::is_same_v<T, half2>) {
+                float2 a_f2 = __half22float2(a);
+                float2 b_f2 = __half22float2(b);
+                return __float22half2_rn(make_float2(
+                    ((a_f2.x != 0.0f) || (b_f2.x != 0.0f)) ? 1.0f : 0.0f,
+                    ((a_f2.y != 0.0f) || (b_f2.y != 0.0f)) ? 1.0f : 0.0f));
+            } else if constexpr (std::is_same_v<T, half> || std::is_same_v<T, cuda_bfloat16>) {
+                return ((a != T(0)) || (b != T(0))) ? T(1) : T(0);
+            } else if constexpr (std::is_same_v<T, float>) {
+                return ((a != 0.0f) || (b != 0.0f)) ? 1.0f : 0.0f;
+            } else {
+                return static_cast<T>(((a != T(0)) || (b != T(0))) ? 1 : 0);
+            }
+        } else if constexpr (Mode == BinaryMode::LogicalXor) {
+            // Return 1.0 if exactly one of a or b is non-zero, else 0.0
+            if constexpr (std::is_same_v<T, half2>) {
+                float2 a_f2 = __half22float2(a);
+                float2 b_f2 = __half22float2(b);
+                bool a_x_nonzero = (a_f2.x != 0.0f);
+                bool b_x_nonzero = (b_f2.x != 0.0f);
+                bool a_y_nonzero = (a_f2.y != 0.0f);
+                bool b_y_nonzero = (b_f2.y != 0.0f);
+                return __float22half2_rn(make_float2(
+                    (a_x_nonzero != b_x_nonzero) ? 1.0f : 0.0f,
+                    (a_y_nonzero != b_y_nonzero) ? 1.0f : 0.0f));
+            } else if constexpr (std::is_same_v<T, half> || std::is_same_v<T, cuda_bfloat16>) {
+                bool a_nonzero = (a != T(0));
+                bool b_nonzero = (b != T(0));
+                return (a_nonzero != b_nonzero) ? T(1) : T(0);
+            } else if constexpr (std::is_same_v<T, float>) {
+                bool a_nonzero = (a != 0.0f);
+                bool b_nonzero = (b != 0.0f);
+                return (a_nonzero != b_nonzero) ? 1.0f : 0.0f;
+            } else {
+                bool a_nonzero = (a != T(0));
+                bool b_nonzero = (b != T(0));
+                return static_cast<T>((a_nonzero != b_nonzero) ? 1 : 0);
+            }
+        } else if constexpr (Mode == BinaryMode::BitwiseAnd) {
+            // Bitwise AND: a & b (only for integral types)
+            if constexpr (std::is_integral_v<T>) {
+                return a & b;
+            } else {
+                static_assert(std::is_integral_v<T>, "Bitwise operations require integral types");
+                return T(0);
+            }
+        } else if constexpr (Mode == BinaryMode::BitwiseOr) {
+            // Bitwise OR: a | b (only for integral types)
+            if constexpr (std::is_integral_v<T>) {
+                return a | b;
+            } else {
+                static_assert(std::is_integral_v<T>, "Bitwise operations require integral types");
+                return T(0);
+            }
+        } else if constexpr (Mode == BinaryMode::BitwiseXor) {
+            // Bitwise XOR: a ^ b (only for integral types)
+            if constexpr (std::is_integral_v<T>) {
+                return a ^ b;
+            } else {
+                static_assert(std::is_integral_v<T>, "Bitwise operations require integral types");
+                return T(0);
+            }
+        } else if constexpr (Mode == BinaryMode::BitwiseLeftShift) {
+            // Bitwise left shift: a << b (only for integral types)
+            if constexpr (std::is_integral_v<T>) {
+                return a << b;
+            } else {
+                static_assert(std::is_integral_v<T>, "Bitwise operations require integral types");
+                return T(0);
+            }
+        } else if constexpr (Mode == BinaryMode::BitwiseRightShift) {
+            // Bitwise right shift: a >> b (only for integral types)
+            if constexpr (std::is_integral_v<T>) {
+                return a >> b;
+            } else {
+                static_assert(std::is_integral_v<T>, "Bitwise operations require integral types");
+                return T(0);
+            }
         } else {
-            static_assert(Mode != Mode, "Unsupported binary operation mode");
+            static_assert(always_false<Mode>::value, "Unsupported binary operation mode");
             return a;
         }
     }
diff --git a/src/infiniop/elementwise/cpu/elementwise_cpu_impl.h b/src/infiniop/elementwise/cpu/elementwise_cpu_impl.h
index fff5b1819..fa0e216e0 100644
--- a/src/infiniop/elementwise/cpu/elementwise_cpu_impl.h
+++ b/src/infiniop/elementwise/cpu/elementwise_cpu_impl.h
@@ -32,38 +32,50 @@
 /**
  * @brief Common Calculate Switch Cases (F16 & F32)
  */
-#define _IMPL_CALC_CASES_COMMON \
-    case INFINI_DTYPE_F16: \
+#define _IMPL_CALC_CASES_COMMON                                                             \
+    case INFINI_DTYPE_F16:                                                                  \
         return _device_info->template calculate<Op, fp16_t>(_info, output, inputs, stream); \
-    case INFINI_DTYPE_F32: \
+    case INFINI_DTYPE_F32:                                                                  \
         return _device_info->template calculate<Op, float>(_info, output, inputs, stream);
 
 /**
  * @brief Extended Calculate Switch Cases (Adds F64 & BF16)
  */
-#define _IMPL_CALC_CASES_EXTENDED \
-    _IMPL_CALC_CASES_COMMON \
-    case INFINI_DTYPE_F64: \
+#define _IMPL_CALC_CASES_EXTENDED                                                           \
+    _IMPL_CALC_CASES_COMMON                                                                 \
+    case INFINI_DTYPE_F64:                                                                  \
         return _device_info->template calculate<Op, double>(_info, output, inputs, stream); \
-    case INFINI_DTYPE_BF16: \
+    case INFINI_DTYPE_BF16:                                                                 \
         return _device_info->template calculate<Op, bf16_t>(_info, output, inputs, stream);
 
+/**
+ * @brief Integral Calculate Switch Cases (I32, I64, U8)
+ * For bitwise operations that only support integral types
+ */
+#define _IMPL_CALC_CASES_INTEGRAL                                                            \
+    case INFINI_DTYPE_I32:                                                                   \
+        return _device_info->template calculate<Op, int32_t>(_info, output, inputs, stream); \
+    case INFINI_DTYPE_I64:                                                                   \
+        return _device_info->template calculate<Op, int64_t>(_info, output, inputs, stream); \
+    case INFINI_DTYPE_U8:                                                                    \
+        return _device_info->template calculate<Op, uint8_t>(_info, output, inputs, stream);
+
 /**
  * @brief Generic Template for the Calculate method
  * @param CASES_MACRO The macro containing the switch cases to use
  */
-#define _IMPL_CALCULATE_METHOD(CASES_MACRO) \
-    infiniStatus_t Descriptor::calculate( \
-        void *workspace, \
-        size_t workspace_size, \
-        void *output, \
-        std::vector<const void *> inputs, \
-        void *stream) const { \
-        switch (_dtype) { \
-            CASES_MACRO \
-            default: \
-                return INFINI_STATUS_BAD_TENSOR_DTYPE; \
-        } \
+#define _IMPL_CALCULATE_METHOD(CASES_MACRO)        \
+    infiniStatus_t Descriptor::calculate(          \
+        void *workspace,                           \
+        size_t workspace_size,                     \
+        void *output,                              \
+        std::vector<const void *> inputs,          \
+        void *stream) const {                      \
+        switch (_dtype) {                          \
+            CASES_MACRO                            \
+        default:                                   \
+            return INFINI_STATUS_BAD_TENSOR_DTYPE; \
+        }                                          \
     }
 
 /**
@@ -71,20 +83,20 @@
  * @param SHAPE_CHECK_BLOCK Code block to execute for shape checking
  * @param ... Variadic arguments for allowed data types in CHECK_DTYPE
  */
-#define _IMPL_CREATE_METHOD(SHAPE_CHECK_BLOCK, ...) \
-    Descriptor::~Descriptor() = default; \
-    infiniStatus_t Descriptor::create( \
-        infiniopHandle_t handle_, \
-        Descriptor **desc_ptr, \
-        infiniopTensorDescriptor_t out_desc, \
-        std::vector<infiniopTensorDescriptor_t> input_desc_vec) { \
-        auto handle = reinterpret_cast<device::cpu::Handle *>(handle_); \
-        auto dtype = out_desc->dtype(); \
-        const auto &out_shape = out_desc->shape(); \
-        SHAPE_CHECK_BLOCK \
-        CHECK_DTYPE(dtype, __VA_ARGS__); \
+#define _IMPL_CREATE_METHOD(SHAPE_CHECK_BLOCK, ...)                                 \
+    Descriptor::~Descriptor() = default;                                            \
+    infiniStatus_t Descriptor::create(                                              \
+        infiniopHandle_t handle_,                                                   \
+        Descriptor **desc_ptr,                                                      \
+        infiniopTensorDescriptor_t out_desc,                                        \
+        std::vector<infiniopTensorDescriptor_t> input_desc_vec) {                   \
+        auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);             \
+        auto dtype = out_desc->dtype();                                             \
+        const auto &out_shape = out_desc->shape();                                  \
+        SHAPE_CHECK_BLOCK                                                           \
+        CHECK_DTYPE(dtype, __VA_ARGS__);                                            \
         CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); \
-        return INFINI_STATUS_SUCCESS; \
+        return INFINI_STATUS_SUCCESS;                                               \
     }
 
 // =========================================================================
@@ -103,15 +115,14 @@
  *       ELEMENTWISE_CPU_IMPL_BINARY(pow)
  *   }
  */
-#define ELEMENTWISE_CPU_IMPL_BINARY(OP) \
-    _IMPL_CREATE_METHOD( \
-        const auto &a_desc = input_desc_vec.at(0); \
-        const auto &b_desc = input_desc_vec.at(1); \
-        const auto &a_shape = a_desc->shape(); \
-        const auto &b_shape = b_desc->shape(); \
-        CHECK_SAME_SHAPE(out_shape, a_shape, b_shape);, \
-        INFINI_DTYPE_F16, INFINI_DTYPE_F32 \
-    ) \
+#define ELEMENTWISE_CPU_IMPL_BINARY(OP)                                                   \
+    _IMPL_CREATE_METHOD(                                                                  \
+        const auto &a_desc = input_desc_vec.at(0);                                        \
+        const auto &b_desc = input_desc_vec.at(1);                                        \
+        const auto &a_shape = a_desc->shape();                                            \
+        const auto &b_shape = b_desc->shape();                                            \
+        CHECK_SAME_SHAPE(out_shape, a_shape, b_shape);,                                   \
+                                                      INFINI_DTYPE_F16, INFINI_DTYPE_F32) \
     _IMPL_CALCULATE_METHOD(_IMPL_CALC_CASES_COMMON)
 
 /**
@@ -126,13 +137,12 @@
  *       ELEMENTWISE_CPU_IMPL_UNARY(sqrt)
  *   }
  */
-#define ELEMENTWISE_CPU_IMPL_UNARY(OP) \
-    _IMPL_CREATE_METHOD( \
-        const auto &x_desc = input_desc_vec.at(0); \
-        const auto &x_shape = x_desc->shape(); \
-        CHECK_SAME_SHAPE(out_shape, x_shape);, \
-        INFINI_DTYPE_F16, INFINI_DTYPE_F32 \
-    ) \
+#define ELEMENTWISE_CPU_IMPL_UNARY(OP)                                           \
+    _IMPL_CREATE_METHOD(                                                         \
+        const auto &x_desc = input_desc_vec.at(0);                               \
+        const auto &x_shape = x_desc->shape();                                   \
+        CHECK_SAME_SHAPE(out_shape, x_shape);,                                   \
+                                             INFINI_DTYPE_F16, INFINI_DTYPE_F32) \
     _IMPL_CALCULATE_METHOD(_IMPL_CALC_CASES_COMMON)
 
 /**
@@ -147,13 +157,34 @@
  *       ELEMENTWISE_CPU_IMPL_UNARY_EXTENDED(exp)
  *   }
  */
-#define ELEMENTWISE_CPU_IMPL_UNARY_EXTENDED(OP) \
-    _IMPL_CREATE_METHOD( \
-        const auto &x_desc = input_desc_vec.at(0); \
-        const auto &x_shape = x_desc->shape(); \
-        CHECK_SAME_SHAPE(out_shape, x_shape);, \
-        INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16 \
-    ) \
+#define ELEMENTWISE_CPU_IMPL_UNARY_EXTENDED(OP)                                                                       \
+    _IMPL_CREATE_METHOD(                                                                                              \
+        const auto &x_desc = input_desc_vec.at(0);                                                                    \
+        const auto &x_shape = x_desc->shape();                                                                        \
+        CHECK_SAME_SHAPE(out_shape, x_shape);,                                                                        \
+                                             INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16) \
     _IMPL_CALCULATE_METHOD(_IMPL_CALC_CASES_EXTENDED)
 
+/**
+ * @brief Implementation for Binary Operators with Integral Types (I32, I64, U8)
+ *
+ * This macro generates the Descriptor destructor, create, and calculate methods
+ * for binary operators that only support integral types (e.g., bitwise operations).
+ *
+ * Usage:
+ *   namespace op::bitwise_and::cpu {
+ *       using Op = op::elementwise::binary::BinaryOp<BinaryMode::BitwiseAnd>;
+ *       ELEMENTWISE_CPU_IMPL_BINARY_INTEGRAL(bitwise_and)
+ *   }
+ */
+#define ELEMENTWISE_CPU_IMPL_BINARY_INTEGRAL(OP)                                                           \
+    _IMPL_CREATE_METHOD(                                                                                   \
+        const auto &a_desc = input_desc_vec.at(0);                                                         \
+        const auto &b_desc = input_desc_vec.at(1);                                                         \
+        const auto &a_shape = a_desc->shape();                                                             \
+        const auto &b_shape = b_desc->shape();                                                             \
+        CHECK_SAME_SHAPE(out_shape, a_shape, b_shape);,                                                    \
+                                                      INFINI_DTYPE_I32, INFINI_DTYPE_I64, INFINI_DTYPE_U8) \
+    _IMPL_CALCULATE_METHOD(_IMPL_CALC_CASES_INTEGRAL)
+
 #endif // __INFINIOP_ELEMENTWISE_CPU_IMPL_H__
diff --git a/src/infiniop/elementwise/nvidia/elementwise_nvidia_impl.cuh b/src/infiniop/elementwise/nvidia/elementwise_nvidia_impl.cuh
index b0716db1a..4e325e68a 100644
--- a/src/infiniop/elementwise/nvidia/elementwise_nvidia_impl.cuh
+++ b/src/infiniop/elementwise/nvidia/elementwise_nvidia_impl.cuh
@@ -32,45 +32,57 @@
 /**
  * @brief Common Calculate Switch Cases (F16 & F32)
  */
-#define _IMPL_CALC_CASES_COMMON \
-    case INFINI_DTYPE_F16: \
+#define _IMPL_CALC_CASES_COMMON                                                                        \
+    case INFINI_DTYPE_F16:                                                                             \
         return _device_info->calculate<256, cuda::Op, half>(_info, workspace, output, inputs, stream); \
-    case INFINI_DTYPE_F32: \
+    case INFINI_DTYPE_F32:                                                                             \
         return _device_info->calculate<256, cuda::Op, float>(_info, workspace, output, inputs, stream);
 
 /**
  * @brief Extended Calculate Switch Cases (Adds F64 & BF16)
  * Note: Order is F16, BF16, F32, F64 to match original implementation
  */
-#define _IMPL_CALC_CASES_EXTENDED \
-    case INFINI_DTYPE_F16: \
-        return _device_info->calculate<256, cuda::Op, half>(_info, workspace, output, inputs, stream); \
-    case INFINI_DTYPE_BF16: \
+#define _IMPL_CALC_CASES_EXTENDED                                                                               \
+    case INFINI_DTYPE_F16:                                                                                      \
+        return _device_info->calculate<256, cuda::Op, half>(_info, workspace, output, inputs, stream);          \
+    case INFINI_DTYPE_BF16:                                                                                     \
         return _device_info->calculate<256, cuda::Op, cuda_bfloat16>(_info, workspace, output, inputs, stream); \
-    case INFINI_DTYPE_F32: \
-        return _device_info->calculate<256, cuda::Op, float>(_info, workspace, output, inputs, stream); \
-    case INFINI_DTYPE_F64: \
+    case INFINI_DTYPE_F32:                                                                                      \
+        return _device_info->calculate<256, cuda::Op, float>(_info, workspace, output, inputs, stream);         \
+    case INFINI_DTYPE_F64:                                                                                      \
         return _device_info->calculate<256, cuda::Op, double>(_info, workspace, output, inputs, stream);
 
+/**
+ * @brief Integral Calculate Switch Cases (I32, I64, U8)
+ * For bitwise operations that only support integral types
+ */
+#define _IMPL_CALC_CASES_INTEGRAL                                                                         \
+    case INFINI_DTYPE_I32:                                                                                \
+        return _device_info->calculate<256, cuda::Op, int32_t>(_info, workspace, output, inputs, stream); \
+    case INFINI_DTYPE_I64:                                                                                \
+        return _device_info->calculate<256, cuda::Op, int64_t>(_info, workspace, output, inputs, stream); \
+    case INFINI_DTYPE_U8:                                                                                 \
+        return _device_info->calculate<256, cuda::Op, uint8_t>(_info, workspace, output, inputs, stream);
+
 /**
  * @brief Generic Template for the Calculate method
  * @param CASES_MACRO The macro containing the switch cases to use
  */
-#define _IMPL_CALCULATE_METHOD(CASES_MACRO) \
-    infiniStatus_t Descriptor::calculate( \
-        void *workspace, \
-        size_t workspace_size, \
-        void *output, \
-        std::vector<const void *> inputs, \
-        void *stream) const { \
-        if (workspace_size < _workspace_size) { \
+#define _IMPL_CALCULATE_METHOD(CASES_MACRO)              \
+    infiniStatus_t Descriptor::calculate(                \
+        void *workspace,                                 \
+        size_t workspace_size,                           \
+        void *output,                                    \
+        std::vector<const void *> inputs,                \
+        void *stream) const {                            \
+        if (workspace_size < _workspace_size) {          \
             return INFINI_STATUS_INSUFFICIENT_WORKSPACE; \
-        } \
-        switch (_dtype) { \
-            CASES_MACRO \
-            default: \
-                return INFINI_STATUS_BAD_TENSOR_DTYPE; \
-        } \
+        }                                                \
+        switch (_dtype) {                                \
+            CASES_MACRO                                  \
+        default:                                         \
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;       \
+        }                                                \
     }
 
 /**
@@ -78,20 +90,20 @@
  * @param SHAPE_CHECK_BLOCK Code block to execute for shape checking
  * @param ... Variadic arguments for allowed data types in CHECK_DTYPE
  */
-#define _IMPL_CREATE_METHOD(SHAPE_CHECK_BLOCK, ...) \
-    Descriptor::~Descriptor() = default; \
-    infiniStatus_t Descriptor::create( \
-        infiniopHandle_t handle_, \
-        Descriptor **desc_ptr, \
-        infiniopTensorDescriptor_t out_desc, \
-        std::vector<infiniopTensorDescriptor_t> input_desc_vec) { \
-        auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_); \
-        auto dtype = out_desc->dtype(); \
-        const auto &out_shape = out_desc->shape(); \
-        SHAPE_CHECK_BLOCK \
-        CHECK_DTYPE(dtype, __VA_ARGS__); \
+#define _IMPL_CREATE_METHOD(SHAPE_CHECK_BLOCK, ...)                                  \
+    Descriptor::~Descriptor() = default;                                             \
+    infiniStatus_t Descriptor::create(                                               \
+        infiniopHandle_t handle_,                                                    \
+        Descriptor **desc_ptr,                                                       \
+        infiniopTensorDescriptor_t out_desc,                                         \
+        std::vector<infiniopTensorDescriptor_t> input_desc_vec) {                    \
+        auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);           \
+        auto dtype = out_desc->dtype();                                              \
+        const auto &out_shape = out_desc->shape();                                   \
+        SHAPE_CHECK_BLOCK                                                            \
+        CHECK_DTYPE(dtype, __VA_ARGS__);                                             \
         CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); \
-        return INFINI_STATUS_SUCCESS; \
+        return INFINI_STATUS_SUCCESS;                                                \
     }
 
 // =========================================================================
@@ -109,15 +121,14 @@
  *       ELEMENTWISE_NVIDIA_IMPL_BINARY(pow)
  *   }
  */
-#define ELEMENTWISE_NVIDIA_IMPL_BINARY(OP) \
-    _IMPL_CREATE_METHOD( \
-        const auto &a_desc = input_desc_vec.at(0); \
-        const auto &b_desc = input_desc_vec.at(1); \
-        const auto &a_shape = a_desc->shape(); \
-        const auto &b_shape = b_desc->shape(); \
-        CHECK_SAME_SHAPE(out_shape, a_shape, b_shape);, \
-        INFINI_DTYPE_F16, INFINI_DTYPE_F32 \
-    ) \
+#define ELEMENTWISE_NVIDIA_IMPL_BINARY(OP)                                                \
+    _IMPL_CREATE_METHOD(                                                                  \
+        const auto &a_desc = input_desc_vec.at(0);                                        \
+        const auto &b_desc = input_desc_vec.at(1);                                        \
+        const auto &a_shape = a_desc->shape();                                            \
+        const auto &b_shape = b_desc->shape();                                            \
+        CHECK_SAME_SHAPE(out_shape, a_shape, b_shape);,                                   \
+                                                      INFINI_DTYPE_F16, INFINI_DTYPE_F32) \
     _IMPL_CALCULATE_METHOD(_IMPL_CALC_CASES_COMMON)
 
 /**
@@ -131,13 +142,12 @@
  *       ELEMENTWISE_NVIDIA_IMPL_UNARY(sqrt)
  *   }
  */
-#define ELEMENTWISE_NVIDIA_IMPL_UNARY(OP) \
-    _IMPL_CREATE_METHOD( \
-        const auto &x_desc = input_desc_vec.at(0); \
-        const auto &x_shape = x_desc->shape(); \
-        CHECK_SAME_SHAPE(out_shape, x_shape);, \
-        INFINI_DTYPE_F16, INFINI_DTYPE_F32 \
-    ) \
+#define ELEMENTWISE_NVIDIA_IMPL_UNARY(OP)                                        \
+    _IMPL_CREATE_METHOD(                                                         \
+        const auto &x_desc = input_desc_vec.at(0);                               \
+        const auto &x_shape = x_desc->shape();                                   \
+        CHECK_SAME_SHAPE(out_shape, x_shape);,                                   \
+                                             INFINI_DTYPE_F16, INFINI_DTYPE_F32) \
     _IMPL_CALCULATE_METHOD(_IMPL_CALC_CASES_COMMON)
 
 /**
@@ -151,13 +161,33 @@
  *       ELEMENTWISE_NVIDIA_IMPL_UNARY_EXTENDED(exp)
  *   }
  */
-#define ELEMENTWISE_NVIDIA_IMPL_UNARY_EXTENDED(OP) \
-    _IMPL_CREATE_METHOD( \
-        const auto &x_desc = input_desc_vec.at(0); \
-        const auto &x_shape = x_desc->shape(); \
-        CHECK_SAME_SHAPE(out_shape, x_shape);, \
-        INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16 \
-    ) \
+#define ELEMENTWISE_NVIDIA_IMPL_UNARY_EXTENDED(OP)                                                                    \
+    _IMPL_CREATE_METHOD(                                                                                              \
+        const auto &x_desc = input_desc_vec.at(0);                                                                    \
+        const auto &x_shape = x_desc->shape();                                                                        \
+        CHECK_SAME_SHAPE(out_shape, x_shape);,                                                                        \
+                                             INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16) \
     _IMPL_CALCULATE_METHOD(_IMPL_CALC_CASES_EXTENDED)
 
+/**
+ * @brief Implementation for Binary Operators with Integral Types (I32, I64, U8)
+ *
+ * This macro generates the Descriptor destructor, create, and calculate methods
+ * for binary operators that only support integral types (e.g., bitwise operations).
+ *
+ * Usage:
+ *   namespace op::bitwise_and::nvidia {
+ *       ELEMENTWISE_NVIDIA_IMPL_BINARY_INTEGRAL(bitwise_and)
+ *   }
+ */
+#define ELEMENTWISE_NVIDIA_IMPL_BINARY_INTEGRAL(OP)                                                        \
+    _IMPL_CREATE_METHOD(                                                                                   \
+        const auto &a_desc = input_desc_vec.at(0);                                                         \
+        const auto &b_desc = input_desc_vec.at(1);                                                         \
+        const auto &a_shape = a_desc->shape();                                                             \
+        const auto &b_shape = b_desc->shape();                                                             \
+        CHECK_SAME_SHAPE(out_shape, a_shape, b_shape);,                                                    \
+                                                      INFINI_DTYPE_I32, INFINI_DTYPE_I64, INFINI_DTYPE_U8) \
+    _IMPL_CALCULATE_METHOD(_IMPL_CALC_CASES_INTEGRAL)
+
 #endif // __INFINIOP_ELEMENTWISE_NVIDIA_IMPL_CUH__
diff --git a/src/infiniop/elementwise/unary.h b/src/infiniop/elementwise/unary.h
index 330f305dd..ef3fdc8fd 100644
--- a/src/infiniop/elementwise/unary.h
+++ b/src/infiniop/elementwise/unary.h
@@ -25,9 +25,15 @@ enum class UnaryMode {
     // Math operations:
     Abs,
     Exp,
+    Exp2, // exp2: 2^x
     Log,
+    Log2,  // log2: log base 2
+    Log10, // log10: log base 10
+    Log1p, // log1p: log(1 + x), numerically stable for values close to zero
     Reciprocal,
     Sqrt,
+    Square,
+    Rsqrt,
     Neg,
     Ceil,
     Floor,
@@ -49,8 +55,16 @@ enum class UnaryMode {
     Sign,
     Erf,
     Hardswish,
+    IsNan,
+    IsInf,
+    IsFinite,
+    Sinc,
 };
 
+// Helper template for static_assert in else branches
+template <UnaryMode M>
+struct always_false : std::false_type {};
+
 /**
  * @brief Generic unary operation template that performs different operations
  *        based on the specified UnaryMode.
@@ -75,12 +89,28 @@ struct UnaryOp {
             }
         } else if constexpr (Mode == UnaryMode::Exp) {
             return std::exp(x);
+        } else if constexpr (Mode == UnaryMode::Exp2) {
+            // exp2: 2^x
+            return std::exp2(x);
         } else if constexpr (Mode == UnaryMode::Log) {
             return std::log(x);
+        } else if constexpr (Mode == UnaryMode::Log2) {
+            // log2: log base 2
+            return std::log2(x);
+        } else if constexpr (Mode == UnaryMode::Log10) {
+            // log10: log base 10
+            return std::log10(x);
+        } else if constexpr (Mode == UnaryMode::Log1p) {
+            // log1p: log(1 + x), numerically stable for values close to zero
+            return std::log1p(x);
         } else if constexpr (Mode == UnaryMode::Reciprocal) {
             return T(1) / x;
         } else if constexpr (Mode == UnaryMode::Sqrt) {
             return std::sqrt(x);
+        } else if constexpr (Mode == UnaryMode::Square) {
+            return x * x;
+        } else if constexpr (Mode == UnaryMode::Rsqrt) {
+            return T(1) / std::sqrt(x);
         } else if constexpr (Mode == UnaryMode::Neg) {
             return -x;
         } else if constexpr (Mode == UnaryMode::Ceil) {
@@ -125,19 +155,55 @@ struct UnaryOp {
             return x > T(0) ? T(1) : (x == T(0) ? T(0) : T(-1));
         } else if constexpr (Mode == UnaryMode::Erf) {
             return std::erf(x);
+        } else if constexpr (Mode == UnaryMode::IsNan) {
+            if constexpr (std::is_floating_point_v<T>) {
+                return std::isnan(x) ? T(1) : T(0);
+            } else {
+                // For integral types, NaN doesn't exist, so always return 0
+                return T(0);
+            }
+        } else if constexpr (Mode == UnaryMode::IsInf) {
+            if constexpr (std::is_floating_point_v<T>) {
+                return std::isinf(x) ? T(1) : T(0);
+            } else {
+                // For integral types, Inf doesn't exist, so always return 0
+                return T(0);
+            }
+        } else if constexpr (Mode == UnaryMode::IsFinite) {
+            if constexpr (std::is_floating_point_v<T>) {
+                return std::isfinite(x) ? T(1) : T(0);
+            } else {
+                // For integral types, all values are finite, so always return 1
+                return T(1);
+            }
+        } else if constexpr (Mode == UnaryMode::Sinc) {
+            // sinc(x) = sin(x) / x, sinc(0) = 1
+            // For small values, use Taylor expansion for numerical stability
+            // sinc(x) ≈ 1 - x²/6 + x⁴/120 - x⁶/5040
+            if constexpr (std::is_floating_point_v<T>) {
+                T abs_x = std::abs(x);
+                if (abs_x < T(1e-2)) {
+                    T x2 = x * x;
+                    return T(1) - x2 * (T(1) / T(6) - x2 * (T(1) / T(120) - x2 * (T(1) / T(5040))));
+                } else {
+                    return std::sin(x) / x;
+                }
+            } else {
+                // For integral types, sinc is not well-defined, return 1 for 0, 0 otherwise
+                return x == T(0) ? T(1) : T(0);
+            }
         } else if constexpr (Mode == UnaryMode::Hardswish) {
             if constexpr (std::is_integral_v<T>) {
                 return static_cast<T>(0);
             } else {
                 // x * clamp(x + 3, 0, 6) / 6
-                auto x_val = static_cast<double>(x);
-                double y = x_val + 3.0;
-                y = std::min(std::max(y, 0.0), 6.0);
-                double out = x_val * (y / 6.0);
-                return static_cast<T>(out);
+                // Use template type T directly instead of double for better performance
+                T y = x + T(3);
+                y = std::min(std::max(y, T(0)), T(6));
+                return x * (y / T(6));
             }
         } else {
-            static_assert(Mode != Mode, "Unsupported unary operation mode");
+            static_assert(always_false<Mode>::value, "Unsupported unary operation mode");
             return x;
         }
     }
@@ -186,6 +252,23 @@ struct UnaryOp {
             } else {
                 return std::exp(x);
             }
+        } else if constexpr (Mode == UnaryMode::Exp2) {
+            // exp2: 2^x
+            if constexpr (std::is_same_v<T, half2>) {
+                float2 x_f2 = __half22float2(x);
+                return __float22half2_rn(make_float2(exp2f(x_f2.x), exp2f(x_f2.y)));
+            } else if constexpr (std::is_same_v<T, half>) {
+                return __float2half(exp2f(__half2float(x)));
+            } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
+                float2 x_f2 = __bfloat1622float2(x);
+                return __floats2bfloat162_rn(exp2f(x_f2.x), exp2f(x_f2.y));
+            } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+                return __float2bfloat16_rn(exp2f(__bfloat162float(x)));
+            } else if constexpr (std::is_same_v<T, float>) {
+                return exp2f(x);
+            } else {
+                return std::exp2(x);
+            }
         } else if constexpr (Mode == UnaryMode::Log) {
             if constexpr (std::is_same_v<T, half2>) {
                 return h2log(x);
@@ -202,6 +285,62 @@ struct UnaryOp {
             } else {
                 return std::log(x);
             }
+        } else if constexpr (Mode == UnaryMode::Log2) {
+            // log2: log base 2
+            if constexpr (std::is_same_v<T, half2>) {
+                float2 x_f2 = __half22float2(x);
+                return __float22half2_rn(make_float2(log2f(x_f2.x), log2f(x_f2.y)));
+            } else if constexpr (std::is_same_v<T, half>) {
+                return __float2half(log2f(__half2float(x)));
+            } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
+                float x0 = __bfloat162float(__low2bfloat16(x));
+                float x1 = __bfloat162float(__high2bfloat16(x));
+                return __floats2bfloat162_rn(log2f(x0), log2f(x1));
+            } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+                return __float2bfloat16_rn(log2f(__bfloat162float(x)));
+            } else if constexpr (std::is_same_v<T, float>) {
+                return log2f(x);
+            } else {
+                return std::log2(x);
+            }
+        } else if constexpr (Mode == UnaryMode::Log10) {
+            // log10: log base 10
+            if constexpr (std::is_same_v<T, half2>) {
+                float2 x_f2 = __half22float2(x);
+                return __float22half2_rn(make_float2(log10f(x_f2.x), log10f(x_f2.y)));
+            } else if constexpr (std::is_same_v<T, half>) {
+                return __float2half(log10f(__half2float(x)));
+            } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
+                float x0 = __bfloat162float(__low2bfloat16(x));
+                float x1 = __bfloat162float(__high2bfloat16(x));
+                return __floats2bfloat162_rn(log10f(x0), log10f(x1));
+            } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+                return __float2bfloat16_rn(log10f(__bfloat162float(x)));
+            } else if constexpr (std::is_same_v<T, float>) {
+                return log10f(x);
+            } else {
+                return std::log10(x);
+            }
+        } else if constexpr (Mode == UnaryMode::Log1p) {
+            // log1p: log(1 + x), numerically stable for values close to zero
+            if constexpr (std::is_same_v<T, half2>) {
+                float2 x_f2 = __half22float2(x);
+                return __float22half2_rn(make_float2(log1pf(x_f2.x), log1pf(x_f2.y)));
+            } else if constexpr (std::is_same_v<T, half>) {
+                return __float2half(log1pf(__half2float(x)));
+            } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
+                float x0 = __bfloat162float(__low2bfloat16(x));
+                float x1 = __bfloat162float(__high2bfloat16(x));
+                return __floats2bfloat162_rn(log1pf(x0), log1pf(x1));
+            } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+                return __float2bfloat16_rn(log1pf(__bfloat162float(x)));
+            } else if constexpr (std::is_same_v<T, float>) {
+                return log1pf(x);
+            } else if constexpr (std::is_same_v<T, double>) {
+                return log1p(x);
+            } else {
+                return std::log1p(x);
+            }
         } else if constexpr (Mode == UnaryMode::Reciprocal) {
             if constexpr (std::is_same_v<T, half2>) {
                 return h2rcp(x);
@@ -234,6 +373,25 @@ struct UnaryOp {
             } else {
                 return std::sqrt(x);
             }
+        } else if constexpr (Mode == UnaryMode::Square) {
+            return x * x;
+        } else if constexpr (Mode == UnaryMode::Rsqrt) {
+            if constexpr (std::is_same_v<T, half2>) {
+                float2 x_f2 = __half22float2(x);
+                return __float22half2_rn(make_float2(__frsqrt_rn(x_f2.x), __frsqrt_rn(x_f2.y)));
+            } else if constexpr (std::is_same_v<T, half>) {
+                return __float2half(__frsqrt_rn(__half2float(x)));
+            } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
+                float x0 = __bfloat162float(__low2bfloat16(x));
+                float x1 = __bfloat162float(__high2bfloat16(x));
+                return __floats2bfloat162_rn(__frsqrt_rn(x0), __frsqrt_rn(x1));
+            } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+                return __float2bfloat16_rn(__frsqrt_rn(__bfloat162float(x)));
+            } else if constexpr (std::is_same_v<T, float>) {
+                return __frsqrt_rn(x);
+            } else {
+                return T(1) / std::sqrt(x);
+            }
         } else if constexpr (Mode == UnaryMode::Neg) {
             if constexpr (std::is_same_v<T, half2>) {
                 return __hneg2(x);
@@ -409,6 +567,8 @@ struct UnaryOp {
                 return __float2bfloat16_rn(tanhf(__bfloat162float(x)));
             } else if constexpr (std::is_same_v<T, float>) {
                 return tanhf(x);
+            } else if constexpr (std::is_same_v<T, double>) {
+                return ::tanh(x);
             } else {
                 return std::tanh(x);
             }
@@ -474,8 +634,17 @@ struct UnaryOp {
             } else if constexpr (std::is_same_v<T, half>) {
                 float x_ = __half2float(x);
                 return __float2half(1.0f / (1.0f + __expf(-x_)));
+            } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
+                float2 x_f2 = __bfloat1622float2(x);
+                float2 exp_neg_x = make_float2(__expf(-x_f2.x), __expf(-x_f2.y));
+                return __floats2bfloat162_rn(1.0f / (1.0f + exp_neg_x.x), 1.0f / (1.0f + exp_neg_x.y));
+            } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+                float x_ = __bfloat162float(x);
+                return __float2bfloat16_rn(1.0f / (1.0f + __expf(-x_)));
             } else if constexpr (std::is_same_v<T, float>) {
                 return 1.0f / (1.0f + __expf(-x));
+            } else if constexpr (std::is_same_v<T, double>) {
+                return 1.0 / (1.0 + exp(-x));
             } else {
                 return T(1) / (T(1) + std::exp(-x));
             }
@@ -499,6 +668,177 @@ struct UnaryOp {
             } else {
                 return std::erf(x);
             }
+        } else if constexpr (Mode == UnaryMode::IsNan) {
+            if constexpr (std::is_same_v<T, half2>) {
+                float2 x_f2 = __half22float2(x);
+                return __float22half2_rn(make_float2(
+                    __isnanf(x_f2.x) ? 1.0f : 0.0f,
+                    __isnanf(x_f2.y) ? 1.0f : 0.0f));
+            } else if constexpr (std::is_same_v<T, half>) {
+                float x_ = __half2float(x);
+                return __float2half(__isnanf(x_) ? 1.0f : 0.0f);
+            } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
+                float x0 = __bfloat162float(__low2bfloat16(x));
+                float x1 = __bfloat162float(__high2bfloat16(x));
+                return __floats2bfloat162_rn(
+                    __isnanf(x0) ? 1.0f : 0.0f,
+                    __isnanf(x1) ? 1.0f : 0.0f);
+            } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+                float x_ = __bfloat162float(x);
+                return __float2bfloat16_rn(__isnanf(x_) ? 1.0f : 0.0f);
+            } else if constexpr (std::is_same_v<T, float>) {
+                return __isnanf(x) ? 1.0f : 0.0f;
+            } else if constexpr (std::is_same_v<T, double>) {
+                return __isnan(x) ? 1.0 : 0.0;
+            } else if constexpr (std::is_floating_point_v<T>) {
+                return std::isnan(x) ? T(1) : T(0);
+            } else {
+                // For integral types, NaN doesn't exist, so always return 0
+                return T(0);
+            }
+        } else if constexpr (Mode == UnaryMode::IsInf) {
+            if constexpr (std::is_same_v<T, half2>) {
+                float2 x_f2 = __half22float2(x);
+                return __float22half2_rn(make_float2(
+                    __isinff(x_f2.x) ? 1.0f : 0.0f,
+                    __isinff(x_f2.y) ? 1.0f : 0.0f));
+            } else if constexpr (std::is_same_v<T, half>) {
+                float x_ = __half2float(x);
+                return __float2half(__isinff(x_) ? 1.0f : 0.0f);
+            } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
+                float x0 = __bfloat162float(__low2bfloat16(x));
+                float x1 = __bfloat162float(__high2bfloat16(x));
+                return __floats2bfloat162_rn(
+                    __isinff(x0) ? 1.0f : 0.0f,
+                    __isinff(x1) ? 1.0f : 0.0f);
+            } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+                float x_ = __bfloat162float(x);
+                return __float2bfloat16_rn(__isinff(x_) ? 1.0f : 0.0f);
+            } else if constexpr (std::is_same_v<T, float>) {
+                return __isinff(x) ? 1.0f : 0.0f;
+            } else if constexpr (std::is_same_v<T, double>) {
+                return __isinf(x) ? 1.0 : 0.0;
+            } else if constexpr (std::is_floating_point_v<T>) {
+                return std::isinf(x) ? T(1) : T(0);
+            } else {
+                // For integral types, Inf doesn't exist, so always return 0
+                return T(0);
+            }
+        } else if constexpr (Mode == UnaryMode::IsFinite) {
+            // isfinite(x) = !isnan(x) && !isinf(x)
+            if constexpr (std::is_same_v<T, half2>) {
+                float2 x_f2 = __half22float2(x);
+                auto isfinite_f32 = [](float val) -> float {
+                    return (!__isnanf(val) && !__isinff(val)) ? 1.0f : 0.0f;
+                };
+                return __float22half2_rn(make_float2(
+                    isfinite_f32(x_f2.x),
+                    isfinite_f32(x_f2.y)));
+            } else if constexpr (std::is_same_v<T, half>) {
+                float x_ = __half2float(x);
+                return __float2half((!__isnanf(x_) && !__isinff(x_)) ? 1.0f : 0.0f);
+            } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
+                float x0 = __bfloat162float(__low2bfloat16(x));
+                float x1 = __bfloat162float(__high2bfloat16(x));
+                auto isfinite_f32 = [](float val) -> float {
+                    return (!__isnanf(val) && !__isinff(val)) ? 1.0f : 0.0f;
+                };
+                return __floats2bfloat162_rn(
+                    isfinite_f32(x0),
+                    isfinite_f32(x1));
+            } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+                float x_ = __bfloat162float(x);
+                return __float2bfloat16_rn((!__isnanf(x_) && !__isinff(x_)) ? 1.0f : 0.0f);
+            } else if constexpr (std::is_same_v<T, float>) {
+                return (!__isnanf(x) && !__isinff(x)) ? 1.0f : 0.0f;
+            } else if constexpr (std::is_same_v<T, double>) {
+                return (!__isnan(x) && !__isinf(x)) ? 1.0 : 0.0;
+            } else if constexpr (std::is_floating_point_v<T>) {
+                return std::isfinite(x) ? T(1) : T(0);
+            } else {
+                // For integral types, all values are finite, so always return 1
+                return T(1);
+            }
+        } else if constexpr (Mode == UnaryMode::Sinc) {
+            // sinc(x) = sin(x) / x, sinc(0) = 1
+            // For small values, use Taylor expansion for numerical stability
+            // sinc(x) ≈ 1 - x²/6 + x⁴/120 - x⁶/5040
+            if constexpr (std::is_same_v<T, half2>) {
+                float2 x_f2 = __half22float2(x);
+                auto sinc_f32 = [](float val) -> float {
+                    float abs_val = fabsf(val);
+                    if (abs_val < 1e-2f) {
+                        // Use Taylor expansion for small values: 1 - x²/6 + x⁴/120 - x⁶/5040
+                        float x2 = val * val;
+                        return 1.0f - x2 * (1.0f / 6.0f - x2 * (1.0f / 120.0f - x2 * (1.0f / 5040.0f)));
+                    } else {
+                        return __sinf(val) / val;
+                    }
+                };
+                return __float22half2_rn(make_float2(
+                    sinc_f32(x_f2.x),
+                    sinc_f32(x_f2.y)));
+            } else if constexpr (std::is_same_v<T, half>) {
+                float x_ = __half2float(x);
+                float abs_x = fabsf(x_);
+                if (abs_x < 1e-2f) {
+                    float x2 = x_ * x_;
+                    return __float2half(1.0f - x2 * (1.0f / 6.0f - x2 * (1.0f / 120.0f - x2 * (1.0f / 5040.0f))));
+                } else {
+                    return __float2half(__sinf(x_) / x_);
+                }
+            } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
+                float x0 = __bfloat162float(__low2bfloat16(x));
+                float x1 = __bfloat162float(__high2bfloat16(x));
+                auto sinc_f32 = [](float val) -> float {
+                    float abs_val = fabsf(val);
+                    if (abs_val < 1e-2f) {
+                        float x2 = val * val;
+                        return 1.0f - x2 * (1.0f / 6.0f - x2 * (1.0f / 120.0f - x2 * (1.0f / 5040.0f)));
+                    } else {
+                        return sinf(val) / val;
+                    }
+                };
+                return __floats2bfloat162_rn(
+                    sinc_f32(x0),
+                    sinc_f32(x1));
+            } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+                float x_ = __bfloat162float(x);
+                float abs_x = fabsf(x_);
+                if (abs_x < 1e-2f) {
+                    float x2 = x_ * x_;
+                    return __float2bfloat16_rn(1.0f - x2 * (1.0f / 6.0f - x2 * (1.0f / 120.0f - x2 * (1.0f / 5040.0f))));
+                } else {
+                    return __float2bfloat16_rn(sinf(x_) / x_);
+                }
+            } else if constexpr (std::is_same_v<T, float>) {
+                float abs_x = fabsf(x);
+                if (abs_x < 1e-2f) {
+                    float x2 = x * x;
+                    return 1.0f - x2 * (1.0f / 6.0f - x2 * (1.0f / 120.0f - x2 * (1.0f / 5040.0f)));
+                } else {
+                    return __sinf(x) / x;
+                }
+            } else if constexpr (std::is_same_v<T, double>) {
+                double abs_x = std::fabs(x);
+                if (abs_x < 1e-6) {
+                    double x2 = x * x;
+                    return 1.0 - x2 * (1.0 / 6.0 - x2 * (1.0 / 120.0 - x2 * (1.0 / 5040.0)));
+                } else {
+                    return std::sin(x) / x;
+                }
+            } else if constexpr (std::is_floating_point_v<T>) {
+                T abs_x = std::abs(x);
+                if (abs_x < T(1e-2)) {
+                    T x2 = x * x;
+                    return T(1) - x2 * (T(1) / T(6) - x2 * (T(1) / T(120) - x2 * (T(1) / T(5040))));
+                } else {
+                    return std::sin(x) / x;
+                }
+            } else {
+                // For integral types, sinc is not well-defined, return 1 for 0, 0 otherwise
+                return x == T(0) ? T(1) : T(0);
+            }
         } else if constexpr (Mode == UnaryMode::Hardswish) {
             // Hardswish: f(x) = x * clamp(x + 3, 0, 6) / 6
             auto hswish_f32 = [](float x) -> float {
@@ -535,7 +875,7 @@ struct UnaryOp {
                 return static_cast<T>(yd);
             }
         } else {
-            static_assert(Mode != Mode, "Unsupported unary operation mode");
+            static_assert(always_false<Mode>::value, "Unsupported unary operation mode");
             return x;
         }
     }
diff --git a/src/infiniop/ops/atan2/cpu/atan2_cpu.cc b/src/infiniop/ops/atan2/cpu/atan2_cpu.cc
new file mode 100644
index 000000000..8d300d867
--- /dev/null
+++ b/src/infiniop/ops/atan2/cpu/atan2_cpu.cc
@@ -0,0 +1,8 @@
+#include "atan2_cpu.h"
+#include "../../../elementwise/cpu/elementwise_cpu_impl.h"
+
+namespace op::atan2::cpu {
+
+ELEMENTWISE_CPU_IMPL_BINARY(atan2)
+
+} // namespace op::atan2::cpu
diff --git a/src/infiniop/ops/atan2/cpu/atan2_cpu.h b/src/infiniop/ops/atan2/cpu/atan2_cpu.h
new file mode 100644
index 000000000..9a26d425d
--- /dev/null
+++ b/src/infiniop/ops/atan2/cpu/atan2_cpu.h
@@ -0,0 +1,9 @@
+#ifndef __ATAN2_CPU_H__
+#define __ATAN2_CPU_H__
+
+#include "../../../elementwise/binary.h"
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+
+BINARY_ELEMENTWISE_DESCRIPTOR(atan2, cpu, op::elementwise::binary::BinaryMode::Atan2)
+
+#endif // __ATAN2_CPU_H__
diff --git a/src/infiniop/ops/atan2/cuda/kernel.cuh b/src/infiniop/ops/atan2/cuda/kernel.cuh
new file mode 100644
index 000000000..22ca06c8d
--- /dev/null
+++ b/src/infiniop/ops/atan2/cuda/kernel.cuh
@@ -0,0 +1,10 @@
+#ifndef __ATAN2_CUDA_H__
+#define __ATAN2_CUDA_H__
+
+#include "../../../elementwise/binary.h"
+
+namespace op::atan2::cuda {
+using Op = op::elementwise::binary::cuda::BinaryOp<op::elementwise::binary::BinaryMode::Atan2>;
+} // namespace op::atan2::cuda
+
+#endif // __ATAN2_CUDA_H__
diff --git a/src/infiniop/ops/atan2/nvidia/atan2_nvidia.cu b/src/infiniop/ops/atan2/nvidia/atan2_nvidia.cu
new file mode 100644
index 000000000..716ff6884
--- /dev/null
+++ b/src/infiniop/ops/atan2/nvidia/atan2_nvidia.cu
@@ -0,0 +1,10 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "atan2_nvidia.cuh"
+
+namespace op::atan2::nvidia {
+
+ELEMENTWISE_NVIDIA_IMPL_BINARY(atan2)
+
+} // namespace op::atan2::nvidia
diff --git a/src/infiniop/ops/atan2/nvidia/atan2_nvidia.cuh b/src/infiniop/ops/atan2/nvidia/atan2_nvidia.cuh
new file mode 100644
index 000000000..c882c22a8
--- /dev/null
+++ b/src/infiniop/ops/atan2/nvidia/atan2_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __ATAN2_CUDA_API_H__
+#define __ATAN2_CUDA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(atan2, nvidia)
+
+#endif // __ATAN2_CUDA_API_H__
diff --git a/src/infiniop/ops/atan2/operator.cc b/src/infiniop/ops/atan2/operator.cc
new file mode 100644
index 000000000..3580402ac
--- /dev/null
+++ b/src/infiniop/ops/atan2/operator.cc
@@ -0,0 +1,11 @@
+#include "../../operator_impl.h"
+#include "infiniop/ops/binary_ops_api.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/atan2_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
+#include "nvidia/atan2_nvidia.cuh"
+#endif
+
+BINARY_OP_IMPL(atan2, Atan2)
diff --git a/src/infiniop/ops/bitwise_and/cpu/bitwise_and_cpu.cc b/src/infiniop/ops/bitwise_and/cpu/bitwise_and_cpu.cc
new file mode 100644
index 000000000..cb293baba
--- /dev/null
+++ b/src/infiniop/ops/bitwise_and/cpu/bitwise_and_cpu.cc
@@ -0,0 +1,8 @@
+#include "bitwise_and_cpu.h"
+#include "../../../elementwise/cpu/elementwise_cpu_impl.h"
+
+namespace op::bitwise_and::cpu {
+
+ELEMENTWISE_CPU_IMPL_BINARY_INTEGRAL(bitwise_and)
+
+} // namespace op::bitwise_and::cpu
diff --git a/src/infiniop/ops/bitwise_and/cpu/bitwise_and_cpu.h b/src/infiniop/ops/bitwise_and/cpu/bitwise_and_cpu.h
new file mode 100644
index 000000000..348022cb2
--- /dev/null
+++ b/src/infiniop/ops/bitwise_and/cpu/bitwise_and_cpu.h
@@ -0,0 +1,9 @@
+#ifndef __BITWISE_AND_CPU_H__
+#define __BITWISE_AND_CPU_H__
+
+#include "../../../elementwise/binary.h"
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+
+BINARY_ELEMENTWISE_DESCRIPTOR(bitwise_and, cpu, op::elementwise::binary::BinaryMode::BitwiseAnd)
+
+#endif // __BITWISE_AND_CPU_H__
diff --git a/src/infiniop/ops/bitwise_and/cuda/kernel.cuh b/src/infiniop/ops/bitwise_and/cuda/kernel.cuh
new file mode 100644
index 000000000..a1a58b3c9
--- /dev/null
+++ b/src/infiniop/ops/bitwise_and/cuda/kernel.cuh
@@ -0,0 +1,10 @@
+#ifndef __BITWISE_AND_CUDA_H__
+#define __BITWISE_AND_CUDA_H__
+
+#include "../../../elementwise/binary.h"
+
+namespace op::bitwise_and::cuda {
+using Op = op::elementwise::binary::cuda::BinaryOp<op::elementwise::binary::BinaryMode::BitwiseAnd>;
+} // namespace op::bitwise_and::cuda
+
+#endif // __BITWISE_AND_CUDA_H__
diff --git a/src/infiniop/ops/bitwise_and/nvidia/bitwise_and_nvidia.cu b/src/infiniop/ops/bitwise_and/nvidia/bitwise_and_nvidia.cu
new file mode 100644
index 000000000..e97f957cb
--- /dev/null
+++ b/src/infiniop/ops/bitwise_and/nvidia/bitwise_and_nvidia.cu
@@ -0,0 +1,10 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "bitwise_and_nvidia.cuh"
+
+namespace op::bitwise_and::nvidia {
+
+ELEMENTWISE_NVIDIA_IMPL_BINARY_INTEGRAL(bitwise_and)
+
+} // namespace op::bitwise_and::nvidia
diff --git a/src/infiniop/ops/bitwise_and/nvidia/bitwise_and_nvidia.cuh b/src/infiniop/ops/bitwise_and/nvidia/bitwise_and_nvidia.cuh
new file mode 100644
index 000000000..0f9c65f22
--- /dev/null
+++ b/src/infiniop/ops/bitwise_and/nvidia/bitwise_and_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __BITWISE_AND_CUDA_API_H__
+#define __BITWISE_AND_CUDA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(bitwise_and, nvidia)
+
+#endif // __BITWISE_AND_CUDA_API_H__
diff --git a/src/infiniop/ops/bitwise_and/operator.cc b/src/infiniop/ops/bitwise_and/operator.cc
new file mode 100644
index 000000000..c934bee86
--- /dev/null
+++ b/src/infiniop/ops/bitwise_and/operator.cc
@@ -0,0 +1,11 @@
+#include "../../operator_impl.h"
+#include "infiniop/ops/binary_ops_api.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/bitwise_and_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
+#include "nvidia/bitwise_and_nvidia.cuh"
+#endif
+
+BINARY_OP_IMPL(bitwise_and, BitwiseAnd)
diff --git a/src/infiniop/ops/bitwise_left_shift/cpu/bitwise_left_shift_cpu.cc b/src/infiniop/ops/bitwise_left_shift/cpu/bitwise_left_shift_cpu.cc
new file mode 100644
index 000000000..d5f367515
--- /dev/null
+++ b/src/infiniop/ops/bitwise_left_shift/cpu/bitwise_left_shift_cpu.cc
@@ -0,0 +1,8 @@
+#include "bitwise_left_shift_cpu.h"
+#include "../../../elementwise/cpu/elementwise_cpu_impl.h"
+
+namespace op::bitwise_left_shift::cpu {
+
+ELEMENTWISE_CPU_IMPL_BINARY_INTEGRAL(bitwise_left_shift)
+
+} // namespace op::bitwise_left_shift::cpu
diff --git a/src/infiniop/ops/bitwise_left_shift/cpu/bitwise_left_shift_cpu.h b/src/infiniop/ops/bitwise_left_shift/cpu/bitwise_left_shift_cpu.h
new file mode 100644
index 000000000..36636bb56
--- /dev/null
+++ b/src/infiniop/ops/bitwise_left_shift/cpu/bitwise_left_shift_cpu.h
@@ -0,0 +1,9 @@
+#ifndef __BITWISE_LEFT_SHIFT_CPU_H__
+#define __BITWISE_LEFT_SHIFT_CPU_H__
+
+#include "../../../elementwise/binary.h"
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+
+BINARY_ELEMENTWISE_DESCRIPTOR(bitwise_left_shift, cpu, op::elementwise::binary::BinaryMode::BitwiseLeftShift)
+
+#endif // __BITWISE_LEFT_SHIFT_CPU_H__
diff --git a/src/infiniop/ops/bitwise_left_shift/cuda/kernel.cuh b/src/infiniop/ops/bitwise_left_shift/cuda/kernel.cuh
new file mode 100644
index 000000000..0f74548b6
--- /dev/null
+++ b/src/infiniop/ops/bitwise_left_shift/cuda/kernel.cuh
@@ -0,0 +1,10 @@
+#ifndef __BITWISE_LEFT_SHIFT_CUDA_H__
+#define __BITWISE_LEFT_SHIFT_CUDA_H__
+
+#include "../../../elementwise/binary.h"
+
+namespace op::bitwise_left_shift::cuda {
+using Op = op::elementwise::binary::cuda::BinaryOp<op::elementwise::binary::BinaryMode::BitwiseLeftShift>;
+} // namespace op::bitwise_left_shift::cuda
+
+#endif // __BITWISE_LEFT_SHIFT_CUDA_H__
diff --git a/src/infiniop/ops/bitwise_left_shift/nvidia/bitwise_left_shift_nvidia.cu b/src/infiniop/ops/bitwise_left_shift/nvidia/bitwise_left_shift_nvidia.cu
new file mode 100644
index 000000000..66fbd856c
--- /dev/null
+++ b/src/infiniop/ops/bitwise_left_shift/nvidia/bitwise_left_shift_nvidia.cu
@@ -0,0 +1,10 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "bitwise_left_shift_nvidia.cuh"
+
+namespace op::bitwise_left_shift::nvidia {
+
+ELEMENTWISE_NVIDIA_IMPL_BINARY_INTEGRAL(bitwise_left_shift)
+
+} // namespace op::bitwise_left_shift::nvidia
diff --git a/src/infiniop/ops/bitwise_left_shift/nvidia/bitwise_left_shift_nvidia.cuh b/src/infiniop/ops/bitwise_left_shift/nvidia/bitwise_left_shift_nvidia.cuh
new file mode 100644
index 000000000..89a573145
--- /dev/null
+++ b/src/infiniop/ops/bitwise_left_shift/nvidia/bitwise_left_shift_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __BITWISE_LEFT_SHIFT_CUDA_API_H__
+#define __BITWISE_LEFT_SHIFT_CUDA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(bitwise_left_shift, nvidia)
+
+#endif // __BITWISE_LEFT_SHIFT_CUDA_API_H__
diff --git a/src/infiniop/ops/bitwise_left_shift/operator.cc b/src/infiniop/ops/bitwise_left_shift/operator.cc
new file mode 100644
index 000000000..297874f10
--- /dev/null
+++ b/src/infiniop/ops/bitwise_left_shift/operator.cc
@@ -0,0 +1,11 @@
+#include "../../operator_impl.h"
+#include "infiniop/ops/binary_ops_api.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/bitwise_left_shift_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
+#include "nvidia/bitwise_left_shift_nvidia.cuh"
+#endif
+
+BINARY_OP_IMPL(bitwise_left_shift, BitwiseLeftShift)
diff --git a/src/infiniop/ops/bitwise_or/cpu/bitwise_or_cpu.cc b/src/infiniop/ops/bitwise_or/cpu/bitwise_or_cpu.cc
new file mode 100644
index 000000000..9808c8294
--- /dev/null
+++ b/src/infiniop/ops/bitwise_or/cpu/bitwise_or_cpu.cc
@@ -0,0 +1,8 @@
+#include "bitwise_or_cpu.h"
+#include "../../../elementwise/cpu/elementwise_cpu_impl.h"
+
+namespace op::bitwise_or::cpu {
+
+ELEMENTWISE_CPU_IMPL_BINARY_INTEGRAL(bitwise_or)
+
+} // namespace op::bitwise_or::cpu
diff --git a/src/infiniop/ops/bitwise_or/cpu/bitwise_or_cpu.h b/src/infiniop/ops/bitwise_or/cpu/bitwise_or_cpu.h
new file mode 100644
index 000000000..f0503b1fd
--- /dev/null
+++ b/src/infiniop/ops/bitwise_or/cpu/bitwise_or_cpu.h
@@ -0,0 +1,9 @@
+#ifndef __BITWISE_OR_CPU_H__
+#define __BITWISE_OR_CPU_H__
+
+#include "../../../elementwise/binary.h"
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+
+BINARY_ELEMENTWISE_DESCRIPTOR(bitwise_or, cpu, op::elementwise::binary::BinaryMode::BitwiseOr)
+
+#endif // __BITWISE_OR_CPU_H__
diff --git a/src/infiniop/ops/bitwise_or/cuda/kernel.cuh b/src/infiniop/ops/bitwise_or/cuda/kernel.cuh
new file mode 100644
index 000000000..0424f03b8
--- /dev/null
+++ b/src/infiniop/ops/bitwise_or/cuda/kernel.cuh
@@ -0,0 +1,10 @@
+#ifndef __BITWISE_OR_CUDA_H__
+#define __BITWISE_OR_CUDA_H__
+
+#include "../../../elementwise/binary.h"
+
+namespace op::bitwise_or::cuda {
+using Op = op::elementwise::binary::cuda::BinaryOp<op::elementwise::binary::BinaryMode::BitwiseOr>;
+} // namespace op::bitwise_or::cuda
+
+#endif // __BITWISE_OR_CUDA_H__
diff --git a/src/infiniop/ops/bitwise_or/nvidia/bitwise_or_nvidia.cu b/src/infiniop/ops/bitwise_or/nvidia/bitwise_or_nvidia.cu
new file mode 100644
index 000000000..8bf58dbfa
--- /dev/null
+++ b/src/infiniop/ops/bitwise_or/nvidia/bitwise_or_nvidia.cu
@@ -0,0 +1,10 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "bitwise_or_nvidia.cuh"
+
+namespace op::bitwise_or::nvidia {
+
+ELEMENTWISE_NVIDIA_IMPL_BINARY_INTEGRAL(bitwise_or)
+
+} // namespace op::bitwise_or::nvidia
diff --git a/src/infiniop/ops/bitwise_or/nvidia/bitwise_or_nvidia.cuh b/src/infiniop/ops/bitwise_or/nvidia/bitwise_or_nvidia.cuh
new file mode 100644
index 000000000..419ac2603
--- /dev/null
+++ b/src/infiniop/ops/bitwise_or/nvidia/bitwise_or_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __BITWISE_OR_CUDA_API_H__
+#define __BITWISE_OR_CUDA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(bitwise_or, nvidia)
+
+#endif // __BITWISE_OR_CUDA_API_H__
diff --git a/src/infiniop/ops/bitwise_or/operator.cc b/src/infiniop/ops/bitwise_or/operator.cc
new file mode 100644
index 000000000..40a68f2a8
--- /dev/null
+++ b/src/infiniop/ops/bitwise_or/operator.cc
@@ -0,0 +1,11 @@
+#include "../../operator_impl.h"
+#include "infiniop/ops/binary_ops_api.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/bitwise_or_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
+#include "nvidia/bitwise_or_nvidia.cuh"
+#endif
+
+BINARY_OP_IMPL(bitwise_or, BitwiseOr)
diff --git a/src/infiniop/ops/bitwise_right_shift/cpu/bitwise_right_shift_cpu.cc b/src/infiniop/ops/bitwise_right_shift/cpu/bitwise_right_shift_cpu.cc
new file mode 100644
index 000000000..c11022f26
--- /dev/null
+++ b/src/infiniop/ops/bitwise_right_shift/cpu/bitwise_right_shift_cpu.cc
@@ -0,0 +1,8 @@
+#include "bitwise_right_shift_cpu.h"
+#include "../../../elementwise/cpu/elementwise_cpu_impl.h"
+
+namespace op::bitwise_right_shift::cpu {
+
+ELEMENTWISE_CPU_IMPL_BINARY_INTEGRAL(bitwise_right_shift)
+
+} // namespace op::bitwise_right_shift::cpu
diff --git a/src/infiniop/ops/bitwise_right_shift/cpu/bitwise_right_shift_cpu.h b/src/infiniop/ops/bitwise_right_shift/cpu/bitwise_right_shift_cpu.h
new file mode 100644
index 000000000..f4d023c4d
--- /dev/null
+++ b/src/infiniop/ops/bitwise_right_shift/cpu/bitwise_right_shift_cpu.h
@@ -0,0 +1,9 @@
+#ifndef __BITWISE_RIGHT_SHIFT_CPU_H__
+#define __BITWISE_RIGHT_SHIFT_CPU_H__
+
+#include "../../../elementwise/binary.h"
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+
+BINARY_ELEMENTWISE_DESCRIPTOR(bitwise_right_shift, cpu, op::elementwise::binary::BinaryMode::BitwiseRightShift)
+
+#endif // __BITWISE_RIGHT_SHIFT_CPU_H__
diff --git a/src/infiniop/ops/bitwise_right_shift/cuda/kernel.cuh b/src/infiniop/ops/bitwise_right_shift/cuda/kernel.cuh
new file mode 100644
index 000000000..43308fb3b
--- /dev/null
+++ b/src/infiniop/ops/bitwise_right_shift/cuda/kernel.cuh
@@ -0,0 +1,10 @@
+#ifndef __BITWISE_RIGHT_SHIFT_CUDA_H__
+#define __BITWISE_RIGHT_SHIFT_CUDA_H__
+
+#include "../../../elementwise/binary.h"
+
+namespace op::bitwise_right_shift::cuda {
+using Op = op::elementwise::binary::cuda::BinaryOp<op::elementwise::binary::BinaryMode::BitwiseRightShift>;
+} // namespace op::bitwise_right_shift::cuda
+
+#endif // __BITWISE_RIGHT_SHIFT_CUDA_H__
diff --git a/src/infiniop/ops/bitwise_right_shift/nvidia/bitwise_right_shift_nvidia.cu b/src/infiniop/ops/bitwise_right_shift/nvidia/bitwise_right_shift_nvidia.cu
new file mode 100644
index 000000000..2be31f1ce
--- /dev/null
+++ b/src/infiniop/ops/bitwise_right_shift/nvidia/bitwise_right_shift_nvidia.cu
@@ -0,0 +1,10 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "bitwise_right_shift_nvidia.cuh"
+
+namespace op::bitwise_right_shift::nvidia {
+
+ELEMENTWISE_NVIDIA_IMPL_BINARY_INTEGRAL(bitwise_right_shift)
+
+} // namespace op::bitwise_right_shift::nvidia
diff --git a/src/infiniop/ops/bitwise_right_shift/nvidia/bitwise_right_shift_nvidia.cuh b/src/infiniop/ops/bitwise_right_shift/nvidia/bitwise_right_shift_nvidia.cuh
new file mode 100644
index 000000000..749189263
--- /dev/null
+++ b/src/infiniop/ops/bitwise_right_shift/nvidia/bitwise_right_shift_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __BITWISE_RIGHT_SHIFT_CUDA_API_H__
+#define __BITWISE_RIGHT_SHIFT_CUDA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(bitwise_right_shift, nvidia)
+
+#endif // __BITWISE_RIGHT_SHIFT_CUDA_API_H__
diff --git a/src/infiniop/ops/bitwise_right_shift/operator.cc b/src/infiniop/ops/bitwise_right_shift/operator.cc
new file mode 100644
index 000000000..6c5c1957e
--- /dev/null
+++ b/src/infiniop/ops/bitwise_right_shift/operator.cc
@@ -0,0 +1,11 @@
+#include "../../operator_impl.h"
+#include "infiniop/ops/binary_ops_api.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/bitwise_right_shift_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
+#include "nvidia/bitwise_right_shift_nvidia.cuh"
+#endif
+
+BINARY_OP_IMPL(bitwise_right_shift, BitwiseRightShift)
diff --git a/src/infiniop/ops/bitwise_xor/cpu/bitwise_xor_cpu.cc b/src/infiniop/ops/bitwise_xor/cpu/bitwise_xor_cpu.cc
new file mode 100644
index 000000000..6f8cdc7e3
--- /dev/null
+++ b/src/infiniop/ops/bitwise_xor/cpu/bitwise_xor_cpu.cc
@@ -0,0 +1,8 @@
+#include "bitwise_xor_cpu.h"
+#include "../../../elementwise/cpu/elementwise_cpu_impl.h"
+
+namespace op::bitwise_xor::cpu {
+
+ELEMENTWISE_CPU_IMPL_BINARY_INTEGRAL(bitwise_xor)
+
+} // namespace op::bitwise_xor::cpu
diff --git a/src/infiniop/ops/bitwise_xor/cpu/bitwise_xor_cpu.h b/src/infiniop/ops/bitwise_xor/cpu/bitwise_xor_cpu.h
new file mode 100644
index 000000000..e971b7a16
--- /dev/null
+++ b/src/infiniop/ops/bitwise_xor/cpu/bitwise_xor_cpu.h
@@ -0,0 +1,9 @@
+#ifndef __BITWISE_XOR_CPU_H__
+#define __BITWISE_XOR_CPU_H__
+
+#include "../../../elementwise/binary.h"
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+
+BINARY_ELEMENTWISE_DESCRIPTOR(bitwise_xor, cpu, op::elementwise::binary::BinaryMode::BitwiseXor)
+
+#endif // __BITWISE_XOR_CPU_H__
diff --git a/src/infiniop/ops/bitwise_xor/cuda/kernel.cuh b/src/infiniop/ops/bitwise_xor/cuda/kernel.cuh
new file mode 100644
index 000000000..ef7f23bca
--- /dev/null
+++ b/src/infiniop/ops/bitwise_xor/cuda/kernel.cuh
@@ -0,0 +1,10 @@
+#ifndef __BITWISE_XOR_CUDA_H__
+#define __BITWISE_XOR_CUDA_H__
+
+#include "../../../elementwise/binary.h"
+
+namespace op::bitwise_xor::cuda {
+using Op = op::elementwise::binary::cuda::BinaryOp<op::elementwise::binary::BinaryMode::BitwiseXor>;
+} // namespace op::bitwise_xor::cuda
+
+#endif // __BITWISE_XOR_CUDA_H__
diff --git a/src/infiniop/ops/bitwise_xor/nvidia/bitwise_xor_nvidia.cu b/src/infiniop/ops/bitwise_xor/nvidia/bitwise_xor_nvidia.cu
new file mode 100644
index 000000000..64aff8297
--- /dev/null
+++ b/src/infiniop/ops/bitwise_xor/nvidia/bitwise_xor_nvidia.cu
@@ -0,0 +1,10 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "bitwise_xor_nvidia.cuh"
+
+namespace op::bitwise_xor::nvidia {
+
+ELEMENTWISE_NVIDIA_IMPL_BINARY_INTEGRAL(bitwise_xor)
+
+} // namespace op::bitwise_xor::nvidia
diff --git a/src/infiniop/ops/bitwise_xor/nvidia/bitwise_xor_nvidia.cuh b/src/infiniop/ops/bitwise_xor/nvidia/bitwise_xor_nvidia.cuh
new file mode 100644
index 000000000..e3ae06a54
--- /dev/null
+++ b/src/infiniop/ops/bitwise_xor/nvidia/bitwise_xor_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __BITWISE_XOR_CUDA_API_H__
+#define __BITWISE_XOR_CUDA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(bitwise_xor, nvidia)
+
+#endif // __BITWISE_XOR_CUDA_API_H__
diff --git a/src/infiniop/ops/bitwise_xor/operator.cc b/src/infiniop/ops/bitwise_xor/operator.cc
new file mode 100644
index 000000000..04529fe68
--- /dev/null
+++ b/src/infiniop/ops/bitwise_xor/operator.cc
@@ -0,0 +1,11 @@
+#include "../../operator_impl.h"
+#include "infiniop/ops/binary_ops_api.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/bitwise_xor_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
+#include "nvidia/bitwise_xor_nvidia.cuh"
+#endif
+
+BINARY_OP_IMPL(bitwise_xor, BitwiseXor)
diff --git a/src/infiniop/ops/copysign/cpu/copysign_cpu.cc b/src/infiniop/ops/copysign/cpu/copysign_cpu.cc
new file mode 100644
index 000000000..fe3757fa5
--- /dev/null
+++ b/src/infiniop/ops/copysign/cpu/copysign_cpu.cc
@@ -0,0 +1,8 @@
+#include "copysign_cpu.h"
+#include "../../../elementwise/cpu/elementwise_cpu_impl.h"
+
+namespace op::copysign::cpu {
+
+ELEMENTWISE_CPU_IMPL_BINARY(copysign)
+
+} // namespace op::copysign::cpu
diff --git a/src/infiniop/ops/copysign/cpu/copysign_cpu.h b/src/infiniop/ops/copysign/cpu/copysign_cpu.h
new file mode 100644
index 000000000..99123765e
--- /dev/null
+++ b/src/infiniop/ops/copysign/cpu/copysign_cpu.h
@@ -0,0 +1,9 @@
+#ifndef __COPYSIGN_CPU_H__
+#define __COPYSIGN_CPU_H__
+
+#include "../../../elementwise/binary.h"
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+
+BINARY_ELEMENTWISE_DESCRIPTOR(copysign, cpu, op::elementwise::binary::BinaryMode::CopySign)
+
+#endif // __COPYSIGN_CPU_H__
diff --git a/src/infiniop/ops/copysign/cuda/kernel.cuh b/src/infiniop/ops/copysign/cuda/kernel.cuh
new file mode 100644
index 000000000..2064923c9
--- /dev/null
+++ b/src/infiniop/ops/copysign/cuda/kernel.cuh
@@ -0,0 +1,10 @@
+#ifndef __COPYSIGN_CUDA_H__
+#define __COPYSIGN_CUDA_H__
+
+#include "../../../elementwise/binary.h"
+
+namespace op::copysign::cuda {
+using Op = op::elementwise::binary::cuda::BinaryOp<op::elementwise::binary::BinaryMode::CopySign>;
+} // namespace op::copysign::cuda
+
+#endif // __COPYSIGN_CUDA_H__
diff --git a/src/infiniop/ops/copysign/nvidia/copysign_nvidia.cu b/src/infiniop/ops/copysign/nvidia/copysign_nvidia.cu
new file mode 100644
index 000000000..261a4a204
--- /dev/null
+++ b/src/infiniop/ops/copysign/nvidia/copysign_nvidia.cu
@@ -0,0 +1,10 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "copysign_nvidia.cuh"
+
+namespace op::copysign::nvidia {
+
+ELEMENTWISE_NVIDIA_IMPL_BINARY(copysign)
+
+} // namespace op::copysign::nvidia
diff --git a/src/infiniop/ops/copysign/nvidia/copysign_nvidia.cuh b/src/infiniop/ops/copysign/nvidia/copysign_nvidia.cuh
new file mode 100644
index 000000000..33005c379
--- /dev/null
+++ b/src/infiniop/ops/copysign/nvidia/copysign_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __COPYSIGN_NVIDIA_API_H__
+#define __COPYSIGN_NVIDIA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(copysign, nvidia)
+
+#endif // __COPYSIGN_NVIDIA_API_H__
diff --git a/src/infiniop/ops/copysign/operator.cc b/src/infiniop/ops/copysign/operator.cc
new file mode 100644
index 000000000..0c8652ff7
--- /dev/null
+++ b/src/infiniop/ops/copysign/operator.cc
@@ -0,0 +1,11 @@
+#include "../../operator_impl.h"
+#include "infiniop/ops/binary_ops_api.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/copysign_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
+#include "nvidia/copysign_nvidia.cuh"
+#endif
+
+BINARY_OP_IMPL(copysign, CopySign)
diff --git a/src/infiniop/ops/eq/cpu/eq_cpu.cc b/src/infiniop/ops/eq/cpu/eq_cpu.cc
new file mode 100644
index 000000000..e9b5dcefa
--- /dev/null
+++ b/src/infiniop/ops/eq/cpu/eq_cpu.cc
@@ -0,0 +1,8 @@
+#include "eq_cpu.h"
+#include "../../../elementwise/cpu/elementwise_cpu_impl.h"
+
+namespace op::eq::cpu {
+
+ELEMENTWISE_CPU_IMPL_BINARY(eq)
+
+} // namespace op::eq::cpu
diff --git a/src/infiniop/ops/eq/cpu/eq_cpu.h b/src/infiniop/ops/eq/cpu/eq_cpu.h
new file mode 100644
index 000000000..c1de9b01c
--- /dev/null
+++ b/src/infiniop/ops/eq/cpu/eq_cpu.h
@@ -0,0 +1,9 @@
+#ifndef __EQ_CPU_H__
+#define __EQ_CPU_H__
+
+#include "../../../elementwise/binary.h"
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+
+BINARY_ELEMENTWISE_DESCRIPTOR(eq, cpu, op::elementwise::binary::BinaryMode::Equal)
+
+#endif // __EQ_CPU_H__
diff --git a/src/infiniop/ops/eq/cuda/kernel.cuh b/src/infiniop/ops/eq/cuda/kernel.cuh
new file mode 100644
index 000000000..a8557604c
--- /dev/null
+++ b/src/infiniop/ops/eq/cuda/kernel.cuh
@@ -0,0 +1,10 @@
+#ifndef __EQ_CUDA_H__
+#define __EQ_CUDA_H__
+
+#include "../../../elementwise/binary.h"
+
+namespace op::eq::cuda {
+using Op = op::elementwise::binary::cuda::BinaryOp<op::elementwise::binary::BinaryMode::Equal>;
+} // namespace op::eq::cuda
+
+#endif // __EQ_CUDA_H__
diff --git a/src/infiniop/ops/eq/nvidia/eq_nvidia.cu b/src/infiniop/ops/eq/nvidia/eq_nvidia.cu
new file mode 100644
index 000000000..b3dd9722d
--- /dev/null
+++ b/src/infiniop/ops/eq/nvidia/eq_nvidia.cu
@@ -0,0 +1,10 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "eq_nvidia.cuh"
+
+namespace op::eq::nvidia {
+
+ELEMENTWISE_NVIDIA_IMPL_BINARY(eq)
+
+} // namespace op::eq::nvidia
diff --git a/src/infiniop/ops/eq/nvidia/eq_nvidia.cuh b/src/infiniop/ops/eq/nvidia/eq_nvidia.cuh
new file mode 100644
index 000000000..bbe6ccbdb
--- /dev/null
+++ b/src/infiniop/ops/eq/nvidia/eq_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __EQ_CUDA_API_H__
+#define __EQ_CUDA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(eq, nvidia)
+
+#endif // __EQ_CUDA_API_H__
diff --git a/src/infiniop/ops/eq/operator.cc b/src/infiniop/ops/eq/operator.cc
new file mode 100644
index 000000000..380fcfab7
--- /dev/null
+++ b/src/infiniop/ops/eq/operator.cc
@@ -0,0 +1,11 @@
+#include "../../operator_impl.h"
+#include "infiniop/ops/binary_ops_api.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/eq_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
+#include "nvidia/eq_nvidia.cuh"
+#endif
+
+BINARY_OP_IMPL(eq, Eq)
diff --git a/src/infiniop/ops/exp2/cpu/exp2_cpu.cc b/src/infiniop/ops/exp2/cpu/exp2_cpu.cc
new file mode 100644
index 000000000..4a1bfdc88
--- /dev/null
+++ b/src/infiniop/ops/exp2/cpu/exp2_cpu.cc
@@ -0,0 +1,8 @@
+#include "exp2_cpu.h"
+#include "../../../elementwise/cpu/elementwise_cpu_impl.h"
+
+namespace op::exp2::cpu {
+
+ELEMENTWISE_CPU_IMPL_UNARY_EXTENDED(exp2)
+
+} // namespace op::exp2::cpu
diff --git a/src/infiniop/ops/exp2/cpu/exp2_cpu.h b/src/infiniop/ops/exp2/cpu/exp2_cpu.h
new file mode 100644
index 000000000..1f86686a4
--- /dev/null
+++ b/src/infiniop/ops/exp2/cpu/exp2_cpu.h
@@ -0,0 +1,9 @@
+#ifndef __EXP2_CPU_H__
+#define __EXP2_CPU_H__
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+#include "../../../elementwise/unary.h"
+
+UNARY_ELEMENTWISE_DESCRIPTOR(exp2, cpu, op::elementwise::unary::UnaryMode::Exp2)
+
+#endif // __EXP2_CPU_H__
diff --git a/src/infiniop/ops/exp2/cuda/kernel.cuh b/src/infiniop/ops/exp2/cuda/kernel.cuh
new file mode 100644
index 000000000..50987350c
--- /dev/null
+++ b/src/infiniop/ops/exp2/cuda/kernel.cuh
@@ -0,0 +1,10 @@
+#ifndef __EXP2_CUDA_H__
+#define __EXP2_CUDA_H__
+
+#include "../../../elementwise/unary.h"
+
+namespace op::exp2::cuda {
+using Op = op::elementwise::unary::cuda::UnaryOp<op::elementwise::unary::UnaryMode::Exp2>;
+} // namespace op::exp2::cuda
+
+#endif // __EXP2_CUDA_H__
diff --git a/src/infiniop/ops/exp2/nvidia/exp2_nvidia.cu b/src/infiniop/ops/exp2/nvidia/exp2_nvidia.cu
new file mode 100644
index 000000000..6097a7745
--- /dev/null
+++ b/src/infiniop/ops/exp2/nvidia/exp2_nvidia.cu
@@ -0,0 +1,10 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "exp2_nvidia.cuh"
+
+namespace op::exp2::nvidia {
+
+ELEMENTWISE_NVIDIA_IMPL_UNARY_EXTENDED(exp2)
+
+} // namespace op::exp2::nvidia
diff --git a/src/infiniop/ops/exp2/nvidia/exp2_nvidia.cuh b/src/infiniop/ops/exp2/nvidia/exp2_nvidia.cuh
new file mode 100644
index 000000000..d00e368fb
--- /dev/null
+++ b/src/infiniop/ops/exp2/nvidia/exp2_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __EXP2_CUDA_API_H__
+#define __EXP2_CUDA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(exp2, nvidia)
+
+#endif // __EXP2_CUDA_API_H__
diff --git a/src/infiniop/ops/exp2/operator.cc b/src/infiniop/ops/exp2/operator.cc
new file mode 100644
index 000000000..c4f695f18
--- /dev/null
+++ b/src/infiniop/ops/exp2/operator.cc
@@ -0,0 +1,11 @@
+#include "../../operator_impl.h"
+#include "infiniop/ops/unary_ops_api.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/exp2_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
+#include "nvidia/exp2_nvidia.cuh"
+#endif
+
+UNARY_OP_IMPL(exp2, Exp2)
diff --git a/src/infiniop/ops/floor_divide/cpu/floor_divide_cpu.cc b/src/infiniop/ops/floor_divide/cpu/floor_divide_cpu.cc
new file mode 100644
index 000000000..5e6665a70
--- /dev/null
+++ b/src/infiniop/ops/floor_divide/cpu/floor_divide_cpu.cc
@@ -0,0 +1,8 @@
+#include "floor_divide_cpu.h"
+#include "../../../elementwise/cpu/elementwise_cpu_impl.h"
+
+namespace op::floor_divide::cpu {
+
+ELEMENTWISE_CPU_IMPL_BINARY(floor_divide)
+
+} // namespace op::floor_divide::cpu
diff --git a/src/infiniop/ops/floor_divide/cpu/floor_divide_cpu.h b/src/infiniop/ops/floor_divide/cpu/floor_divide_cpu.h
new file mode 100644
index 000000000..e3286b837
--- /dev/null
+++ b/src/infiniop/ops/floor_divide/cpu/floor_divide_cpu.h
@@ -0,0 +1,9 @@
+#ifndef __FLOOR_DIVIDE_CPU_H__
+#define __FLOOR_DIVIDE_CPU_H__
+
+#include "../../../elementwise/binary.h"
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+
+BINARY_ELEMENTWISE_DESCRIPTOR(floor_divide, cpu, op::elementwise::binary::BinaryMode::FloorDivide)
+
+#endif // __FLOOR_DIVIDE_CPU_H__
diff --git a/src/infiniop/ops/floor_divide/cuda/kernel.cuh b/src/infiniop/ops/floor_divide/cuda/kernel.cuh
new file mode 100644
index 000000000..a4303f883
--- /dev/null
+++ b/src/infiniop/ops/floor_divide/cuda/kernel.cuh
@@ -0,0 +1,10 @@
+#ifndef __FLOOR_DIVIDE_CUDA_H__
+#define __FLOOR_DIVIDE_CUDA_H__
+
+#include "../../../elementwise/binary.h"
+
+namespace op::floor_divide::cuda {
+using Op = op::elementwise::binary::cuda::BinaryOp<op::elementwise::binary::BinaryMode::FloorDivide>;
+} // namespace op::floor_divide::cuda
+
+#endif // __FLOOR_DIVIDE_CUDA_H__
diff --git a/src/infiniop/ops/floor_divide/nvidia/floor_divide_nvidia.cu b/src/infiniop/ops/floor_divide/nvidia/floor_divide_nvidia.cu
new file mode 100644
index 000000000..764891fa4
--- /dev/null
+++ b/src/infiniop/ops/floor_divide/nvidia/floor_divide_nvidia.cu
@@ -0,0 +1,10 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "floor_divide_nvidia.cuh"
+
+namespace op::floor_divide::nvidia {
+
+ELEMENTWISE_NVIDIA_IMPL_BINARY(floor_divide)
+
+} // namespace op::floor_divide::nvidia
diff --git a/src/infiniop/ops/floor_divide/nvidia/floor_divide_nvidia.cuh b/src/infiniop/ops/floor_divide/nvidia/floor_divide_nvidia.cuh
new file mode 100644
index 000000000..1c70343cf
--- /dev/null
+++ b/src/infiniop/ops/floor_divide/nvidia/floor_divide_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __FLOOR_DIVIDE_CUDA_API_H__
+#define __FLOOR_DIVIDE_CUDA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(floor_divide, nvidia)
+
+#endif // __FLOOR_DIVIDE_CUDA_API_H__
diff --git a/src/infiniop/ops/floor_divide/operator.cc b/src/infiniop/ops/floor_divide/operator.cc
new file mode 100644
index 000000000..3de7b971d
--- /dev/null
+++ b/src/infiniop/ops/floor_divide/operator.cc
@@ -0,0 +1,11 @@
+#include "../../operator_impl.h"
+#include "infiniop/ops/binary_ops_api.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/floor_divide_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
+#include "nvidia/floor_divide_nvidia.cuh"
+#endif
+
+BINARY_OP_IMPL(floor_divide, FloorDivide)
diff --git a/src/infiniop/ops/fmax/cpu/fmax_cpu.cc b/src/infiniop/ops/fmax/cpu/fmax_cpu.cc
new file mode 100644
index 000000000..72cda24f9
--- /dev/null
+++ b/src/infiniop/ops/fmax/cpu/fmax_cpu.cc
@@ -0,0 +1,8 @@
+#include "fmax_cpu.h"
+#include "../../../elementwise/cpu/elementwise_cpu_impl.h"
+
+namespace op::fmax::cpu {
+
+ELEMENTWISE_CPU_IMPL_BINARY(fmax)
+
+} // namespace op::fmax::cpu
diff --git a/src/infiniop/ops/fmax/cpu/fmax_cpu.h b/src/infiniop/ops/fmax/cpu/fmax_cpu.h
new file mode 100644
index 000000000..f9abc2e87
--- /dev/null
+++ b/src/infiniop/ops/fmax/cpu/fmax_cpu.h
@@ -0,0 +1,9 @@
+#ifndef __FMAX_CPU_H__
+#define __FMAX_CPU_H__
+
+#include "../../../elementwise/binary.h"
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+
+BINARY_ELEMENTWISE_DESCRIPTOR(fmax, cpu, op::elementwise::binary::BinaryMode::Fmax)
+
+#endif // __FMAX_CPU_H__
diff --git a/src/infiniop/ops/fmax/cuda/kernel.cuh b/src/infiniop/ops/fmax/cuda/kernel.cuh
new file mode 100644
index 000000000..ad1c61116
--- /dev/null
+++ b/src/infiniop/ops/fmax/cuda/kernel.cuh
@@ -0,0 +1,10 @@
+#ifndef __FMAX_CUDA_H__
+#define __FMAX_CUDA_H__
+
+#include "../../../elementwise/binary.h"
+
+namespace op::fmax::cuda {
+using Op = op::elementwise::binary::cuda::BinaryOp<op::elementwise::binary::BinaryMode::Fmax>;
+} // namespace op::fmax::cuda
+
+#endif // __FMAX_CUDA_H__
diff --git a/src/infiniop/ops/fmax/nvidia/fmax_nvidia.cu b/src/infiniop/ops/fmax/nvidia/fmax_nvidia.cu
new file mode 100644
index 000000000..3a259f435
--- /dev/null
+++ b/src/infiniop/ops/fmax/nvidia/fmax_nvidia.cu
@@ -0,0 +1,10 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "fmax_nvidia.cuh"
+
+namespace op::fmax::nvidia {
+
+ELEMENTWISE_NVIDIA_IMPL_BINARY(fmax)
+
+} // namespace op::fmax::nvidia
diff --git a/src/infiniop/ops/fmax/nvidia/fmax_nvidia.cuh b/src/infiniop/ops/fmax/nvidia/fmax_nvidia.cuh
new file mode 100644
index 000000000..8e2b1d74a
--- /dev/null
+++ b/src/infiniop/ops/fmax/nvidia/fmax_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __FMAX_NVIDIA_API_H__
+#define __FMAX_NVIDIA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(fmax, nvidia)
+
+#endif // __FMAX_NVIDIA_API_H__
diff --git a/src/infiniop/ops/fmax/operator.cc b/src/infiniop/ops/fmax/operator.cc
new file mode 100644
index 000000000..32ce7235c
--- /dev/null
+++ b/src/infiniop/ops/fmax/operator.cc
@@ -0,0 +1,11 @@
+#include "../../operator_impl.h"
+#include "infiniop/ops/binary_ops_api.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/fmax_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
+#include "nvidia/fmax_nvidia.cuh"
+#endif
+
+BINARY_OP_IMPL(fmax, Fmax)
diff --git a/src/infiniop/ops/fmin/cpu/fmin_cpu.cc b/src/infiniop/ops/fmin/cpu/fmin_cpu.cc
new file mode 100644
index 000000000..730371b63
--- /dev/null
+++ b/src/infiniop/ops/fmin/cpu/fmin_cpu.cc
@@ -0,0 +1,8 @@
+#include "fmin_cpu.h"
+#include "../../../elementwise/cpu/elementwise_cpu_impl.h"
+
+namespace op::fmin::cpu {
+
+ELEMENTWISE_CPU_IMPL_BINARY(fmin)
+
+} // namespace op::fmin::cpu
diff --git a/src/infiniop/ops/fmin/cpu/fmin_cpu.h b/src/infiniop/ops/fmin/cpu/fmin_cpu.h
new file mode 100644
index 000000000..9ea1781e4
--- /dev/null
+++ b/src/infiniop/ops/fmin/cpu/fmin_cpu.h
@@ -0,0 +1,9 @@
+#ifndef __FMIN_CPU_H__
+#define __FMIN_CPU_H__
+
+#include "../../../elementwise/binary.h"
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+
+BINARY_ELEMENTWISE_DESCRIPTOR(fmin, cpu, op::elementwise::binary::BinaryMode::Fmin)
+
+#endif // __FMIN_CPU_H__
diff --git a/src/infiniop/ops/fmin/cuda/kernel.cuh b/src/infiniop/ops/fmin/cuda/kernel.cuh
new file mode 100644
index 000000000..57e641c8b
--- /dev/null
+++ b/src/infiniop/ops/fmin/cuda/kernel.cuh
@@ -0,0 +1,10 @@
+#ifndef __FMIN_CUDA_H__
+#define __FMIN_CUDA_H__
+
+#include "../../../elementwise/binary.h"
+
+namespace op::fmin::cuda {
+using Op = op::elementwise::binary::cuda::BinaryOp<op::elementwise::binary::BinaryMode::Fmin>;
+} // namespace op::fmin::cuda
+
+#endif // __FMIN_CUDA_H__
diff --git a/src/infiniop/ops/fmin/nvidia/fmin_nvidia.cu b/src/infiniop/ops/fmin/nvidia/fmin_nvidia.cu
new file mode 100644
index 000000000..cda271a51
--- /dev/null
+++ b/src/infiniop/ops/fmin/nvidia/fmin_nvidia.cu
@@ -0,0 +1,10 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "fmin_nvidia.cuh"
+
+namespace op::fmin::nvidia {
+
+ELEMENTWISE_NVIDIA_IMPL_BINARY(fmin)
+
+} // namespace op::fmin::nvidia
diff --git a/src/infiniop/ops/fmin/nvidia/fmin_nvidia.cuh b/src/infiniop/ops/fmin/nvidia/fmin_nvidia.cuh
new file mode 100644
index 000000000..10a391e40
--- /dev/null
+++ b/src/infiniop/ops/fmin/nvidia/fmin_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __FMIN_NVIDIA_API_H__
+#define __FMIN_NVIDIA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(fmin, nvidia)
+
+#endif // __FMIN_NVIDIA_API_H__
diff --git a/src/infiniop/ops/fmin/operator.cc b/src/infiniop/ops/fmin/operator.cc
new file mode 100644
index 000000000..f0e85cfe9
--- /dev/null
+++ b/src/infiniop/ops/fmin/operator.cc
@@ -0,0 +1,11 @@
+#include "../../operator_impl.h"
+#include "infiniop/ops/binary_ops_api.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/fmin_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
+#include "nvidia/fmin_nvidia.cuh"
+#endif
+
+BINARY_OP_IMPL(fmin, Fmin)
diff --git a/src/infiniop/ops/ge/cpu/ge_cpu.cc b/src/infiniop/ops/ge/cpu/ge_cpu.cc
new file mode 100644
index 000000000..56b3bc383
--- /dev/null
+++ b/src/infiniop/ops/ge/cpu/ge_cpu.cc
@@ -0,0 +1,8 @@
+#include "ge_cpu.h"
+#include "../../../elementwise/cpu/elementwise_cpu_impl.h"
+
+namespace op::ge::cpu {
+
+ELEMENTWISE_CPU_IMPL_BINARY(ge)
+
+} // namespace op::ge::cpu
diff --git a/src/infiniop/ops/ge/cpu/ge_cpu.h b/src/infiniop/ops/ge/cpu/ge_cpu.h
new file mode 100644
index 000000000..461c36411
--- /dev/null
+++ b/src/infiniop/ops/ge/cpu/ge_cpu.h
@@ -0,0 +1,9 @@
+#ifndef __GE_CPU_H__
+#define __GE_CPU_H__
+
+#include "../../../elementwise/binary.h"
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+
+BINARY_ELEMENTWISE_DESCRIPTOR(ge, cpu, op::elementwise::binary::BinaryMode::GreaterOrEqual)
+
+#endif // __GE_CPU_H__
diff --git a/src/infiniop/ops/ge/cuda/kernel.cuh b/src/infiniop/ops/ge/cuda/kernel.cuh
new file mode 100644
index 000000000..40dfdd8b2
--- /dev/null
+++ b/src/infiniop/ops/ge/cuda/kernel.cuh
@@ -0,0 +1,10 @@
+#ifndef __GE_CUDA_H__
+#define __GE_CUDA_H__
+
+#include "../../../elementwise/binary.h"
+
+namespace op::ge::cuda {
+using Op = op::elementwise::binary::cuda::BinaryOp<op::elementwise::binary::BinaryMode::GreaterOrEqual>;
+} // namespace op::ge::cuda
+
+#endif // __GE_CUDA_H__
diff --git a/src/infiniop/ops/ge/nvidia/ge_nvidia.cu b/src/infiniop/ops/ge/nvidia/ge_nvidia.cu
new file mode 100644
index 000000000..c29f985aa
--- /dev/null
+++ b/src/infiniop/ops/ge/nvidia/ge_nvidia.cu
@@ -0,0 +1,10 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "ge_nvidia.cuh"
+
+namespace op::ge::nvidia {
+
+ELEMENTWISE_NVIDIA_IMPL_BINARY(ge)
+
+} // namespace op::ge::nvidia
diff --git a/src/infiniop/ops/ge/nvidia/ge_nvidia.cuh b/src/infiniop/ops/ge/nvidia/ge_nvidia.cuh
new file mode 100644
index 000000000..a20742080
--- /dev/null
+++ b/src/infiniop/ops/ge/nvidia/ge_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __GE_CUDA_API_H__
+#define __GE_CUDA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(ge, nvidia)
+
+#endif // __GE_CUDA_API_H__
diff --git a/src/infiniop/ops/ge/operator.cc b/src/infiniop/ops/ge/operator.cc
new file mode 100644
index 000000000..eda9851d2
--- /dev/null
+++ b/src/infiniop/ops/ge/operator.cc
@@ -0,0 +1,11 @@
+#include "../../operator_impl.h"
+#include "infiniop/ops/binary_ops_api.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/ge_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
+#include "nvidia/ge_nvidia.cuh"
+#endif
+
+BINARY_OP_IMPL(ge, Ge)
diff --git a/src/infiniop/ops/gt/cpu/gt_cpu.cc b/src/infiniop/ops/gt/cpu/gt_cpu.cc
new file mode 100644
index 000000000..bdd204dde
--- /dev/null
+++ b/src/infiniop/ops/gt/cpu/gt_cpu.cc
@@ -0,0 +1,8 @@
+#include "gt_cpu.h"
+#include "../../../elementwise/cpu/elementwise_cpu_impl.h"
+
+namespace op::gt::cpu {
+
+ELEMENTWISE_CPU_IMPL_BINARY(gt)
+
+} // namespace op::gt::cpu
diff --git a/src/infiniop/ops/gt/cpu/gt_cpu.h b/src/infiniop/ops/gt/cpu/gt_cpu.h
new file mode 100644
index 000000000..a637a856d
--- /dev/null
+++ b/src/infiniop/ops/gt/cpu/gt_cpu.h
@@ -0,0 +1,9 @@
+#ifndef __GT_CPU_H__
+#define __GT_CPU_H__
+
+#include "../../../elementwise/binary.h"
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+
+BINARY_ELEMENTWISE_DESCRIPTOR(gt, cpu, op::elementwise::binary::BinaryMode::Greater)
+
+#endif // __GT_CPU_H__
diff --git a/src/infiniop/ops/gt/cuda/kernel.cuh b/src/infiniop/ops/gt/cuda/kernel.cuh
new file mode 100644
index 000000000..1d4bfa720
--- /dev/null
+++ b/src/infiniop/ops/gt/cuda/kernel.cuh
@@ -0,0 +1,10 @@
+#ifndef __GT_CUDA_H__
+#define __GT_CUDA_H__
+
+#include "../../../elementwise/binary.h"
+
+namespace op::gt::cuda {
+using Op = op::elementwise::binary::cuda::BinaryOp<op::elementwise::binary::BinaryMode::Greater>;
+} // namespace op::gt::cuda
+
+#endif // __GT_CUDA_H__
diff --git a/src/infiniop/ops/gt/nvidia/gt_nvidia.cu b/src/infiniop/ops/gt/nvidia/gt_nvidia.cu
new file mode 100644
index 000000000..a0eea040e
--- /dev/null
+++ b/src/infiniop/ops/gt/nvidia/gt_nvidia.cu
@@ -0,0 +1,10 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "gt_nvidia.cuh"
+
+namespace op::gt::nvidia {
+
+ELEMENTWISE_NVIDIA_IMPL_BINARY(gt)
+
+} // namespace op::gt::nvidia
diff --git a/src/infiniop/ops/gt/nvidia/gt_nvidia.cuh b/src/infiniop/ops/gt/nvidia/gt_nvidia.cuh
new file mode 100644
index 000000000..ce5517d89
--- /dev/null
+++ b/src/infiniop/ops/gt/nvidia/gt_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __GT_CUDA_API_H__
+#define __GT_CUDA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(gt, nvidia)
+
+#endif // __GT_CUDA_API_H__
diff --git a/src/infiniop/ops/gt/operator.cc b/src/infiniop/ops/gt/operator.cc
new file mode 100644
index 000000000..74a5cec53
--- /dev/null
+++ b/src/infiniop/ops/gt/operator.cc
@@ -0,0 +1,11 @@
+#include "../../operator_impl.h"
+#include "infiniop/ops/binary_ops_api.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/gt_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
+#include "nvidia/gt_nvidia.cuh"
+#endif
+
+BINARY_OP_IMPL(gt, Gt)
diff --git a/src/infiniop/ops/hypot/cpu/hypot_cpu.cc b/src/infiniop/ops/hypot/cpu/hypot_cpu.cc
new file mode 100644
index 000000000..20f90fe40
--- /dev/null
+++ b/src/infiniop/ops/hypot/cpu/hypot_cpu.cc
@@ -0,0 +1,8 @@
+#include "hypot_cpu.h"
+#include "../../../elementwise/cpu/elementwise_cpu_impl.h"
+
+namespace op::hypot::cpu {
+
+ELEMENTWISE_CPU_IMPL_BINARY(hypot)
+
+} // namespace op::hypot::cpu
diff --git a/src/infiniop/ops/hypot/cpu/hypot_cpu.h b/src/infiniop/ops/hypot/cpu/hypot_cpu.h
new file mode 100644
index 000000000..d9b1db8cd
--- /dev/null
+++ b/src/infiniop/ops/hypot/cpu/hypot_cpu.h
@@ -0,0 +1,9 @@
+#ifndef __HYPOT_CPU_H__
+#define __HYPOT_CPU_H__
+
+#include "../../../elementwise/binary.h"
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+
+BINARY_ELEMENTWISE_DESCRIPTOR(hypot, cpu, op::elementwise::binary::BinaryMode::Hypot)
+
+#endif // __HYPOT_CPU_H__
diff --git a/src/infiniop/ops/hypot/cuda/kernel.cuh b/src/infiniop/ops/hypot/cuda/kernel.cuh
new file mode 100644
index 000000000..9616d5bbe
--- /dev/null
+++ b/src/infiniop/ops/hypot/cuda/kernel.cuh
@@ -0,0 +1,10 @@
+#ifndef __HYPOT_CUDA_H__
+#define __HYPOT_CUDA_H__
+
+#include "../../../elementwise/binary.h"
+
+namespace op::hypot::cuda {
+using Op = op::elementwise::binary::cuda::BinaryOp<op::elementwise::binary::BinaryMode::Hypot>;
+} // namespace op::hypot::cuda
+
+#endif // __HYPOT_CUDA_H__
diff --git a/src/infiniop/ops/hypot/nvidia/hypot_nvidia.cu b/src/infiniop/ops/hypot/nvidia/hypot_nvidia.cu
new file mode 100644
index 000000000..5c3aaaa73
--- /dev/null
+++ b/src/infiniop/ops/hypot/nvidia/hypot_nvidia.cu
@@ -0,0 +1,10 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "hypot_nvidia.cuh"
+
+namespace op::hypot::nvidia {
+
+ELEMENTWISE_NVIDIA_IMPL_BINARY(hypot)
+
+} // namespace op::hypot::nvidia
diff --git a/src/infiniop/ops/hypot/nvidia/hypot_nvidia.cuh b/src/infiniop/ops/hypot/nvidia/hypot_nvidia.cuh
new file mode 100644
index 000000000..879bc02dc
--- /dev/null
+++ b/src/infiniop/ops/hypot/nvidia/hypot_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __HYPOT_NVIDIA_API_H__
+#define __HYPOT_NVIDIA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(hypot, nvidia)
+
+#endif // __HYPOT_NVIDIA_API_H__
diff --git a/src/infiniop/ops/hypot/operator.cc b/src/infiniop/ops/hypot/operator.cc
new file mode 100644
index 000000000..a2ebf9bea
--- /dev/null
+++ b/src/infiniop/ops/hypot/operator.cc
@@ -0,0 +1,11 @@
+#include "../../operator_impl.h"
+#include "infiniop/ops/binary_ops_api.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/hypot_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
+#include "nvidia/hypot_nvidia.cuh"
+#endif
+
+BINARY_OP_IMPL(hypot, Hypot)
diff --git a/src/infiniop/ops/isfinite/cpu/isfinite_cpu.cc b/src/infiniop/ops/isfinite/cpu/isfinite_cpu.cc
new file mode 100644
index 000000000..f7bcb3d8a
--- /dev/null
+++ b/src/infiniop/ops/isfinite/cpu/isfinite_cpu.cc
@@ -0,0 +1,8 @@
+#include "isfinite_cpu.h"
+#include "../../../elementwise/cpu/elementwise_cpu_impl.h"
+
+namespace op::isfinite::cpu {
+
+ELEMENTWISE_CPU_IMPL_UNARY(isfinite)
+
+} // namespace op::isfinite::cpu
diff --git a/src/infiniop/ops/isfinite/cpu/isfinite_cpu.h b/src/infiniop/ops/isfinite/cpu/isfinite_cpu.h
new file mode 100644
index 000000000..9a0fa1c3f
--- /dev/null
+++ b/src/infiniop/ops/isfinite/cpu/isfinite_cpu.h
@@ -0,0 +1,9 @@
+#ifndef __ISFINITE_CPU_H__
+#define __ISFINITE_CPU_H__
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+#include "../../../elementwise/unary.h"
+
+UNARY_ELEMENTWISE_DESCRIPTOR(isfinite, cpu, op::elementwise::unary::UnaryMode::IsFinite)
+
+#endif // __ISFINITE_CPU_H__
diff --git a/src/infiniop/ops/isfinite/cuda/kernel.cuh b/src/infiniop/ops/isfinite/cuda/kernel.cuh
new file mode 100644
index 000000000..edbd0a548
--- /dev/null
+++ b/src/infiniop/ops/isfinite/cuda/kernel.cuh
@@ -0,0 +1,10 @@
+#ifndef __ISFINITE_CUDA_H__
+#define __ISFINITE_CUDA_H__
+
+#include "../../../elementwise/unary.h"
+
+namespace op::isfinite::cuda {
+using Op = op::elementwise::unary::cuda::UnaryOp<op::elementwise::unary::UnaryMode::IsFinite>;
+} // namespace op::isfinite::cuda
+
+#endif // __ISFINITE_CUDA_H__
diff --git a/src/infiniop/ops/isfinite/nvidia/isfinite_nvidia.cu b/src/infiniop/ops/isfinite/nvidia/isfinite_nvidia.cu
new file mode 100644
index 000000000..a76ea559d
--- /dev/null
+++ b/src/infiniop/ops/isfinite/nvidia/isfinite_nvidia.cu
@@ -0,0 +1,10 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "isfinite_nvidia.cuh"
+
+namespace op::isfinite::nvidia {
+
+ELEMENTWISE_NVIDIA_IMPL_UNARY(isfinite)
+
+} // namespace op::isfinite::nvidia
diff --git a/src/infiniop/ops/isfinite/nvidia/isfinite_nvidia.cuh b/src/infiniop/ops/isfinite/nvidia/isfinite_nvidia.cuh
new file mode 100644
index 000000000..49e0f3e99
--- /dev/null
+++ b/src/infiniop/ops/isfinite/nvidia/isfinite_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __ISFINITE_NVIDIA_API_H__
+#define __ISFINITE_NVIDIA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(isfinite, nvidia)
+
+#endif // __ISFINITE_NVIDIA_API_H__
diff --git a/src/infiniop/ops/isfinite/operator.cc b/src/infiniop/ops/isfinite/operator.cc
new file mode 100644
index 000000000..92c88523e
--- /dev/null
+++ b/src/infiniop/ops/isfinite/operator.cc
@@ -0,0 +1,11 @@
+#include "../../operator_impl.h"
+#include "infiniop/ops/unary_ops_api.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/isfinite_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
+#include "nvidia/isfinite_nvidia.cuh"
+#endif
+
+UNARY_OP_IMPL(isfinite, IsFinite)
diff --git a/src/infiniop/ops/isinf/cpu/isinf_cpu.cc b/src/infiniop/ops/isinf/cpu/isinf_cpu.cc
new file mode 100644
index 000000000..8327746fc
--- /dev/null
+++ b/src/infiniop/ops/isinf/cpu/isinf_cpu.cc
@@ -0,0 +1,8 @@
+#include "isinf_cpu.h"
+#include "../../../elementwise/cpu/elementwise_cpu_impl.h"
+
+namespace op::isinf::cpu {
+
+ELEMENTWISE_CPU_IMPL_UNARY(isinf)
+
+} // namespace op::isinf::cpu
diff --git a/src/infiniop/ops/isinf/cpu/isinf_cpu.h b/src/infiniop/ops/isinf/cpu/isinf_cpu.h
new file mode 100644
index 000000000..edc4a1fcc
--- /dev/null
+++ b/src/infiniop/ops/isinf/cpu/isinf_cpu.h
@@ -0,0 +1,9 @@
+#ifndef __ISINF_CPU_H__
+#define __ISINF_CPU_H__
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+#include "../../../elementwise/unary.h"
+
+UNARY_ELEMENTWISE_DESCRIPTOR(isinf, cpu, op::elementwise::unary::UnaryMode::IsInf)
+
+#endif // __ISINF_CPU_H__
diff --git a/src/infiniop/ops/isinf/cuda/kernel.cuh b/src/infiniop/ops/isinf/cuda/kernel.cuh
new file mode 100644
index 000000000..272ff3f08
--- /dev/null
+++ b/src/infiniop/ops/isinf/cuda/kernel.cuh
@@ -0,0 +1,10 @@
+#ifndef __ISINF_CUDA_H__
+#define __ISINF_CUDA_H__
+
+#include "../../../elementwise/unary.h"
+
+namespace op::isinf::cuda {
+using Op = op::elementwise::unary::cuda::UnaryOp<op::elementwise::unary::UnaryMode::IsInf>;
+} // namespace op::isinf::cuda
+
+#endif // __ISINF_CUDA_H__
diff --git a/src/infiniop/ops/isinf/nvidia/isinf_nvidia.cu b/src/infiniop/ops/isinf/nvidia/isinf_nvidia.cu
new file mode 100644
index 000000000..1fd88363f
--- /dev/null
+++ b/src/infiniop/ops/isinf/nvidia/isinf_nvidia.cu
@@ -0,0 +1,10 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "isinf_nvidia.cuh"
+
+namespace op::isinf::nvidia {
+
+ELEMENTWISE_NVIDIA_IMPL_UNARY(isinf)
+
+} // namespace op::isinf::nvidia
diff --git a/src/infiniop/ops/isinf/nvidia/isinf_nvidia.cuh b/src/infiniop/ops/isinf/nvidia/isinf_nvidia.cuh
new file mode 100644
index 000000000..072a749ab
--- /dev/null
+++ b/src/infiniop/ops/isinf/nvidia/isinf_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __ISINF_NVIDIA_API_H__
+#define __ISINF_NVIDIA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(isinf, nvidia)
+
+#endif // __ISINF_NVIDIA_API_H__
diff --git a/src/infiniop/ops/isinf/operator.cc b/src/infiniop/ops/isinf/operator.cc
new file mode 100644
index 000000000..de042520d
--- /dev/null
+++ b/src/infiniop/ops/isinf/operator.cc
@@ -0,0 +1,11 @@
+#include "../../operator_impl.h"
+#include "infiniop/ops/unary_ops_api.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/isinf_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
+#include "nvidia/isinf_nvidia.cuh"
+#endif
+
+UNARY_OP_IMPL(isinf, IsInf)
diff --git a/src/infiniop/ops/isnan/cpu/isnan_cpu.cc b/src/infiniop/ops/isnan/cpu/isnan_cpu.cc
new file mode 100644
index 000000000..398bc3039
--- /dev/null
+++ b/src/infiniop/ops/isnan/cpu/isnan_cpu.cc
@@ -0,0 +1,8 @@
+#include "isnan_cpu.h"
+#include "../../../elementwise/cpu/elementwise_cpu_impl.h"
+
+namespace op::isnan::cpu {
+
+ELEMENTWISE_CPU_IMPL_UNARY(isnan)
+
+} // namespace op::isnan::cpu
diff --git a/src/infiniop/ops/isnan/cpu/isnan_cpu.h b/src/infiniop/ops/isnan/cpu/isnan_cpu.h
new file mode 100644
index 000000000..002ad36d4
--- /dev/null
+++ b/src/infiniop/ops/isnan/cpu/isnan_cpu.h
@@ -0,0 +1,9 @@
+#ifndef __ISNAN_CPU_H__
+#define __ISNAN_CPU_H__
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+#include "../../../elementwise/unary.h"
+
+UNARY_ELEMENTWISE_DESCRIPTOR(isnan, cpu, op::elementwise::unary::UnaryMode::IsNan)
+
+#endif // __ISNAN_CPU_H__
diff --git a/src/infiniop/ops/isnan/cuda/kernel.cuh b/src/infiniop/ops/isnan/cuda/kernel.cuh
new file mode 100644
index 000000000..890688422
--- /dev/null
+++ b/src/infiniop/ops/isnan/cuda/kernel.cuh
@@ -0,0 +1,10 @@
+#ifndef __ISNAN_CUDA_H__
+#define __ISNAN_CUDA_H__
+
+#include "../../../elementwise/unary.h"
+
+namespace op::isnan::cuda {
+using Op = op::elementwise::unary::cuda::UnaryOp<op::elementwise::unary::UnaryMode::IsNan>;
+} // namespace op::isnan::cuda
+
+#endif // __ISNAN_CUDA_H__
diff --git a/src/infiniop/ops/isnan/nvidia/isnan_nvidia.cu b/src/infiniop/ops/isnan/nvidia/isnan_nvidia.cu
new file mode 100644
index 000000000..8f1cd695e
--- /dev/null
+++ b/src/infiniop/ops/isnan/nvidia/isnan_nvidia.cu
@@ -0,0 +1,10 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "isnan_nvidia.cuh"
+
+namespace op::isnan::nvidia {
+
+ELEMENTWISE_NVIDIA_IMPL_UNARY(isnan)
+
+} // namespace op::isnan::nvidia
diff --git a/src/infiniop/ops/isnan/nvidia/isnan_nvidia.cuh b/src/infiniop/ops/isnan/nvidia/isnan_nvidia.cuh
new file mode 100644
index 000000000..341adf103
--- /dev/null
+++ b/src/infiniop/ops/isnan/nvidia/isnan_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __ISNAN_NVIDIA_API_H__
+#define __ISNAN_NVIDIA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(isnan, nvidia)
+
+#endif // __ISNAN_NVIDIA_API_H__
diff --git a/src/infiniop/ops/isnan/operator.cc b/src/infiniop/ops/isnan/operator.cc
new file mode 100644
index 000000000..fedad566a
--- /dev/null
+++ b/src/infiniop/ops/isnan/operator.cc
@@ -0,0 +1,11 @@
+#include "../../operator_impl.h"
+#include "infiniop/ops/unary_ops_api.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/isnan_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
+#include "nvidia/isnan_nvidia.cuh"
+#endif
+
+UNARY_OP_IMPL(isnan, IsNan)
diff --git a/src/infiniop/ops/le/cpu/le_cpu.cc b/src/infiniop/ops/le/cpu/le_cpu.cc
new file mode 100644
index 000000000..9be2df7a0
--- /dev/null
+++ b/src/infiniop/ops/le/cpu/le_cpu.cc
@@ -0,0 +1,8 @@
+#include "le_cpu.h"
+#include "../../../elementwise/cpu/elementwise_cpu_impl.h"
+
+namespace op::le::cpu {
+
+ELEMENTWISE_CPU_IMPL_BINARY(le)
+
+} // namespace op::le::cpu
diff --git a/src/infiniop/ops/le/cpu/le_cpu.h b/src/infiniop/ops/le/cpu/le_cpu.h
new file mode 100644
index 000000000..2a5679a16
--- /dev/null
+++ b/src/infiniop/ops/le/cpu/le_cpu.h
@@ -0,0 +1,9 @@
+#ifndef __LE_CPU_H__
+#define __LE_CPU_H__
+
+#include "../../../elementwise/binary.h"
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+
+BINARY_ELEMENTWISE_DESCRIPTOR(le, cpu, op::elementwise::binary::BinaryMode::LessOrEqual)
+
+#endif // __LE_CPU_H__
diff --git a/src/infiniop/ops/le/cuda/kernel.cuh b/src/infiniop/ops/le/cuda/kernel.cuh
new file mode 100644
index 000000000..d8d64f9f9
--- /dev/null
+++ b/src/infiniop/ops/le/cuda/kernel.cuh
@@ -0,0 +1,10 @@
+#ifndef __LE_CUDA_H__
+#define __LE_CUDA_H__
+
+#include "../../../elementwise/binary.h"
+
+namespace op::le::cuda {
+using Op = op::elementwise::binary::cuda::BinaryOp<op::elementwise::binary::BinaryMode::LessOrEqual>;
+} // namespace op::le::cuda
+
+#endif // __LE_CUDA_H__
diff --git a/src/infiniop/ops/le/nvidia/le_nvidia.cu b/src/infiniop/ops/le/nvidia/le_nvidia.cu
new file mode 100644
index 000000000..93d1327fb
--- /dev/null
+++ b/src/infiniop/ops/le/nvidia/le_nvidia.cu
@@ -0,0 +1,10 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "le_nvidia.cuh"
+
+namespace op::le::nvidia {
+
+ELEMENTWISE_NVIDIA_IMPL_BINARY(le)
+
+} // namespace op::le::nvidia
diff --git a/src/infiniop/ops/le/nvidia/le_nvidia.cuh b/src/infiniop/ops/le/nvidia/le_nvidia.cuh
new file mode 100644
index 000000000..62ea3d392
--- /dev/null
+++ b/src/infiniop/ops/le/nvidia/le_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __LE_CUDA_API_H__
+#define __LE_CUDA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(le, nvidia)
+
+#endif // __LE_CUDA_API_H__
diff --git a/src/infiniop/ops/le/operator.cc b/src/infiniop/ops/le/operator.cc
new file mode 100644
index 000000000..1809a3241
--- /dev/null
+++ b/src/infiniop/ops/le/operator.cc
@@ -0,0 +1,11 @@
+#include "../../operator_impl.h"
+#include "infiniop/ops/binary_ops_api.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/le_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
+#include "nvidia/le_nvidia.cuh"
+#endif
+
+BINARY_OP_IMPL(le, Le)
diff --git a/src/infiniop/ops/log10/cpu/log10_cpu.cc b/src/infiniop/ops/log10/cpu/log10_cpu.cc
new file mode 100644
index 000000000..2b28eff21
--- /dev/null
+++ b/src/infiniop/ops/log10/cpu/log10_cpu.cc
@@ -0,0 +1,8 @@
+#include "log10_cpu.h"
+#include "../../../elementwise/cpu/elementwise_cpu_impl.h"
+
+namespace op::log10::cpu {
+
+ELEMENTWISE_CPU_IMPL_UNARY_EXTENDED(log10)
+
+} // namespace op::log10::cpu
diff --git a/src/infiniop/ops/log10/cpu/log10_cpu.h b/src/infiniop/ops/log10/cpu/log10_cpu.h
new file mode 100644
index 000000000..5ddc6d437
--- /dev/null
+++ b/src/infiniop/ops/log10/cpu/log10_cpu.h
@@ -0,0 +1,9 @@
+#ifndef __LOG10_CPU_H__
+#define __LOG10_CPU_H__
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+#include "../../../elementwise/unary.h"
+
+UNARY_ELEMENTWISE_DESCRIPTOR(log10, cpu, op::elementwise::unary::UnaryMode::Log10)
+
+#endif // __LOG10_CPU_H__
diff --git a/src/infiniop/ops/log10/cuda/kernel.cuh b/src/infiniop/ops/log10/cuda/kernel.cuh
new file mode 100644
index 000000000..eca7e0517
--- /dev/null
+++ b/src/infiniop/ops/log10/cuda/kernel.cuh
@@ -0,0 +1,10 @@
+#ifndef __LOG10_CUDA_H__
+#define __LOG10_CUDA_H__
+
+#include "../../../elementwise/unary.h"
+
+namespace op::log10::cuda {
+using Op = op::elementwise::unary::cuda::UnaryOp<op::elementwise::unary::UnaryMode::Log10>;
+} // namespace op::log10::cuda
+
+#endif // __LOG10_CUDA_H__
diff --git a/src/infiniop/ops/log10/nvidia/log10_nvidia.cu b/src/infiniop/ops/log10/nvidia/log10_nvidia.cu
new file mode 100644
index 000000000..fbfca7d35
--- /dev/null
+++ b/src/infiniop/ops/log10/nvidia/log10_nvidia.cu
@@ -0,0 +1,10 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "log10_nvidia.cuh"
+
+namespace op::log10::nvidia {
+
+ELEMENTWISE_NVIDIA_IMPL_UNARY_EXTENDED(log10)
+
+} // namespace op::log10::nvidia
diff --git a/src/infiniop/ops/log10/nvidia/log10_nvidia.cuh b/src/infiniop/ops/log10/nvidia/log10_nvidia.cuh
new file mode 100644
index 000000000..1514994eb
--- /dev/null
+++ b/src/infiniop/ops/log10/nvidia/log10_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __LOG10_CUDA_API_H__
+#define __LOG10_CUDA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(log10, nvidia)
+
+#endif // __LOG10_CUDA_API_H__
diff --git a/src/infiniop/ops/log10/operator.cc b/src/infiniop/ops/log10/operator.cc
new file mode 100644
index 000000000..da799c858
--- /dev/null
+++ b/src/infiniop/ops/log10/operator.cc
@@ -0,0 +1,11 @@
+#include "../../operator_impl.h"
+#include "infiniop/ops/unary_ops_api.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/log10_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
+#include "nvidia/log10_nvidia.cuh"
+#endif
+
+UNARY_OP_IMPL(log10, Log10)
diff --git a/src/infiniop/ops/log1p/cpu/log1p_cpu.cc b/src/infiniop/ops/log1p/cpu/log1p_cpu.cc
new file mode 100644
index 000000000..12aa543a9
--- /dev/null
+++ b/src/infiniop/ops/log1p/cpu/log1p_cpu.cc
@@ -0,0 +1,8 @@
+#include "log1p_cpu.h"
+#include "../../../elementwise/cpu/elementwise_cpu_impl.h"
+
+namespace op::log1p::cpu {
+
+ELEMENTWISE_CPU_IMPL_UNARY(log1p)
+
+} // namespace op::log1p::cpu
diff --git a/src/infiniop/ops/log1p/cpu/log1p_cpu.h b/src/infiniop/ops/log1p/cpu/log1p_cpu.h
new file mode 100644
index 000000000..0c999a48a
--- /dev/null
+++ b/src/infiniop/ops/log1p/cpu/log1p_cpu.h
@@ -0,0 +1,9 @@
+#ifndef __LOG1P_CPU_H__
+#define __LOG1P_CPU_H__
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+#include "../../../elementwise/unary.h"
+
+UNARY_ELEMENTWISE_DESCRIPTOR(log1p, cpu, op::elementwise::unary::UnaryMode::Log1p)
+
+#endif // __LOG1P_CPU_H__
diff --git a/src/infiniop/ops/log1p/cuda/kernel.cuh b/src/infiniop/ops/log1p/cuda/kernel.cuh
new file mode 100644
index 000000000..0bcfc73de
--- /dev/null
+++ b/src/infiniop/ops/log1p/cuda/kernel.cuh
@@ -0,0 +1,10 @@
+#ifndef __LOG1P_CUDA_H__
+#define __LOG1P_CUDA_H__
+
+#include "../../../elementwise/unary.h"
+
+namespace op::log1p::cuda {
+using Op = op::elementwise::unary::cuda::UnaryOp<op::elementwise::unary::UnaryMode::Log1p>;
+} // namespace op::log1p::cuda
+
+#endif // __LOG1P_CUDA_H__
diff --git a/src/infiniop/ops/log1p/nvidia/log1p_nvidia.cu b/src/infiniop/ops/log1p/nvidia/log1p_nvidia.cu
new file mode 100644
index 000000000..695b7c743
--- /dev/null
+++ b/src/infiniop/ops/log1p/nvidia/log1p_nvidia.cu
@@ -0,0 +1,10 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "log1p_nvidia.cuh"
+
+namespace op::log1p::nvidia {
+
+ELEMENTWISE_NVIDIA_IMPL_UNARY(log1p)
+
+} // namespace op::log1p::nvidia
diff --git a/src/infiniop/ops/log1p/nvidia/log1p_nvidia.cuh b/src/infiniop/ops/log1p/nvidia/log1p_nvidia.cuh
new file mode 100644
index 000000000..2522dbc08
--- /dev/null
+++ b/src/infiniop/ops/log1p/nvidia/log1p_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __LOG1P_NVIDIA_API_H__
+#define __LOG1P_NVIDIA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(log1p, nvidia)
+
+#endif // __LOG1P_NVIDIA_API_H__
diff --git a/src/infiniop/ops/log1p/operator.cc b/src/infiniop/ops/log1p/operator.cc
new file mode 100644
index 000000000..795e9c009
--- /dev/null
+++ b/src/infiniop/ops/log1p/operator.cc
@@ -0,0 +1,11 @@
+#include "../../operator_impl.h"
+#include "infiniop/ops/unary_ops_api.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/log1p_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
+#include "nvidia/log1p_nvidia.cuh"
+#endif
+
+UNARY_OP_IMPL(log1p, Log1p)
diff --git a/src/infiniop/ops/log2/cpu/log2_cpu.cc b/src/infiniop/ops/log2/cpu/log2_cpu.cc
new file mode 100644
index 000000000..79676e4ab
--- /dev/null
+++ b/src/infiniop/ops/log2/cpu/log2_cpu.cc
@@ -0,0 +1,8 @@
+#include "log2_cpu.h"
+#include "../../../elementwise/cpu/elementwise_cpu_impl.h"
+
+namespace op::log2::cpu {
+
+ELEMENTWISE_CPU_IMPL_UNARY_EXTENDED(log2)
+
+} // namespace op::log2::cpu
diff --git a/src/infiniop/ops/log2/cpu/log2_cpu.h b/src/infiniop/ops/log2/cpu/log2_cpu.h
new file mode 100644
index 000000000..db62d672c
--- /dev/null
+++ b/src/infiniop/ops/log2/cpu/log2_cpu.h
@@ -0,0 +1,9 @@
+#ifndef __LOG2_CPU_H__
+#define __LOG2_CPU_H__
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+#include "../../../elementwise/unary.h"
+
+UNARY_ELEMENTWISE_DESCRIPTOR(log2, cpu, op::elementwise::unary::UnaryMode::Log2)
+
+#endif // __LOG2_CPU_H__
diff --git a/src/infiniop/ops/log2/cuda/kernel.cuh b/src/infiniop/ops/log2/cuda/kernel.cuh
new file mode 100644
index 000000000..3d50dd7ae
--- /dev/null
+++ b/src/infiniop/ops/log2/cuda/kernel.cuh
@@ -0,0 +1,10 @@
+#ifndef __LOG2_CUDA_H__
+#define __LOG2_CUDA_H__
+
+#include "../../../elementwise/unary.h"
+
+namespace op::log2::cuda {
+using Op = op::elementwise::unary::cuda::UnaryOp<op::elementwise::unary::UnaryMode::Log2>;
+} // namespace op::log2::cuda
+
+#endif // __LOG2_CUDA_H__
diff --git a/src/infiniop/ops/log2/nvidia/log2_nvidia.cu b/src/infiniop/ops/log2/nvidia/log2_nvidia.cu
new file mode 100644
index 000000000..68a4e33a0
--- /dev/null
+++ b/src/infiniop/ops/log2/nvidia/log2_nvidia.cu
@@ -0,0 +1,10 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "log2_nvidia.cuh"
+
+namespace op::log2::nvidia {
+
+ELEMENTWISE_NVIDIA_IMPL_UNARY_EXTENDED(log2)
+
+} // namespace op::log2::nvidia
diff --git a/src/infiniop/ops/log2/nvidia/log2_nvidia.cuh b/src/infiniop/ops/log2/nvidia/log2_nvidia.cuh
new file mode 100644
index 000000000..e1fc178e9
--- /dev/null
+++ b/src/infiniop/ops/log2/nvidia/log2_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __LOG2_CUDA_API_H__
+#define __LOG2_CUDA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(log2, nvidia)
+
+#endif // __LOG2_CUDA_API_H__
diff --git a/src/infiniop/ops/log2/operator.cc b/src/infiniop/ops/log2/operator.cc
new file mode 100644
index 000000000..70deeaa68
--- /dev/null
+++ b/src/infiniop/ops/log2/operator.cc
@@ -0,0 +1,11 @@
+#include "../../operator_impl.h"
+#include "infiniop/ops/unary_ops_api.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/log2_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
+#include "nvidia/log2_nvidia.cuh"
+#endif
+
+UNARY_OP_IMPL(log2, Log2)
diff --git a/src/infiniop/ops/logical_and/cpu/logical_and_cpu.cc b/src/infiniop/ops/logical_and/cpu/logical_and_cpu.cc
new file mode 100644
index 000000000..4f664c648
--- /dev/null
+++ b/src/infiniop/ops/logical_and/cpu/logical_and_cpu.cc
@@ -0,0 +1,8 @@
+#include "logical_and_cpu.h"
+#include "../../../elementwise/cpu/elementwise_cpu_impl.h"
+
+namespace op::logical_and::cpu {
+
+ELEMENTWISE_CPU_IMPL_BINARY(logical_and)
+
+} // namespace op::logical_and::cpu
diff --git a/src/infiniop/ops/logical_and/cpu/logical_and_cpu.h b/src/infiniop/ops/logical_and/cpu/logical_and_cpu.h
new file mode 100644
index 000000000..531a8d31d
--- /dev/null
+++ b/src/infiniop/ops/logical_and/cpu/logical_and_cpu.h
@@ -0,0 +1,9 @@
+#ifndef __LOGICAL_AND_CPU_H__
+#define __LOGICAL_AND_CPU_H__
+
+#include "../../../elementwise/binary.h"
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+
+BINARY_ELEMENTWISE_DESCRIPTOR(logical_and, cpu, op::elementwise::binary::BinaryMode::LogicalAnd)
+
+#endif // __LOGICAL_AND_CPU_H__
diff --git a/src/infiniop/ops/logical_and/cuda/kernel.cuh b/src/infiniop/ops/logical_and/cuda/kernel.cuh
new file mode 100644
index 000000000..85b80fee6
--- /dev/null
+++ b/src/infiniop/ops/logical_and/cuda/kernel.cuh
@@ -0,0 +1,10 @@
+#ifndef __LOGICAL_AND_CUDA_H__
+#define __LOGICAL_AND_CUDA_H__
+
+#include "../../../elementwise/binary.h"
+
+namespace op::logical_and::cuda {
+using Op = op::elementwise::binary::cuda::BinaryOp<op::elementwise::binary::BinaryMode::LogicalAnd>;
+} // namespace op::logical_and::cuda
+
+#endif // __LOGICAL_AND_CUDA_H__
diff --git a/src/infiniop/ops/logical_and/nvidia/logical_and_nvidia.cu b/src/infiniop/ops/logical_and/nvidia/logical_and_nvidia.cu
new file mode 100644
index 000000000..2bf34aa08
--- /dev/null
+++ b/src/infiniop/ops/logical_and/nvidia/logical_and_nvidia.cu
@@ -0,0 +1,10 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "logical_and_nvidia.cuh"
+
+namespace op::logical_and::nvidia {
+
+ELEMENTWISE_NVIDIA_IMPL_BINARY(logical_and)
+
+} // namespace op::logical_and::nvidia
diff --git a/src/infiniop/ops/logical_and/nvidia/logical_and_nvidia.cuh b/src/infiniop/ops/logical_and/nvidia/logical_and_nvidia.cuh
new file mode 100644
index 000000000..9d68754bf
--- /dev/null
+++ b/src/infiniop/ops/logical_and/nvidia/logical_and_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __LOGICAL_AND_CUDA_API_H__
+#define __LOGICAL_AND_CUDA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(logical_and, nvidia)
+
+#endif // __LOGICAL_AND_CUDA_API_H__
diff --git a/src/infiniop/ops/logical_and/operator.cc b/src/infiniop/ops/logical_and/operator.cc
new file mode 100644
index 000000000..036d8d061
--- /dev/null
+++ b/src/infiniop/ops/logical_and/operator.cc
@@ -0,0 +1,11 @@
+#include "../../operator_impl.h"
+#include "infiniop/ops/binary_ops_api.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/logical_and_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
+#include "nvidia/logical_and_nvidia.cuh"
+#endif
+
+BINARY_OP_IMPL(logical_and, LogicalAnd)
diff --git a/src/infiniop/ops/logical_or/cpu/logical_or_cpu.cc b/src/infiniop/ops/logical_or/cpu/logical_or_cpu.cc
new file mode 100644
index 000000000..dcc824a16
--- /dev/null
+++ b/src/infiniop/ops/logical_or/cpu/logical_or_cpu.cc
@@ -0,0 +1,8 @@
+#include "logical_or_cpu.h"
+#include "../../../elementwise/cpu/elementwise_cpu_impl.h"
+
+namespace op::logical_or::cpu {
+
+ELEMENTWISE_CPU_IMPL_BINARY(logical_or)
+
+} // namespace op::logical_or::cpu
diff --git a/src/infiniop/ops/logical_or/cpu/logical_or_cpu.h b/src/infiniop/ops/logical_or/cpu/logical_or_cpu.h
new file mode 100644
index 000000000..77b1cc6e7
--- /dev/null
+++ b/src/infiniop/ops/logical_or/cpu/logical_or_cpu.h
@@ -0,0 +1,9 @@
+#ifndef __LOGICAL_OR_CPU_H__
+#define __LOGICAL_OR_CPU_H__
+
+#include "../../../elementwise/binary.h"
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+
+BINARY_ELEMENTWISE_DESCRIPTOR(logical_or, cpu, op::elementwise::binary::BinaryMode::LogicalOr)
+
+#endif // __LOGICAL_OR_CPU_H__
diff --git a/src/infiniop/ops/logical_or/cuda/kernel.cuh b/src/infiniop/ops/logical_or/cuda/kernel.cuh
new file mode 100644
index 000000000..60ec81e28
--- /dev/null
+++ b/src/infiniop/ops/logical_or/cuda/kernel.cuh
@@ -0,0 +1,10 @@
+#ifndef __LOGICAL_OR_CUDA_H__
+#define __LOGICAL_OR_CUDA_H__
+
+#include "../../../elementwise/binary.h"
+
+namespace op::logical_or::cuda {
+using Op = op::elementwise::binary::cuda::BinaryOp<op::elementwise::binary::BinaryMode::LogicalOr>;
+} // namespace op::logical_or::cuda
+
+#endif // __LOGICAL_OR_CUDA_H__
diff --git a/src/infiniop/ops/logical_or/nvidia/logical_or_nvidia.cu b/src/infiniop/ops/logical_or/nvidia/logical_or_nvidia.cu
new file mode 100644
index 000000000..1a7830e84
--- /dev/null
+++ b/src/infiniop/ops/logical_or/nvidia/logical_or_nvidia.cu
@@ -0,0 +1,10 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "logical_or_nvidia.cuh"
+
+namespace op::logical_or::nvidia {
+
+ELEMENTWISE_NVIDIA_IMPL_BINARY(logical_or)
+
+} // namespace op::logical_or::nvidia
diff --git a/src/infiniop/ops/logical_or/nvidia/logical_or_nvidia.cuh b/src/infiniop/ops/logical_or/nvidia/logical_or_nvidia.cuh
new file mode 100644
index 000000000..a70bd8da7
--- /dev/null
+++ b/src/infiniop/ops/logical_or/nvidia/logical_or_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __LOGICAL_OR_CUDA_API_H__
+#define __LOGICAL_OR_CUDA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(logical_or, nvidia)
+
+#endif // __LOGICAL_OR_CUDA_API_H__
diff --git a/src/infiniop/ops/logical_or/operator.cc b/src/infiniop/ops/logical_or/operator.cc
new file mode 100644
index 000000000..7e06f23a0
--- /dev/null
+++ b/src/infiniop/ops/logical_or/operator.cc
@@ -0,0 +1,11 @@
+#include "../../operator_impl.h"
+#include "infiniop/ops/binary_ops_api.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/logical_or_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
+#include "nvidia/logical_or_nvidia.cuh"
+#endif
+
+BINARY_OP_IMPL(logical_or, LogicalOr)
diff --git a/src/infiniop/ops/logical_xor/cpu/logical_xor_cpu.cc b/src/infiniop/ops/logical_xor/cpu/logical_xor_cpu.cc
new file mode 100644
index 000000000..d7512db10
--- /dev/null
+++ b/src/infiniop/ops/logical_xor/cpu/logical_xor_cpu.cc
@@ -0,0 +1,8 @@
+#include "logical_xor_cpu.h"
+#include "../../../elementwise/cpu/elementwise_cpu_impl.h"
+
+namespace op::logical_xor::cpu {
+
+ELEMENTWISE_CPU_IMPL_BINARY(logical_xor)
+
+} // namespace op::logical_xor::cpu
diff --git a/src/infiniop/ops/logical_xor/cpu/logical_xor_cpu.h b/src/infiniop/ops/logical_xor/cpu/logical_xor_cpu.h
new file mode 100644
index 000000000..2e4b0b038
--- /dev/null
+++ b/src/infiniop/ops/logical_xor/cpu/logical_xor_cpu.h
@@ -0,0 +1,9 @@
+#ifndef __LOGICAL_XOR_CPU_H__
+#define __LOGICAL_XOR_CPU_H__
+
+#include "../../../elementwise/binary.h"
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+
+BINARY_ELEMENTWISE_DESCRIPTOR(logical_xor, cpu, op::elementwise::binary::BinaryMode::LogicalXor)
+
+#endif // __LOGICAL_XOR_CPU_H__
diff --git a/src/infiniop/ops/logical_xor/cuda/kernel.cuh b/src/infiniop/ops/logical_xor/cuda/kernel.cuh
new file mode 100644
index 000000000..e3ab59ef9
--- /dev/null
+++ b/src/infiniop/ops/logical_xor/cuda/kernel.cuh
@@ -0,0 +1,10 @@
+#ifndef __LOGICAL_XOR_CUDA_H__
+#define __LOGICAL_XOR_CUDA_H__
+
+#include "../../../elementwise/binary.h"
+
+namespace op::logical_xor::cuda {
+using Op = op::elementwise::binary::cuda::BinaryOp<op::elementwise::binary::BinaryMode::LogicalXor>;
+} // namespace op::logical_xor::cuda
+
+#endif // __LOGICAL_XOR_CUDA_H__
diff --git a/src/infiniop/ops/logical_xor/nvidia/logical_xor_nvidia.cu b/src/infiniop/ops/logical_xor/nvidia/logical_xor_nvidia.cu
new file mode 100644
index 000000000..ac2bd04dc
--- /dev/null
+++ b/src/infiniop/ops/logical_xor/nvidia/logical_xor_nvidia.cu
@@ -0,0 +1,10 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "logical_xor_nvidia.cuh"
+
+namespace op::logical_xor::nvidia {
+
+ELEMENTWISE_NVIDIA_IMPL_BINARY(logical_xor)
+
+} // namespace op::logical_xor::nvidia
diff --git a/src/infiniop/ops/logical_xor/nvidia/logical_xor_nvidia.cuh b/src/infiniop/ops/logical_xor/nvidia/logical_xor_nvidia.cuh
new file mode 100644
index 000000000..f264f8025
--- /dev/null
+++ b/src/infiniop/ops/logical_xor/nvidia/logical_xor_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __LOGICAL_XOR_CUDA_API_H__
+#define __LOGICAL_XOR_CUDA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(logical_xor, nvidia)
+
+#endif // __LOGICAL_XOR_CUDA_API_H__
diff --git a/src/infiniop/ops/logical_xor/operator.cc b/src/infiniop/ops/logical_xor/operator.cc
new file mode 100644
index 000000000..3af7d138a
--- /dev/null
+++ b/src/infiniop/ops/logical_xor/operator.cc
@@ -0,0 +1,11 @@
+#include "../../operator_impl.h"
+#include "infiniop/ops/binary_ops_api.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/logical_xor_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
+#include "nvidia/logical_xor_nvidia.cuh"
+#endif
+
+BINARY_OP_IMPL(logical_xor, LogicalXor)
diff --git a/src/infiniop/ops/lt/cpu/lt_cpu.cc b/src/infiniop/ops/lt/cpu/lt_cpu.cc
new file mode 100644
index 000000000..25a5bfae0
--- /dev/null
+++ b/src/infiniop/ops/lt/cpu/lt_cpu.cc
@@ -0,0 +1,8 @@
+#include "lt_cpu.h"
+#include "../../../elementwise/cpu/elementwise_cpu_impl.h"
+
+namespace op::lt::cpu {
+
+ELEMENTWISE_CPU_IMPL_BINARY(lt)
+
+} // namespace op::lt::cpu
diff --git a/src/infiniop/ops/lt/cpu/lt_cpu.h b/src/infiniop/ops/lt/cpu/lt_cpu.h
new file mode 100644
index 000000000..5870ee818
--- /dev/null
+++ b/src/infiniop/ops/lt/cpu/lt_cpu.h
@@ -0,0 +1,9 @@
+#ifndef __LT_CPU_H__
+#define __LT_CPU_H__
+
+#include "../../../elementwise/binary.h"
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+
+BINARY_ELEMENTWISE_DESCRIPTOR(lt, cpu, op::elementwise::binary::BinaryMode::Less)
+
+#endif // __LT_CPU_H__
diff --git a/src/infiniop/ops/lt/cuda/kernel.cuh b/src/infiniop/ops/lt/cuda/kernel.cuh
new file mode 100644
index 000000000..fa81f75ba
--- /dev/null
+++ b/src/infiniop/ops/lt/cuda/kernel.cuh
@@ -0,0 +1,10 @@
+#ifndef __LT_CUDA_H__
+#define __LT_CUDA_H__
+
+#include "../../../elementwise/binary.h"
+
+namespace op::lt::cuda {
+using Op = op::elementwise::binary::cuda::BinaryOp<op::elementwise::binary::BinaryMode::Less>;
+} // namespace op::lt::cuda
+
+#endif // __LT_CUDA_H__
diff --git a/src/infiniop/ops/lt/nvidia/lt_nvidia.cu b/src/infiniop/ops/lt/nvidia/lt_nvidia.cu
new file mode 100644
index 000000000..f019e95fe
--- /dev/null
+++ b/src/infiniop/ops/lt/nvidia/lt_nvidia.cu
@@ -0,0 +1,10 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "lt_nvidia.cuh"
+
+namespace op::lt::nvidia {
+
+ELEMENTWISE_NVIDIA_IMPL_BINARY(lt)
+
+} // namespace op::lt::nvidia
diff --git a/src/infiniop/ops/lt/nvidia/lt_nvidia.cuh b/src/infiniop/ops/lt/nvidia/lt_nvidia.cuh
new file mode 100644
index 000000000..198fe9161
--- /dev/null
+++ b/src/infiniop/ops/lt/nvidia/lt_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __LT_CUDA_API_H__
+#define __LT_CUDA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(lt, nvidia)
+
+#endif // __LT_CUDA_API_H__
diff --git a/src/infiniop/ops/lt/operator.cc b/src/infiniop/ops/lt/operator.cc
new file mode 100644
index 000000000..9b392768a
--- /dev/null
+++ b/src/infiniop/ops/lt/operator.cc
@@ -0,0 +1,11 @@
+#include "../../operator_impl.h"
+#include "infiniop/ops/binary_ops_api.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/lt_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
+#include "nvidia/lt_nvidia.cuh"
+#endif
+
+BINARY_OP_IMPL(lt, Lt)
diff --git a/src/infiniop/ops/ne/cpu/ne_cpu.cc b/src/infiniop/ops/ne/cpu/ne_cpu.cc
new file mode 100644
index 000000000..a8eb321ec
--- /dev/null
+++ b/src/infiniop/ops/ne/cpu/ne_cpu.cc
@@ -0,0 +1,8 @@
+#include "ne_cpu.h"
+#include "../../../elementwise/cpu/elementwise_cpu_impl.h"
+
+namespace op::ne::cpu {
+
+ELEMENTWISE_CPU_IMPL_BINARY(ne)
+
+} // namespace op::ne::cpu
diff --git a/src/infiniop/ops/ne/cpu/ne_cpu.h b/src/infiniop/ops/ne/cpu/ne_cpu.h
new file mode 100644
index 000000000..65e592742
--- /dev/null
+++ b/src/infiniop/ops/ne/cpu/ne_cpu.h
@@ -0,0 +1,9 @@
+#ifndef __NE_CPU_H__
+#define __NE_CPU_H__
+
+#include "../../../elementwise/binary.h"
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+
+BINARY_ELEMENTWISE_DESCRIPTOR(ne, cpu, op::elementwise::binary::BinaryMode::NotEqual)
+
+#endif // __NE_CPU_H__
diff --git a/src/infiniop/ops/ne/cuda/kernel.cuh b/src/infiniop/ops/ne/cuda/kernel.cuh
new file mode 100644
index 000000000..339819488
--- /dev/null
+++ b/src/infiniop/ops/ne/cuda/kernel.cuh
@@ -0,0 +1,10 @@
+#ifndef __NE_CUDA_H__
+#define __NE_CUDA_H__
+
+#include "../../../elementwise/binary.h"
+
+namespace op::ne::cuda {
+using Op = op::elementwise::binary::cuda::BinaryOp<op::elementwise::binary::BinaryMode::NotEqual>;
+} // namespace op::ne::cuda
+
+#endif // __NE_CUDA_H__
diff --git a/src/infiniop/ops/ne/nvidia/ne_nvidia.cu b/src/infiniop/ops/ne/nvidia/ne_nvidia.cu
new file mode 100644
index 000000000..2de40bb53
--- /dev/null
+++ b/src/infiniop/ops/ne/nvidia/ne_nvidia.cu
@@ -0,0 +1,10 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "ne_nvidia.cuh"
+
+namespace op::ne::nvidia {
+
+ELEMENTWISE_NVIDIA_IMPL_BINARY(ne)
+
+} // namespace op::ne::nvidia
diff --git a/src/infiniop/ops/ne/nvidia/ne_nvidia.cuh b/src/infiniop/ops/ne/nvidia/ne_nvidia.cuh
new file mode 100644
index 000000000..514d2a884
--- /dev/null
+++ b/src/infiniop/ops/ne/nvidia/ne_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __NE_CUDA_API_H__
+#define __NE_CUDA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(ne, nvidia)
+
+#endif // __NE_CUDA_API_H__
diff --git a/src/infiniop/ops/ne/operator.cc b/src/infiniop/ops/ne/operator.cc
new file mode 100644
index 000000000..cb4c1ed6d
--- /dev/null
+++ b/src/infiniop/ops/ne/operator.cc
@@ -0,0 +1,11 @@
+#include "../../operator_impl.h"
+#include "infiniop/ops/binary_ops_api.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/ne_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
+#include "nvidia/ne_nvidia.cuh"
+#endif
+
+BINARY_OP_IMPL(ne, Ne)
diff --git a/src/infiniop/ops/remainder/cpu/remainder_cpu.cc b/src/infiniop/ops/remainder/cpu/remainder_cpu.cc
new file mode 100644
index 000000000..6ccb2cd63
--- /dev/null
+++ b/src/infiniop/ops/remainder/cpu/remainder_cpu.cc
@@ -0,0 +1,8 @@
+#include "remainder_cpu.h"
+#include "../../../elementwise/cpu/elementwise_cpu_impl.h"
+
+namespace op::remainder::cpu {
+
+ELEMENTWISE_CPU_IMPL_BINARY(remainder)
+
+} // namespace op::remainder::cpu
diff --git a/src/infiniop/ops/remainder/cpu/remainder_cpu.h b/src/infiniop/ops/remainder/cpu/remainder_cpu.h
new file mode 100644
index 000000000..db6b7f760
--- /dev/null
+++ b/src/infiniop/ops/remainder/cpu/remainder_cpu.h
@@ -0,0 +1,9 @@
+#ifndef __REMAINDER_CPU_H__
+#define __REMAINDER_CPU_H__
+
+#include "../../../elementwise/binary.h"
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+
+BINARY_ELEMENTWISE_DESCRIPTOR(remainder, cpu, op::elementwise::binary::BinaryMode::Remainder)
+
+#endif // __REMAINDER_CPU_H__
diff --git a/src/infiniop/ops/remainder/cuda/kernel.cuh b/src/infiniop/ops/remainder/cuda/kernel.cuh
new file mode 100644
index 000000000..48af7d5d7
--- /dev/null
+++ b/src/infiniop/ops/remainder/cuda/kernel.cuh
@@ -0,0 +1,10 @@
+#ifndef __REMAINDER_CUDA_H__
+#define __REMAINDER_CUDA_H__
+
+#include "../../../elementwise/binary.h"
+
+namespace op::remainder::cuda {
+using Op = op::elementwise::binary::cuda::BinaryOp<op::elementwise::binary::BinaryMode::Remainder>;
+} // namespace op::remainder::cuda
+
+#endif // __REMAINDER_CUDA_H__
diff --git a/src/infiniop/ops/remainder/nvidia/remainder_nvidia.cu b/src/infiniop/ops/remainder/nvidia/remainder_nvidia.cu
new file mode 100644
index 000000000..9bea21004
--- /dev/null
+++ b/src/infiniop/ops/remainder/nvidia/remainder_nvidia.cu
@@ -0,0 +1,10 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "remainder_nvidia.cuh"
+
+namespace op::remainder::nvidia {
+
+ELEMENTWISE_NVIDIA_IMPL_BINARY(remainder)
+
+} // namespace op::remainder::nvidia
diff --git a/src/infiniop/ops/remainder/nvidia/remainder_nvidia.cuh b/src/infiniop/ops/remainder/nvidia/remainder_nvidia.cuh
new file mode 100644
index 000000000..d6834dfc4
--- /dev/null
+++ b/src/infiniop/ops/remainder/nvidia/remainder_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __REMAINDER_NVIDIA_API_H__
+#define __REMAINDER_NVIDIA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(remainder, nvidia)
+
+#endif // __REMAINDER_NVIDIA_API_H__
diff --git a/src/infiniop/ops/remainder/operator.cc b/src/infiniop/ops/remainder/operator.cc
new file mode 100644
index 000000000..c331cebc9
--- /dev/null
+++ b/src/infiniop/ops/remainder/operator.cc
@@ -0,0 +1,11 @@
+#include "../../operator_impl.h"
+#include "infiniop/ops/binary_ops_api.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/remainder_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
+#include "nvidia/remainder_nvidia.cuh"
+#endif
+
+BINARY_OP_IMPL(remainder, Remainder)
diff --git a/src/infiniop/ops/rsqrt/cpu/rsqrt_cpu.cc b/src/infiniop/ops/rsqrt/cpu/rsqrt_cpu.cc
new file mode 100644
index 000000000..78b0138c5
--- /dev/null
+++ b/src/infiniop/ops/rsqrt/cpu/rsqrt_cpu.cc
@@ -0,0 +1,8 @@
+#include "rsqrt_cpu.h"
+#include "../../../elementwise/cpu/elementwise_cpu_impl.h"
+
+namespace op::rsqrt::cpu {
+
+ELEMENTWISE_CPU_IMPL_UNARY(rsqrt)
+
+} // namespace op::rsqrt::cpu
diff --git a/src/infiniop/ops/rsqrt/cpu/rsqrt_cpu.h b/src/infiniop/ops/rsqrt/cpu/rsqrt_cpu.h
new file mode 100644
index 000000000..9bafc6609
--- /dev/null
+++ b/src/infiniop/ops/rsqrt/cpu/rsqrt_cpu.h
@@ -0,0 +1,9 @@
+#ifndef __RSQRT_CPU_H__
+#define __RSQRT_CPU_H__
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+#include "../../../elementwise/unary.h"
+
+UNARY_ELEMENTWISE_DESCRIPTOR(rsqrt, cpu, op::elementwise::unary::UnaryMode::Rsqrt)
+
+#endif // __RSQRT_CPU_H__
diff --git a/src/infiniop/ops/rsqrt/cuda/kernel.cuh b/src/infiniop/ops/rsqrt/cuda/kernel.cuh
new file mode 100644
index 000000000..9381f2bae
--- /dev/null
+++ b/src/infiniop/ops/rsqrt/cuda/kernel.cuh
@@ -0,0 +1,10 @@
+#ifndef __RSQRT_CUDA_H__
+#define __RSQRT_CUDA_H__
+
+#include "../../../elementwise/unary.h"
+
+namespace op::rsqrt::cuda {
+using Op = op::elementwise::unary::cuda::UnaryOp<op::elementwise::unary::UnaryMode::Rsqrt>;
+} // namespace op::rsqrt::cuda
+
+#endif // __RSQRT_CUDA_H__
diff --git a/src/infiniop/ops/rsqrt/nvidia/rsqrt_nvidia.cu b/src/infiniop/ops/rsqrt/nvidia/rsqrt_nvidia.cu
new file mode 100644
index 000000000..4cdaccc19
--- /dev/null
+++ b/src/infiniop/ops/rsqrt/nvidia/rsqrt_nvidia.cu
@@ -0,0 +1,10 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "rsqrt_nvidia.cuh"
+
+namespace op::rsqrt::nvidia {
+
+ELEMENTWISE_NVIDIA_IMPL_UNARY(rsqrt)
+
+} // namespace op::rsqrt::nvidia
diff --git a/src/infiniop/ops/rsqrt/nvidia/rsqrt_nvidia.cuh b/src/infiniop/ops/rsqrt/nvidia/rsqrt_nvidia.cuh
new file mode 100644
index 000000000..afffff923
--- /dev/null
+++ b/src/infiniop/ops/rsqrt/nvidia/rsqrt_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __RSQRT_NVIDIA_API_H__
+#define __RSQRT_NVIDIA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(rsqrt, nvidia)
+
+#endif // __RSQRT_NVIDIA_API_H__
diff --git a/src/infiniop/ops/rsqrt/operator.cc b/src/infiniop/ops/rsqrt/operator.cc
new file mode 100644
index 000000000..d92e32510
--- /dev/null
+++ b/src/infiniop/ops/rsqrt/operator.cc
@@ -0,0 +1,11 @@
+#include "../../operator_impl.h"
+#include "infiniop/ops/unary_ops_api.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/rsqrt_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
+#include "nvidia/rsqrt_nvidia.cuh"
+#endif
+
+UNARY_OP_IMPL(rsqrt, Rsqrt)
diff --git a/src/infiniop/ops/sin/cpu/sin_cpu.cc b/src/infiniop/ops/sin/cpu/sin_cpu.cc
new file mode 100644
index 000000000..c7b172bb0
--- /dev/null
+++ b/src/infiniop/ops/sin/cpu/sin_cpu.cc
@@ -0,0 +1,8 @@
+#include "sin_cpu.h"
+#include "../../../elementwise/cpu/elementwise_cpu_impl.h"
+
+namespace op::sin::cpu {
+
+ELEMENTWISE_CPU_IMPL_UNARY(sin)
+
+} // namespace op::sin::cpu
diff --git a/src/infiniop/ops/sin/cpu/sin_cpu.h b/src/infiniop/ops/sin/cpu/sin_cpu.h
new file mode 100644
index 000000000..14117695d
--- /dev/null
+++ b/src/infiniop/ops/sin/cpu/sin_cpu.h
@@ -0,0 +1,9 @@
+#ifndef __SIN_CPU_H__
+#define __SIN_CPU_H__
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+#include "../../../elementwise/unary.h"
+
+UNARY_ELEMENTWISE_DESCRIPTOR(sin, cpu, op::elementwise::unary::UnaryMode::Sin)
+
+#endif // __SIN_CPU_H__
diff --git a/src/infiniop/ops/sin/cuda/kernel.cuh b/src/infiniop/ops/sin/cuda/kernel.cuh
new file mode 100644
index 000000000..918bb8345
--- /dev/null
+++ b/src/infiniop/ops/sin/cuda/kernel.cuh
@@ -0,0 +1,10 @@
+#ifndef __SIN_CUDA_H__
+#define __SIN_CUDA_H__
+
+#include "../../../elementwise/unary.h"
+
+namespace op::sin::cuda {
+using Op = op::elementwise::unary::cuda::UnaryOp<op::elementwise::unary::UnaryMode::Sin>;
+} // namespace op::sin::cuda
+
+#endif // __SIN_CUDA_H__
diff --git a/src/infiniop/ops/sin/nvidia/sin_nvidia.cu b/src/infiniop/ops/sin/nvidia/sin_nvidia.cu
new file mode 100644
index 000000000..36c667ccf
--- /dev/null
+++ b/src/infiniop/ops/sin/nvidia/sin_nvidia.cu
@@ -0,0 +1,10 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "sin_nvidia.cuh"
+
+namespace op::sin::nvidia {
+
+ELEMENTWISE_NVIDIA_IMPL_UNARY(sin)
+
+} // namespace op::sin::nvidia
diff --git a/src/infiniop/ops/sin/nvidia/sin_nvidia.cuh b/src/infiniop/ops/sin/nvidia/sin_nvidia.cuh
new file mode 100644
index 000000000..617119eb4
--- /dev/null
+++ b/src/infiniop/ops/sin/nvidia/sin_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __SIN_NVIDIA_API_H__
+#define __SIN_NVIDIA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(sin, nvidia)
+
+#endif // __SIN_NVIDIA_API_H__
diff --git a/src/infiniop/ops/sin/operator.cc b/src/infiniop/ops/sin/operator.cc
new file mode 100644
index 000000000..6772de6fe
--- /dev/null
+++ b/src/infiniop/ops/sin/operator.cc
@@ -0,0 +1,11 @@
+#include "../../operator_impl.h"
+#include "infiniop/ops/unary_ops_api.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/sin_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
+#include "nvidia/sin_nvidia.cuh"
+#endif
+
+UNARY_OP_IMPL(sin, Sin)
diff --git a/src/infiniop/ops/sinc/cpu/sinc_cpu.cc b/src/infiniop/ops/sinc/cpu/sinc_cpu.cc
new file mode 100644
index 000000000..6a5dc21d3
--- /dev/null
+++ b/src/infiniop/ops/sinc/cpu/sinc_cpu.cc
@@ -0,0 +1,8 @@
+#include "sinc_cpu.h"
+#include "../../../elementwise/cpu/elementwise_cpu_impl.h"
+
+namespace op::sinc::cpu {
+
+ELEMENTWISE_CPU_IMPL_UNARY(sinc)
+
+} // namespace op::sinc::cpu
diff --git a/src/infiniop/ops/sinc/cpu/sinc_cpu.h b/src/infiniop/ops/sinc/cpu/sinc_cpu.h
new file mode 100644
index 000000000..094fb2f30
--- /dev/null
+++ b/src/infiniop/ops/sinc/cpu/sinc_cpu.h
@@ -0,0 +1,9 @@
+#ifndef __SINC_CPU_H__
+#define __SINC_CPU_H__
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+#include "../../../elementwise/unary.h"
+
+UNARY_ELEMENTWISE_DESCRIPTOR(sinc, cpu, op::elementwise::unary::UnaryMode::Sinc)
+
+#endif // __SINC_CPU_H__
diff --git a/src/infiniop/ops/sinc/cuda/kernel.cuh b/src/infiniop/ops/sinc/cuda/kernel.cuh
new file mode 100644
index 000000000..fc31025b0
--- /dev/null
+++ b/src/infiniop/ops/sinc/cuda/kernel.cuh
@@ -0,0 +1,10 @@
+#ifndef __SINC_CUDA_H__
+#define __SINC_CUDA_H__
+
+#include "../../../elementwise/unary.h"
+
+namespace op::sinc::cuda {
+using Op = op::elementwise::unary::cuda::UnaryOp<op::elementwise::unary::UnaryMode::Sinc>;
+} // namespace op::sinc::cuda
+
+#endif // __SINC_CUDA_H__
diff --git a/src/infiniop/ops/sinc/nvidia/sinc_nvidia.cu b/src/infiniop/ops/sinc/nvidia/sinc_nvidia.cu
new file mode 100644
index 000000000..85d36106b
--- /dev/null
+++ b/src/infiniop/ops/sinc/nvidia/sinc_nvidia.cu
@@ -0,0 +1,10 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "sinc_nvidia.cuh"
+
+namespace op::sinc::nvidia {
+
+ELEMENTWISE_NVIDIA_IMPL_UNARY(sinc)
+
+} // namespace op::sinc::nvidia
diff --git a/src/infiniop/ops/sinc/nvidia/sinc_nvidia.cuh b/src/infiniop/ops/sinc/nvidia/sinc_nvidia.cuh
new file mode 100644
index 000000000..04058aafb
--- /dev/null
+++ b/src/infiniop/ops/sinc/nvidia/sinc_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __SINC_NVIDIA_API_H__
+#define __SINC_NVIDIA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(sinc, nvidia)
+
+#endif // __SINC_NVIDIA_API_H__
diff --git a/src/infiniop/ops/sinc/operator.cc b/src/infiniop/ops/sinc/operator.cc
new file mode 100644
index 000000000..636030916
--- /dev/null
+++ b/src/infiniop/ops/sinc/operator.cc
@@ -0,0 +1,11 @@
+#include "../../operator_impl.h"
+#include "infiniop/ops/unary_ops_api.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/sinc_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
+#include "nvidia/sinc_nvidia.cuh"
+#endif
+
+UNARY_OP_IMPL(sinc, Sinc)
diff --git a/src/infiniop/ops/square/cpu/square_cpu.cc b/src/infiniop/ops/square/cpu/square_cpu.cc
new file mode 100644
index 000000000..2ba497866
--- /dev/null
+++ b/src/infiniop/ops/square/cpu/square_cpu.cc
@@ -0,0 +1,8 @@
+#include "square_cpu.h"
+#include "../../../elementwise/cpu/elementwise_cpu_impl.h"
+
+namespace op::square::cpu {
+
+ELEMENTWISE_CPU_IMPL_UNARY(square)
+
+} // namespace op::square::cpu
diff --git a/src/infiniop/ops/square/cpu/square_cpu.h b/src/infiniop/ops/square/cpu/square_cpu.h
new file mode 100644
index 000000000..014bdcf79
--- /dev/null
+++ b/src/infiniop/ops/square/cpu/square_cpu.h
@@ -0,0 +1,9 @@
+#ifndef __SQUARE_CPU_H__
+#define __SQUARE_CPU_H__
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+#include "../../../elementwise/unary.h"
+
+UNARY_ELEMENTWISE_DESCRIPTOR(square, cpu, op::elementwise::unary::UnaryMode::Square)
+
+#endif // __SQUARE_CPU_H__
diff --git a/src/infiniop/ops/square/cuda/kernel.cuh b/src/infiniop/ops/square/cuda/kernel.cuh
new file mode 100644
index 000000000..07dd19124
--- /dev/null
+++ b/src/infiniop/ops/square/cuda/kernel.cuh
@@ -0,0 +1,10 @@
+#ifndef __SQUARE_CUDA_H__
+#define __SQUARE_CUDA_H__
+
+#include "../../../elementwise/unary.h"
+
+namespace op::square::cuda {
+using Op = op::elementwise::unary::cuda::UnaryOp<op::elementwise::unary::UnaryMode::Square>;
+} // namespace op::square::cuda
+
+#endif // __SQUARE_CUDA_H__
diff --git a/src/infiniop/ops/square/nvidia/square_nvidia.cu b/src/infiniop/ops/square/nvidia/square_nvidia.cu
new file mode 100644
index 000000000..ac446d4cf
--- /dev/null
+++ b/src/infiniop/ops/square/nvidia/square_nvidia.cu
@@ -0,0 +1,10 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "square_nvidia.cuh"
+
+namespace op::square::nvidia {
+
+ELEMENTWISE_NVIDIA_IMPL_UNARY(square)
+
+} // namespace op::square::nvidia
diff --git a/src/infiniop/ops/square/nvidia/square_nvidia.cuh b/src/infiniop/ops/square/nvidia/square_nvidia.cuh
new file mode 100644
index 000000000..deed0905a
--- /dev/null
+++ b/src/infiniop/ops/square/nvidia/square_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __SQUARE_NVIDIA_API_H__
+#define __SQUARE_NVIDIA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(square, nvidia)
+
+#endif // __SQUARE_NVIDIA_API_H__
diff --git a/src/infiniop/ops/square/operator.cc b/src/infiniop/ops/square/operator.cc
new file mode 100644
index 000000000..b66e1621e
--- /dev/null
+++ b/src/infiniop/ops/square/operator.cc
@@ -0,0 +1,11 @@
+#include "../../operator_impl.h"
+#include "infiniop/ops/unary_ops_api.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/square_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
+#include "nvidia/square_nvidia.cuh"
+#endif
+
+UNARY_OP_IMPL(square, Square)
diff --git a/test/infiniop/libinfiniop/op_register.py b/test/infiniop/libinfiniop/op_register.py
index a0f7cbccb..afa0e7bef 100644
--- a/test/infiniop/libinfiniop/op_register.py
+++ b/test/infiniop/libinfiniop/op_register.py
@@ -303,6 +303,108 @@ def pow_(lib):
     ]
 
 
+@OpRegister.operator
+def copysign_(lib):
+    lib.infiniopCreateCopySignDescriptor.restype = c_int32
+    lib.infiniopCreateCopySignDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+
+    lib.infiniopGetCopySignWorkspaceSize.restype = c_int32
+    lib.infiniopGetCopySignWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopCopySign.restype = c_int32
+    lib.infiniopCopySign.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyCopySignDescriptor.restype = c_int32
+    lib.infiniopDestroyCopySignDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def hypot_(lib):
+    lib.infiniopCreateHypotDescriptor.restype = c_int32
+    lib.infiniopCreateHypotDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+
+    lib.infiniopGetHypotWorkspaceSize.restype = c_int32
+    lib.infiniopGetHypotWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopHypot.restype = c_int32
+    lib.infiniopHypot.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyHypotDescriptor.restype = c_int32
+    lib.infiniopDestroyHypotDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def atan2_(lib):
+    lib.infiniopCreateAtan2Descriptor.restype = c_int32
+    lib.infiniopCreateAtan2Descriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+
+    lib.infiniopGetAtan2WorkspaceSize.restype = c_int32
+    lib.infiniopGetAtan2WorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopAtan2.restype = c_int32
+    lib.infiniopAtan2.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyAtan2Descriptor.restype = c_int32
+    lib.infiniopDestroyAtan2Descriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
 @OpRegister.operator
 def div_(lib):
     lib.infiniopCreateDivDescriptor.restype = c_int32
@@ -314,14 +416,618 @@ def div_(lib):
         infiniopTensorDescriptor_t,
     ]
 
-    lib.infiniopGetDivWorkspaceSize.restype = c_int32
-    lib.infiniopGetDivWorkspaceSize.argtypes = [
+    lib.infiniopGetDivWorkspaceSize.restype = c_int32
+    lib.infiniopGetDivWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopDiv.restype = c_int32
+    lib.infiniopDiv.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyDivDescriptor.restype = c_int32
+    lib.infiniopDestroyDivDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def floor_divide_(lib):
+    lib.infiniopCreateFloorDivideDescriptor.restype = c_int32
+    lib.infiniopCreateFloorDivideDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+
+    lib.infiniopGetFloorDivideWorkspaceSize.restype = c_int32
+    lib.infiniopGetFloorDivideWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopFloorDivide.restype = c_int32
+    lib.infiniopFloorDivide.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyFloorDivideDescriptor.restype = c_int32
+    lib.infiniopDestroyFloorDivideDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def mod_(lib):
+    lib.infiniopCreateModDescriptor.restype = c_int32
+    lib.infiniopCreateModDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+
+    lib.infiniopGetModWorkspaceSize.restype = c_int32
+    lib.infiniopGetModWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopMod.restype = c_int32
+    lib.infiniopMod.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyModDescriptor.restype = c_int32
+    lib.infiniopDestroyModDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def remainder_(lib):
+    lib.infiniopCreateRemainderDescriptor.restype = c_int32
+    lib.infiniopCreateRemainderDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+
+    lib.infiniopGetRemainderWorkspaceSize.restype = c_int32
+    lib.infiniopGetRemainderWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopRemainder.restype = c_int32
+    lib.infiniopRemainder.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyRemainderDescriptor.restype = c_int32
+    lib.infiniopDestroyRemainderDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def max_(lib):
+    lib.infiniopCreateMaxDescriptor.restype = c_int32
+    lib.infiniopCreateMaxDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+
+    lib.infiniopGetMaxWorkspaceSize.restype = c_int32
+    lib.infiniopGetMaxWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopMax.restype = c_int32
+    lib.infiniopMax.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyMaxDescriptor.restype = c_int32
+    lib.infiniopDestroyMaxDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def min_(lib):
+    lib.infiniopCreateMinDescriptor.restype = c_int32
+    lib.infiniopCreateMinDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+
+    lib.infiniopGetMinWorkspaceSize.restype = c_int32
+    lib.infiniopGetMinWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopMin.restype = c_int32
+    lib.infiniopMin.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyMinDescriptor.restype = c_int32
+    lib.infiniopDestroyMinDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def fmax_(lib):
+    lib.infiniopCreateFmaxDescriptor.restype = c_int32
+    lib.infiniopCreateFmaxDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+    lib.infiniopGetFmaxWorkspaceSize.restype = c_int32
+    lib.infiniopGetFmaxWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+    lib.infiniopFmax.restype = c_int32
+    lib.infiniopFmax.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyFmaxDescriptor.restype = c_int32
+    lib.infiniopDestroyFmaxDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def fmin_(lib):
+    lib.infiniopCreateFminDescriptor.restype = c_int32
+    lib.infiniopCreateFminDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+    lib.infiniopGetFminWorkspaceSize.restype = c_int32
+    lib.infiniopGetFminWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+    lib.infiniopFmin.restype = c_int32
+    lib.infiniopFmin.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyFminDescriptor.restype = c_int32
+    lib.infiniopDestroyFminDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def gt_(lib):
+    lib.infiniopCreateGtDescriptor.restype = c_int32
+    lib.infiniopCreateGtDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+
+    lib.infiniopGetGtWorkspaceSize.restype = c_int32
+    lib.infiniopGetGtWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopGt.restype = c_int32
+    lib.infiniopGt.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyGtDescriptor.restype = c_int32
+    lib.infiniopDestroyGtDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def lt_(lib):
+    lib.infiniopCreateLtDescriptor.restype = c_int32
+    lib.infiniopCreateLtDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+
+    lib.infiniopGetLtWorkspaceSize.restype = c_int32
+    lib.infiniopGetLtWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopLt.restype = c_int32
+    lib.infiniopLt.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyLtDescriptor.restype = c_int32
+    lib.infiniopDestroyLtDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def ge_(lib):
+    lib.infiniopCreateGeDescriptor.restype = c_int32
+    lib.infiniopCreateGeDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+
+    lib.infiniopGetGeWorkspaceSize.restype = c_int32
+    lib.infiniopGetGeWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopGe.restype = c_int32
+    lib.infiniopGe.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyGeDescriptor.restype = c_int32
+    lib.infiniopDestroyGeDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def le_(lib):
+    lib.infiniopCreateLeDescriptor.restype = c_int32
+    lib.infiniopCreateLeDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+
+    lib.infiniopGetLeWorkspaceSize.restype = c_int32
+    lib.infiniopGetLeWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopLe.restype = c_int32
+    lib.infiniopLe.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyLeDescriptor.restype = c_int32
+    lib.infiniopDestroyLeDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def eq_(lib):
+    lib.infiniopCreateEqDescriptor.restype = c_int32
+    lib.infiniopCreateEqDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+
+    lib.infiniopGetEqWorkspaceSize.restype = c_int32
+    lib.infiniopGetEqWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopEq.restype = c_int32
+    lib.infiniopEq.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyEqDescriptor.restype = c_int32
+    lib.infiniopDestroyEqDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def ne_(lib):
+    lib.infiniopCreateNeDescriptor.restype = c_int32
+    lib.infiniopCreateNeDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+
+    lib.infiniopGetNeWorkspaceSize.restype = c_int32
+    lib.infiniopGetNeWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopNe.restype = c_int32
+    lib.infiniopNe.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyNeDescriptor.restype = c_int32
+    lib.infiniopDestroyNeDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def logical_and_(lib):
+    lib.infiniopCreateLogicalAndDescriptor.restype = c_int32
+    lib.infiniopCreateLogicalAndDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+
+    lib.infiniopGetLogicalAndWorkspaceSize.restype = c_int32
+    lib.infiniopGetLogicalAndWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopLogicalAnd.restype = c_int32
+    lib.infiniopLogicalAnd.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyLogicalAndDescriptor.restype = c_int32
+    lib.infiniopDestroyLogicalAndDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def logical_or_(lib):
+    lib.infiniopCreateLogicalOrDescriptor.restype = c_int32
+    lib.infiniopCreateLogicalOrDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+
+    lib.infiniopGetLogicalOrWorkspaceSize.restype = c_int32
+    lib.infiniopGetLogicalOrWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopLogicalOr.restype = c_int32
+    lib.infiniopLogicalOr.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyLogicalOrDescriptor.restype = c_int32
+    lib.infiniopDestroyLogicalOrDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def logical_xor_(lib):
+    lib.infiniopCreateLogicalXorDescriptor.restype = c_int32
+    lib.infiniopCreateLogicalXorDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+
+    lib.infiniopGetLogicalXorWorkspaceSize.restype = c_int32
+    lib.infiniopGetLogicalXorWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopLogicalXor.restype = c_int32
+    lib.infiniopLogicalXor.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyLogicalXorDescriptor.restype = c_int32
+    lib.infiniopDestroyLogicalXorDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def bitwise_and_(lib):
+    lib.infiniopCreateBitwiseAndDescriptor.restype = c_int32
+    lib.infiniopCreateBitwiseAndDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+
+    lib.infiniopGetBitwiseAndWorkspaceSize.restype = c_int32
+    lib.infiniopGetBitwiseAndWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopBitwiseAnd.restype = c_int32
+    lib.infiniopBitwiseAnd.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyBitwiseAndDescriptor.restype = c_int32
+    lib.infiniopDestroyBitwiseAndDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def bitwise_or_(lib):
+    lib.infiniopCreateBitwiseOrDescriptor.restype = c_int32
+    lib.infiniopCreateBitwiseOrDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+
+    lib.infiniopGetBitwiseOrWorkspaceSize.restype = c_int32
+    lib.infiniopGetBitwiseOrWorkspaceSize.argtypes = [
         infiniopOperatorDescriptor_t,
         POINTER(c_size_t),
     ]
 
-    lib.infiniopDiv.restype = c_int32
-    lib.infiniopDiv.argtypes = [
+    lib.infiniopBitwiseOr.restype = c_int32
+    lib.infiniopBitwiseOr.argtypes = [
         infiniopOperatorDescriptor_t,
         c_void_p,
         c_size_t,
@@ -331,16 +1037,16 @@ def div_(lib):
         c_void_p,
     ]
 
-    lib.infiniopDestroyDivDescriptor.restype = c_int32
-    lib.infiniopDestroyDivDescriptor.argtypes = [
+    lib.infiniopDestroyBitwiseOrDescriptor.restype = c_int32
+    lib.infiniopDestroyBitwiseOrDescriptor.argtypes = [
         infiniopOperatorDescriptor_t,
     ]
 
 
 @OpRegister.operator
-def mod_(lib):
-    lib.infiniopCreateModDescriptor.restype = c_int32
-    lib.infiniopCreateModDescriptor.argtypes = [
+def bitwise_xor_(lib):
+    lib.infiniopCreateBitwiseXorDescriptor.restype = c_int32
+    lib.infiniopCreateBitwiseXorDescriptor.argtypes = [
         infiniopHandle_t,
         POINTER(infiniopOperatorDescriptor_t),
         infiniopTensorDescriptor_t,
@@ -348,14 +1054,14 @@ def mod_(lib):
         infiniopTensorDescriptor_t,
     ]
 
-    lib.infiniopGetModWorkspaceSize.restype = c_int32
-    lib.infiniopGetModWorkspaceSize.argtypes = [
+    lib.infiniopGetBitwiseXorWorkspaceSize.restype = c_int32
+    lib.infiniopGetBitwiseXorWorkspaceSize.argtypes = [
         infiniopOperatorDescriptor_t,
         POINTER(c_size_t),
     ]
 
-    lib.infiniopMod.restype = c_int32
-    lib.infiniopMod.argtypes = [
+    lib.infiniopBitwiseXor.restype = c_int32
+    lib.infiniopBitwiseXor.argtypes = [
         infiniopOperatorDescriptor_t,
         c_void_p,
         c_size_t,
@@ -365,16 +1071,16 @@ def mod_(lib):
         c_void_p,
     ]
 
-    lib.infiniopDestroyModDescriptor.restype = c_int32
-    lib.infiniopDestroyModDescriptor.argtypes = [
+    lib.infiniopDestroyBitwiseXorDescriptor.restype = c_int32
+    lib.infiniopDestroyBitwiseXorDescriptor.argtypes = [
         infiniopOperatorDescriptor_t,
     ]
 
 
 @OpRegister.operator
-def max_(lib):
-    lib.infiniopCreateMaxDescriptor.restype = c_int32
-    lib.infiniopCreateMaxDescriptor.argtypes = [
+def bitwise_left_shift_(lib):
+    lib.infiniopCreateBitwiseLeftShiftDescriptor.restype = c_int32
+    lib.infiniopCreateBitwiseLeftShiftDescriptor.argtypes = [
         infiniopHandle_t,
         POINTER(infiniopOperatorDescriptor_t),
         infiniopTensorDescriptor_t,
@@ -382,14 +1088,14 @@ def max_(lib):
         infiniopTensorDescriptor_t,
     ]
 
-    lib.infiniopGetMaxWorkspaceSize.restype = c_int32
-    lib.infiniopGetMaxWorkspaceSize.argtypes = [
+    lib.infiniopGetBitwiseLeftShiftWorkspaceSize.restype = c_int32
+    lib.infiniopGetBitwiseLeftShiftWorkspaceSize.argtypes = [
         infiniopOperatorDescriptor_t,
         POINTER(c_size_t),
     ]
 
-    lib.infiniopMax.restype = c_int32
-    lib.infiniopMax.argtypes = [
+    lib.infiniopBitwiseLeftShift.restype = c_int32
+    lib.infiniopBitwiseLeftShift.argtypes = [
         infiniopOperatorDescriptor_t,
         c_void_p,
         c_size_t,
@@ -399,16 +1105,16 @@ def max_(lib):
         c_void_p,
     ]
 
-    lib.infiniopDestroyMaxDescriptor.restype = c_int32
-    lib.infiniopDestroyMaxDescriptor.argtypes = [
+    lib.infiniopDestroyBitwiseLeftShiftDescriptor.restype = c_int32
+    lib.infiniopDestroyBitwiseLeftShiftDescriptor.argtypes = [
         infiniopOperatorDescriptor_t,
     ]
 
 
 @OpRegister.operator
-def min_(lib):
-    lib.infiniopCreateMinDescriptor.restype = c_int32
-    lib.infiniopCreateMinDescriptor.argtypes = [
+def bitwise_right_shift_(lib):
+    lib.infiniopCreateBitwiseRightShiftDescriptor.restype = c_int32
+    lib.infiniopCreateBitwiseRightShiftDescriptor.argtypes = [
         infiniopHandle_t,
         POINTER(infiniopOperatorDescriptor_t),
         infiniopTensorDescriptor_t,
@@ -416,14 +1122,14 @@ def min_(lib):
         infiniopTensorDescriptor_t,
     ]
 
-    lib.infiniopGetMinWorkspaceSize.restype = c_int32
-    lib.infiniopGetMinWorkspaceSize.argtypes = [
+    lib.infiniopGetBitwiseRightShiftWorkspaceSize.restype = c_int32
+    lib.infiniopGetBitwiseRightShiftWorkspaceSize.argtypes = [
         infiniopOperatorDescriptor_t,
         POINTER(c_size_t),
     ]
 
-    lib.infiniopMin.restype = c_int32
-    lib.infiniopMin.argtypes = [
+    lib.infiniopBitwiseRightShift.restype = c_int32
+    lib.infiniopBitwiseRightShift.argtypes = [
         infiniopOperatorDescriptor_t,
         c_void_p,
         c_size_t,
@@ -433,8 +1139,8 @@ def min_(lib):
         c_void_p,
     ]
 
-    lib.infiniopDestroyMinDescriptor.restype = c_int32
-    lib.infiniopDestroyMinDescriptor.argtypes = [
+    lib.infiniopDestroyBitwiseRightShiftDescriptor.restype = c_int32
+    lib.infiniopDestroyBitwiseRightShiftDescriptor.argtypes = [
         infiniopOperatorDescriptor_t,
     ]
 
@@ -1021,6 +1727,64 @@ def sqrt_(lib):
     ]
 
 
+@OpRegister.operator
+def square_(lib):
+    lib.infiniopCreateSquareDescriptor.restype = c_int32
+    lib.infiniopCreateSquareDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+    lib.infiniopGetSquareWorkspaceSize.restype = c_int32
+    lib.infiniopGetSquareWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+    lib.infiniopSquare.restype = c_int32
+    lib.infiniopSquare.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroySquareDescriptor.restype = c_int32
+    lib.infiniopDestroySquareDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def rsqrt_(lib):
+    lib.infiniopCreateRsqrtDescriptor.restype = c_int32
+    lib.infiniopCreateRsqrtDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+    lib.infiniopGetRsqrtWorkspaceSize.restype = c_int32
+    lib.infiniopGetRsqrtWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+    lib.infiniopRsqrt.restype = c_int32
+    lib.infiniopRsqrt.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyRsqrtDescriptor.restype = c_int32
+    lib.infiniopDestroyRsqrtDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
 @OpRegister.operator
 def log_(lib):
     lib.infiniopCreateLogDescriptor.restype = c_int32
@@ -1050,6 +1814,122 @@ def log_(lib):
     ]
 
 
+@OpRegister.operator
+def log2_(lib):
+    lib.infiniopCreateLog2Descriptor.restype = c_int32
+    lib.infiniopCreateLog2Descriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+    lib.infiniopGetLog2WorkspaceSize.restype = c_int32
+    lib.infiniopGetLog2WorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+    lib.infiniopLog2.restype = c_int32
+    lib.infiniopLog2.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyLog2Descriptor.restype = c_int32
+    lib.infiniopDestroyLog2Descriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def log10_(lib):
+    lib.infiniopCreateLog10Descriptor.restype = c_int32
+    lib.infiniopCreateLog10Descriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+    lib.infiniopGetLog10WorkspaceSize.restype = c_int32
+    lib.infiniopGetLog10WorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+    lib.infiniopLog10.restype = c_int32
+    lib.infiniopLog10.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyLog10Descriptor.restype = c_int32
+    lib.infiniopDestroyLog10Descriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def log1p_(lib):
+    lib.infiniopCreateLog1pDescriptor.restype = c_int32
+    lib.infiniopCreateLog1pDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+    lib.infiniopGetLog1pWorkspaceSize.restype = c_int32
+    lib.infiniopGetLog1pWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+    lib.infiniopLog1p.restype = c_int32
+    lib.infiniopLog1p.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyLog1pDescriptor.restype = c_int32
+    lib.infiniopDestroyLog1pDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def sin_(lib):
+    lib.infiniopCreateSinDescriptor.restype = c_int32
+    lib.infiniopCreateSinDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+    lib.infiniopGetSinWorkspaceSize.restype = c_int32
+    lib.infiniopGetSinWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+    lib.infiniopSin.restype = c_int32
+    lib.infiniopSin.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroySinDescriptor.restype = c_int32
+    lib.infiniopDestroySinDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
 @OpRegister.operator
 def tan_(lib):
     lib.infiniopCreateTanDescriptor.restype = c_int32
@@ -2031,6 +2911,35 @@ def exp_(lib):
     ]
 
 
+@OpRegister.operator
+def exp2_(lib):
+    lib.infiniopCreateExp2Descriptor.restype = c_int32
+    lib.infiniopCreateExp2Descriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+    lib.infiniopGetExp2WorkspaceSize.restype = c_int32
+    lib.infiniopGetExp2WorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+    lib.infiniopExp2.restype = c_int32
+    lib.infiniopExp2.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyExp2Descriptor.restype = c_int32
+    lib.infiniopDestroyExp2Descriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
 @OpRegister.operator
 def gather_(lib):
     lib.infiniopCreateGatherDescriptor.restype = c_int32
@@ -2092,6 +3001,122 @@ def hardswish_(lib):
     ]
 
 
+@OpRegister.operator
+def isnan_(lib):
+    lib.infiniopCreateIsNanDescriptor.restype = c_int32
+    lib.infiniopCreateIsNanDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+    lib.infiniopGetIsNanWorkspaceSize.restype = c_int32
+    lib.infiniopGetIsNanWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+    lib.infiniopIsNan.restype = c_int32
+    lib.infiniopIsNan.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyIsNanDescriptor.restype = c_int32
+    lib.infiniopDestroyIsNanDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def isinf_(lib):
+    lib.infiniopCreateIsInfDescriptor.restype = c_int32
+    lib.infiniopCreateIsInfDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+    lib.infiniopGetIsInfWorkspaceSize.restype = c_int32
+    lib.infiniopGetIsInfWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+    lib.infiniopIsInf.restype = c_int32
+    lib.infiniopIsInf.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyIsInfDescriptor.restype = c_int32
+    lib.infiniopDestroyIsInfDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def isfinite_(lib):
+    lib.infiniopCreateIsFiniteDescriptor.restype = c_int32
+    lib.infiniopCreateIsFiniteDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+    lib.infiniopGetIsFiniteWorkspaceSize.restype = c_int32
+    lib.infiniopGetIsFiniteWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+    lib.infiniopIsFinite.restype = c_int32
+    lib.infiniopIsFinite.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyIsFiniteDescriptor.restype = c_int32
+    lib.infiniopDestroyIsFiniteDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def sinc_(lib):
+    lib.infiniopCreateSincDescriptor.restype = c_int32
+    lib.infiniopCreateSincDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+    lib.infiniopGetSincWorkspaceSize.restype = c_int32
+    lib.infiniopGetSincWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+    lib.infiniopSinc.restype = c_int32
+    lib.infiniopSinc.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroySincDescriptor.restype = c_int32
+    lib.infiniopDestroySincDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
 @OpRegister.operator
 def index_copy_inplace_(lib):
     lib.infiniopCreateIndexCopyInplaceDescriptor.restype = c_int32
diff --git a/test/infiniop/libinfiniop/utils.py b/test/infiniop/libinfiniop/utils.py
index 9b43c47c5..b6d8d4207 100644
--- a/test/infiniop/libinfiniop/utils.py
+++ b/test/infiniop/libinfiniop/utils.py
@@ -83,8 +83,25 @@ def __init__(
                 InfiniDtype.BYTE,
                 InfiniDtype.BOOL,
             ]:
-                randint_low = -2000000000 if randint_low is None else randint_low
-                randint_high = 2000000000 if randint_high is None else randint_high
+                # Set appropriate default ranges based on dtype
+                if randint_low is None or randint_high is None:
+                    if dt == InfiniDtype.U8 or dt == InfiniDtype.BYTE:
+                        randint_low = 0 if randint_low is None else randint_low
+                        randint_high = 256 if randint_high is None else randint_high
+                    elif dt == InfiniDtype.BOOL:
+                        randint_low = 0 if randint_low is None else randint_low
+                        randint_high = 2 if randint_high is None else randint_high
+                    elif dt == InfiniDtype.U16:
+                        randint_low = 0 if randint_low is None else randint_low
+                        randint_high = 65536 if randint_high is None else randint_high
+                    elif dt in [InfiniDtype.U32, InfiniDtype.U64]:
+                        randint_low = 0 if randint_low is None else randint_low
+                        randint_high = 2000000000 if randint_high is None else randint_high
+                    else:
+                        # For signed integer types (I8, I16, I32, I64)
+                        randint_low = -2000000000 if randint_low is None else randint_low
+                        randint_high = 2000000000 if randint_high is None else randint_high
+                
                 self._torch_tensor = torch.randint(
                     randint_low,
                     randint_high,
@@ -107,8 +124,25 @@ def __init__(
                 torch_shape, dtype=to_torch_dtype(dt), device=torch_device_map[device]
             )
         elif mode == "randint":
-            randint_low = -2000000000 if randint_low is None else randint_low
-            randint_high = 2000000000 if randint_high is None else randint_high
+            # Set appropriate default ranges based on dtype (same logic as mode="random")
+            if randint_low is None or randint_high is None:
+                if dt == InfiniDtype.U8 or dt == InfiniDtype.BYTE:
+                    randint_low = 0 if randint_low is None else randint_low
+                    randint_high = 256 if randint_high is None else randint_high
+                elif dt == InfiniDtype.BOOL:
+                    randint_low = 0 if randint_low is None else randint_low
+                    randint_high = 2 if randint_high is None else randint_high
+                elif dt == InfiniDtype.U16:
+                    randint_low = 0 if randint_low is None else randint_low
+                    randint_high = 65536 if randint_high is None else randint_high
+                elif dt in [InfiniDtype.U32, InfiniDtype.U64]:
+                    randint_low = 0 if randint_low is None else randint_low
+                    randint_high = 2000000000 if randint_high is None else randint_high
+                else:
+                    # For signed integer types (I8, I16, I32, I64)
+                    randint_low = -2000000000 if randint_low is None else randint_low
+                    randint_high = 2000000000 if randint_high is None else randint_high
+            
             self._torch_tensor = torch.randint(
                 randint_low,
                 randint_high,
diff --git a/test/infiniop/test_all_binary_ops.py b/test/infiniop/test_all_binary_ops.py
index e08b3e41b..61fe15574 100644
--- a/test/infiniop/test_all_binary_ops.py
+++ b/test/infiniop/test_all_binary_ops.py
@@ -50,6 +50,31 @@ def generate_input_b(shape, b_stride, dtype, device):
     EQUAL_NAN = True
 
 
+class FloorDivideTest(BinaryTestBase):
+    OP_NAME = "FloorDivide"
+    OP_NAME_LOWER = "floor_divide"
+    
+    @staticmethod
+    def torch_op(c, a, b):
+        torch.floor_divide(a, b, out=c)
+    
+    @staticmethod
+    def generate_input_a(shape, a_stride, dtype, device):
+        return TestTensor(shape, a_stride, dtype, device)
+    
+    @staticmethod
+    def generate_input_b(shape, b_stride, dtype, device):
+        # For division, ensure b doesn't contain zeros
+        return TestTensor(shape, b_stride, dtype, device, scale=2, bias=0.1)
+    
+    TOLERANCE_MAP = {
+        InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+        InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6},
+    }
+    
+    EQUAL_NAN = True
+
+
 class PowTest(BinaryTestBase):
     OP_NAME = "Pow"
     OP_NAME_LOWER = "pow"
@@ -75,6 +100,81 @@ def generate_input_b(shape, b_stride, dtype, device):
     EQUAL_NAN = True
 
 
+class CopySignTest(BinaryTestBase):
+    OP_NAME = "CopySign"
+    OP_NAME_LOWER = "copysign"
+    
+    @staticmethod
+    def torch_op(c, a, b):
+        torch.copysign(a, b, out=c)
+    
+    @staticmethod
+    def generate_input_a(shape, a_stride, dtype, device):
+        # Generate values with various magnitudes
+        return TestTensor(shape, a_stride, dtype, device, mode="random", scale=10.0, bias=-5.0)
+    
+    @staticmethod
+    def generate_input_b(shape, b_stride, dtype, device):
+        # Generate values with various signs
+        return TestTensor(shape, b_stride, dtype, device, mode="random", scale=10.0, bias=-5.0)
+    
+    TOLERANCE_MAP = {
+        InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+        InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7},
+    }
+    
+    EQUAL_NAN = True
+
+
+class HypotTest(BinaryTestBase):
+    OP_NAME = "Hypot"
+    OP_NAME_LOWER = "hypot"
+    
+    @staticmethod
+    def torch_op(c, a, b):
+        torch.hypot(a, b, out=c)
+    
+    @staticmethod
+    def generate_input_a(shape, a_stride, dtype, device):
+        return TestTensor(shape, a_stride, dtype, device, mode="random", scale=10.0, bias=-5.0)
+    
+    @staticmethod
+    def generate_input_b(shape, b_stride, dtype, device):
+        return TestTensor(shape, b_stride, dtype, device, mode="random", scale=10.0, bias=-5.0)
+    
+    TOLERANCE_MAP = {
+        InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+        InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6},
+    }
+    
+    EQUAL_NAN = True
+
+
+class Atan2Test(BinaryTestBase):
+    OP_NAME = "Atan2"
+    OP_NAME_LOWER = "atan2"
+    
+    @staticmethod
+    def torch_op(c, a, b):
+        torch.atan2(a, b, out=c)
+    
+    @staticmethod
+    def generate_input_a(shape, a_stride, dtype, device):
+        return TestTensor(shape, a_stride, dtype, device)
+    
+    @staticmethod
+    def generate_input_b(shape, b_stride, dtype, device):
+        # For atan2, avoid zeros in denominator (b)
+        return TestTensor(shape, b_stride, dtype, device, scale=2, bias=0.1)
+    
+    TOLERANCE_MAP = {
+        InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+        InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6},
+    }
+    
+    EQUAL_NAN = True
+
+
 class ModTest(BinaryTestBase):
     OP_NAME = "Mod"
     OP_NAME_LOWER = "mod"
@@ -100,6 +200,31 @@ def generate_input_b(shape, b_stride, dtype, device):
     EQUAL_NAN = True
 
 
+class RemainderTest(BinaryTestBase):
+    OP_NAME = "Remainder"
+    OP_NAME_LOWER = "remainder"
+    
+    @staticmethod
+    def torch_op(c, a, b):
+        torch.remainder(a, b, out=c)
+    
+    @staticmethod
+    def generate_input_a(shape, a_stride, dtype, device):
+        return TestTensor(shape, a_stride, dtype, device)
+    
+    @staticmethod
+    def generate_input_b(shape, b_stride, dtype, device):
+        # Avoid zeros
+        return TestTensor(shape, b_stride, dtype, device, scale=2, bias=0.1)
+    
+    TOLERANCE_MAP = {
+        InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+        InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6},
+    }
+    
+    EQUAL_NAN = True
+
+
 class MaxTest(BinaryTestBase):
     OP_NAME = "Max"
     OP_NAME_LOWER = "max"
@@ -148,6 +273,466 @@ def generate_input_b(shape, b_stride, dtype, device):
     EQUAL_NAN = True
 
 
+class FmaxTest(BinaryTestBase):
+    OP_NAME = "Fmax"
+    OP_NAME_LOWER = "fmax"
+    
+    @staticmethod
+    def torch_op(c, a, b):
+        # torch.fmax ignores NaN: if one is NaN, return the other
+        result = torch.fmax(a, b)
+        c.copy_(result)
+    
+    @staticmethod
+    def generate_input_a(shape, a_stride, dtype, device):
+        return TestTensor(shape, a_stride, dtype, device)
+    
+    @staticmethod
+    def generate_input_b(shape, b_stride, dtype, device):
+        return TestTensor(shape, b_stride, dtype, device)
+    
+    TOLERANCE_MAP = {
+        InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+        InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6},
+    }
+    
+    EQUAL_NAN = True
+
+
+class FminTest(BinaryTestBase):
+    OP_NAME = "Fmin"
+    OP_NAME_LOWER = "fmin"
+    
+    @staticmethod
+    def torch_op(c, a, b):
+        # torch.fmin ignores NaN: if one is NaN, return the other
+        result = torch.fmin(a, b)
+        c.copy_(result)
+    
+    @staticmethod
+    def generate_input_a(shape, a_stride, dtype, device):
+        return TestTensor(shape, a_stride, dtype, device)
+    
+    @staticmethod
+    def generate_input_b(shape, b_stride, dtype, device):
+        return TestTensor(shape, b_stride, dtype, device)
+    
+    TOLERANCE_MAP = {
+        InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+        InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6},
+    }
+    
+    EQUAL_NAN = True
+
+
+class GtTest(BinaryTestBase):
+    OP_NAME = "Gt"
+    OP_NAME_LOWER = "gt"
+    
+    @staticmethod
+    def torch_op(c, a, b):
+        # torch.gt returns bool, convert to float (1.0 or 0.0) to match our implementation
+        result = torch.gt(a, b)
+        c.copy_(result.float())
+    
+    @staticmethod
+    def generate_input_a(shape, a_stride, dtype, device):
+        return TestTensor(shape, a_stride, dtype, device)
+    
+    @staticmethod
+    def generate_input_b(shape, b_stride, dtype, device):
+        return TestTensor(shape, b_stride, dtype, device)
+    
+    TOLERANCE_MAP = {
+        InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+        InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6},
+    }
+    
+    EQUAL_NAN = True
+
+
+class LtTest(BinaryTestBase):
+    OP_NAME = "Lt"
+    OP_NAME_LOWER = "lt"
+    
+    @staticmethod
+    def torch_op(c, a, b):
+        # torch.lt returns bool, convert to float (1.0 or 0.0) to match our implementation
+        result = torch.lt(a, b)
+        c.copy_(result.float())
+    
+    @staticmethod
+    def generate_input_a(shape, a_stride, dtype, device):
+        return TestTensor(shape, a_stride, dtype, device)
+    
+    @staticmethod
+    def generate_input_b(shape, b_stride, dtype, device):
+        return TestTensor(shape, b_stride, dtype, device)
+    
+    TOLERANCE_MAP = {
+        InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+        InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6},
+    }
+    
+    EQUAL_NAN = True
+
+
+class GeTest(BinaryTestBase):
+    OP_NAME = "Ge"
+    OP_NAME_LOWER = "ge"
+    
+    @staticmethod
+    def torch_op(c, a, b):
+        # torch.ge returns bool, convert to float (1.0 or 0.0) to match our implementation
+        result = torch.ge(a, b)
+        c.copy_(result.float())
+    
+    @staticmethod
+    def generate_input_a(shape, a_stride, dtype, device):
+        return TestTensor(shape, a_stride, dtype, device)
+    
+    @staticmethod
+    def generate_input_b(shape, b_stride, dtype, device):
+        return TestTensor(shape, b_stride, dtype, device)
+    
+    TOLERANCE_MAP = {
+        InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+        InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6},
+    }
+    
+    EQUAL_NAN = True
+
+
+class LeTest(BinaryTestBase):
+    OP_NAME = "Le"
+    OP_NAME_LOWER = "le"
+    
+    @staticmethod
+    def torch_op(c, a, b):
+        # torch.le returns bool, convert to float (1.0 or 0.0) to match our implementation
+        result = torch.le(a, b)
+        c.copy_(result.float())
+    
+    @staticmethod
+    def generate_input_a(shape, a_stride, dtype, device):
+        return TestTensor(shape, a_stride, dtype, device)
+    
+    @staticmethod
+    def generate_input_b(shape, b_stride, dtype, device):
+        return TestTensor(shape, b_stride, dtype, device)
+    
+    TOLERANCE_MAP = {
+        InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+        InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6},
+    }
+    
+    EQUAL_NAN = True
+
+
+class EqTest(BinaryTestBase):
+    OP_NAME = "Eq"
+    OP_NAME_LOWER = "eq"
+    
+    @staticmethod
+    def torch_op(c, a, b):
+        # torch.eq returns bool, convert to float (1.0 or 0.0) to match our implementation
+        result = torch.eq(a, b)
+        c.copy_(result.float())
+    
+    @staticmethod
+    def generate_input_a(shape, a_stride, dtype, device):
+        return TestTensor(shape, a_stride, dtype, device)
+    
+    @staticmethod
+    def generate_input_b(shape, b_stride, dtype, device):
+        return TestTensor(shape, b_stride, dtype, device)
+    
+    TOLERANCE_MAP = {
+        InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+        InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6},
+    }
+    
+    EQUAL_NAN = True
+
+
+class NeTest(BinaryTestBase):
+    OP_NAME = "Ne"
+    OP_NAME_LOWER = "ne"
+    
+    @staticmethod
+    def torch_op(c, a, b):
+        # torch.ne returns bool, convert to float (1.0 or 0.0) to match our implementation
+        result = torch.ne(a, b)
+        c.copy_(result.float())
+    
+    @staticmethod
+    def generate_input_a(shape, a_stride, dtype, device):
+        return TestTensor(shape, a_stride, dtype, device)
+    
+    @staticmethod
+    def generate_input_b(shape, b_stride, dtype, device):
+        return TestTensor(shape, b_stride, dtype, device)
+    
+    TOLERANCE_MAP = {
+        InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+        InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6},
+    }
+    
+    EQUAL_NAN = True
+
+
+class LogicalAndTest(BinaryTestBase):
+    OP_NAME = "LogicalAnd"
+    OP_NAME_LOWER = "logical_and"
+    
+    @staticmethod
+    def torch_op(c, a, b):
+        # torch.logical_and returns bool, convert to float (1.0 or 0.0) to match our implementation
+        result = torch.logical_and(a, b)
+        c.copy_(result.float())
+    
+    @staticmethod
+    def generate_input_a(shape, a_stride, dtype, device):
+        return TestTensor(shape, a_stride, dtype, device)
+    
+    @staticmethod
+    def generate_input_b(shape, b_stride, dtype, device):
+        return TestTensor(shape, b_stride, dtype, device)
+    
+    TOLERANCE_MAP = {
+        InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+        InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6},
+    }
+    
+    EQUAL_NAN = True
+
+
+class LogicalOrTest(BinaryTestBase):
+    OP_NAME = "LogicalOr"
+    OP_NAME_LOWER = "logical_or"
+    
+    @staticmethod
+    def torch_op(c, a, b):
+        # torch.logical_or returns bool, convert to float (1.0 or 0.0) to match our implementation
+        result = torch.logical_or(a, b)
+        c.copy_(result.float())
+    
+    @staticmethod
+    def generate_input_a(shape, a_stride, dtype, device):
+        return TestTensor(shape, a_stride, dtype, device)
+    
+    @staticmethod
+    def generate_input_b(shape, b_stride, dtype, device):
+        return TestTensor(shape, b_stride, dtype, device)
+    
+    TOLERANCE_MAP = {
+        InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+        InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6},
+    }
+    
+    EQUAL_NAN = True
+
+
+class LogicalXorTest(BinaryTestBase):
+    OP_NAME = "LogicalXor"
+    OP_NAME_LOWER = "logical_xor"
+    
+    @staticmethod
+    def torch_op(c, a, b):
+        # torch.logical_xor returns bool, convert to float (1.0 or 0.0) to match our implementation
+        result = torch.logical_xor(a, b)
+        c.copy_(result.float())
+    
+    @staticmethod
+    def generate_input_a(shape, a_stride, dtype, device):
+        return TestTensor(shape, a_stride, dtype, device)
+    
+    @staticmethod
+    def generate_input_b(shape, b_stride, dtype, device):
+        return TestTensor(shape, b_stride, dtype, device)
+    
+    TOLERANCE_MAP = {
+        InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+        InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6},
+    }
+    
+    EQUAL_NAN = True
+
+
+class BitwiseAndTest(BinaryTestBase):
+    OP_NAME = "BitwiseAnd"
+    OP_NAME_LOWER = "bitwise_and"
+    
+    @staticmethod
+    def torch_op(c, a, b):
+        # torch.bitwise_and only supports integral types
+        result = torch.bitwise_and(a, b)
+        c.copy_(result)
+    
+    @staticmethod
+    def generate_input_a(shape, a_stride, dtype, device):
+        # Use default TestTensor (utils.py now handles correct ranges for integral types)
+        return TestTensor(shape, a_stride, dtype, device)
+    
+    @staticmethod
+    def generate_input_b(shape, b_stride, dtype, device):
+        # Use default TestTensor (utils.py now handles correct ranges for integral types)
+        return TestTensor(shape, b_stride, dtype, device)
+    
+    TOLERANCE_MAP = {
+        InfiniDtype.I32: {"atol": 0, "rtol": 0},
+        InfiniDtype.I64: {"atol": 0, "rtol": 0},
+        InfiniDtype.U8: {"atol": 0, "rtol": 0},
+    }
+    
+    # Bitwise operations only support integral types
+    TENSOR_DTYPES = [InfiniDtype.I32, InfiniDtype.I64, InfiniDtype.U8]
+    
+    EQUAL_NAN = True
+
+
+class BitwiseOrTest(BinaryTestBase):
+    OP_NAME = "BitwiseOr"
+    OP_NAME_LOWER = "bitwise_or"
+    
+    @staticmethod
+    def torch_op(c, a, b):
+        # torch.bitwise_or only supports integral types
+        result = torch.bitwise_or(a, b)
+        c.copy_(result)
+    
+    @staticmethod
+    def generate_input_a(shape, a_stride, dtype, device):
+        # Use default TestTensor (utils.py now handles correct ranges for integral types)
+        return TestTensor(shape, a_stride, dtype, device)
+    
+    @staticmethod
+    def generate_input_b(shape, b_stride, dtype, device):
+        # Use default TestTensor (utils.py now handles correct ranges for integral types)
+        return TestTensor(shape, b_stride, dtype, device)
+    
+    TOLERANCE_MAP = {
+        InfiniDtype.I32: {"atol": 0, "rtol": 0},
+        InfiniDtype.I64: {"atol": 0, "rtol": 0},
+        InfiniDtype.U8: {"atol": 0, "rtol": 0},
+    }
+    
+    # Bitwise operations only support integral types
+    TENSOR_DTYPES = [InfiniDtype.I32, InfiniDtype.I64, InfiniDtype.U8]
+    
+    EQUAL_NAN = True
+
+
+class BitwiseXorTest(BinaryTestBase):
+    OP_NAME = "BitwiseXor"
+    OP_NAME_LOWER = "bitwise_xor"
+    
+    @staticmethod
+    def torch_op(c, a, b):
+        # torch.bitwise_xor only supports integral types
+        result = torch.bitwise_xor(a, b)
+        c.copy_(result)
+    
+    @staticmethod
+    def generate_input_a(shape, a_stride, dtype, device):
+        # Use default TestTensor (utils.py now handles correct ranges for integral types)
+        return TestTensor(shape, a_stride, dtype, device)
+    
+    @staticmethod
+    def generate_input_b(shape, b_stride, dtype, device):
+        # Use default TestTensor (utils.py now handles correct ranges for integral types)
+        return TestTensor(shape, b_stride, dtype, device)
+    
+    TOLERANCE_MAP = {
+        InfiniDtype.I32: {"atol": 0, "rtol": 0},
+        InfiniDtype.I64: {"atol": 0, "rtol": 0},
+        InfiniDtype.U8: {"atol": 0, "rtol": 0},
+    }
+    
+    # Bitwise operations only support integral types
+    TENSOR_DTYPES = [InfiniDtype.I32, InfiniDtype.I64, InfiniDtype.U8]
+    
+    EQUAL_NAN = True
+
+
+class BitwiseLeftShiftTest(BinaryTestBase):
+    OP_NAME = "BitwiseLeftShift"
+    OP_NAME_LOWER = "bitwise_left_shift"
+    
+    @staticmethod
+    def torch_op(c, a, b):
+        # torch.bitwise_left_shift only supports integral types
+        result = torch.bitwise_left_shift(a, b)
+        c.copy_(result)
+    
+    @staticmethod
+    def generate_input_a(shape, a_stride, dtype, device):
+        # Use default TestTensor (utils.py now handles correct ranges for integral types)
+        return TestTensor(shape, a_stride, dtype, device)
+    
+    @staticmethod
+    def generate_input_b(shape, b_stride, dtype, device):
+        # For shift operations, b should be non-negative and within reasonable range
+        # Generate shift amounts between 0 and bit_width-1 for each type
+        if dtype == InfiniDtype.U8:
+            return TestTensor(shape, b_stride, dtype, device, randint_low=0, randint_high=8)
+        elif dtype == InfiniDtype.I32:
+            return TestTensor(shape, b_stride, dtype, device, randint_low=0, randint_high=32)
+        elif dtype == InfiniDtype.I64:
+            return TestTensor(shape, b_stride, dtype, device, randint_low=0, randint_high=64)
+        return TestTensor(shape, b_stride, dtype, device)
+    
+    TOLERANCE_MAP = {
+        InfiniDtype.I32: {"atol": 0, "rtol": 0},
+        InfiniDtype.I64: {"atol": 0, "rtol": 0},
+        InfiniDtype.U8: {"atol": 0, "rtol": 0},
+    }
+    
+    # Bitwise operations only support integral types
+    TENSOR_DTYPES = [InfiniDtype.I32, InfiniDtype.I64, InfiniDtype.U8]
+    
+    EQUAL_NAN = True
+
+
+class BitwiseRightShiftTest(BinaryTestBase):
+    OP_NAME = "BitwiseRightShift"
+    OP_NAME_LOWER = "bitwise_right_shift"
+    
+    @staticmethod
+    def torch_op(c, a, b):
+        # torch.bitwise_right_shift only supports integral types
+        result = torch.bitwise_right_shift(a, b)
+        c.copy_(result)
+    
+    @staticmethod
+    def generate_input_a(shape, a_stride, dtype, device):
+        # Use default TestTensor (utils.py now handles correct ranges for integral types)
+        return TestTensor(shape, a_stride, dtype, device)
+    
+    @staticmethod
+    def generate_input_b(shape, b_stride, dtype, device):
+        # For shift operations, b should be non-negative and within reasonable range
+        # Generate shift amounts between 0 and bit_width-1 for each type
+        if dtype == InfiniDtype.U8:
+            return TestTensor(shape, b_stride, dtype, device, randint_low=0, randint_high=8)
+        elif dtype == InfiniDtype.I32:
+            return TestTensor(shape, b_stride, dtype, device, randint_low=0, randint_high=32)
+        elif dtype == InfiniDtype.I64:
+            return TestTensor(shape, b_stride, dtype, device, randint_low=0, randint_high=64)
+        return TestTensor(shape, b_stride, dtype, device)
+    
+    TOLERANCE_MAP = {
+        InfiniDtype.I32: {"atol": 0, "rtol": 0},
+        InfiniDtype.I64: {"atol": 0, "rtol": 0},
+        InfiniDtype.U8: {"atol": 0, "rtol": 0},
+    }
+    
+    # Bitwise operations only support integral types
+    TENSOR_DTYPES = [InfiniDtype.I32, InfiniDtype.I64, InfiniDtype.U8]
+    
+    EQUAL_NAN = True
+
+
 # ==============================================================================
 # 算子注册表
 # ==============================================================================
@@ -155,10 +740,31 @@ def generate_input_b(shape, b_stride, dtype, device):
 # 所有 binary 算子的测试类映射
 BINARY_OP_TESTS = {
     "div": DivTest,
+    "floor_divide": FloorDivideTest,
     "pow": PowTest,
+    "copysign": CopySignTest,
+    "hypot": HypotTest,
+    "atan2": Atan2Test,
     "mod": ModTest,
+    "remainder": RemainderTest,
     "max": MaxTest,
     "min": MinTest,
+    "fmax": FmaxTest,
+    "fmin": FminTest,
+    "gt": GtTest,
+    "lt": LtTest,
+    "ge": GeTest,
+    "le": LeTest,
+    "eq": EqTest,
+    "ne": NeTest,
+    "logical_and": LogicalAndTest,
+    "logical_or": LogicalOrTest,
+    "logical_xor": LogicalXorTest,
+    "bitwise_and": BitwiseAndTest,
+    "bitwise_or": BitwiseOrTest,
+    "bitwise_xor": BitwiseXorTest,
+    "bitwise_left_shift": BitwiseLeftShiftTest,
+    "bitwise_right_shift": BitwiseRightShiftTest,
 }
 
 
diff --git a/test/infiniop/test_all_unary_ops.py b/test/infiniop/test_all_unary_ops.py
index 2a65cf938..54a8ef70c 100644
--- a/test/infiniop/test_all_unary_ops.py
+++ b/test/infiniop/test_all_unary_ops.py
@@ -185,6 +185,28 @@ def generate_input(shape, dtype, device):
     }
 
 
+class SinTest(UnaryTestBase):
+    OP_NAME = "Sin"
+    OP_NAME_LOWER = "sin"
+    
+    @staticmethod
+    def torch_op(x):
+        return torch.sin(x).to(x.dtype)
+    
+    @staticmethod
+    def generate_input(shape, dtype, device):
+        # Generate test tensors with values in range [-200, -100) for sin operation
+        # sin domain is (-∞, +∞), so we use range [-200, -100)
+        return torch.rand(shape, dtype=dtype, device=device) * 100 - 200
+    
+    TOLERANCE_MAP = {
+        InfiniDtype.F16: {"atol": 1e-4, "rtol": 1e-2},
+        InfiniDtype.F32: {"atol": 1e-4, "rtol": 1e-2},
+    }
+    
+    EQUAL_NAN = True
+
+
 class CosTest(UnaryTestBase):
     OP_NAME = "Cos"
     OP_NAME_LOWER = "cos"
@@ -288,6 +310,77 @@ def generate_input(shape, dtype, device):
     EQUAL_NAN = True
 
 
+class Log2Test(UnaryTestBase):
+    OP_NAME = "Log2"
+    OP_NAME_LOWER = "log2"
+    
+    @staticmethod
+    def torch_op(x):
+        return torch.log2(x).to(x.dtype)
+    
+    @staticmethod
+    def generate_input(shape, dtype, device):
+        # log2 domain is (0, +∞), so we use range [0.1, 1.1)
+        return torch.rand(shape, dtype=dtype, device=device) + 0.1
+    
+    TOLERANCE_MAP = {
+        InfiniDtype.F16: {"atol": 1e-7, "rtol": 1e-3},
+        InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-3},
+        InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2},
+    }
+    
+    # Support BF16
+    TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16]
+    
+    EQUAL_NAN = True
+
+
+class Log10Test(UnaryTestBase):
+    OP_NAME = "Log10"
+    OP_NAME_LOWER = "log10"
+    
+    @staticmethod
+    def torch_op(x):
+        return torch.log10(x).to(x.dtype)
+    
+    @staticmethod
+    def generate_input(shape, dtype, device):
+        # log10 domain is (0, +∞), so we use range [0.1, 1.1)
+        return torch.rand(shape, dtype=dtype, device=device) + 0.1
+    
+    TOLERANCE_MAP = {
+        InfiniDtype.F16: {"atol": 1e-7, "rtol": 1e-3},
+        InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-3},
+        InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2},
+    }
+    
+    # Support BF16
+    TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16]
+    
+    EQUAL_NAN = True
+
+
+class Log1pTest(UnaryTestBase):
+    OP_NAME = "Log1p"
+    OP_NAME_LOWER = "log1p"
+    
+    @staticmethod
+    def torch_op(x):
+        return torch.log1p(x).to(x.dtype)
+    
+    @staticmethod
+    def generate_input(shape, dtype, device):
+        # log1p domain is (-1, +∞), so we use range [-0.9, 1.1)
+        # Include values close to zero to test numerical stability
+        x = torch.rand(shape, dtype=dtype, device=device) * 2 - 0.9
+        return x
+    
+    TOLERANCE_MAP = {
+        InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+        InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6},
+    }
+
+
 class NegTest(UnaryTestBase):
     OP_NAME = "Neg"
     OP_NAME_LOWER = "neg"
@@ -410,6 +503,47 @@ def generate_input(shape, dtype, device):
     EQUAL_NAN = True
 
 
+class SquareTest(UnaryTestBase):
+    OP_NAME = "Square"
+    OP_NAME_LOWER = "square"
+    
+    @staticmethod
+    def torch_op(x):
+        return torch.square(x).to(x.dtype)
+    
+    @staticmethod
+    def generate_input(shape, dtype, device):
+        return torch.rand(shape, dtype=dtype, device=device) * 10 - 5
+    
+    TOLERANCE_MAP = {
+        InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+        InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7},
+    }
+    
+    EQUAL_NAN = True
+
+
+class RsqrtTest(UnaryTestBase):
+    OP_NAME = "Rsqrt"
+    OP_NAME_LOWER = "rsqrt"
+    
+    @staticmethod
+    def torch_op(x):
+        return torch.rsqrt(x).to(x.dtype)
+    
+    @staticmethod
+    def generate_input(shape, dtype, device):
+        # rsqrt domain is (0, +∞), avoid zero
+        return torch.rand(shape, dtype=dtype, device=device) * 100 + 1e-6
+    
+    TOLERANCE_MAP = {
+        InfiniDtype.F16: {"atol": 1e-3, "rtol": 2e-3},
+        InfiniDtype.F32: {"atol": 0, "rtol": 1e-3},
+    }
+    
+    EQUAL_NAN = True
+
+
 class TanTest(UnaryTestBase):
     OP_NAME = "Tan"
     OP_NAME_LOWER = "tan"
@@ -452,6 +586,29 @@ def generate_input(shape, dtype, device):
     TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16]
 
 
+class Exp2Test(UnaryTestBase):
+    OP_NAME = "Exp2"
+    OP_NAME_LOWER = "exp2"
+    
+    @staticmethod
+    def torch_op(x):
+        return torch.exp2(x).to(x.dtype)
+    
+    @staticmethod
+    def generate_input(shape, dtype, device):
+        # Keep input in reasonable range to avoid overflow
+        return torch.rand(shape, dtype=dtype, device=device) * 4 - 2
+    
+    TOLERANCE_MAP = {
+        InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+        InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6},
+        InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2},
+    }
+    
+    # Support BF16
+    TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16]
+
+
 class HardswishTest(UnaryTestBase):
     OP_NAME = "Hardswish"
     OP_NAME_LOWER = "hardswish"
@@ -474,6 +631,117 @@ def generate_input(shape, dtype, device):
     TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16]
 
 
+class IsNanTest(UnaryTestBase):
+    OP_NAME = "IsNan"
+    OP_NAME_LOWER = "isnan"
+    
+    @staticmethod
+    def torch_op(x):
+        return torch.isnan(x).to(x.dtype)
+    
+    @staticmethod
+    def generate_input(shape, dtype, device):
+        # Generate a mix of normal values and NaN values
+        x = torch.rand(shape, dtype=dtype, device=device) * 10 - 5
+        # Set some values to NaN
+        nan_mask = torch.rand(shape, device=device) < 0.3
+        x[nan_mask] = float('nan')
+        return x
+    
+    TOLERANCE_MAP = {
+        InfiniDtype.F16: {"atol": 0, "rtol": 0},
+        InfiniDtype.F32: {"atol": 0, "rtol": 0},
+    }
+    
+    EQUAL_NAN = False  # For isnan, we want exact match (0 or 1)
+
+
+class IsInfTest(UnaryTestBase):
+    OP_NAME = "IsInf"
+    OP_NAME_LOWER = "isinf"
+    
+    @staticmethod
+    def torch_op(x):
+        return torch.isinf(x).to(x.dtype)
+    
+    @staticmethod
+    def generate_input(shape, dtype, device):
+        # Generate a mix of normal values and Inf values
+        x = torch.rand(shape, dtype=dtype, device=device) * 10 - 5
+        # Set some values to Inf
+        inf_mask = torch.rand(shape, device=device) < 0.3
+        x[inf_mask] = float('inf')
+        # Set some to -Inf
+        neg_inf_mask = torch.rand(shape, device=device) < 0.15
+        x[neg_inf_mask] = float('-inf')
+        return x
+    
+    TOLERANCE_MAP = {
+        InfiniDtype.F16: {"atol": 0, "rtol": 0},
+        InfiniDtype.F32: {"atol": 0, "rtol": 0},
+    }
+    
+    EQUAL_NAN = False  # For isinf, we want exact match (0 or 1)
+
+
+class IsFiniteTest(UnaryTestBase):
+    OP_NAME = "IsFinite"
+    OP_NAME_LOWER = "isfinite"
+    
+    @staticmethod
+    def torch_op(x):
+        return torch.isfinite(x).to(x.dtype)
+    
+    @staticmethod
+    def generate_input(shape, dtype, device):
+        # Generate a mix of normal values, NaN, and Inf values
+        x = torch.rand(shape, dtype=dtype, device=device) * 10 - 5
+        # Set some values to NaN
+        nan_mask = torch.rand(shape, device=device) < 0.2
+        x[nan_mask] = float('nan')
+        # Set some values to Inf
+        inf_mask = torch.rand(shape, device=device) < 0.2
+        x[inf_mask] = float('inf')
+        return x
+    
+    TOLERANCE_MAP = {
+        InfiniDtype.F16: {"atol": 0, "rtol": 0},
+        InfiniDtype.F32: {"atol": 0, "rtol": 0},
+    }
+    
+    EQUAL_NAN = False  # For isfinite, we want exact match (0 or 1)
+
+
+class SincTest(UnaryTestBase):
+    OP_NAME = "Sinc"
+    OP_NAME_LOWER = "sinc"
+    
+    @staticmethod
+    def torch_op(x):
+        # PyTorch doesn't have sinc, so we implement it manually
+        # sinc(x) = sin(x) / x, sinc(0) = 1
+        result = torch.sin(x) / x
+        result[x == 0] = 1.0
+        return result.to(x.dtype)
+    
+    @staticmethod
+    def generate_input(shape, dtype, device):
+        # Generate values around zero and some larger values
+        # Include zero to test the special case
+        x = torch.rand(shape, dtype=dtype, device=device) * 10 - 5
+        # Set some values to exactly zero
+        zero_mask = torch.rand(shape, device=device) < 0.1
+        x[zero_mask] = 0.0
+        return x
+    
+    TOLERANCE_MAP = {
+        InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+        InfiniDtype.F32: {"atol": 1e-4, "rtol": 1e-4},  # sinc can have larger errors near zero
+    }
+    
+    EQUAL_NAN = True
+
+
 # ==============================================================================
 # 算子注册表
 # ==============================================================================
@@ -493,15 +761,26 @@ def generate_input(shape, dtype, device):
     "erf": ErfTest,
     "floor": FloorTest,
     "log": LogTest,
+    "log2": Log2Test,
+    "log10": Log10Test,
+    "log1p": Log1pTest,
     "neg": NegTest,
     "reciprocal": ReciprocalTest,
     "round": RoundTest,
     "sign": SignTest,
+    "sin": SinTest,
     "sinh": SinhTest,
     "sqrt": SqrtTest,
+    "square": SquareTest,
+    "rsqrt": RsqrtTest,
     "tan": TanTest,
     "exp": ExpTest,
+    "exp2": Exp2Test,
     "hardswish": HardswishTest,
+    "isnan": IsNanTest,
+    "isinf": IsInfTest,
+    "isfinite": IsFiniteTest,
+    "sinc": SincTest,
 }