From 120d746848322092e0084c2c424b1706bae6d489 Mon Sep 17 00:00:00 2001 From: gongchensu Date: Mon, 5 Jan 2026 15:28:13 +0000 Subject: [PATCH 1/7] Issue/887 - Add pow,div,mod,min,max operator with CPU and NVIDIA implementations. --- include/infiniop.h | 3 + include/infiniop/ops/div.h | 26 +++ include/infiniop/ops/max.h | 26 +++ include/infiniop/ops/min.h | 26 +++ include/infiniop/ops/mod.h | 26 +++ include/infiniop/ops/pow.h | 26 +++ src/infiniop/ops/div/cpu/div_cpu.cc | 50 +++++ src/infiniop/ops/div/cpu/div_cpu.h | 19 ++ src/infiniop/ops/div/cuda/kernel.cuh | 23 +++ src/infiniop/ops/div/nvidia/div_nvidia.cu | 57 ++++++ src/infiniop/ops/div/nvidia/div_nvidia.cuh | 8 + src/infiniop/ops/div/operator.cc | 202 +++++++++++++++++++++ src/infiniop/ops/max/cpu/max_cpu.cc | 50 +++++ src/infiniop/ops/max/cpu/max_cpu.h | 20 ++ src/infiniop/ops/max/cuda/kernel.cuh | 23 +++ src/infiniop/ops/max/nvidia/max_nvidia.cu | 57 ++++++ src/infiniop/ops/max/nvidia/max_nvidia.cuh | 8 + src/infiniop/ops/max/operator.cc | 202 +++++++++++++++++++++ src/infiniop/ops/min/cpu/min_cpu.cc | 50 +++++ src/infiniop/ops/min/cpu/min_cpu.h | 20 ++ src/infiniop/ops/min/cuda/kernel.cuh | 23 +++ src/infiniop/ops/min/nvidia/min_nvidia.cu | 57 ++++++ src/infiniop/ops/min/nvidia/min_nvidia.cuh | 8 + src/infiniop/ops/min/operator.cc | 202 +++++++++++++++++++++ src/infiniop/ops/mod/cpu/mod_cpu.cc | 49 +++++ src/infiniop/ops/mod/cpu/mod_cpu.h | 23 +++ src/infiniop/ops/mod/cuda/kernel.cuh | 30 +++ src/infiniop/ops/mod/nvidia/mod_nvidia.cu | 57 ++++++ src/infiniop/ops/mod/nvidia/mod_nvidia.cuh | 8 + src/infiniop/ops/mod/operator.cc | 142 +++++++++++++++ src/infiniop/ops/pow/cpu/pow_cpu.cc | 49 +++++ src/infiniop/ops/pow/cpu/pow_cpu.h | 19 ++ src/infiniop/ops/pow/cuda/kernel.cuh | 40 ++++ src/infiniop/ops/pow/nvidia/pow_nvidia.cu | 57 ++++++ src/infiniop/ops/pow/nvidia/pow_nvidia.cuh | 8 + src/infiniop/ops/pow/operator.cc | 142 +++++++++++++++ test/infiniop/div.py | 192 ++++++++++++++++++++ test/infiniop/libinfiniop/op_register.py | 170 +++++++++++++++++ test/infiniop/max.py | 189 +++++++++++++++++++ test/infiniop/min.py | 189 +++++++++++++++++++ test/infiniop/mod.py | 190 +++++++++++++++++++ test/infiniop/pow.py | 190 +++++++++++++++++++ 42 files changed, 2956 insertions(+) create mode 100644 include/infiniop/ops/div.h create mode 100644 include/infiniop/ops/max.h create mode 100644 include/infiniop/ops/min.h create mode 100644 include/infiniop/ops/mod.h create mode 100644 include/infiniop/ops/pow.h create mode 100644 src/infiniop/ops/div/cpu/div_cpu.cc create mode 100644 src/infiniop/ops/div/cpu/div_cpu.h create mode 100644 src/infiniop/ops/div/cuda/kernel.cuh create mode 100644 src/infiniop/ops/div/nvidia/div_nvidia.cu create mode 100644 src/infiniop/ops/div/nvidia/div_nvidia.cuh create mode 100644 src/infiniop/ops/div/operator.cc create mode 100644 src/infiniop/ops/max/cpu/max_cpu.cc create mode 100644 src/infiniop/ops/max/cpu/max_cpu.h create mode 100644 src/infiniop/ops/max/cuda/kernel.cuh create mode 100644 src/infiniop/ops/max/nvidia/max_nvidia.cu create mode 100644 src/infiniop/ops/max/nvidia/max_nvidia.cuh create mode 100644 src/infiniop/ops/max/operator.cc create mode 100644 src/infiniop/ops/min/cpu/min_cpu.cc create mode 100644 src/infiniop/ops/min/cpu/min_cpu.h create mode 100644 src/infiniop/ops/min/cuda/kernel.cuh create mode 100644 src/infiniop/ops/min/nvidia/min_nvidia.cu create mode 100644 src/infiniop/ops/min/nvidia/min_nvidia.cuh create mode 100644 src/infiniop/ops/min/operator.cc create mode 100644 src/infiniop/ops/mod/cpu/mod_cpu.cc create mode 100644 src/infiniop/ops/mod/cpu/mod_cpu.h create mode 100644 src/infiniop/ops/mod/cuda/kernel.cuh create mode 100644 src/infiniop/ops/mod/nvidia/mod_nvidia.cu create mode 100644 src/infiniop/ops/mod/nvidia/mod_nvidia.cuh create mode 100644 src/infiniop/ops/mod/operator.cc create mode 100644 src/infiniop/ops/pow/cpu/pow_cpu.cc create mode 100644 src/infiniop/ops/pow/cpu/pow_cpu.h create mode 100644 src/infiniop/ops/pow/cuda/kernel.cuh create mode 100644 src/infiniop/ops/pow/nvidia/pow_nvidia.cu create mode 100644 src/infiniop/ops/pow/nvidia/pow_nvidia.cuh create mode 100644 src/infiniop/ops/pow/operator.cc create mode 100644 test/infiniop/div.py create mode 100644 test/infiniop/max.py create mode 100644 test/infiniop/min.py create mode 100644 test/infiniop/mod.py create mode 100644 test/infiniop/pow.py diff --git a/include/infiniop.h b/include/infiniop.h index c0a09fcb4..cf1688868 100644 --- a/include/infiniop.h +++ b/include/infiniop.h @@ -9,11 +9,14 @@ #include "infiniop/ops/clip.h" #include "infiniop/ops/conv.h" #include "infiniop/ops/dequantize_awq.h" +#include "infiniop/ops/div.h" #include "infiniop/ops/gelu.h" #include "infiniop/ops/gemm.h" #include "infiniop/ops/layer_norm.h" #include "infiniop/ops/logsoftmax.h" #include "infiniop/ops/lp_norm.h" +#include "infiniop/ops/max.h" +#include "infiniop/ops/min.h" #include "infiniop/ops/mul.h" #include "infiniop/ops/ones.h" #include "infiniop/ops/paged_attention.h" diff --git a/include/infiniop/ops/div.h b/include/infiniop/ops/div.h new file mode 100644 index 000000000..e539b440c --- /dev/null +++ b/include/infiniop/ops/div.h @@ -0,0 +1,26 @@ +#ifndef __INFINIOP_DIV_API_H__ +#define __INFINIOP_DIV_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopDivDescriptor_t; + +__C __export infiniStatus_t infiniopCreateDivDescriptor(infiniopHandle_t handle, + infiniopDivDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c, + infiniopTensorDescriptor_t a, + infiniopTensorDescriptor_t b); + +__C __export infiniStatus_t infiniopGetDivWorkspaceSize(infiniopDivDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopDiv(infiniopDivDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *c, + const void *a, + const void *b, + void *stream); + +__C __export infiniStatus_t infiniopDestroyDivDescriptor(infiniopDivDescriptor_t desc); + +#endif diff --git a/include/infiniop/ops/max.h b/include/infiniop/ops/max.h new file mode 100644 index 000000000..e6f2f5d4c --- /dev/null +++ b/include/infiniop/ops/max.h @@ -0,0 +1,26 @@ +#ifndef __INFINIOP_MAX_API_H__ +#define __INFINIOP_MAX_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopMaxDescriptor_t; + +__C __export infiniStatus_t infiniopCreateMaxDescriptor(infiniopHandle_t handle, + infiniopMaxDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c, + infiniopTensorDescriptor_t a, + infiniopTensorDescriptor_t b); + +__C __export infiniStatus_t infiniopGetMaxWorkspaceSize(infiniopMaxDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopMax(infiniopMaxDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *c, + const void *a, + const void *b, + void *stream); + +__C __export infiniStatus_t infiniopDestroyMaxDescriptor(infiniopMaxDescriptor_t desc); + +#endif diff --git a/include/infiniop/ops/min.h b/include/infiniop/ops/min.h new file mode 100644 index 000000000..f72f0c4db --- /dev/null +++ b/include/infiniop/ops/min.h @@ -0,0 +1,26 @@ +#ifndef __INFINIOP_MIN_API_H__ +#define __INFINIOP_MIN_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopMinDescriptor_t; + +__C __export infiniStatus_t infiniopCreateMinDescriptor(infiniopHandle_t handle, + infiniopMinDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c, + infiniopTensorDescriptor_t a, + infiniopTensorDescriptor_t b); + +__C __export infiniStatus_t infiniopGetMinWorkspaceSize(infiniopMinDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopMin(infiniopMinDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *c, + const void *a, + const void *b, + void *stream); + +__C __export infiniStatus_t infiniopDestroyMinDescriptor(infiniopMinDescriptor_t desc); + +#endif diff --git a/include/infiniop/ops/mod.h b/include/infiniop/ops/mod.h new file mode 100644 index 000000000..5a6cd5bbf --- /dev/null +++ b/include/infiniop/ops/mod.h @@ -0,0 +1,26 @@ +#ifndef __INFINIOP_MOD_API_H__ +#define __INFINIOP_MOD_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopModDescriptor_t; + +__C __export infiniStatus_t infiniopCreateModDescriptor(infiniopHandle_t handle, + infiniopModDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c, + infiniopTensorDescriptor_t a, + infiniopTensorDescriptor_t b); + +__C __export infiniStatus_t infiniopGetModWorkspaceSize(infiniopModDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopMod(infiniopModDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *c, + const void *a, + const void *b, + void *stream); + +__C __export infiniStatus_t infiniopDestroyModDescriptor(infiniopModDescriptor_t desc); + +#endif diff --git a/include/infiniop/ops/pow.h b/include/infiniop/ops/pow.h new file mode 100644 index 000000000..6449d8622 --- /dev/null +++ b/include/infiniop/ops/pow.h @@ -0,0 +1,26 @@ +#ifndef __INFINIOP_POW_API_H__ +#define __INFINIOP_POW_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopPowDescriptor_t; + +__C __export infiniStatus_t infiniopCreatePowDescriptor(infiniopHandle_t handle, + infiniopPowDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c, + infiniopTensorDescriptor_t a, + infiniopTensorDescriptor_t b); + +__C __export infiniStatus_t infiniopGetPowWorkspaceSize(infiniopPowDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopPow(infiniopPowDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *c, + const void *a, + const void *b, + void *stream); + +__C __export infiniStatus_t infiniopDestroyPowDescriptor(infiniopPowDescriptor_t desc); + +#endif diff --git a/src/infiniop/ops/div/cpu/div_cpu.cc b/src/infiniop/ops/div/cpu/div_cpu.cc new file mode 100644 index 000000000..19e222031 --- /dev/null +++ b/src/infiniop/ops/div/cpu/div_cpu.cc @@ -0,0 +1,50 @@ +#include "div_cpu.h" + +namespace op::div::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &a_desc = input_desc_vec.at(0); + const auto &b_desc = input_desc_vec.at(1); + const auto &c_shape = out_desc->shape(); + const auto &a_shape = a_desc->shape(); + const auto &b_shape = b_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); + + CHECK_SAME_SHAPE(c_shape, a_shape, b_shape); + + // create CPU elementwise descriptor + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::div::cpu diff --git a/src/infiniop/ops/div/cpu/div_cpu.h b/src/infiniop/ops/div/cpu/div_cpu.h new file mode 100644 index 000000000..0373b766f --- /dev/null +++ b/src/infiniop/ops/div/cpu/div_cpu.h @@ -0,0 +1,19 @@ +#ifndef __DIV_CPU_H__ +#define __DIV_CPU_H__ + +#include "../../../elementwise/cpu/elementwise_cpu.h" + +ELEMENTWISE_DESCRIPTOR(div, cpu) + +namespace op::div::cpu { +typedef struct DivOp { +public: + static constexpr size_t num_inputs = 2; + template + T operator()(const T &a, const T &b) const { + return a / b; + } +} DivOp; +} // namespace op::div::cpu + +#endif // __DIV_CPU_H__ diff --git a/src/infiniop/ops/div/cuda/kernel.cuh b/src/infiniop/ops/div/cuda/kernel.cuh new file mode 100644 index 000000000..a67993da5 --- /dev/null +++ b/src/infiniop/ops/div/cuda/kernel.cuh @@ -0,0 +1,23 @@ +#ifndef __DIV_CUDA_H__ +#define __DIV_CUDA_H__ + +namespace op::div::cuda { +typedef struct DivOp { +public: + static constexpr size_t num_inputs = 2; + template + __device__ __forceinline__ T operator()(const T &a, const T &b) const { + if constexpr (std::is_same_v) { + return __h2div(a, b); + } else if constexpr (std::is_same_v || std::is_same_v) { + return a / b; + } else if constexpr (std::is_same_v) { + return __fdividef(a, b); + } else { + return a / b; + } + } +} DivOp; +} // namespace op::div::cuda + +#endif // __DIV_CUDA_H__ diff --git a/src/infiniop/ops/div/nvidia/div_nvidia.cu b/src/infiniop/ops/div/nvidia/div_nvidia.cu new file mode 100644 index 000000000..1abffe816 --- /dev/null +++ b/src/infiniop/ops/div/nvidia/div_nvidia.cu @@ -0,0 +1,57 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "div_nvidia.cuh" + +namespace op::div::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &a_desc = input_desc_vec.at(0); + const auto &b_desc = input_desc_vec.at(1); + const auto &c_shape = out_desc->shape(); + const auto &a_shape = a_desc->shape(); + const auto &b_shape = b_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); + + CHECK_SAME_SHAPE(c_shape, a_shape, b_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::DivOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::DivOp, float>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::div::nvidia diff --git a/src/infiniop/ops/div/nvidia/div_nvidia.cuh b/src/infiniop/ops/div/nvidia/div_nvidia.cuh new file mode 100644 index 000000000..1ad8af94e --- /dev/null +++ b/src/infiniop/ops/div/nvidia/div_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __DIV_CUDA_API_H__ +#define __DIV_CUDA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(div, nvidia) + +#endif // __DIV_CUDA_API_H__ diff --git a/src/infiniop/ops/div/operator.cc b/src/infiniop/ops/div/operator.cc new file mode 100644 index 000000000..84021a1af --- /dev/null +++ b/src/infiniop/ops/div/operator.cc @@ -0,0 +1,202 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/div.h" + +#ifdef ENABLE_CPU_API +#include "cpu/div_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#include "nvidia/div_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/div_metax.h" +#endif +#ifdef ENABLE_KUNLUN_API +#include "kunlun/div_kunlun.h" +#endif +#ifdef ENABLE_CAMBRICON_API +#include "bang/div_bang.h" +#endif +#ifdef ENABLE_MOORE_API +#include "moore/div_moore.h" +#endif + +__C infiniStatus_t infiniopCreateDivDescriptor( + infiniopHandle_t handle, + infiniopDivDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c_desc, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::div::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + c_desc, \ + {a_desc, \ + b_desc}) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_KUNLUN_API + CREATE(INFINI_DEVICE_KUNLUN, kunlun); +#endif +#ifdef ENABLE_CAMBRICON_API + CREATE(INFINI_DEVICE_CAMBRICON, bang); +#endif +#ifdef ENABLE_MOORE_API + CREATE(INFINI_DEVICE_MOORE, moore); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetDivWorkspaceSize(infiniopDivDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_KUNLUN_API + GET(INFINI_DEVICE_KUNLUN, kunlun); +#endif +#ifdef ENABLE_CAMBRICON_API + GET(INFINI_DEVICE_CAMBRICON, bang); +#endif +#ifdef ENABLE_MOORE_API + GET(INFINI_DEVICE_MOORE, moore); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopDiv( + infiniopDivDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *c, + const void *a, + const void *b, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, c, {a, b}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_KUNLUN_API + CALCULATE(INFINI_DEVICE_KUNLUN, kunlun); +#endif +#ifdef ENABLE_CAMBRICON_API + CALCULATE(INFINI_DEVICE_CAMBRICON, bang); +#endif +#ifdef ENABLE_MOORE_API + CALCULATE(INFINI_DEVICE_MOORE, moore); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroyDivDescriptor(infiniopDivDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + DELETE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_KUNLUN_API + DELETE(INFINI_DEVICE_KUNLUN, kunlun); +#endif +#ifdef ENABLE_CAMBRICON_API + DELETE(INFINI_DEVICE_CAMBRICON, bang); +#endif +#ifdef ENABLE_MOORE_API + DELETE(INFINI_DEVICE_MOORE, moore); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/ops/max/cpu/max_cpu.cc b/src/infiniop/ops/max/cpu/max_cpu.cc new file mode 100644 index 000000000..1b30fa4e4 --- /dev/null +++ b/src/infiniop/ops/max/cpu/max_cpu.cc @@ -0,0 +1,50 @@ +#include "max_cpu.h" + +namespace op::max::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &a_desc = input_desc_vec.at(0); + const auto &b_desc = input_desc_vec.at(1); + const auto &c_shape = out_desc->shape(); + const auto &a_shape = a_desc->shape(); + const auto &b_shape = b_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); + + CHECK_SAME_SHAPE(c_shape, a_shape, b_shape); + + // create CPU elementwise descriptor + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::max::cpu diff --git a/src/infiniop/ops/max/cpu/max_cpu.h b/src/infiniop/ops/max/cpu/max_cpu.h new file mode 100644 index 000000000..4d085ed39 --- /dev/null +++ b/src/infiniop/ops/max/cpu/max_cpu.h @@ -0,0 +1,20 @@ +#ifndef __MAX_CPU_H__ +#define __MAX_CPU_H__ + +#include "../../../elementwise/cpu/elementwise_cpu.h" +#include + +ELEMENTWISE_DESCRIPTOR(max, cpu) + +namespace op::max::cpu { +typedef struct MaxOp { +public: + static constexpr size_t num_inputs = 2; + template + T operator()(const T &a, const T &b) const { + return std::max(a, b); + } +} MaxOp; +} // namespace op::max::cpu + +#endif // __MAX_CPU_H__ diff --git a/src/infiniop/ops/max/cuda/kernel.cuh b/src/infiniop/ops/max/cuda/kernel.cuh new file mode 100644 index 000000000..bf3977a31 --- /dev/null +++ b/src/infiniop/ops/max/cuda/kernel.cuh @@ -0,0 +1,23 @@ +#ifndef __MAX_CUDA_H__ +#define __MAX_CUDA_H__ + +namespace op::max::cuda { +typedef struct MaxOp { +public: + static constexpr size_t num_inputs = 2; + template + __device__ __forceinline__ T operator()(const T &a, const T &b) const { + if constexpr (std::is_same_v) { + return __hmax2(a, b); + } else if constexpr (std::is_same_v || std::is_same_v) { + return a > b ? a : b; + } else if constexpr (std::is_same_v) { + return fmaxf(a, b); + } else { + return a > b ? a : b; + } + } +} MaxOp; +} // namespace op::max::cuda + +#endif // __MAX_CUDA_H__ diff --git a/src/infiniop/ops/max/nvidia/max_nvidia.cu b/src/infiniop/ops/max/nvidia/max_nvidia.cu new file mode 100644 index 000000000..5e9fb13f4 --- /dev/null +++ b/src/infiniop/ops/max/nvidia/max_nvidia.cu @@ -0,0 +1,57 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "max_nvidia.cuh" + +namespace op::max::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &a_desc = input_desc_vec.at(0); + const auto &b_desc = input_desc_vec.at(1); + const auto &c_shape = out_desc->shape(); + const auto &a_shape = a_desc->shape(); + const auto &b_shape = b_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); + + CHECK_SAME_SHAPE(c_shape, a_shape, b_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::MaxOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::MaxOp, float>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::max::nvidia diff --git a/src/infiniop/ops/max/nvidia/max_nvidia.cuh b/src/infiniop/ops/max/nvidia/max_nvidia.cuh new file mode 100644 index 000000000..b3b60dd2a --- /dev/null +++ b/src/infiniop/ops/max/nvidia/max_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __MAX_CUDA_API_H__ +#define __MAX_CUDA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(max, nvidia) + +#endif // __MAX_CUDA_API_H__ diff --git a/src/infiniop/ops/max/operator.cc b/src/infiniop/ops/max/operator.cc new file mode 100644 index 000000000..e04368533 --- /dev/null +++ b/src/infiniop/ops/max/operator.cc @@ -0,0 +1,202 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/max.h" + +#ifdef ENABLE_CPU_API +#include "cpu/max_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#include "nvidia/max_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/max_metax.h" +#endif +#ifdef ENABLE_KUNLUN_API +#include "kunlun/max_kunlun.h" +#endif +#ifdef ENABLE_CAMBRICON_API +#include "bang/max_bang.h" +#endif +#ifdef ENABLE_MOORE_API +#include "moore/max_moore.h" +#endif + +__C infiniStatus_t infiniopCreateMaxDescriptor( + infiniopHandle_t handle, + infiniopMaxDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c_desc, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::max::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + c_desc, \ + {a_desc, \ + b_desc}) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_KUNLUN_API + CREATE(INFINI_DEVICE_KUNLUN, kunlun); +#endif +#ifdef ENABLE_CAMBRICON_API + CREATE(INFINI_DEVICE_CAMBRICON, bang); +#endif +#ifdef ENABLE_MOORE_API + CREATE(INFINI_DEVICE_MOORE, moore); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetMaxWorkspaceSize(infiniopMaxDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_KUNLUN_API + GET(INFINI_DEVICE_KUNLUN, kunlun); +#endif +#ifdef ENABLE_CAMBRICON_API + GET(INFINI_DEVICE_CAMBRICON, bang); +#endif +#ifdef ENABLE_MOORE_API + GET(INFINI_DEVICE_MOORE, moore); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopMax( + infiniopMaxDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *c, + const void *a, + const void *b, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, c, {a, b}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_KUNLUN_API + CALCULATE(INFINI_DEVICE_KUNLUN, kunlun); +#endif +#ifdef ENABLE_CAMBRICON_API + CALCULATE(INFINI_DEVICE_CAMBRICON, bang); +#endif +#ifdef ENABLE_MOORE_API + CALCULATE(INFINI_DEVICE_MOORE, moore); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroyMaxDescriptor(infiniopMaxDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + DELETE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_KUNLUN_API + DELETE(INFINI_DEVICE_KUNLUN, kunlun); +#endif +#ifdef ENABLE_CAMBRICON_API + DELETE(INFINI_DEVICE_CAMBRICON, bang); +#endif +#ifdef ENABLE_MOORE_API + DELETE(INFINI_DEVICE_MOORE, moore); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/ops/min/cpu/min_cpu.cc b/src/infiniop/ops/min/cpu/min_cpu.cc new file mode 100644 index 000000000..dc30ee57f --- /dev/null +++ b/src/infiniop/ops/min/cpu/min_cpu.cc @@ -0,0 +1,50 @@ +#include "min_cpu.h" + +namespace op::min::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &a_desc = input_desc_vec.at(0); + const auto &b_desc = input_desc_vec.at(1); + const auto &c_shape = out_desc->shape(); + const auto &a_shape = a_desc->shape(); + const auto &b_shape = b_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); + + CHECK_SAME_SHAPE(c_shape, a_shape, b_shape); + + // create CPU elementwise descriptor + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::min::cpu diff --git a/src/infiniop/ops/min/cpu/min_cpu.h b/src/infiniop/ops/min/cpu/min_cpu.h new file mode 100644 index 000000000..1c84d4fca --- /dev/null +++ b/src/infiniop/ops/min/cpu/min_cpu.h @@ -0,0 +1,20 @@ +#ifndef __MIN_CPU_H__ +#define __MIN_CPU_H__ + +#include "../../../elementwise/cpu/elementwise_cpu.h" +#include + +ELEMENTWISE_DESCRIPTOR(min, cpu) + +namespace op::min::cpu { +typedef struct MinOp { +public: + static constexpr size_t num_inputs = 2; + template + T operator()(const T &a, const T &b) const { + return std::min(a, b); + } +} MinOp; +} // namespace op::min::cpu + +#endif // __MIN_CPU_H__ diff --git a/src/infiniop/ops/min/cuda/kernel.cuh b/src/infiniop/ops/min/cuda/kernel.cuh new file mode 100644 index 000000000..aac14a0e8 --- /dev/null +++ b/src/infiniop/ops/min/cuda/kernel.cuh @@ -0,0 +1,23 @@ +#ifndef __MIN_CUDA_H__ +#define __MIN_CUDA_H__ + +namespace op::min::cuda { +typedef struct MinOp { +public: + static constexpr size_t num_inputs = 2; + template + __device__ __forceinline__ T operator()(const T &a, const T &b) const { + if constexpr (std::is_same_v) { + return __hmin2(a, b); + } else if constexpr (std::is_same_v || std::is_same_v) { + return a < b ? a : b; + } else if constexpr (std::is_same_v) { + return fminf(a, b); + } else { + return a < b ? a : b; + } + } +} MinOp; +} // namespace op::min::cuda + +#endif // __MIN_CUDA_H__ diff --git a/src/infiniop/ops/min/nvidia/min_nvidia.cu b/src/infiniop/ops/min/nvidia/min_nvidia.cu new file mode 100644 index 000000000..419655e29 --- /dev/null +++ b/src/infiniop/ops/min/nvidia/min_nvidia.cu @@ -0,0 +1,57 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "min_nvidia.cuh" + +namespace op::min::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &a_desc = input_desc_vec.at(0); + const auto &b_desc = input_desc_vec.at(1); + const auto &c_shape = out_desc->shape(); + const auto &a_shape = a_desc->shape(); + const auto &b_shape = b_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); + + CHECK_SAME_SHAPE(c_shape, a_shape, b_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::MinOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::MinOp, float>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::min::nvidia diff --git a/src/infiniop/ops/min/nvidia/min_nvidia.cuh b/src/infiniop/ops/min/nvidia/min_nvidia.cuh new file mode 100644 index 000000000..ada9a3545 --- /dev/null +++ b/src/infiniop/ops/min/nvidia/min_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __MIN_CUDA_API_H__ +#define __MIN_CUDA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(min, nvidia) + +#endif // __MIN_CUDA_API_H__ diff --git a/src/infiniop/ops/min/operator.cc b/src/infiniop/ops/min/operator.cc new file mode 100644 index 000000000..8479feab4 --- /dev/null +++ b/src/infiniop/ops/min/operator.cc @@ -0,0 +1,202 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/min.h" + +#ifdef ENABLE_CPU_API +#include "cpu/min_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#include "nvidia/min_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/min_metax.h" +#endif +#ifdef ENABLE_KUNLUN_API +#include "kunlun/min_kunlun.h" +#endif +#ifdef ENABLE_CAMBRICON_API +#include "bang/min_bang.h" +#endif +#ifdef ENABLE_MOORE_API +#include "moore/min_moore.h" +#endif + +__C infiniStatus_t infiniopCreateMinDescriptor( + infiniopHandle_t handle, + infiniopMinDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c_desc, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::min::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + c_desc, \ + {a_desc, \ + b_desc}) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_KUNLUN_API + CREATE(INFINI_DEVICE_KUNLUN, kunlun); +#endif +#ifdef ENABLE_CAMBRICON_API + CREATE(INFINI_DEVICE_CAMBRICON, bang); +#endif +#ifdef ENABLE_MOORE_API + CREATE(INFINI_DEVICE_MOORE, moore); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetMinWorkspaceSize(infiniopMinDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_KUNLUN_API + GET(INFINI_DEVICE_KUNLUN, kunlun); +#endif +#ifdef ENABLE_CAMBRICON_API + GET(INFINI_DEVICE_CAMBRICON, bang); +#endif +#ifdef ENABLE_MOORE_API + GET(INFINI_DEVICE_MOORE, moore); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopMin( + infiniopMinDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *c, + const void *a, + const void *b, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, c, {a, b}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_KUNLUN_API + CALCULATE(INFINI_DEVICE_KUNLUN, kunlun); +#endif +#ifdef ENABLE_CAMBRICON_API + CALCULATE(INFINI_DEVICE_CAMBRICON, bang); +#endif +#ifdef ENABLE_MOORE_API + CALCULATE(INFINI_DEVICE_MOORE, moore); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroyMinDescriptor(infiniopMinDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + DELETE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_KUNLUN_API + DELETE(INFINI_DEVICE_KUNLUN, kunlun); +#endif +#ifdef ENABLE_CAMBRICON_API + DELETE(INFINI_DEVICE_CAMBRICON, bang); +#endif +#ifdef ENABLE_MOORE_API + DELETE(INFINI_DEVICE_MOORE, moore); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/ops/mod/cpu/mod_cpu.cc b/src/infiniop/ops/mod/cpu/mod_cpu.cc new file mode 100644 index 000000000..907d05166 --- /dev/null +++ b/src/infiniop/ops/mod/cpu/mod_cpu.cc @@ -0,0 +1,49 @@ +#include "mod_cpu.h" + +namespace op::mod::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &a_desc = input_desc_vec.at(0); + const auto &b_desc = input_desc_vec.at(1); + const auto &out_shape = out_desc->shape(); + const auto &a_shape = a_desc->shape(); + const auto &b_shape = b_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); + + CHECK_SAME_SHAPE(out_shape, a_shape, b_shape); + + // create CPU elementwise descriptor + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + return INFINI_STATUS_SUCCESS; +} +} // namespace op::mod::cpu diff --git a/src/infiniop/ops/mod/cpu/mod_cpu.h b/src/infiniop/ops/mod/cpu/mod_cpu.h new file mode 100644 index 000000000..9e78adca6 --- /dev/null +++ b/src/infiniop/ops/mod/cpu/mod_cpu.h @@ -0,0 +1,23 @@ +#ifndef __MOD_CPU_H__ +#define __MOD_CPU_H__ + +#include "../../../elementwise/cpu/elementwise_cpu.h" + +ELEMENTWISE_DESCRIPTOR(mod, cpu) + +namespace op::mod::cpu { +typedef struct ModOp { +public: + static constexpr size_t num_inputs = 2; + template + T operator()(const T &a, const T &b) const { + if constexpr (std::is_floating_point_v) { + return std::fmod(a, b); + } else { + return a % b; + } + } +} ModOp; +} // namespace op::mod::cpu + +#endif // __MOD_CPU_H__ diff --git a/src/infiniop/ops/mod/cuda/kernel.cuh b/src/infiniop/ops/mod/cuda/kernel.cuh new file mode 100644 index 000000000..0dcb54136 --- /dev/null +++ b/src/infiniop/ops/mod/cuda/kernel.cuh @@ -0,0 +1,30 @@ +#ifndef __MOD_CUDA_H__ +#define __MOD_CUDA_H__ + +#include +#include + +namespace op::mod::cuda { +typedef struct ModOp { +public: + static constexpr size_t num_inputs = 2; + template + __device__ __forceinline__ T operator()(const T &a, const T &b) const { + if constexpr (std::is_same_v) { + float2 a_f2 = __half22float2(a); + float2 b_f2 = __half22float2(b); + return __float22half2_rn(make_float2(std::fmod(a_f2.x, b_f2.x), std::fmod(a_f2.y, b_f2.y))); + } else if constexpr (std::is_same_v) { + float a_ = __half2float(a); + float b_ = __half2float(b); + return __float2half(std::fmod(a_, b_)); + } else if constexpr (std::is_floating_point_v) { + return std::fmod(a, b); + } else { + return a % b; + } + } +} ModOp; +} // namespace op::mod::cuda + +#endif // __MOD_CUDA_H__ diff --git a/src/infiniop/ops/mod/nvidia/mod_nvidia.cu b/src/infiniop/ops/mod/nvidia/mod_nvidia.cu new file mode 100644 index 000000000..64326d441 --- /dev/null +++ b/src/infiniop/ops/mod/nvidia/mod_nvidia.cu @@ -0,0 +1,57 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "mod_nvidia.cuh" + +namespace op::mod::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &a_desc = input_desc_vec.at(0); + const auto &b_desc = input_desc_vec.at(1); + const auto &c_shape = out_desc->shape(); + const auto &a_shape = a_desc->shape(); + const auto &b_shape = b_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); + + CHECK_SAME_SHAPE(c_shape, a_shape, b_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::ModOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::ModOp, float>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::mod::nvidia diff --git a/src/infiniop/ops/mod/nvidia/mod_nvidia.cuh b/src/infiniop/ops/mod/nvidia/mod_nvidia.cuh new file mode 100644 index 000000000..31788cfd2 --- /dev/null +++ b/src/infiniop/ops/mod/nvidia/mod_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __MOD_CUDA_API_H__ +#define __MOD_CUDA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(mod, nvidia) + +#endif // __MOD_CUDA_API_H__ diff --git a/src/infiniop/ops/mod/operator.cc b/src/infiniop/ops/mod/operator.cc new file mode 100644 index 000000000..85810e794 --- /dev/null +++ b/src/infiniop/ops/mod/operator.cc @@ -0,0 +1,142 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/mod.h" + +#ifdef ENABLE_CPU_API +#include "cpu/mod_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#include "nvidia/mod_nvidia.cuh" +#endif + +__C infiniStatus_t infiniopCreateModDescriptor( + infiniopHandle_t handle, + infiniopModDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c_desc, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::mod::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + c_desc, \ + {a_desc, \ + b_desc}) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetModWorkspaceSize(infiniopModDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopMod( + infiniopModDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *c, + const void *a, + const void *b, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, c, {a, b}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroyModDescriptor(infiniopModDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + DELETE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/ops/pow/cpu/pow_cpu.cc b/src/infiniop/ops/pow/cpu/pow_cpu.cc new file mode 100644 index 000000000..0c6fda0f7 --- /dev/null +++ b/src/infiniop/ops/pow/cpu/pow_cpu.cc @@ -0,0 +1,49 @@ +#include "pow_cpu.h" + +namespace op::pow::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &a_desc = input_desc_vec.at(0); + const auto &b_desc = input_desc_vec.at(1); + const auto &out_shape = out_desc->shape(); + const auto &a_shape = a_desc->shape(); + const auto &b_shape = b_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); + + CHECK_SAME_SHAPE(out_shape, a_shape, b_shape); + + // create CPU elementwise descriptor + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + return INFINI_STATUS_SUCCESS; +} +} // namespace op::pow::cpu diff --git a/src/infiniop/ops/pow/cpu/pow_cpu.h b/src/infiniop/ops/pow/cpu/pow_cpu.h new file mode 100644 index 000000000..21d9bb897 --- /dev/null +++ b/src/infiniop/ops/pow/cpu/pow_cpu.h @@ -0,0 +1,19 @@ +#ifndef __POW_CPU_H__ +#define __POW_CPU_H__ + +#include "../../../elementwise/cpu/elementwise_cpu.h" + +ELEMENTWISE_DESCRIPTOR(pow, cpu) + +namespace op::pow::cpu { +typedef struct PowOp { +public: + static constexpr size_t num_inputs = 2; + template + T operator()(const T &a, const T &b) const { + return std::pow(a, b); + } +} PowOp; +} // namespace op::pow::cpu + +#endif // __POW_CPU_H__ diff --git a/src/infiniop/ops/pow/cuda/kernel.cuh b/src/infiniop/ops/pow/cuda/kernel.cuh new file mode 100644 index 000000000..e8b5324a0 --- /dev/null +++ b/src/infiniop/ops/pow/cuda/kernel.cuh @@ -0,0 +1,40 @@ +#ifndef __POW_CUDA_H__ +#define __POW_CUDA_H__ + +#include +#include +#include + +namespace op::pow::cuda { +typedef struct PowOp { + static constexpr size_t num_inputs = 2; + template + __device__ __forceinline__ T operator()(const T &a, const T &b) const { + if constexpr (std::is_same_v) { + float2 a_f2 = __half22float2(a); + float2 b_f2 = __half22float2(b); + return __float22half2_rn(make_float2(__powf(a_f2.x, b_f2.x), __powf(a_f2.y, b_f2.y))); + } else if constexpr (std::is_same_v) { + float a_ = __half2float(a); + float b_ = __half2float(b); + float ans_f = __powf(a_, b_); + return __float2half(isnan(ans_f) ? std::pow(a_, b_) : ans_f); + } else if constexpr (std::is_same_v) { + float2 a_f2 = __bfloat1622float2(a); + float2 b_f2 = __bfloat1622float2(b); + return __floats2bfloat162_rn(__powf(a_f2.x, b_f2.x), __powf(a_f2.y, b_f2.y)); + } else if constexpr (std::is_same_v) { + float a_ = __bfloat162float(a); + float b_ = __bfloat162float(b); + return __float2bfloat16_rn(__powf(a_, b_)); + } else if constexpr (std::is_same_v) { + return __powf(a, b); + } else { + return std::pow(a, b); + } + } +} PowOp; + +} // namespace op::pow::cuda + +#endif // __POW_CUDA_H__ diff --git a/src/infiniop/ops/pow/nvidia/pow_nvidia.cu b/src/infiniop/ops/pow/nvidia/pow_nvidia.cu new file mode 100644 index 000000000..3cfd0cd2f --- /dev/null +++ b/src/infiniop/ops/pow/nvidia/pow_nvidia.cu @@ -0,0 +1,57 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "pow_nvidia.cuh" + +namespace op::pow::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &a_desc = input_desc_vec.at(0); + const auto &b_desc = input_desc_vec.at(1); + const auto &c_shape = out_desc->shape(); + const auto &a_shape = a_desc->shape(); + const auto &b_shape = b_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); + + CHECK_SAME_SHAPE(c_shape, a_shape, b_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::PowOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::PowOp, float>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::pow::nvidia diff --git a/src/infiniop/ops/pow/nvidia/pow_nvidia.cuh b/src/infiniop/ops/pow/nvidia/pow_nvidia.cuh new file mode 100644 index 000000000..5bbb2fb8c --- /dev/null +++ b/src/infiniop/ops/pow/nvidia/pow_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __POW_CUDA_API_H__ +#define __POW_CUDA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(pow, nvidia) + +#endif // __POW_CUDA_API_H__ diff --git a/src/infiniop/ops/pow/operator.cc b/src/infiniop/ops/pow/operator.cc new file mode 100644 index 000000000..e90639f67 --- /dev/null +++ b/src/infiniop/ops/pow/operator.cc @@ -0,0 +1,142 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/pow.h" + +#ifdef ENABLE_CPU_API +#include "cpu/pow_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#include "nvidia/pow_nvidia.cuh" +#endif + +__C infiniStatus_t infiniopCreatePowDescriptor( + infiniopHandle_t handle, + infiniopPowDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c_desc, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::pow::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + c_desc, \ + {a_desc, \ + b_desc}) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetPowWorkspaceSize(infiniopPowDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopPow( + infiniopPowDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *c, + const void *a, + const void *b, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, c, {a, b}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroyPowDescriptor(infiniopPowDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + DELETE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/test/infiniop/div.py b/test/infiniop/div.py new file mode 100644 index 000000000..17b22b2e5 --- /dev/null +++ b/test/infiniop/div.py @@ -0,0 +1,192 @@ +import torch +import ctypes +from ctypes import c_uint64 +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + get_tolerance, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, +) +from enum import Enum, auto + +# ============================================================================== +# Configuration (Internal Use Only) +# ============================================================================== +# These are not meant to be imported from other modules +_TEST_CASES_ = [ + # shape, a_stride, b_stride, c_stride + ((13, 4), None, None, None), + ((13, 4), (10, 1), (10, 1), (10, 1)), + ((13, 4), (0, 1), None, None), + ((13, 4, 4), None, None, None), + ((13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1)), + ((13, 4, 4), (4, 0, 1), (0, 4, 1), None), + ((16, 5632), None, None, None), + ((16, 5632), (13312, 1), (13312, 1), (13312, 1)), + ((13, 16, 2), (128, 4, 1), (0, 2, 1), (64, 4, 1)), + ((13, 16, 2), (128, 4, 1), (2, 0, 1), (64, 4, 1)), + ((4, 4, 5632), None, None, None), + ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1)), +] + + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_A = auto() + INPLACE_B = auto() + + +# Inplace options applied for each test case in _TEST_CASES_ +_INPLACE = [ + Inplace.OUT_OF_PLACE, + Inplace.INPLACE_A, + Inplace.INPLACE_B, +] + +# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_ +_TEST_CASES = [ + test_case + (inplace_item,) + for test_case in _TEST_CASES_ + for inplace_item in _INPLACE +] + +# Data types used for testing (matching old operators library: only F16 and F32) +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32] + +# Tolerance map for different data types +# Note: F32 tolerance is relaxed compared to theoretical precision due to: +# - Old operators library uses vectorized operations (pack_size=4) with vecN +# - InfiniCore uses elementwise operations, which can cause 1 ULP differences +# - This is acceptable as it's within floating-point precision limits +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6}, # Relaxed from 1e-7 to accommodate vectorization differences +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def div(c, a, b): + # Only support F16 and F32 (matching old operators library) + torch.div(a, b, out=c) + + +def test( + handle, + device, + shape, + a_stride=None, + b_stride=None, + c_stride=None, + inplace=Inplace.OUT_OF_PLACE, + dtype=InfiniDtype.F16, + sync=None, +): + a = TestTensor(shape, a_stride, dtype, device) + # For division, ensure b doesn't contain zeros to avoid division by zero + # Similar to old test: b = torch.rand(...) * 2, which gives range [0, 2) + # Use scale=2 to ensure values are in [0, 2) range, then add small bias to avoid zero + b = TestTensor(shape, b_stride, dtype, device, scale=2, bias=0.1) + + if inplace == Inplace.INPLACE_A: + if c_stride is not None and c_stride != a_stride: + return + c = a + elif inplace == Inplace.INPLACE_B: + if c_stride is not None and c_stride != b_stride: + return + c = b + else: + c = TestTensor(shape, c_stride, dtype, device) + + if c.is_broadcast(): + return + + print( + f"Testing Div on {InfiniDeviceNames[device]} with shape:{shape} a_stride:{a_stride} b_stride:{b_stride} c_stride:{c_stride} " + f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}" + ) + div(c.torch_tensor(), a.torch_tensor(), b.torch_tensor()) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateDivDescriptor( + handle, + ctypes.byref(descriptor), + c.descriptor, + a.descriptor, + b.descriptor, + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + for tensor in [a, b, c]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetDivWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, device) + + def lib_div(): + check_error( + LIBINFINIOP.infiniopDiv( + descriptor, + workspace.data(), + workspace_size.value, + c.data(), + a.data(), + b.data(), + None, + ) + ) + + lib_div() + if sync is not None: + sync() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol) + assert torch.allclose(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol, equal_nan=True) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: div(c.torch_tensor(), a.torch_tensor(), b.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_div(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + check_error(LIBINFINIOP.infiniopDestroyDivDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/libinfiniop/op_register.py b/test/infiniop/libinfiniop/op_register.py index 618be2b05..a61cea018 100644 --- a/test/infiniop/libinfiniop/op_register.py +++ b/test/infiniop/libinfiniop/op_register.py @@ -269,6 +269,176 @@ def mul_(lib): ] +@OpRegister.operator +def pow_(lib): + lib.infiniopCreatePowDescriptor.restype = c_int32 + lib.infiniopCreatePowDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + + lib.infiniopGetPowWorkspaceSize.restype = c_int32 + lib.infiniopGetPowWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + + lib.infiniopPow.restype = c_int32 + lib.infiniopPow.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + ] + + lib.infiniopDestroyPowDescriptor.restype = c_int32 + lib.infiniopDestroyPowDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + +@OpRegister.operator +def div_(lib): + lib.infiniopCreateDivDescriptor.restype = c_int32 + lib.infiniopCreateDivDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + + lib.infiniopGetDivWorkspaceSize.restype = c_int32 + lib.infiniopGetDivWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + + lib.infiniopDiv.restype = c_int32 + lib.infiniopDiv.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + ] + + lib.infiniopDestroyDivDescriptor.restype = c_int32 + lib.infiniopDestroyDivDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + +@OpRegister.operator +def mod_(lib): + lib.infiniopCreateModDescriptor.restype = c_int32 + lib.infiniopCreateModDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + + lib.infiniopGetModWorkspaceSize.restype = c_int32 + lib.infiniopGetModWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + + lib.infiniopMod.restype = c_int32 + lib.infiniopMod.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + ] + + lib.infiniopDestroyModDescriptor.restype = c_int32 + lib.infiniopDestroyModDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + +@OpRegister.operator +def max_(lib): + lib.infiniopCreateMaxDescriptor.restype = c_int32 + lib.infiniopCreateMaxDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + + lib.infiniopGetMaxWorkspaceSize.restype = c_int32 + lib.infiniopGetMaxWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + + lib.infiniopMax.restype = c_int32 + lib.infiniopMax.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + ] + + lib.infiniopDestroyMaxDescriptor.restype = c_int32 + lib.infiniopDestroyMaxDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + +@OpRegister.operator +def min_(lib): + lib.infiniopCreateMinDescriptor.restype = c_int32 + lib.infiniopCreateMinDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + + lib.infiniopGetMinWorkspaceSize.restype = c_int32 + lib.infiniopGetMinWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + + lib.infiniopMin.restype = c_int32 + lib.infiniopMin.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + ] + + lib.infiniopDestroyMinDescriptor.restype = c_int32 + lib.infiniopDestroyMinDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + @OpRegister.operator def random_sample_(lib): lib.infiniopCreateRandomSampleDescriptor.restype = c_int32 diff --git a/test/infiniop/max.py b/test/infiniop/max.py new file mode 100644 index 000000000..e4221cf3e --- /dev/null +++ b/test/infiniop/max.py @@ -0,0 +1,189 @@ +import torch +import ctypes +from ctypes import c_uint64 +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + get_tolerance, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, +) +from enum import Enum, auto + +# ============================================================================== +# Configuration (Internal Use Only) +# ============================================================================== +# These are not meant to be imported from other modules +_TEST_CASES_ = [ + # shape, a_stride, b_stride, c_stride + ((13, 4), None, None, None), + ((13, 4), (10, 1), (10, 1), (10, 1)), + ((13, 4), (0, 1), None, None), + ((13, 4, 4), None, None, None), + ((13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1)), + ((13, 4, 4), (4, 0, 1), (0, 4, 1), None), + ((16, 5632), None, None, None), + ((16, 5632), (13312, 1), (13312, 1), (13312, 1)), + ((13, 16, 2), (128, 4, 1), (0, 2, 1), (64, 4, 1)), + ((13, 16, 2), (128, 4, 1), (2, 0, 1), (64, 4, 1)), + ((4, 4, 5632), None, None, None), + ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1)), +] + + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_A = auto() + INPLACE_B = auto() + + +# Inplace options applied for each test case in _TEST_CASES_ +_INPLACE = [ + Inplace.OUT_OF_PLACE, + Inplace.INPLACE_A, + Inplace.INPLACE_B, +] + +# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_ +_TEST_CASES = [ + test_case + (inplace_item,) + for test_case in _TEST_CASES_ + for inplace_item in _INPLACE +] + +# Data types used for testing (matching old operators library: only F16 and F32) +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32] + +# Tolerance map for different data types +# Note: F32 tolerance is relaxed compared to theoretical precision due to: +# - Old operators library uses vectorized operations (pack_size=4) with vecN +# - InfiniCore uses elementwise operations, which can cause 1 ULP differences +# - This is acceptable as it's within floating-point precision limits +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6}, # Relaxed from 1e-7 to accommodate vectorization differences +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def max(c, a, b): + # Only support F16 and F32 (matching old operators library) + torch.maximum(a, b, out=c) + + +def test( + handle, + device, + shape, + a_stride=None, + b_stride=None, + c_stride=None, + inplace=Inplace.OUT_OF_PLACE, + dtype=InfiniDtype.F16, + sync=None, +): + a = TestTensor(shape, a_stride, dtype, device) + b = TestTensor(shape, b_stride, dtype, device) + + if inplace == Inplace.INPLACE_A: + if c_stride is not None and c_stride != a_stride: + return + c = a + elif inplace == Inplace.INPLACE_B: + if c_stride is not None and c_stride != b_stride: + return + c = b + else: + c = TestTensor(shape, c_stride, dtype, device) + + if c.is_broadcast(): + return + + print( + f"Testing Max on {InfiniDeviceNames[device]} with shape:{shape} a_stride:{a_stride} b_stride:{b_stride} c_stride:{c_stride} " + f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}" + ) + max(c.torch_tensor(), a.torch_tensor(), b.torch_tensor()) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateMaxDescriptor( + handle, + ctypes.byref(descriptor), + c.descriptor, + a.descriptor, + b.descriptor, + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + for tensor in [a, b, c]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetMaxWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, device) + + def lib_max(): + check_error( + LIBINFINIOP.infiniopMax( + descriptor, + workspace.data(), + workspace_size.value, + c.data(), + a.data(), + b.data(), + None, + ) + ) + + lib_max() + if sync is not None: + sync() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol) + assert torch.allclose(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol, equal_nan=True) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: max(c.torch_tensor(), a.torch_tensor(), b.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_max(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + check_error(LIBINFINIOP.infiniopDestroyMaxDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/min.py b/test/infiniop/min.py new file mode 100644 index 000000000..19f52a334 --- /dev/null +++ b/test/infiniop/min.py @@ -0,0 +1,189 @@ +import torch +import ctypes +from ctypes import c_uint64 +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + get_tolerance, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, +) +from enum import Enum, auto + +# ============================================================================== +# Configuration (Internal Use Only) +# ============================================================================== +# These are not meant to be imported from other modules +_TEST_CASES_ = [ + # shape, a_stride, b_stride, c_stride + ((13, 4), None, None, None), + ((13, 4), (10, 1), (10, 1), (10, 1)), + ((13, 4), (0, 1), None, None), + ((13, 4, 4), None, None, None), + ((13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1)), + ((13, 4, 4), (4, 0, 1), (0, 4, 1), None), + ((16, 5632), None, None, None), + ((16, 5632), (13312, 1), (13312, 1), (13312, 1)), + ((13, 16, 2), (128, 4, 1), (0, 2, 1), (64, 4, 1)), + ((13, 16, 2), (128, 4, 1), (2, 0, 1), (64, 4, 1)), + ((4, 4, 5632), None, None, None), + ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1)), +] + + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_A = auto() + INPLACE_B = auto() + + +# Inplace options applied for each test case in _TEST_CASES_ +_INPLACE = [ + Inplace.OUT_OF_PLACE, + Inplace.INPLACE_A, + Inplace.INPLACE_B, +] + +# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_ +_TEST_CASES = [ + test_case + (inplace_item,) + for test_case in _TEST_CASES_ + for inplace_item in _INPLACE +] + +# Data types used for testing (matching old operators library: only F16 and F32) +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32] + +# Tolerance map for different data types +# Note: F32 tolerance is relaxed compared to theoretical precision due to: +# - Old operators library uses vectorized operations (pack_size=4) with vecN +# - InfiniCore uses elementwise operations, which can cause 1 ULP differences +# - This is acceptable as it's within floating-point precision limits +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6}, # Relaxed from 1e-7 to accommodate vectorization differences +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def min(c, a, b): + # Only support F16 and F32 (matching old operators library) + torch.minimum(a, b, out=c) + + +def test( + handle, + device, + shape, + a_stride=None, + b_stride=None, + c_stride=None, + inplace=Inplace.OUT_OF_PLACE, + dtype=InfiniDtype.F16, + sync=None, +): + a = TestTensor(shape, a_stride, dtype, device) + b = TestTensor(shape, b_stride, dtype, device) + + if inplace == Inplace.INPLACE_A: + if c_stride is not None and c_stride != a_stride: + return + c = a + elif inplace == Inplace.INPLACE_B: + if c_stride is not None and c_stride != b_stride: + return + c = b + else: + c = TestTensor(shape, c_stride, dtype, device) + + if c.is_broadcast(): + return + + print( + f"Testing Min on {InfiniDeviceNames[device]} with shape:{shape} a_stride:{a_stride} b_stride:{b_stride} c_stride:{c_stride} " + f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}" + ) + min(c.torch_tensor(), a.torch_tensor(), b.torch_tensor()) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateMinDescriptor( + handle, + ctypes.byref(descriptor), + c.descriptor, + a.descriptor, + b.descriptor, + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + for tensor in [a, b, c]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetMinWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, device) + + def lib_min(): + check_error( + LIBINFINIOP.infiniopMin( + descriptor, + workspace.data(), + workspace_size.value, + c.data(), + a.data(), + b.data(), + None, + ) + ) + + lib_min() + if sync is not None: + sync() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol) + assert torch.allclose(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol, equal_nan=True) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: min(c.torch_tensor(), a.torch_tensor(), b.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_min(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + check_error(LIBINFINIOP.infiniopDestroyMinDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/mod.py b/test/infiniop/mod.py new file mode 100644 index 000000000..298f3137f --- /dev/null +++ b/test/infiniop/mod.py @@ -0,0 +1,190 @@ +import torch +import ctypes +from ctypes import c_uint64 +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + get_tolerance, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, +) +from enum import Enum, auto + +# ============================================================================== +# Configuration (Internal Use Only) +# ============================================================================== +# These are not meant to be imported from other modules +_TEST_CASES_ = [ + # shape, a_stride, b_stride, c_stride + ((13, 4), None, None, None), + ((13, 4), (10, 1), (10, 1), (10, 1)), + ((13, 4), (0, 1), None, None), + ((13, 4, 4), None, None, None), + ((13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1)), + ((13, 4, 4), (4, 0, 1), (0, 4, 1), None), + ((16, 5632), None, None, None), + ((16, 5632), (13312, 1), (13312, 1), (13312, 1)), + ((13, 16, 2), (128, 4, 1), (0, 2, 1), (64, 4, 1)), + ((13, 16, 2), (128, 4, 1), (2, 0, 1), (64, 4, 1)), + ((4, 4, 5632), None, None, None), + ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1)), +] + + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_A = auto() + INPLACE_B = auto() + + +# Inplace options applied for each test case in _TEST_CASES_ +_INPLACE = [ + Inplace.OUT_OF_PLACE, + Inplace.INPLACE_A, + Inplace.INPLACE_B, +] + +# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_ +_TEST_CASES = [ + test_case + (inplace_item,) + for test_case in _TEST_CASES_ + for inplace_item in _INPLACE +] + +# Data types used for testing (matching old operators library: only F16 and F32) +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32] + +# Tolerance map for different data types +# Note: mod operation uses fmod for floating point, which should be exact +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def mod_op(c, a, b): + torch.fmod(a, b, out=c) + + +def test( + handle, + device, + shape, + a_stride=None, + b_stride=None, + c_stride=None, + inplace=Inplace.OUT_OF_PLACE, + dtype=InfiniDtype.F16, + sync=None, +): + # Generate test tensors with values in a reasonable range for mod operation + # Use scale=10 to get values in [0, 10) range, similar to old test + a = TestTensor(shape, a_stride, dtype, device, mode="random", scale=10.0) + # Ensure b doesn't contain zeros to avoid division by zero in mod + b = TestTensor(shape, b_stride, dtype, device, mode="random", scale=10.0, bias=0.1) + + if inplace == Inplace.INPLACE_A: + if c_stride is not None and c_stride != a_stride: + return + c = a + elif inplace == Inplace.INPLACE_B: + if c_stride is not None and c_stride != b_stride: + return + c = b + else: + c = TestTensor(shape, c_stride, dtype, device) + + if c.is_broadcast(): + return + + print( + f"Testing Mod on {InfiniDeviceNames[device]} with shape:{shape} a_stride:{a_stride} b_stride:{b_stride} c_stride:{c_stride} " + f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}" + ) + + mod_op(c.torch_tensor(), a.torch_tensor(), b.torch_tensor()) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateModDescriptor( + handle, + ctypes.byref(descriptor), + c.descriptor, + a.descriptor, + b.descriptor, + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + for tensor in [a, b, c]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetModWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, device) + + def lib_mod(): + check_error( + LIBINFINIOP.infiniopMod( + descriptor, + workspace.data(), + workspace_size.value, + c.data(), + a.data(), + b.data(), + None, + ) + ) + + lib_mod() + if sync is not None: + sync() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol, equal_nan=True) + assert torch.allclose(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol, equal_nan=True) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: mod_op(c.torch_tensor(), a.torch_tensor(), b.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_mod(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + check_error(LIBINFINIOP.infiniopDestroyModDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/pow.py b/test/infiniop/pow.py new file mode 100644 index 000000000..f437c4229 --- /dev/null +++ b/test/infiniop/pow.py @@ -0,0 +1,190 @@ +import torch +import ctypes +from ctypes import c_uint64 +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + get_tolerance, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, +) +from enum import Enum, auto + +# ============================================================================== +# Configuration (Internal Use Only) +# ============================================================================== +# These are not meant to be imported from other modules +_TEST_CASES_ = [ + # shape, a_stride, b_stride, c_stride + ((13, 4), None, None, None), + ((13, 4), (10, 1), (10, 1), (10, 1)), + ((13, 4), (0, 1), None, None), + ((13, 4, 4), None, None, None), + ((13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1)), + ((13, 4, 4), (4, 0, 1), (0, 4, 1), None), + ((16, 5632), None, None, None), + ((16, 5632), (13312, 1), (13312, 1), (13312, 1)), + ((4, 4, 5632), None, None, None), + ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1)), +] + + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_A = auto() + INPLACE_B = auto() + + +# Inplace options applied for each test case in _TEST_CASES_ +_INPLACE = [ + Inplace.OUT_OF_PLACE, + Inplace.INPLACE_A, + Inplace.INPLACE_B, +] + +# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_ +_TEST_CASES = [ + test_case + (inplace_item,) + for test_case in _TEST_CASES_ + for inplace_item in _INPLACE +] + +# Data types used for testing +# Note: Only F16 and F32 are supported, matching the old repository's binary operator +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32] + +# Tolerance map for different data types +# Note: pow operation may have larger numerical errors, especially for F16 +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-3}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def pow_op(c, a, b): + torch.pow(a, b, out=c) + + +def test( + handle, + device, + shape, + a_stride=None, + b_stride=None, + c_stride=None, + inplace=Inplace.OUT_OF_PLACE, + dtype=InfiniDtype.F16, + sync=None, +): + # Generate test tensors with values in a reasonable range for pow operation + # Avoid negative bases and very large exponents to prevent numerical issues + a = TestTensor(shape, a_stride, dtype, device, mode="random", scale=5.0, bias=0.1) + b = TestTensor(shape, b_stride, dtype, device, mode="random", scale=3.0, bias=0.1) + + if inplace == Inplace.INPLACE_A: + if c_stride is not None and c_stride != a_stride: + return + c = a + elif inplace == Inplace.INPLACE_B: + if c_stride is not None and c_stride != b_stride: + return + c = b + else: + c = TestTensor(shape, c_stride, dtype, device) + + if c.is_broadcast(): + return + + print( + f"Testing Pow on {InfiniDeviceNames[device]} with shape:{shape} a_stride:{a_stride} b_stride:{b_stride} c_stride:{c_stride} " + f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}" + ) + + pow_op(c.torch_tensor(), a.torch_tensor(), b.torch_tensor()) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreatePowDescriptor( + handle, + ctypes.byref(descriptor), + c.descriptor, + a.descriptor, + b.descriptor, + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + for tensor in [a, b, c]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetPowWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, c.device) + + def lib_pow(): + check_error( + LIBINFINIOP.infiniopPow( + descriptor, + workspace.data(), + workspace_size.value, + c.data(), + a.data(), + b.data(), + None, + ) + ) + + lib_pow() + + if sync is not None: + sync() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol, equal_nan=True) + # Use equal_nan=True to handle NaN cases in pow operation + assert torch.allclose(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol, equal_nan=True) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: pow_op(c.torch_tensor(), a.torch_tensor(), b.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_pow(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + check_error(LIBINFINIOP.infiniopDestroyPowDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m") From 8ab767e0c7aface0a3b2e9f0dd73fce209ee1474 Mon Sep 17 00:00:00 2001 From: gongchensu Date: Wed, 7 Jan 2026 02:34:11 +0000 Subject: [PATCH 2/7] Issue/887 - Add abs,acos,acosh,asin,asinh,atan,atanh,ceil,cos,cosh,erf,floor,log,neg,reciprocal,round,sign,sinh,sqrt,tan operator with CPU and NVIDIA implementations. --- include/infiniop.h | 20 + include/infiniop/ops/abs.h | 24 + include/infiniop/ops/acos.h | 24 + include/infiniop/ops/acosh.h | 24 + include/infiniop/ops/asin.h | 24 + include/infiniop/ops/asinh.h | 24 + include/infiniop/ops/atan.h | 24 + include/infiniop/ops/atanh.h | 24 + include/infiniop/ops/ceil.h | 24 + include/infiniop/ops/cos.h | 24 + include/infiniop/ops/cosh.h | 24 + include/infiniop/ops/erf.h | 24 + include/infiniop/ops/floor.h | 24 + include/infiniop/ops/log.h | 24 + include/infiniop/ops/neg.h | 24 + include/infiniop/ops/reciprocal.h | 24 + include/infiniop/ops/round.h | 24 + include/infiniop/ops/sign.h | 24 + include/infiniop/ops/sinh.h | 24 + include/infiniop/ops/sqrt.h | 24 + include/infiniop/ops/tan.h | 24 + src/infiniop/ops/abs/cpu/abs_cpu.cc | 48 ++ src/infiniop/ops/abs/cpu/abs_cpu.h | 26 + src/infiniop/ops/abs/cuda/kernel.cuh | 26 + src/infiniop/ops/abs/nvidia/abs_nvidia.cu | 54 ++ src/infiniop/ops/abs/nvidia/abs_nvidia.cuh | 8 + src/infiniop/ops/abs/operator.cc | 139 +++++ src/infiniop/ops/acos/cpu/acos_cpu.cc | 48 ++ src/infiniop/ops/acos/cpu/acos_cpu.h | 22 + src/infiniop/ops/acos/cuda/kernel.cuh | 32 + src/infiniop/ops/acos/nvidia/acos_nvidia.cu | 54 ++ src/infiniop/ops/acos/nvidia/acos_nvidia.cuh | 8 + src/infiniop/ops/acos/operator.cc | 139 +++++ src/infiniop/ops/acosh/cpu/acosh_cpu.cc | 48 ++ src/infiniop/ops/acosh/cpu/acosh_cpu.h | 22 + src/infiniop/ops/acosh/cuda/kernel.cuh | 32 + src/infiniop/ops/acosh/nvidia/acosh_nvidia.cu | 54 ++ .../ops/acosh/nvidia/acosh_nvidia.cuh | 8 + src/infiniop/ops/acosh/operator.cc | 139 +++++ src/infiniop/ops/asin/cpu/asin_cpu.cc | 48 ++ src/infiniop/ops/asin/cpu/asin_cpu.h | 22 + src/infiniop/ops/asin/cuda/kernel.cuh | 32 + src/infiniop/ops/asin/nvidia/asin_nvidia.cu | 54 ++ src/infiniop/ops/asin/nvidia/asin_nvidia.cuh | 8 + src/infiniop/ops/asin/operator.cc | 139 +++++ src/infiniop/ops/asinh/cpu/asinh_cpu.cc | 48 ++ src/infiniop/ops/asinh/cpu/asinh_cpu.h | 22 + src/infiniop/ops/asinh/cuda/kernel.cuh | 32 + src/infiniop/ops/asinh/nvidia/asinh_nvidia.cu | 54 ++ .../ops/asinh/nvidia/asinh_nvidia.cuh | 8 + src/infiniop/ops/asinh/operator.cc | 139 +++++ src/infiniop/ops/atan/cpu/atan_cpu.cc | 48 ++ src/infiniop/ops/atan/cpu/atan_cpu.h | 22 + src/infiniop/ops/atan/cuda/kernel.cuh | 32 + src/infiniop/ops/atan/nvidia/atan_nvidia.cu | 54 ++ src/infiniop/ops/atan/nvidia/atan_nvidia.cuh | 8 + src/infiniop/ops/atan/operator.cc | 139 +++++ src/infiniop/ops/atanh/cpu/atanh_cpu.cc | 48 ++ src/infiniop/ops/atanh/cpu/atanh_cpu.h | 22 + src/infiniop/ops/atanh/cuda/kernel.cuh | 32 + src/infiniop/ops/atanh/nvidia/atanh_nvidia.cu | 54 ++ .../ops/atanh/nvidia/atanh_nvidia.cuh | 8 + src/infiniop/ops/atanh/operator.cc | 139 +++++ src/infiniop/ops/ceil/cpu/ceil_cpu.cc | 48 ++ src/infiniop/ops/ceil/cpu/ceil_cpu.h | 26 + src/infiniop/ops/ceil/cuda/kernel.cuh | 34 + src/infiniop/ops/ceil/nvidia/ceil_nvidia.cu | 54 ++ src/infiniop/ops/ceil/nvidia/ceil_nvidia.cuh | 8 + src/infiniop/ops/ceil/operator.cc | 139 +++++ src/infiniop/ops/cos/cpu/cos_cpu.cc | 48 ++ src/infiniop/ops/cos/cpu/cos_cpu.h | 22 + src/infiniop/ops/cos/cuda/kernel.cuh | 32 + src/infiniop/ops/cos/nvidia/cos_nvidia.cu | 54 ++ src/infiniop/ops/cos/nvidia/cos_nvidia.cuh | 8 + src/infiniop/ops/cos/operator.cc | 139 +++++ src/infiniop/ops/cosh/cpu/cosh_cpu.cc | 48 ++ src/infiniop/ops/cosh/cpu/cosh_cpu.h | 22 + src/infiniop/ops/cosh/cuda/kernel.cuh | 32 + src/infiniop/ops/cosh/nvidia/cosh_nvidia.cu | 54 ++ src/infiniop/ops/cosh/nvidia/cosh_nvidia.cuh | 8 + src/infiniop/ops/cosh/operator.cc | 139 +++++ src/infiniop/ops/erf/cpu/erf_cpu.cc | 48 ++ src/infiniop/ops/erf/cpu/erf_cpu.h | 22 + src/infiniop/ops/erf/cuda/kernel.cuh | 32 + src/infiniop/ops/erf/nvidia/erf_nvidia.cu | 54 ++ src/infiniop/ops/erf/nvidia/erf_nvidia.cuh | 8 + src/infiniop/ops/erf/operator.cc | 139 +++++ src/infiniop/ops/floor/cpu/floor_cpu.cc | 48 ++ src/infiniop/ops/floor/cpu/floor_cpu.h | 26 + src/infiniop/ops/floor/cuda/kernel.cuh | 34 + src/infiniop/ops/floor/nvidia/floor_nvidia.cu | 54 ++ .../ops/floor/nvidia/floor_nvidia.cuh | 8 + src/infiniop/ops/floor/operator.cc | 139 +++++ src/infiniop/ops/log/cpu/log_cpu.cc | 48 ++ src/infiniop/ops/log/cpu/log_cpu.h | 22 + src/infiniop/ops/log/cuda/kernel.cuh | 32 + src/infiniop/ops/log/nvidia/log_nvidia.cu | 54 ++ src/infiniop/ops/log/nvidia/log_nvidia.cuh | 8 + src/infiniop/ops/log/operator.cc | 139 +++++ src/infiniop/ops/neg/cpu/neg_cpu.cc | 48 ++ src/infiniop/ops/neg/cpu/neg_cpu.h | 20 + src/infiniop/ops/neg/cuda/kernel.cuh | 23 + src/infiniop/ops/neg/nvidia/neg_nvidia.cu | 54 ++ src/infiniop/ops/neg/nvidia/neg_nvidia.cuh | 8 + src/infiniop/ops/neg/operator.cc | 139 +++++ src/infiniop/ops/pow/cuda/kernel.cuh | 2 +- .../ops/reciprocal/cpu/reciprocal_cpu.cc | 48 ++ .../ops/reciprocal/cpu/reciprocal_cpu.h | 20 + src/infiniop/ops/reciprocal/cuda/kernel.cuh | 32 + .../reciprocal/nvidia/reciprocal_nvidia.cu | 54 ++ .../reciprocal/nvidia/reciprocal_nvidia.cuh | 8 + src/infiniop/ops/reciprocal/operator.cc | 139 +++++ src/infiniop/ops/round/cpu/round_cpu.cc | 48 ++ src/infiniop/ops/round/cpu/round_cpu.h | 25 + src/infiniop/ops/round/cuda/kernel.cuh | 34 + src/infiniop/ops/round/nvidia/round_nvidia.cu | 54 ++ .../ops/round/nvidia/round_nvidia.cuh | 8 + src/infiniop/ops/round/operator.cc | 139 +++++ src/infiniop/ops/sign/cpu/sign_cpu.cc | 48 ++ src/infiniop/ops/sign/cpu/sign_cpu.h | 20 + src/infiniop/ops/sign/cuda/kernel.cuh | 25 + src/infiniop/ops/sign/nvidia/sign_nvidia.cu | 54 ++ src/infiniop/ops/sign/nvidia/sign_nvidia.cuh | 8 + src/infiniop/ops/sign/operator.cc | 139 +++++ src/infiniop/ops/sinh/cpu/sinh_cpu.cc | 48 ++ src/infiniop/ops/sinh/cpu/sinh_cpu.h | 22 + src/infiniop/ops/sinh/cuda/kernel.cuh | 32 + src/infiniop/ops/sinh/nvidia/sinh_nvidia.cu | 54 ++ src/infiniop/ops/sinh/nvidia/sinh_nvidia.cuh | 8 + src/infiniop/ops/sinh/operator.cc | 139 +++++ src/infiniop/ops/sqrt/cpu/sqrt_cpu.cc | 48 ++ src/infiniop/ops/sqrt/cpu/sqrt_cpu.h | 22 + src/infiniop/ops/sqrt/cuda/kernel.cuh | 32 + src/infiniop/ops/sqrt/nvidia/sqrt_nvidia.cu | 54 ++ src/infiniop/ops/sqrt/nvidia/sqrt_nvidia.cuh | 8 + src/infiniop/ops/sqrt/operator.cc | 139 +++++ src/infiniop/ops/tan/cpu/tan_cpu.cc | 48 ++ src/infiniop/ops/tan/cpu/tan_cpu.h | 22 + src/infiniop/ops/tan/cuda/kernel.cuh | 55 ++ src/infiniop/ops/tan/nvidia/tan_nvidia.cu | 54 ++ src/infiniop/ops/tan/nvidia/tan_nvidia.cuh | 8 + src/infiniop/ops/tan/operator.cc | 139 +++++ test/infiniop/abs.py | 164 +++++ test/infiniop/acos.py | 165 +++++ test/infiniop/acosh.py | 165 +++++ test/infiniop/asin.py | 165 +++++ test/infiniop/asinh.py | 165 +++++ test/infiniop/atan.py | 164 +++++ test/infiniop/atanh.py | 165 +++++ test/infiniop/ceil.py | 165 +++++ test/infiniop/cos.py | 166 +++++ test/infiniop/cosh.py | 165 +++++ test/infiniop/erf.py | 165 +++++ test/infiniop/floor.py | 165 +++++ test/infiniop/libinfiniop/op_register.py | 583 ++++++++++++++++++ test/infiniop/log.py | 166 +++++ test/infiniop/neg.py | 165 +++++ test/infiniop/reciprocal.py | 168 +++++ test/infiniop/round.py | 165 +++++ test/infiniop/sign.py | 166 +++++ test/infiniop/sinh.py | 166 +++++ test/infiniop/sqrt.py | 166 +++++ test/infiniop/tan.py | 167 +++++ 163 files changed, 10468 insertions(+), 1 deletion(-) create mode 100644 include/infiniop/ops/abs.h create mode 100644 include/infiniop/ops/acos.h create mode 100644 include/infiniop/ops/acosh.h create mode 100644 include/infiniop/ops/asin.h create mode 100644 include/infiniop/ops/asinh.h create mode 100644 include/infiniop/ops/atan.h create mode 100644 include/infiniop/ops/atanh.h create mode 100644 include/infiniop/ops/ceil.h create mode 100644 include/infiniop/ops/cos.h create mode 100644 include/infiniop/ops/cosh.h create mode 100644 include/infiniop/ops/erf.h create mode 100644 include/infiniop/ops/floor.h create mode 100644 include/infiniop/ops/log.h create mode 100644 include/infiniop/ops/neg.h create mode 100644 include/infiniop/ops/reciprocal.h create mode 100644 include/infiniop/ops/round.h create mode 100644 include/infiniop/ops/sign.h create mode 100644 include/infiniop/ops/sinh.h create mode 100644 include/infiniop/ops/sqrt.h create mode 100644 include/infiniop/ops/tan.h create mode 100644 src/infiniop/ops/abs/cpu/abs_cpu.cc create mode 100644 src/infiniop/ops/abs/cpu/abs_cpu.h create mode 100644 src/infiniop/ops/abs/cuda/kernel.cuh create mode 100644 src/infiniop/ops/abs/nvidia/abs_nvidia.cu create mode 100644 src/infiniop/ops/abs/nvidia/abs_nvidia.cuh create mode 100644 src/infiniop/ops/abs/operator.cc create mode 100644 src/infiniop/ops/acos/cpu/acos_cpu.cc create mode 100644 src/infiniop/ops/acos/cpu/acos_cpu.h create mode 100644 src/infiniop/ops/acos/cuda/kernel.cuh create mode 100644 src/infiniop/ops/acos/nvidia/acos_nvidia.cu create mode 100644 src/infiniop/ops/acos/nvidia/acos_nvidia.cuh create mode 100644 src/infiniop/ops/acos/operator.cc create mode 100644 src/infiniop/ops/acosh/cpu/acosh_cpu.cc create mode 100644 src/infiniop/ops/acosh/cpu/acosh_cpu.h create mode 100644 src/infiniop/ops/acosh/cuda/kernel.cuh create mode 100644 src/infiniop/ops/acosh/nvidia/acosh_nvidia.cu create mode 100644 src/infiniop/ops/acosh/nvidia/acosh_nvidia.cuh create mode 100644 src/infiniop/ops/acosh/operator.cc create mode 100644 src/infiniop/ops/asin/cpu/asin_cpu.cc create mode 100644 src/infiniop/ops/asin/cpu/asin_cpu.h create mode 100644 src/infiniop/ops/asin/cuda/kernel.cuh create mode 100644 src/infiniop/ops/asin/nvidia/asin_nvidia.cu create mode 100644 src/infiniop/ops/asin/nvidia/asin_nvidia.cuh create mode 100644 src/infiniop/ops/asin/operator.cc create mode 100644 src/infiniop/ops/asinh/cpu/asinh_cpu.cc create mode 100644 src/infiniop/ops/asinh/cpu/asinh_cpu.h create mode 100644 src/infiniop/ops/asinh/cuda/kernel.cuh create mode 100644 src/infiniop/ops/asinh/nvidia/asinh_nvidia.cu create mode 100644 src/infiniop/ops/asinh/nvidia/asinh_nvidia.cuh create mode 100644 src/infiniop/ops/asinh/operator.cc create mode 100644 src/infiniop/ops/atan/cpu/atan_cpu.cc create mode 100644 src/infiniop/ops/atan/cpu/atan_cpu.h create mode 100644 src/infiniop/ops/atan/cuda/kernel.cuh create mode 100644 src/infiniop/ops/atan/nvidia/atan_nvidia.cu create mode 100644 src/infiniop/ops/atan/nvidia/atan_nvidia.cuh create mode 100644 src/infiniop/ops/atan/operator.cc create mode 100644 src/infiniop/ops/atanh/cpu/atanh_cpu.cc create mode 100644 src/infiniop/ops/atanh/cpu/atanh_cpu.h create mode 100644 src/infiniop/ops/atanh/cuda/kernel.cuh create mode 100644 src/infiniop/ops/atanh/nvidia/atanh_nvidia.cu create mode 100644 src/infiniop/ops/atanh/nvidia/atanh_nvidia.cuh create mode 100644 src/infiniop/ops/atanh/operator.cc create mode 100644 src/infiniop/ops/ceil/cpu/ceil_cpu.cc create mode 100644 src/infiniop/ops/ceil/cpu/ceil_cpu.h create mode 100644 src/infiniop/ops/ceil/cuda/kernel.cuh create mode 100644 src/infiniop/ops/ceil/nvidia/ceil_nvidia.cu create mode 100644 src/infiniop/ops/ceil/nvidia/ceil_nvidia.cuh create mode 100644 src/infiniop/ops/ceil/operator.cc create mode 100644 src/infiniop/ops/cos/cpu/cos_cpu.cc create mode 100644 src/infiniop/ops/cos/cpu/cos_cpu.h create mode 100644 src/infiniop/ops/cos/cuda/kernel.cuh create mode 100644 src/infiniop/ops/cos/nvidia/cos_nvidia.cu create mode 100644 src/infiniop/ops/cos/nvidia/cos_nvidia.cuh create mode 100644 src/infiniop/ops/cos/operator.cc create mode 100644 src/infiniop/ops/cosh/cpu/cosh_cpu.cc create mode 100644 src/infiniop/ops/cosh/cpu/cosh_cpu.h create mode 100644 src/infiniop/ops/cosh/cuda/kernel.cuh create mode 100644 src/infiniop/ops/cosh/nvidia/cosh_nvidia.cu create mode 100644 src/infiniop/ops/cosh/nvidia/cosh_nvidia.cuh create mode 100644 src/infiniop/ops/cosh/operator.cc create mode 100644 src/infiniop/ops/erf/cpu/erf_cpu.cc create mode 100644 src/infiniop/ops/erf/cpu/erf_cpu.h create mode 100644 src/infiniop/ops/erf/cuda/kernel.cuh create mode 100644 src/infiniop/ops/erf/nvidia/erf_nvidia.cu create mode 100644 src/infiniop/ops/erf/nvidia/erf_nvidia.cuh create mode 100644 src/infiniop/ops/erf/operator.cc create mode 100644 src/infiniop/ops/floor/cpu/floor_cpu.cc create mode 100644 src/infiniop/ops/floor/cpu/floor_cpu.h create mode 100644 src/infiniop/ops/floor/cuda/kernel.cuh create mode 100644 src/infiniop/ops/floor/nvidia/floor_nvidia.cu create mode 100644 src/infiniop/ops/floor/nvidia/floor_nvidia.cuh create mode 100644 src/infiniop/ops/floor/operator.cc create mode 100644 src/infiniop/ops/log/cpu/log_cpu.cc create mode 100644 src/infiniop/ops/log/cpu/log_cpu.h create mode 100644 src/infiniop/ops/log/cuda/kernel.cuh create mode 100644 src/infiniop/ops/log/nvidia/log_nvidia.cu create mode 100644 src/infiniop/ops/log/nvidia/log_nvidia.cuh create mode 100644 src/infiniop/ops/log/operator.cc create mode 100644 src/infiniop/ops/neg/cpu/neg_cpu.cc create mode 100644 src/infiniop/ops/neg/cpu/neg_cpu.h create mode 100644 src/infiniop/ops/neg/cuda/kernel.cuh create mode 100644 src/infiniop/ops/neg/nvidia/neg_nvidia.cu create mode 100644 src/infiniop/ops/neg/nvidia/neg_nvidia.cuh create mode 100644 src/infiniop/ops/neg/operator.cc create mode 100644 src/infiniop/ops/reciprocal/cpu/reciprocal_cpu.cc create mode 100644 src/infiniop/ops/reciprocal/cpu/reciprocal_cpu.h create mode 100644 src/infiniop/ops/reciprocal/cuda/kernel.cuh create mode 100644 src/infiniop/ops/reciprocal/nvidia/reciprocal_nvidia.cu create mode 100644 src/infiniop/ops/reciprocal/nvidia/reciprocal_nvidia.cuh create mode 100644 src/infiniop/ops/reciprocal/operator.cc create mode 100644 src/infiniop/ops/round/cpu/round_cpu.cc create mode 100644 src/infiniop/ops/round/cpu/round_cpu.h create mode 100644 src/infiniop/ops/round/cuda/kernel.cuh create mode 100644 src/infiniop/ops/round/nvidia/round_nvidia.cu create mode 100644 src/infiniop/ops/round/nvidia/round_nvidia.cuh create mode 100644 src/infiniop/ops/round/operator.cc create mode 100644 src/infiniop/ops/sign/cpu/sign_cpu.cc create mode 100644 src/infiniop/ops/sign/cpu/sign_cpu.h create mode 100644 src/infiniop/ops/sign/cuda/kernel.cuh create mode 100644 src/infiniop/ops/sign/nvidia/sign_nvidia.cu create mode 100644 src/infiniop/ops/sign/nvidia/sign_nvidia.cuh create mode 100644 src/infiniop/ops/sign/operator.cc create mode 100644 src/infiniop/ops/sinh/cpu/sinh_cpu.cc create mode 100644 src/infiniop/ops/sinh/cpu/sinh_cpu.h create mode 100644 src/infiniop/ops/sinh/cuda/kernel.cuh create mode 100644 src/infiniop/ops/sinh/nvidia/sinh_nvidia.cu create mode 100644 src/infiniop/ops/sinh/nvidia/sinh_nvidia.cuh create mode 100644 src/infiniop/ops/sinh/operator.cc create mode 100644 src/infiniop/ops/sqrt/cpu/sqrt_cpu.cc create mode 100644 src/infiniop/ops/sqrt/cpu/sqrt_cpu.h create mode 100644 src/infiniop/ops/sqrt/cuda/kernel.cuh create mode 100644 src/infiniop/ops/sqrt/nvidia/sqrt_nvidia.cu create mode 100644 src/infiniop/ops/sqrt/nvidia/sqrt_nvidia.cuh create mode 100644 src/infiniop/ops/sqrt/operator.cc create mode 100644 src/infiniop/ops/tan/cpu/tan_cpu.cc create mode 100644 src/infiniop/ops/tan/cpu/tan_cpu.h create mode 100644 src/infiniop/ops/tan/cuda/kernel.cuh create mode 100644 src/infiniop/ops/tan/nvidia/tan_nvidia.cu create mode 100644 src/infiniop/ops/tan/nvidia/tan_nvidia.cuh create mode 100644 src/infiniop/ops/tan/operator.cc create mode 100644 test/infiniop/abs.py create mode 100644 test/infiniop/acos.py create mode 100644 test/infiniop/acosh.py create mode 100644 test/infiniop/asin.py create mode 100644 test/infiniop/asinh.py create mode 100644 test/infiniop/atan.py create mode 100644 test/infiniop/atanh.py create mode 100644 test/infiniop/ceil.py create mode 100644 test/infiniop/cos.py create mode 100644 test/infiniop/cosh.py create mode 100644 test/infiniop/erf.py create mode 100644 test/infiniop/floor.py create mode 100644 test/infiniop/log.py create mode 100644 test/infiniop/neg.py create mode 100644 test/infiniop/reciprocal.py create mode 100644 test/infiniop/round.py create mode 100644 test/infiniop/sign.py create mode 100644 test/infiniop/sinh.py create mode 100644 test/infiniop/sqrt.py create mode 100644 test/infiniop/tan.py diff --git a/include/infiniop.h b/include/infiniop.h index cf1688868..4778fce90 100644 --- a/include/infiniop.h +++ b/include/infiniop.h @@ -2,9 +2,21 @@ #define __INFINIOP_API_H__ #include "infiniop/handle.h" +#include "infiniop/ops/abs.h" +#include "infiniop/ops/acos.h" +#include "infiniop/ops/acosh.h" #include "infiniop/ops/add.h" #include "infiniop/ops/add_rms_norm.h" +#include "infiniop/ops/asin.h" +#include "infiniop/ops/asinh.h" +#include "infiniop/ops/atan.h" +#include "infiniop/ops/atanh.h" #include "infiniop/ops/attention.h" +#include "infiniop/ops/ceil.h" +#include "infiniop/ops/cos.h" +#include "infiniop/ops/cosh.h" +#include "infiniop/ops/erf.h" +#include "infiniop/ops/floor.h" #include "infiniop/ops/causal_softmax.h" #include "infiniop/ops/clip.h" #include "infiniop/ops/conv.h" @@ -13,17 +25,24 @@ #include "infiniop/ops/gelu.h" #include "infiniop/ops/gemm.h" #include "infiniop/ops/layer_norm.h" +#include "infiniop/ops/log.h" #include "infiniop/ops/logsoftmax.h" #include "infiniop/ops/lp_norm.h" #include "infiniop/ops/max.h" #include "infiniop/ops/min.h" #include "infiniop/ops/mul.h" +#include "infiniop/ops/neg.h" #include "infiniop/ops/ones.h" #include "infiniop/ops/paged_attention.h" #include "infiniop/ops/paged_attention_prefill.h" #include "infiniop/ops/paged_caching.h" #include "infiniop/ops/random_sample.h" +#include "infiniop/ops/reciprocal.h" #include "infiniop/ops/rearrange.h" +#include "infiniop/ops/round.h" +#include "infiniop/ops/sign.h" +#include "infiniop/ops/sinh.h" +#include "infiniop/ops/sqrt.h" #include "infiniop/ops/relu.h" #include "infiniop/ops/rms_norm.h" #include "infiniop/ops/rope.h" @@ -33,6 +52,7 @@ #include "infiniop/ops/softplus.h" #include "infiniop/ops/sub.h" #include "infiniop/ops/swiglu.h" +#include "infiniop/ops/tan.h" #include "infiniop/ops/tanh.h" #include "infiniop/ops/topkrouter.h" #include "infiniop/ops/topksoftmax.h" diff --git a/include/infiniop/ops/abs.h b/include/infiniop/ops/abs.h new file mode 100644 index 000000000..7b5872657 --- /dev/null +++ b/include/infiniop/ops/abs.h @@ -0,0 +1,24 @@ +#ifndef __INFINIOP_ABS_API_H__ +#define __INFINIOP_ABS_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopAbsDescriptor_t; + +__C __export infiniStatus_t infiniopCreateAbsDescriptor(infiniopHandle_t handle, + infiniopAbsDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x); + +__C __export infiniStatus_t infiniopGetAbsWorkspaceSize(infiniopAbsDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopAbs(infiniopAbsDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream); + +__C __export infiniStatus_t infiniopDestroyAbsDescriptor(infiniopAbsDescriptor_t desc); + +#endif diff --git a/include/infiniop/ops/acos.h b/include/infiniop/ops/acos.h new file mode 100644 index 000000000..fe6af01ed --- /dev/null +++ b/include/infiniop/ops/acos.h @@ -0,0 +1,24 @@ +#ifndef __INFINIOP_ACOS_API_H__ +#define __INFINIOP_ACOS_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopAcosDescriptor_t; + +__C __export infiniStatus_t infiniopCreateAcosDescriptor(infiniopHandle_t handle, + infiniopAcosDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x); + +__C __export infiniStatus_t infiniopGetAcosWorkspaceSize(infiniopAcosDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopAcos(infiniopAcosDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream); + +__C __export infiniStatus_t infiniopDestroyAcosDescriptor(infiniopAcosDescriptor_t desc); + +#endif diff --git a/include/infiniop/ops/acosh.h b/include/infiniop/ops/acosh.h new file mode 100644 index 000000000..be28918bb --- /dev/null +++ b/include/infiniop/ops/acosh.h @@ -0,0 +1,24 @@ +#ifndef __INFINIOP_ACOSH_API_H__ +#define __INFINIOP_ACOSH_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopAcoshDescriptor_t; + +__C __export infiniStatus_t infiniopCreateAcoshDescriptor(infiniopHandle_t handle, + infiniopAcoshDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x); + +__C __export infiniStatus_t infiniopGetAcoshWorkspaceSize(infiniopAcoshDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopAcosh(infiniopAcoshDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream); + +__C __export infiniStatus_t infiniopDestroyAcoshDescriptor(infiniopAcoshDescriptor_t desc); + +#endif diff --git a/include/infiniop/ops/asin.h b/include/infiniop/ops/asin.h new file mode 100644 index 000000000..2aac6d1e1 --- /dev/null +++ b/include/infiniop/ops/asin.h @@ -0,0 +1,24 @@ +#ifndef __INFINIOP_ASIN_API_H__ +#define __INFINIOP_ASIN_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopAsinDescriptor_t; + +__C __export infiniStatus_t infiniopCreateAsinDescriptor(infiniopHandle_t handle, + infiniopAsinDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x); + +__C __export infiniStatus_t infiniopGetAsinWorkspaceSize(infiniopAsinDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopAsin(infiniopAsinDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream); + +__C __export infiniStatus_t infiniopDestroyAsinDescriptor(infiniopAsinDescriptor_t desc); + +#endif diff --git a/include/infiniop/ops/asinh.h b/include/infiniop/ops/asinh.h new file mode 100644 index 000000000..d1385fc01 --- /dev/null +++ b/include/infiniop/ops/asinh.h @@ -0,0 +1,24 @@ +#ifndef __INFINIOP_ASINH_API_H__ +#define __INFINIOP_ASINH_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopAsinhDescriptor_t; + +__C __export infiniStatus_t infiniopCreateAsinhDescriptor(infiniopHandle_t handle, + infiniopAsinhDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x); + +__C __export infiniStatus_t infiniopGetAsinhWorkspaceSize(infiniopAsinhDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopAsinh(infiniopAsinhDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream); + +__C __export infiniStatus_t infiniopDestroyAsinhDescriptor(infiniopAsinhDescriptor_t desc); + +#endif diff --git a/include/infiniop/ops/atan.h b/include/infiniop/ops/atan.h new file mode 100644 index 000000000..3b1a5bde3 --- /dev/null +++ b/include/infiniop/ops/atan.h @@ -0,0 +1,24 @@ +#ifndef __INFINIOP_ATAN_API_H__ +#define __INFINIOP_ATAN_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopAtanDescriptor_t; + +__C __export infiniStatus_t infiniopCreateAtanDescriptor(infiniopHandle_t handle, + infiniopAtanDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x); + +__C __export infiniStatus_t infiniopGetAtanWorkspaceSize(infiniopAtanDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopAtan(infiniopAtanDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream); + +__C __export infiniStatus_t infiniopDestroyAtanDescriptor(infiniopAtanDescriptor_t desc); + +#endif diff --git a/include/infiniop/ops/atanh.h b/include/infiniop/ops/atanh.h new file mode 100644 index 000000000..800afd5d5 --- /dev/null +++ b/include/infiniop/ops/atanh.h @@ -0,0 +1,24 @@ +#ifndef __INFINIOP_ATANH_API_H__ +#define __INFINIOP_ATANH_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopAtanhDescriptor_t; + +__C __export infiniStatus_t infiniopCreateAtanhDescriptor(infiniopHandle_t handle, + infiniopAtanhDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x); + +__C __export infiniStatus_t infiniopGetAtanhWorkspaceSize(infiniopAtanhDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopAtanh(infiniopAtanhDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream); + +__C __export infiniStatus_t infiniopDestroyAtanhDescriptor(infiniopAtanhDescriptor_t desc); + +#endif diff --git a/include/infiniop/ops/ceil.h b/include/infiniop/ops/ceil.h new file mode 100644 index 000000000..4539d77fd --- /dev/null +++ b/include/infiniop/ops/ceil.h @@ -0,0 +1,24 @@ +#ifndef __INFINIOP_CEIL_API_H__ +#define __INFINIOP_CEIL_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopCeilDescriptor_t; + +__C __export infiniStatus_t infiniopCreateCeilDescriptor(infiniopHandle_t handle, + infiniopCeilDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x); + +__C __export infiniStatus_t infiniopGetCeilWorkspaceSize(infiniopCeilDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopCeil(infiniopCeilDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream); + +__C __export infiniStatus_t infiniopDestroyCeilDescriptor(infiniopCeilDescriptor_t desc); + +#endif diff --git a/include/infiniop/ops/cos.h b/include/infiniop/ops/cos.h new file mode 100644 index 000000000..8f0b6eeb7 --- /dev/null +++ b/include/infiniop/ops/cos.h @@ -0,0 +1,24 @@ +#ifndef __INFINIOP_COS_API_H__ +#define __INFINIOP_COS_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopCosDescriptor_t; + +__C __export infiniStatus_t infiniopCreateCosDescriptor(infiniopHandle_t handle, + infiniopCosDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x); + +__C __export infiniStatus_t infiniopGetCosWorkspaceSize(infiniopCosDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopCos(infiniopCosDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream); + +__C __export infiniStatus_t infiniopDestroyCosDescriptor(infiniopCosDescriptor_t desc); + +#endif diff --git a/include/infiniop/ops/cosh.h b/include/infiniop/ops/cosh.h new file mode 100644 index 000000000..3328151ad --- /dev/null +++ b/include/infiniop/ops/cosh.h @@ -0,0 +1,24 @@ +#ifndef __INFINIOP_COSH_API_H__ +#define __INFINIOP_COSH_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopCoshDescriptor_t; + +__C __export infiniStatus_t infiniopCreateCoshDescriptor(infiniopHandle_t handle, + infiniopCoshDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x); + +__C __export infiniStatus_t infiniopGetCoshWorkspaceSize(infiniopCoshDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopCosh(infiniopCoshDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream); + +__C __export infiniStatus_t infiniopDestroyCoshDescriptor(infiniopCoshDescriptor_t desc); + +#endif diff --git a/include/infiniop/ops/erf.h b/include/infiniop/ops/erf.h new file mode 100644 index 000000000..8cbb8fb74 --- /dev/null +++ b/include/infiniop/ops/erf.h @@ -0,0 +1,24 @@ +#ifndef __INFINIOP_ERF_API_H__ +#define __INFINIOP_ERF_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopErfDescriptor_t; + +__C __export infiniStatus_t infiniopCreateErfDescriptor(infiniopHandle_t handle, + infiniopErfDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x); + +__C __export infiniStatus_t infiniopGetErfWorkspaceSize(infiniopErfDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopErf(infiniopErfDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream); + +__C __export infiniStatus_t infiniopDestroyErfDescriptor(infiniopErfDescriptor_t desc); + +#endif diff --git a/include/infiniop/ops/floor.h b/include/infiniop/ops/floor.h new file mode 100644 index 000000000..2f65f8f4a --- /dev/null +++ b/include/infiniop/ops/floor.h @@ -0,0 +1,24 @@ +#ifndef __INFINIOP_FLOOR_API_H__ +#define __INFINIOP_FLOOR_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopFloorDescriptor_t; + +__C __export infiniStatus_t infiniopCreateFloorDescriptor(infiniopHandle_t handle, + infiniopFloorDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x); + +__C __export infiniStatus_t infiniopGetFloorWorkspaceSize(infiniopFloorDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopFloor(infiniopFloorDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream); + +__C __export infiniStatus_t infiniopDestroyFloorDescriptor(infiniopFloorDescriptor_t desc); + +#endif diff --git a/include/infiniop/ops/log.h b/include/infiniop/ops/log.h new file mode 100644 index 000000000..f5bec4382 --- /dev/null +++ b/include/infiniop/ops/log.h @@ -0,0 +1,24 @@ +#ifndef __INFINIOP_LOG_API_H__ +#define __INFINIOP_LOG_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopLogDescriptor_t; + +__C __export infiniStatus_t infiniopCreateLogDescriptor(infiniopHandle_t handle, + infiniopLogDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x); + +__C __export infiniStatus_t infiniopGetLogWorkspaceSize(infiniopLogDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopLog(infiniopLogDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream); + +__C __export infiniStatus_t infiniopDestroyLogDescriptor(infiniopLogDescriptor_t desc); + +#endif diff --git a/include/infiniop/ops/neg.h b/include/infiniop/ops/neg.h new file mode 100644 index 000000000..4d3b06e21 --- /dev/null +++ b/include/infiniop/ops/neg.h @@ -0,0 +1,24 @@ +#ifndef __INFINIOP_NEG_API_H__ +#define __INFINIOP_NEG_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopNegDescriptor_t; + +__C __export infiniStatus_t infiniopCreateNegDescriptor(infiniopHandle_t handle, + infiniopNegDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x); + +__C __export infiniStatus_t infiniopGetNegWorkspaceSize(infiniopNegDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopNeg(infiniopNegDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream); + +__C __export infiniStatus_t infiniopDestroyNegDescriptor(infiniopNegDescriptor_t desc); + +#endif diff --git a/include/infiniop/ops/reciprocal.h b/include/infiniop/ops/reciprocal.h new file mode 100644 index 000000000..73836fea4 --- /dev/null +++ b/include/infiniop/ops/reciprocal.h @@ -0,0 +1,24 @@ +#ifndef __INFINIOP_RECIPROCAL_API_H__ +#define __INFINIOP_RECIPROCAL_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopReciprocalDescriptor_t; + +__C __export infiniStatus_t infiniopCreateReciprocalDescriptor(infiniopHandle_t handle, + infiniopReciprocalDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x); + +__C __export infiniStatus_t infiniopGetReciprocalWorkspaceSize(infiniopReciprocalDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopReciprocal(infiniopReciprocalDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream); + +__C __export infiniStatus_t infiniopDestroyReciprocalDescriptor(infiniopReciprocalDescriptor_t desc); + +#endif diff --git a/include/infiniop/ops/round.h b/include/infiniop/ops/round.h new file mode 100644 index 000000000..18c7fe44e --- /dev/null +++ b/include/infiniop/ops/round.h @@ -0,0 +1,24 @@ +#ifndef __INFINIOP_ROUND_API_H__ +#define __INFINIOP_ROUND_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopRoundDescriptor_t; + +__C __export infiniStatus_t infiniopCreateRoundDescriptor(infiniopHandle_t handle, + infiniopRoundDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x); + +__C __export infiniStatus_t infiniopGetRoundWorkspaceSize(infiniopRoundDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopRound(infiniopRoundDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream); + +__C __export infiniStatus_t infiniopDestroyRoundDescriptor(infiniopRoundDescriptor_t desc); + +#endif diff --git a/include/infiniop/ops/sign.h b/include/infiniop/ops/sign.h new file mode 100644 index 000000000..fe47c7190 --- /dev/null +++ b/include/infiniop/ops/sign.h @@ -0,0 +1,24 @@ +#ifndef __INFINIOP_SIGN_API_H__ +#define __INFINIOP_SIGN_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopSignDescriptor_t; + +__C __export infiniStatus_t infiniopCreateSignDescriptor(infiniopHandle_t handle, + infiniopSignDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x); + +__C __export infiniStatus_t infiniopGetSignWorkspaceSize(infiniopSignDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopSign(infiniopSignDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream); + +__C __export infiniStatus_t infiniopDestroySignDescriptor(infiniopSignDescriptor_t desc); + +#endif diff --git a/include/infiniop/ops/sinh.h b/include/infiniop/ops/sinh.h new file mode 100644 index 000000000..a5325fb81 --- /dev/null +++ b/include/infiniop/ops/sinh.h @@ -0,0 +1,24 @@ +#ifndef __INFINIOP_SINH_API_H__ +#define __INFINIOP_SINH_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopSinhDescriptor_t; + +__C __export infiniStatus_t infiniopCreateSinhDescriptor(infiniopHandle_t handle, + infiniopSinhDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x); + +__C __export infiniStatus_t infiniopGetSinhWorkspaceSize(infiniopSinhDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopSinh(infiniopSinhDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream); + +__C __export infiniStatus_t infiniopDestroySinhDescriptor(infiniopSinhDescriptor_t desc); + +#endif diff --git a/include/infiniop/ops/sqrt.h b/include/infiniop/ops/sqrt.h new file mode 100644 index 000000000..db04ec8bc --- /dev/null +++ b/include/infiniop/ops/sqrt.h @@ -0,0 +1,24 @@ +#ifndef __INFINIOP_SQRT_API_H__ +#define __INFINIOP_SQRT_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopSqrtDescriptor_t; + +__C __export infiniStatus_t infiniopCreateSqrtDescriptor(infiniopHandle_t handle, + infiniopSqrtDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x); + +__C __export infiniStatus_t infiniopGetSqrtWorkspaceSize(infiniopSqrtDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopSqrt(infiniopSqrtDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream); + +__C __export infiniStatus_t infiniopDestroySqrtDescriptor(infiniopSqrtDescriptor_t desc); + +#endif diff --git a/include/infiniop/ops/tan.h b/include/infiniop/ops/tan.h new file mode 100644 index 000000000..69fc47bf1 --- /dev/null +++ b/include/infiniop/ops/tan.h @@ -0,0 +1,24 @@ +#ifndef __INFINIOP_TAN_API_H__ +#define __INFINIOP_TAN_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopTanDescriptor_t; + +__C __export infiniStatus_t infiniopCreateTanDescriptor(infiniopHandle_t handle, + infiniopTanDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x); + +__C __export infiniStatus_t infiniopGetTanWorkspaceSize(infiniopTanDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopTan(infiniopTanDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream); + +__C __export infiniStatus_t infiniopDestroyTanDescriptor(infiniopTanDescriptor_t desc); + +#endif diff --git a/src/infiniop/ops/abs/cpu/abs_cpu.cc b/src/infiniop/ops/abs/cpu/abs_cpu.cc new file mode 100644 index 000000000..7d6e81d04 --- /dev/null +++ b/src/infiniop/ops/abs/cpu/abs_cpu.cc @@ -0,0 +1,48 @@ +#include "abs_cpu.h" + +namespace op::abs::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &x_desc = input_desc_vec.at(0); + const auto &y_shape = out_desc->shape(); + const auto &x_shape = x_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); + + CHECK_SAME_SHAPE(y_shape, x_shape); + + // create CPU elementwise descriptor + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::abs::cpu diff --git a/src/infiniop/ops/abs/cpu/abs_cpu.h b/src/infiniop/ops/abs/cpu/abs_cpu.h new file mode 100644 index 000000000..5b9773298 --- /dev/null +++ b/src/infiniop/ops/abs/cpu/abs_cpu.h @@ -0,0 +1,26 @@ +#ifndef __ABS_CPU_H__ +#define __ABS_CPU_H__ + +#include + +#include "../../../elementwise/cpu/elementwise_cpu.h" + +ELEMENTWISE_DESCRIPTOR(abs, cpu) + +namespace op::abs::cpu { +typedef struct AbsOp { +public: + static constexpr size_t num_inputs = 1; + + template + T operator()(const T &x) const { + if constexpr (std::is_floating_point_v) { + return std::fabs(x); + } else { + return std::abs(x); + } + } +} AbsOp; +} // namespace op::abs::cpu + +#endif // __ABS_CPU_H__ diff --git a/src/infiniop/ops/abs/cuda/kernel.cuh b/src/infiniop/ops/abs/cuda/kernel.cuh new file mode 100644 index 000000000..d7ff2db12 --- /dev/null +++ b/src/infiniop/ops/abs/cuda/kernel.cuh @@ -0,0 +1,26 @@ +#ifndef __ABS_CUDA_H__ +#define __ABS_CUDA_H__ + +#include +#include + +namespace op::abs::cuda { +typedef struct AbsOp { +public: + static constexpr size_t num_inputs = 1; + template + __device__ __forceinline__ T operator()(const T &x) const { + if constexpr (std::is_same_v) { + return __habs2(x); + } else if constexpr (std::is_same_v) { + return __habs(x); + } else if constexpr (std::is_floating_point_v) { + return std::fabs(x); + } else { + return std::abs(x); + } + } +} AbsOp; +} // namespace op::abs::cuda + +#endif // __ABS_CUDA_H__ diff --git a/src/infiniop/ops/abs/nvidia/abs_nvidia.cu b/src/infiniop/ops/abs/nvidia/abs_nvidia.cu new file mode 100644 index 000000000..485f0406a --- /dev/null +++ b/src/infiniop/ops/abs/nvidia/abs_nvidia.cu @@ -0,0 +1,54 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "abs_nvidia.cuh" + +namespace op::abs::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &x_desc = input_desc_vec.at(0); + const auto &y_shape = out_desc->shape(); + const auto &x_shape = x_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); + + CHECK_SAME_SHAPE(y_shape, x_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::AbsOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::AbsOp, float>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::abs::nvidia diff --git a/src/infiniop/ops/abs/nvidia/abs_nvidia.cuh b/src/infiniop/ops/abs/nvidia/abs_nvidia.cuh new file mode 100644 index 000000000..db1751e26 --- /dev/null +++ b/src/infiniop/ops/abs/nvidia/abs_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __ABS_NVIDIA_API_H__ +#define __ABS_NVIDIA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(abs, nvidia) + +#endif // __ABS_NVIDIA_API_H__ diff --git a/src/infiniop/ops/abs/operator.cc b/src/infiniop/ops/abs/operator.cc new file mode 100644 index 000000000..b6820079d --- /dev/null +++ b/src/infiniop/ops/abs/operator.cc @@ -0,0 +1,139 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/abs.h" + +#ifdef ENABLE_CPU_API +#include "cpu/abs_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#include "nvidia/abs_nvidia.cuh" +#endif + +__C infiniStatus_t infiniopCreateAbsDescriptor( + infiniopHandle_t handle, + infiniopAbsDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::abs::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + y_desc, \ + {x_desc}) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetAbsWorkspaceSize(infiniopAbsDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu) +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia) +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia) +#endif +#ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia) +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopAbs( + infiniopAbsDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, y, {x}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroyAbsDescriptor(infiniopAbsDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + DELETE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/ops/acos/cpu/acos_cpu.cc b/src/infiniop/ops/acos/cpu/acos_cpu.cc new file mode 100644 index 000000000..1accb6752 --- /dev/null +++ b/src/infiniop/ops/acos/cpu/acos_cpu.cc @@ -0,0 +1,48 @@ +#include "acos_cpu.h" + +namespace op::acos::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &x_desc = input_desc_vec.at(0); + const auto &y_shape = out_desc->shape(); + const auto &x_shape = x_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); + + CHECK_SAME_SHAPE(y_shape, x_shape); + + // create CPU elementwise descriptor + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::acos::cpu diff --git a/src/infiniop/ops/acos/cpu/acos_cpu.h b/src/infiniop/ops/acos/cpu/acos_cpu.h new file mode 100644 index 000000000..14e74b75c --- /dev/null +++ b/src/infiniop/ops/acos/cpu/acos_cpu.h @@ -0,0 +1,22 @@ +#ifndef __ACOS_CPU_H__ +#define __ACOS_CPU_H__ + +#include + +#include "../../../elementwise/cpu/elementwise_cpu.h" + +ELEMENTWISE_DESCRIPTOR(acos, cpu) + +namespace op::acos::cpu { +typedef struct AcosOp { +public: + static constexpr size_t num_inputs = 1; + + template + T operator()(const T &x) const { + return std::acos(x); + } +} AcosOp; +} // namespace op::acos::cpu + +#endif // __ACOS_CPU_H__ diff --git a/src/infiniop/ops/acos/cuda/kernel.cuh b/src/infiniop/ops/acos/cuda/kernel.cuh new file mode 100644 index 000000000..c3281c7e3 --- /dev/null +++ b/src/infiniop/ops/acos/cuda/kernel.cuh @@ -0,0 +1,32 @@ +#ifndef __ACOS_CUDA_H__ +#define __ACOS_CUDA_H__ + +#include +#include + +namespace op::acos::cuda { +typedef struct AcosOp { +public: + static constexpr size_t num_inputs = 1; + template + __device__ __forceinline__ T operator()(const T &x) const { + if constexpr (std::is_same_v) { + return __floats2half2_rn(acosf(__half2float(__low2half(x))), acosf(__half2float(__high2half(x)))); + } else if constexpr (std::is_same_v) { + return __float2half(acosf(__half2float(x))); + } else if constexpr (std::is_same_v) { + float x0 = __bfloat162float(__low2bfloat16(x)); + float x1 = __bfloat162float(__high2bfloat16(x)); + return __floats2bfloat162_rn(acosf(x0), acosf(x1)); + } else if constexpr (std::is_same_v) { + return __float2bfloat16_rn(acosf(__bfloat162float(x))); + } else if constexpr (std::is_same_v) { + return acosf(x); + } else { + return std::acos(x); + } + } +} AcosOp; +} // namespace op::acos::cuda + +#endif // __ACOS_CUDA_H__ diff --git a/src/infiniop/ops/acos/nvidia/acos_nvidia.cu b/src/infiniop/ops/acos/nvidia/acos_nvidia.cu new file mode 100644 index 000000000..8480219bc --- /dev/null +++ b/src/infiniop/ops/acos/nvidia/acos_nvidia.cu @@ -0,0 +1,54 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "acos_nvidia.cuh" + +namespace op::acos::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &x_desc = input_desc_vec.at(0); + const auto &y_shape = out_desc->shape(); + const auto &x_shape = x_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); + + CHECK_SAME_SHAPE(y_shape, x_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::AcosOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::AcosOp, float>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::acos::nvidia diff --git a/src/infiniop/ops/acos/nvidia/acos_nvidia.cuh b/src/infiniop/ops/acos/nvidia/acos_nvidia.cuh new file mode 100644 index 000000000..a7ac7e190 --- /dev/null +++ b/src/infiniop/ops/acos/nvidia/acos_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __ACOS_NVIDIA_API_H__ +#define __ACOS_NVIDIA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(acos, nvidia) + +#endif // __ACOS_NVIDIA_API_H__ diff --git a/src/infiniop/ops/acos/operator.cc b/src/infiniop/ops/acos/operator.cc new file mode 100644 index 000000000..e775a005a --- /dev/null +++ b/src/infiniop/ops/acos/operator.cc @@ -0,0 +1,139 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/acos.h" + +#ifdef ENABLE_CPU_API +#include "cpu/acos_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#include "nvidia/acos_nvidia.cuh" +#endif + +__C infiniStatus_t infiniopCreateAcosDescriptor( + infiniopHandle_t handle, + infiniopAcosDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::acos::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + y_desc, \ + {x_desc}) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetAcosWorkspaceSize(infiniopAcosDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu) +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia) +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia) +#endif +#ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia) +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopAcos( + infiniopAcosDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, y, {x}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroyAcosDescriptor(infiniopAcosDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + DELETE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/ops/acosh/cpu/acosh_cpu.cc b/src/infiniop/ops/acosh/cpu/acosh_cpu.cc new file mode 100644 index 000000000..005463679 --- /dev/null +++ b/src/infiniop/ops/acosh/cpu/acosh_cpu.cc @@ -0,0 +1,48 @@ +#include "acosh_cpu.h" + +namespace op::acosh::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &x_desc = input_desc_vec.at(0); + const auto &y_shape = out_desc->shape(); + const auto &x_shape = x_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); + + CHECK_SAME_SHAPE(y_shape, x_shape); + + // create CPU elementwise descriptor + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::acosh::cpu diff --git a/src/infiniop/ops/acosh/cpu/acosh_cpu.h b/src/infiniop/ops/acosh/cpu/acosh_cpu.h new file mode 100644 index 000000000..b4b710ed5 --- /dev/null +++ b/src/infiniop/ops/acosh/cpu/acosh_cpu.h @@ -0,0 +1,22 @@ +#ifndef __ACOSH_CPU_H__ +#define __ACOSH_CPU_H__ + +#include + +#include "../../../elementwise/cpu/elementwise_cpu.h" + +ELEMENTWISE_DESCRIPTOR(acosh, cpu) + +namespace op::acosh::cpu { +typedef struct AcoshOp { +public: + static constexpr size_t num_inputs = 1; + + template + T operator()(const T &x) const { + return std::acosh(x); + } +} AcoshOp; +} // namespace op::acosh::cpu + +#endif // __ACOSH_CPU_H__ diff --git a/src/infiniop/ops/acosh/cuda/kernel.cuh b/src/infiniop/ops/acosh/cuda/kernel.cuh new file mode 100644 index 000000000..fe444b1b4 --- /dev/null +++ b/src/infiniop/ops/acosh/cuda/kernel.cuh @@ -0,0 +1,32 @@ +#ifndef __ACOSH_CUDA_H__ +#define __ACOSH_CUDA_H__ + +#include +#include + +namespace op::acosh::cuda { +typedef struct AcoshOp { +public: + static constexpr size_t num_inputs = 1; + template + __device__ __forceinline__ T operator()(const T &x) const { + if constexpr (std::is_same_v) { + return __floats2half2_rn(acoshf(__half2float(__low2half(x))), acoshf(__half2float(__high2half(x)))); + } else if constexpr (std::is_same_v) { + return __float2half(acoshf(__half2float(x))); + } else if constexpr (std::is_same_v) { + float x0 = __bfloat162float(__low2bfloat16(x)); + float x1 = __bfloat162float(__high2bfloat16(x)); + return __floats2bfloat162_rn(acoshf(x0), acoshf(x1)); + } else if constexpr (std::is_same_v) { + return __float2bfloat16_rn(acoshf(__bfloat162float(x))); + } else if constexpr (std::is_same_v) { + return acoshf(x); + } else { + return std::acosh(x); + } + } +} AcoshOp; +} // namespace op::acosh::cuda + +#endif // __ACOSH_CUDA_H__ diff --git a/src/infiniop/ops/acosh/nvidia/acosh_nvidia.cu b/src/infiniop/ops/acosh/nvidia/acosh_nvidia.cu new file mode 100644 index 000000000..fc06590a7 --- /dev/null +++ b/src/infiniop/ops/acosh/nvidia/acosh_nvidia.cu @@ -0,0 +1,54 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "acosh_nvidia.cuh" + +namespace op::acosh::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &x_desc = input_desc_vec.at(0); + const auto &y_shape = out_desc->shape(); + const auto &x_shape = x_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); + + CHECK_SAME_SHAPE(y_shape, x_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::AcoshOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::AcoshOp, float>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::acosh::nvidia diff --git a/src/infiniop/ops/acosh/nvidia/acosh_nvidia.cuh b/src/infiniop/ops/acosh/nvidia/acosh_nvidia.cuh new file mode 100644 index 000000000..b13332431 --- /dev/null +++ b/src/infiniop/ops/acosh/nvidia/acosh_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __ACOSH_NVIDIA_API_H__ +#define __ACOSH_NVIDIA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(acosh, nvidia) + +#endif // __ACOSH_NVIDIA_API_H__ diff --git a/src/infiniop/ops/acosh/operator.cc b/src/infiniop/ops/acosh/operator.cc new file mode 100644 index 000000000..9bba3389a --- /dev/null +++ b/src/infiniop/ops/acosh/operator.cc @@ -0,0 +1,139 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/acosh.h" + +#ifdef ENABLE_CPU_API +#include "cpu/acosh_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#include "nvidia/acosh_nvidia.cuh" +#endif + +__C infiniStatus_t infiniopCreateAcoshDescriptor( + infiniopHandle_t handle, + infiniopAcoshDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::acosh::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + y_desc, \ + {x_desc}) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetAcoshWorkspaceSize(infiniopAcoshDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu) +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia) +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia) +#endif +#ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia) +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopAcosh( + infiniopAcoshDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, y, {x}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroyAcoshDescriptor(infiniopAcoshDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + DELETE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/ops/asin/cpu/asin_cpu.cc b/src/infiniop/ops/asin/cpu/asin_cpu.cc new file mode 100644 index 000000000..e149044f1 --- /dev/null +++ b/src/infiniop/ops/asin/cpu/asin_cpu.cc @@ -0,0 +1,48 @@ +#include "asin_cpu.h" + +namespace op::asin::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &x_desc = input_desc_vec.at(0); + const auto &y_shape = out_desc->shape(); + const auto &x_shape = x_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); + + CHECK_SAME_SHAPE(y_shape, x_shape); + + // create CPU elementwise descriptor + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::asin::cpu diff --git a/src/infiniop/ops/asin/cpu/asin_cpu.h b/src/infiniop/ops/asin/cpu/asin_cpu.h new file mode 100644 index 000000000..22bcba337 --- /dev/null +++ b/src/infiniop/ops/asin/cpu/asin_cpu.h @@ -0,0 +1,22 @@ +#ifndef __ASIN_CPU_H__ +#define __ASIN_CPU_H__ + +#include + +#include "../../../elementwise/cpu/elementwise_cpu.h" + +ELEMENTWISE_DESCRIPTOR(asin, cpu) + +namespace op::asin::cpu { +typedef struct AsinOp { +public: + static constexpr size_t num_inputs = 1; + + template + T operator()(const T &x) const { + return std::asin(x); + } +} AsinOp; +} // namespace op::asin::cpu + +#endif // __ASIN_CPU_H__ diff --git a/src/infiniop/ops/asin/cuda/kernel.cuh b/src/infiniop/ops/asin/cuda/kernel.cuh new file mode 100644 index 000000000..3e8d11a07 --- /dev/null +++ b/src/infiniop/ops/asin/cuda/kernel.cuh @@ -0,0 +1,32 @@ +#ifndef __ASIN_CUDA_H__ +#define __ASIN_CUDA_H__ + +#include +#include + +namespace op::asin::cuda { +typedef struct AsinOp { +public: + static constexpr size_t num_inputs = 1; + template + __device__ __forceinline__ T operator()(const T &x) const { + if constexpr (std::is_same_v) { + return __floats2half2_rn(asinf(__half2float(__low2half(x))), asinf(__half2float(__high2half(x)))); + } else if constexpr (std::is_same_v) { + return __float2half(asinf(__half2float(x))); + } else if constexpr (std::is_same_v) { + float x0 = __bfloat162float(__low2bfloat16(x)); + float x1 = __bfloat162float(__high2bfloat16(x)); + return __floats2bfloat162_rn(asinf(x0), asinf(x1)); + } else if constexpr (std::is_same_v) { + return __float2bfloat16_rn(asinf(__bfloat162float(x))); + } else if constexpr (std::is_same_v) { + return asinf(x); + } else { + return std::asin(x); + } + } +} AsinOp; +} // namespace op::asin::cuda + +#endif // __ASIN_CUDA_H__ diff --git a/src/infiniop/ops/asin/nvidia/asin_nvidia.cu b/src/infiniop/ops/asin/nvidia/asin_nvidia.cu new file mode 100644 index 000000000..714d2b1b3 --- /dev/null +++ b/src/infiniop/ops/asin/nvidia/asin_nvidia.cu @@ -0,0 +1,54 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "asin_nvidia.cuh" + +namespace op::asin::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &x_desc = input_desc_vec.at(0); + const auto &y_shape = out_desc->shape(); + const auto &x_shape = x_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); + + CHECK_SAME_SHAPE(y_shape, x_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::AsinOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::AsinOp, float>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::asin::nvidia diff --git a/src/infiniop/ops/asin/nvidia/asin_nvidia.cuh b/src/infiniop/ops/asin/nvidia/asin_nvidia.cuh new file mode 100644 index 000000000..46e168ede --- /dev/null +++ b/src/infiniop/ops/asin/nvidia/asin_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __ASIN_NVIDIA_API_H__ +#define __ASIN_NVIDIA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(asin, nvidia) + +#endif // __ASIN_NVIDIA_API_H__ diff --git a/src/infiniop/ops/asin/operator.cc b/src/infiniop/ops/asin/operator.cc new file mode 100644 index 000000000..c4973e9f5 --- /dev/null +++ b/src/infiniop/ops/asin/operator.cc @@ -0,0 +1,139 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/asin.h" + +#ifdef ENABLE_CPU_API +#include "cpu/asin_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#include "nvidia/asin_nvidia.cuh" +#endif + +__C infiniStatus_t infiniopCreateAsinDescriptor( + infiniopHandle_t handle, + infiniopAsinDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::asin::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + y_desc, \ + {x_desc}) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetAsinWorkspaceSize(infiniopAsinDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu) +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia) +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia) +#endif +#ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia) +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopAsin( + infiniopAsinDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, y, {x}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroyAsinDescriptor(infiniopAsinDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + DELETE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/ops/asinh/cpu/asinh_cpu.cc b/src/infiniop/ops/asinh/cpu/asinh_cpu.cc new file mode 100644 index 000000000..e0d5b749a --- /dev/null +++ b/src/infiniop/ops/asinh/cpu/asinh_cpu.cc @@ -0,0 +1,48 @@ +#include "asinh_cpu.h" + +namespace op::asinh::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &x_desc = input_desc_vec.at(0); + const auto &y_shape = out_desc->shape(); + const auto &x_shape = x_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); + + CHECK_SAME_SHAPE(y_shape, x_shape); + + // create CPU elementwise descriptor + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::asinh::cpu diff --git a/src/infiniop/ops/asinh/cpu/asinh_cpu.h b/src/infiniop/ops/asinh/cpu/asinh_cpu.h new file mode 100644 index 000000000..0a999b63b --- /dev/null +++ b/src/infiniop/ops/asinh/cpu/asinh_cpu.h @@ -0,0 +1,22 @@ +#ifndef __ASINH_CPU_H__ +#define __ASINH_CPU_H__ + +#include + +#include "../../../elementwise/cpu/elementwise_cpu.h" + +ELEMENTWISE_DESCRIPTOR(asinh, cpu) + +namespace op::asinh::cpu { +typedef struct AsinhOp { +public: + static constexpr size_t num_inputs = 1; + + template + T operator()(const T &x) const { + return std::asinh(x); + } +} AsinhOp; +} // namespace op::asinh::cpu + +#endif // __ASINH_CPU_H__ diff --git a/src/infiniop/ops/asinh/cuda/kernel.cuh b/src/infiniop/ops/asinh/cuda/kernel.cuh new file mode 100644 index 000000000..7cb018c8a --- /dev/null +++ b/src/infiniop/ops/asinh/cuda/kernel.cuh @@ -0,0 +1,32 @@ +#ifndef __ASINH_CUDA_H__ +#define __ASINH_CUDA_H__ + +#include +#include + +namespace op::asinh::cuda { +typedef struct AsinhOp { +public: + static constexpr size_t num_inputs = 1; + template + __device__ __forceinline__ T operator()(const T &x) const { + if constexpr (std::is_same_v) { + return __floats2half2_rn(asinhf(__half2float(__low2half(x))), asinhf(__half2float(__high2half(x)))); + } else if constexpr (std::is_same_v) { + return __float2half(asinhf(__half2float(x))); + } else if constexpr (std::is_same_v) { + float x0 = __bfloat162float(__low2bfloat16(x)); + float x1 = __bfloat162float(__high2bfloat16(x)); + return __floats2bfloat162_rn(asinhf(x0), asinhf(x1)); + } else if constexpr (std::is_same_v) { + return __float2bfloat16_rn(asinhf(__bfloat162float(x))); + } else if constexpr (std::is_same_v) { + return asinhf(x); + } else { + return std::asinh(x); + } + } +} AsinhOp; +} // namespace op::asinh::cuda + +#endif // __ASINH_CUDA_H__ diff --git a/src/infiniop/ops/asinh/nvidia/asinh_nvidia.cu b/src/infiniop/ops/asinh/nvidia/asinh_nvidia.cu new file mode 100644 index 000000000..203008b81 --- /dev/null +++ b/src/infiniop/ops/asinh/nvidia/asinh_nvidia.cu @@ -0,0 +1,54 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "asinh_nvidia.cuh" + +namespace op::asinh::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &x_desc = input_desc_vec.at(0); + const auto &y_shape = out_desc->shape(); + const auto &x_shape = x_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); + + CHECK_SAME_SHAPE(y_shape, x_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::AsinhOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::AsinhOp, float>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::asinh::nvidia diff --git a/src/infiniop/ops/asinh/nvidia/asinh_nvidia.cuh b/src/infiniop/ops/asinh/nvidia/asinh_nvidia.cuh new file mode 100644 index 000000000..d1dcb4287 --- /dev/null +++ b/src/infiniop/ops/asinh/nvidia/asinh_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __ASINH_NVIDIA_API_H__ +#define __ASINH_NVIDIA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(asinh, nvidia) + +#endif // __ASINH_NVIDIA_API_H__ diff --git a/src/infiniop/ops/asinh/operator.cc b/src/infiniop/ops/asinh/operator.cc new file mode 100644 index 000000000..d9ff5beda --- /dev/null +++ b/src/infiniop/ops/asinh/operator.cc @@ -0,0 +1,139 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/asinh.h" + +#ifdef ENABLE_CPU_API +#include "cpu/asinh_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#include "nvidia/asinh_nvidia.cuh" +#endif + +__C infiniStatus_t infiniopCreateAsinhDescriptor( + infiniopHandle_t handle, + infiniopAsinhDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::asinh::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + y_desc, \ + {x_desc}) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetAsinhWorkspaceSize(infiniopAsinhDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu) +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia) +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia) +#endif +#ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia) +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopAsinh( + infiniopAsinhDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, y, {x}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroyAsinhDescriptor(infiniopAsinhDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + DELETE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/ops/atan/cpu/atan_cpu.cc b/src/infiniop/ops/atan/cpu/atan_cpu.cc new file mode 100644 index 000000000..a8c613d1e --- /dev/null +++ b/src/infiniop/ops/atan/cpu/atan_cpu.cc @@ -0,0 +1,48 @@ +#include "atan_cpu.h" + +namespace op::atan::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &x_desc = input_desc_vec.at(0); + const auto &y_shape = out_desc->shape(); + const auto &x_shape = x_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); + + CHECK_SAME_SHAPE(y_shape, x_shape); + + // create CPU elementwise descriptor + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::atan::cpu diff --git a/src/infiniop/ops/atan/cpu/atan_cpu.h b/src/infiniop/ops/atan/cpu/atan_cpu.h new file mode 100644 index 000000000..ac2a1bc0c --- /dev/null +++ b/src/infiniop/ops/atan/cpu/atan_cpu.h @@ -0,0 +1,22 @@ +#ifndef __ATAN_CPU_H__ +#define __ATAN_CPU_H__ + +#include + +#include "../../../elementwise/cpu/elementwise_cpu.h" + +ELEMENTWISE_DESCRIPTOR(atan, cpu) + +namespace op::atan::cpu { +typedef struct AtanOp { +public: + static constexpr size_t num_inputs = 1; + + template + T operator()(const T &x) const { + return std::atan(x); + } +} AtanOp; +} // namespace op::atan::cpu + +#endif // __ATAN_CPU_H__ diff --git a/src/infiniop/ops/atan/cuda/kernel.cuh b/src/infiniop/ops/atan/cuda/kernel.cuh new file mode 100644 index 000000000..0c7745196 --- /dev/null +++ b/src/infiniop/ops/atan/cuda/kernel.cuh @@ -0,0 +1,32 @@ +#ifndef __ATAN_CUDA_H__ +#define __ATAN_CUDA_H__ + +#include +#include + +namespace op::atan::cuda { +typedef struct AtanOp { +public: + static constexpr size_t num_inputs = 1; + template + __device__ __forceinline__ T operator()(const T &x) const { + if constexpr (std::is_same_v) { + return __floats2half2_rn(atanf(__half2float(__low2half(x))), atanf(__half2float(__high2half(x)))); + } else if constexpr (std::is_same_v) { + return __float2half(atanf(__half2float(x))); + } else if constexpr (std::is_same_v) { + float x0 = __bfloat162float(__low2bfloat16(x)); + float x1 = __bfloat162float(__high2bfloat16(x)); + return __floats2bfloat162_rn(atanf(x0), atanf(x1)); + } else if constexpr (std::is_same_v) { + return __float2bfloat16_rn(atanf(__bfloat162float(x))); + } else if constexpr (std::is_same_v) { + return atanf(x); + } else { + return std::atan(x); + } + } +} AtanOp; +} // namespace op::atan::cuda + +#endif // __ATAN_CUDA_H__ diff --git a/src/infiniop/ops/atan/nvidia/atan_nvidia.cu b/src/infiniop/ops/atan/nvidia/atan_nvidia.cu new file mode 100644 index 000000000..2c6cf53d4 --- /dev/null +++ b/src/infiniop/ops/atan/nvidia/atan_nvidia.cu @@ -0,0 +1,54 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "atan_nvidia.cuh" + +namespace op::atan::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &x_desc = input_desc_vec.at(0); + const auto &y_shape = out_desc->shape(); + const auto &x_shape = x_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); + + CHECK_SAME_SHAPE(y_shape, x_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::AtanOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::AtanOp, float>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::atan::nvidia diff --git a/src/infiniop/ops/atan/nvidia/atan_nvidia.cuh b/src/infiniop/ops/atan/nvidia/atan_nvidia.cuh new file mode 100644 index 000000000..2aaee1ad9 --- /dev/null +++ b/src/infiniop/ops/atan/nvidia/atan_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __ATAN_NVIDIA_API_H__ +#define __ATAN_NVIDIA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(atan, nvidia) + +#endif // __ATAN_NVIDIA_API_H__ diff --git a/src/infiniop/ops/atan/operator.cc b/src/infiniop/ops/atan/operator.cc new file mode 100644 index 000000000..c56e101d2 --- /dev/null +++ b/src/infiniop/ops/atan/operator.cc @@ -0,0 +1,139 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/atan.h" + +#ifdef ENABLE_CPU_API +#include "cpu/atan_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#include "nvidia/atan_nvidia.cuh" +#endif + +__C infiniStatus_t infiniopCreateAtanDescriptor( + infiniopHandle_t handle, + infiniopAtanDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::atan::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + y_desc, \ + {x_desc}) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetAtanWorkspaceSize(infiniopAtanDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu) +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia) +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia) +#endif +#ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia) +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopAtan( + infiniopAtanDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, y, {x}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroyAtanDescriptor(infiniopAtanDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + DELETE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/ops/atanh/cpu/atanh_cpu.cc b/src/infiniop/ops/atanh/cpu/atanh_cpu.cc new file mode 100644 index 000000000..66ef4b1df --- /dev/null +++ b/src/infiniop/ops/atanh/cpu/atanh_cpu.cc @@ -0,0 +1,48 @@ +#include "atanh_cpu.h" + +namespace op::atanh::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &x_desc = input_desc_vec.at(0); + const auto &y_shape = out_desc->shape(); + const auto &x_shape = x_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); + + CHECK_SAME_SHAPE(y_shape, x_shape); + + // create CPU elementwise descriptor + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::atanh::cpu diff --git a/src/infiniop/ops/atanh/cpu/atanh_cpu.h b/src/infiniop/ops/atanh/cpu/atanh_cpu.h new file mode 100644 index 000000000..8c2b04755 --- /dev/null +++ b/src/infiniop/ops/atanh/cpu/atanh_cpu.h @@ -0,0 +1,22 @@ +#ifndef __ATANH_CPU_H__ +#define __ATANH_CPU_H__ + +#include + +#include "../../../elementwise/cpu/elementwise_cpu.h" + +ELEMENTWISE_DESCRIPTOR(atanh, cpu) + +namespace op::atanh::cpu { +typedef struct AtanhOp { +public: + static constexpr size_t num_inputs = 1; + + template + T operator()(const T &x) const { + return std::atanh(x); + } +} AtanhOp; +} // namespace op::atanh::cpu + +#endif // __ATANH_CPU_H__ diff --git a/src/infiniop/ops/atanh/cuda/kernel.cuh b/src/infiniop/ops/atanh/cuda/kernel.cuh new file mode 100644 index 000000000..5337d8243 --- /dev/null +++ b/src/infiniop/ops/atanh/cuda/kernel.cuh @@ -0,0 +1,32 @@ +#ifndef __ATANH_CUDA_H__ +#define __ATANH_CUDA_H__ + +#include +#include + +namespace op::atanh::cuda { +typedef struct AtanhOp { +public: + static constexpr size_t num_inputs = 1; + template + __device__ __forceinline__ T operator()(const T &x) const { + if constexpr (std::is_same_v) { + return __floats2half2_rn(atanhf(__half2float(__low2half(x))), atanhf(__half2float(__high2half(x)))); + } else if constexpr (std::is_same_v) { + return __float2half(atanhf(__half2float(x))); + } else if constexpr (std::is_same_v) { + float x0 = __bfloat162float(__low2bfloat16(x)); + float x1 = __bfloat162float(__high2bfloat16(x)); + return __floats2bfloat162_rn(atanhf(x0), atanhf(x1)); + } else if constexpr (std::is_same_v) { + return __float2bfloat16_rn(atanhf(__bfloat162float(x))); + } else if constexpr (std::is_same_v) { + return atanhf(x); + } else { + return std::atanh(x); + } + } +} AtanhOp; +} // namespace op::atanh::cuda + +#endif // __ATANH_CUDA_H__ diff --git a/src/infiniop/ops/atanh/nvidia/atanh_nvidia.cu b/src/infiniop/ops/atanh/nvidia/atanh_nvidia.cu new file mode 100644 index 000000000..cb5a1ff03 --- /dev/null +++ b/src/infiniop/ops/atanh/nvidia/atanh_nvidia.cu @@ -0,0 +1,54 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "atanh_nvidia.cuh" + +namespace op::atanh::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &x_desc = input_desc_vec.at(0); + const auto &y_shape = out_desc->shape(); + const auto &x_shape = x_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); + + CHECK_SAME_SHAPE(y_shape, x_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::AtanhOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::AtanhOp, float>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::atanh::nvidia diff --git a/src/infiniop/ops/atanh/nvidia/atanh_nvidia.cuh b/src/infiniop/ops/atanh/nvidia/atanh_nvidia.cuh new file mode 100644 index 000000000..da73cfa99 --- /dev/null +++ b/src/infiniop/ops/atanh/nvidia/atanh_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __ATANH_NVIDIA_API_H__ +#define __ATANH_NVIDIA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(atanh, nvidia) + +#endif // __ATANH_NVIDIA_API_H__ diff --git a/src/infiniop/ops/atanh/operator.cc b/src/infiniop/ops/atanh/operator.cc new file mode 100644 index 000000000..a73adcb23 --- /dev/null +++ b/src/infiniop/ops/atanh/operator.cc @@ -0,0 +1,139 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/atanh.h" + +#ifdef ENABLE_CPU_API +#include "cpu/atanh_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#include "nvidia/atanh_nvidia.cuh" +#endif + +__C infiniStatus_t infiniopCreateAtanhDescriptor( + infiniopHandle_t handle, + infiniopAtanhDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::atanh::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + y_desc, \ + {x_desc}) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetAtanhWorkspaceSize(infiniopAtanhDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu) +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia) +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia) +#endif +#ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia) +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopAtanh( + infiniopAtanhDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, y, {x}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroyAtanhDescriptor(infiniopAtanhDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + DELETE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/ops/ceil/cpu/ceil_cpu.cc b/src/infiniop/ops/ceil/cpu/ceil_cpu.cc new file mode 100644 index 000000000..17b3ec888 --- /dev/null +++ b/src/infiniop/ops/ceil/cpu/ceil_cpu.cc @@ -0,0 +1,48 @@ +#include "ceil_cpu.h" + +namespace op::ceil::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &x_desc = input_desc_vec.at(0); + const auto &y_shape = out_desc->shape(); + const auto &x_shape = x_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); + + CHECK_SAME_SHAPE(y_shape, x_shape); + + // create CPU elementwise descriptor + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::ceil::cpu diff --git a/src/infiniop/ops/ceil/cpu/ceil_cpu.h b/src/infiniop/ops/ceil/cpu/ceil_cpu.h new file mode 100644 index 000000000..c3ca8e441 --- /dev/null +++ b/src/infiniop/ops/ceil/cpu/ceil_cpu.h @@ -0,0 +1,26 @@ +#ifndef __CEIL_CPU_H__ +#define __CEIL_CPU_H__ + +#include + +#include "../../../elementwise/cpu/elementwise_cpu.h" + +ELEMENTWISE_DESCRIPTOR(ceil, cpu) + +namespace op::ceil::cpu { +typedef struct CeilOp { +public: + static constexpr size_t num_inputs = 1; + + template + T operator()(const T &x) const { + if constexpr (std::is_integral_v) { + return x; + } else { + return std::ceil(x); + } + } +} CeilOp; +} // namespace op::ceil::cpu + +#endif // __CEIL_CPU_H__ diff --git a/src/infiniop/ops/ceil/cuda/kernel.cuh b/src/infiniop/ops/ceil/cuda/kernel.cuh new file mode 100644 index 000000000..a2d2e7fb5 --- /dev/null +++ b/src/infiniop/ops/ceil/cuda/kernel.cuh @@ -0,0 +1,34 @@ +#ifndef __CEIL_CUDA_H__ +#define __CEIL_CUDA_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" +#include + +namespace op::ceil::cuda { +typedef struct CeilOp { +public: + static constexpr size_t num_inputs = 1; + template + __device__ __forceinline__ T operator()(const T &x) const { + if constexpr (std::is_same_v) { + return h2ceil(x); + } else if constexpr (std::is_same_v) { + return hceil(x); + } else if constexpr (std::is_same_v) { + float x0 = __bfloat162float(__low2bfloat16(x)); + float x1 = __bfloat162float(__high2bfloat16(x)); + return __floats2bfloat162_rn(ceilf(x0), ceilf(x1)); + } else if constexpr (std::is_same_v) { + return __float2bfloat16_rn(ceilf(__bfloat162float(x))); + } else if constexpr (std::is_same_v) { + return ceilf(x); + } else if constexpr (std::is_integral_v) { + return x; + } else { + return std::ceil(x); + } + } +} CeilOp; +} // namespace op::ceil::cuda + +#endif // __CEIL_CUDA_H__ diff --git a/src/infiniop/ops/ceil/nvidia/ceil_nvidia.cu b/src/infiniop/ops/ceil/nvidia/ceil_nvidia.cu new file mode 100644 index 000000000..c7ad2ee5b --- /dev/null +++ b/src/infiniop/ops/ceil/nvidia/ceil_nvidia.cu @@ -0,0 +1,54 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "ceil_nvidia.cuh" + +namespace op::ceil::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &x_desc = input_desc_vec.at(0); + const auto &y_shape = out_desc->shape(); + const auto &x_shape = x_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); + + CHECK_SAME_SHAPE(y_shape, x_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::CeilOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::CeilOp, float>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::ceil::nvidia diff --git a/src/infiniop/ops/ceil/nvidia/ceil_nvidia.cuh b/src/infiniop/ops/ceil/nvidia/ceil_nvidia.cuh new file mode 100644 index 000000000..9bada334d --- /dev/null +++ b/src/infiniop/ops/ceil/nvidia/ceil_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __CEIL_NVIDIA_API_H__ +#define __CEIL_NVIDIA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(ceil, nvidia) + +#endif // __CEIL_NVIDIA_API_H__ diff --git a/src/infiniop/ops/ceil/operator.cc b/src/infiniop/ops/ceil/operator.cc new file mode 100644 index 000000000..4e5ee7800 --- /dev/null +++ b/src/infiniop/ops/ceil/operator.cc @@ -0,0 +1,139 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/ceil.h" + +#ifdef ENABLE_CPU_API +#include "cpu/ceil_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#include "nvidia/ceil_nvidia.cuh" +#endif + +__C infiniStatus_t infiniopCreateCeilDescriptor( + infiniopHandle_t handle, + infiniopCeilDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::ceil::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + y_desc, \ + {x_desc}) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetCeilWorkspaceSize(infiniopCeilDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu) +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia) +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia) +#endif +#ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia) +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopCeil( + infiniopCeilDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, y, {x}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroyCeilDescriptor(infiniopCeilDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + DELETE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/ops/cos/cpu/cos_cpu.cc b/src/infiniop/ops/cos/cpu/cos_cpu.cc new file mode 100644 index 000000000..9dc68d327 --- /dev/null +++ b/src/infiniop/ops/cos/cpu/cos_cpu.cc @@ -0,0 +1,48 @@ +#include "cos_cpu.h" + +namespace op::cos::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &x_desc = input_desc_vec.at(0); + const auto &y_shape = out_desc->shape(); + const auto &x_shape = x_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); + + CHECK_SAME_SHAPE(y_shape, x_shape); + + // create CPU elementwise descriptor + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::cos::cpu diff --git a/src/infiniop/ops/cos/cpu/cos_cpu.h b/src/infiniop/ops/cos/cpu/cos_cpu.h new file mode 100644 index 000000000..9b4236fc2 --- /dev/null +++ b/src/infiniop/ops/cos/cpu/cos_cpu.h @@ -0,0 +1,22 @@ +#ifndef __COS_CPU_H__ +#define __COS_CPU_H__ + +#include + +#include "../../../elementwise/cpu/elementwise_cpu.h" + +ELEMENTWISE_DESCRIPTOR(cos, cpu) + +namespace op::cos::cpu { +typedef struct CosOp { +public: + static constexpr size_t num_inputs = 1; + + template + T operator()(const T &x) const { + return std::cos(x); + } +} CosOp; +} // namespace op::cos::cpu + +#endif // __COS_CPU_H__ diff --git a/src/infiniop/ops/cos/cuda/kernel.cuh b/src/infiniop/ops/cos/cuda/kernel.cuh new file mode 100644 index 000000000..b0dabb340 --- /dev/null +++ b/src/infiniop/ops/cos/cuda/kernel.cuh @@ -0,0 +1,32 @@ +#ifndef __COS_CUDA_H__ +#define __COS_CUDA_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" +#include + +namespace op::cos::cuda { +typedef struct CosOp { +public: + static constexpr size_t num_inputs = 1; + template + __device__ __forceinline__ T operator()(const T &x) const { + if constexpr (std::is_same_v) { + return h2cos(x); + } else if constexpr (std::is_same_v) { + return hcos(x); + } else if constexpr (std::is_same_v) { + float x0 = __bfloat162float(__low2bfloat16(x)); + float x1 = __bfloat162float(__high2bfloat16(x)); + return __floats2bfloat162_rn(cosf(x0), cosf(x1)); + } else if constexpr (std::is_same_v) { + return __float2bfloat16_rn(cosf(__bfloat162float(x))); + } else if constexpr (std::is_same_v) { + return __cosf(x); + } else { + return std::cos(x); + } + } +} CosOp; +} // namespace op::cos::cuda + +#endif // __COS_CUDA_H__ diff --git a/src/infiniop/ops/cos/nvidia/cos_nvidia.cu b/src/infiniop/ops/cos/nvidia/cos_nvidia.cu new file mode 100644 index 000000000..044c59ca0 --- /dev/null +++ b/src/infiniop/ops/cos/nvidia/cos_nvidia.cu @@ -0,0 +1,54 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "cos_nvidia.cuh" + +namespace op::cos::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &x_desc = input_desc_vec.at(0); + const auto &y_shape = out_desc->shape(); + const auto &x_shape = x_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); + + CHECK_SAME_SHAPE(y_shape, x_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::CosOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::CosOp, float>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::cos::nvidia diff --git a/src/infiniop/ops/cos/nvidia/cos_nvidia.cuh b/src/infiniop/ops/cos/nvidia/cos_nvidia.cuh new file mode 100644 index 000000000..a9866e4d2 --- /dev/null +++ b/src/infiniop/ops/cos/nvidia/cos_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __COS_NVIDIA_API_H__ +#define __COS_NVIDIA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(cos, nvidia) + +#endif // __COS_NVIDIA_API_H__ diff --git a/src/infiniop/ops/cos/operator.cc b/src/infiniop/ops/cos/operator.cc new file mode 100644 index 000000000..5c464ad60 --- /dev/null +++ b/src/infiniop/ops/cos/operator.cc @@ -0,0 +1,139 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/cos.h" + +#ifdef ENABLE_CPU_API +#include "cpu/cos_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#include "nvidia/cos_nvidia.cuh" +#endif + +__C infiniStatus_t infiniopCreateCosDescriptor( + infiniopHandle_t handle, + infiniopCosDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::cos::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + y_desc, \ + {x_desc}) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetCosWorkspaceSize(infiniopCosDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu) +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia) +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia) +#endif +#ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia) +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopCos( + infiniopCosDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, y, {x}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroyCosDescriptor(infiniopCosDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + DELETE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/ops/cosh/cpu/cosh_cpu.cc b/src/infiniop/ops/cosh/cpu/cosh_cpu.cc new file mode 100644 index 000000000..9ed8e33da --- /dev/null +++ b/src/infiniop/ops/cosh/cpu/cosh_cpu.cc @@ -0,0 +1,48 @@ +#include "cosh_cpu.h" + +namespace op::cosh::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &x_desc = input_desc_vec.at(0); + const auto &y_shape = out_desc->shape(); + const auto &x_shape = x_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); + + CHECK_SAME_SHAPE(y_shape, x_shape); + + // create CPU elementwise descriptor + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::cosh::cpu diff --git a/src/infiniop/ops/cosh/cpu/cosh_cpu.h b/src/infiniop/ops/cosh/cpu/cosh_cpu.h new file mode 100644 index 000000000..aea359ef2 --- /dev/null +++ b/src/infiniop/ops/cosh/cpu/cosh_cpu.h @@ -0,0 +1,22 @@ +#ifndef __COSH_CPU_H__ +#define __COSH_CPU_H__ + +#include + +#include "../../../elementwise/cpu/elementwise_cpu.h" + +ELEMENTWISE_DESCRIPTOR(cosh, cpu) + +namespace op::cosh::cpu { +typedef struct CoshOp { +public: + static constexpr size_t num_inputs = 1; + + template + T operator()(const T &x) const { + return std::cosh(x); + } +} CoshOp; +} // namespace op::cosh::cpu + +#endif // __COSH_CPU_H__ diff --git a/src/infiniop/ops/cosh/cuda/kernel.cuh b/src/infiniop/ops/cosh/cuda/kernel.cuh new file mode 100644 index 000000000..ce6806433 --- /dev/null +++ b/src/infiniop/ops/cosh/cuda/kernel.cuh @@ -0,0 +1,32 @@ +#ifndef __COSH_CUDA_H__ +#define __COSH_CUDA_H__ + +#include +#include + +namespace op::cosh::cuda { +typedef struct CoshOp { +public: + static constexpr size_t num_inputs = 1; + template + __device__ __forceinline__ T operator()(const T &x) const { + if constexpr (std::is_same_v) { + return __floats2half2_rn(coshf(__half2float(__low2half(x))), coshf(__half2float(__high2half(x)))); + } else if constexpr (std::is_same_v) { + return __float2half(coshf(__half2float(x))); + } else if constexpr (std::is_same_v) { + float x0 = __bfloat162float(__low2bfloat16(x)); + float x1 = __bfloat162float(__high2bfloat16(x)); + return __floats2bfloat162_rn(coshf(x0), coshf(x1)); + } else if constexpr (std::is_same_v) { + return __float2bfloat16_rn(coshf(__bfloat162float(x))); + } else if constexpr (std::is_same_v) { + return coshf(x); + } else { + return std::cosh(x); + } + } +} CoshOp; +} // namespace op::cosh::cuda + +#endif // __COSH_CUDA_H__ diff --git a/src/infiniop/ops/cosh/nvidia/cosh_nvidia.cu b/src/infiniop/ops/cosh/nvidia/cosh_nvidia.cu new file mode 100644 index 000000000..a5e1442ce --- /dev/null +++ b/src/infiniop/ops/cosh/nvidia/cosh_nvidia.cu @@ -0,0 +1,54 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "cosh_nvidia.cuh" + +namespace op::cosh::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &x_desc = input_desc_vec.at(0); + const auto &y_shape = out_desc->shape(); + const auto &x_shape = x_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); + + CHECK_SAME_SHAPE(y_shape, x_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::CoshOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::CoshOp, float>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::cosh::nvidia diff --git a/src/infiniop/ops/cosh/nvidia/cosh_nvidia.cuh b/src/infiniop/ops/cosh/nvidia/cosh_nvidia.cuh new file mode 100644 index 000000000..6a032b0bb --- /dev/null +++ b/src/infiniop/ops/cosh/nvidia/cosh_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __COSH_NVIDIA_API_H__ +#define __COSH_NVIDIA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(cosh, nvidia) + +#endif // __COSH_NVIDIA_API_H__ diff --git a/src/infiniop/ops/cosh/operator.cc b/src/infiniop/ops/cosh/operator.cc new file mode 100644 index 000000000..75aac0c91 --- /dev/null +++ b/src/infiniop/ops/cosh/operator.cc @@ -0,0 +1,139 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/cosh.h" + +#ifdef ENABLE_CPU_API +#include "cpu/cosh_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#include "nvidia/cosh_nvidia.cuh" +#endif + +__C infiniStatus_t infiniopCreateCoshDescriptor( + infiniopHandle_t handle, + infiniopCoshDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::cosh::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + y_desc, \ + {x_desc}) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetCoshWorkspaceSize(infiniopCoshDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu) +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia) +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia) +#endif +#ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia) +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopCosh( + infiniopCoshDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, y, {x}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroyCoshDescriptor(infiniopCoshDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + DELETE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/ops/erf/cpu/erf_cpu.cc b/src/infiniop/ops/erf/cpu/erf_cpu.cc new file mode 100644 index 000000000..00b1897d1 --- /dev/null +++ b/src/infiniop/ops/erf/cpu/erf_cpu.cc @@ -0,0 +1,48 @@ +#include "erf_cpu.h" + +namespace op::erf::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &x_desc = input_desc_vec.at(0); + const auto &y_shape = out_desc->shape(); + const auto &x_shape = x_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); + + CHECK_SAME_SHAPE(y_shape, x_shape); + + // create CPU elementwise descriptor + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::erf::cpu diff --git a/src/infiniop/ops/erf/cpu/erf_cpu.h b/src/infiniop/ops/erf/cpu/erf_cpu.h new file mode 100644 index 000000000..c26f519cf --- /dev/null +++ b/src/infiniop/ops/erf/cpu/erf_cpu.h @@ -0,0 +1,22 @@ +#ifndef __ERF_CPU_H__ +#define __ERF_CPU_H__ + +#include + +#include "../../../elementwise/cpu/elementwise_cpu.h" + +ELEMENTWISE_DESCRIPTOR(erf, cpu) + +namespace op::erf::cpu { +typedef struct ErfOp { +public: + static constexpr size_t num_inputs = 1; + + template + T operator()(const T &x) const { + return std::erf(x); + } +} ErfOp; +} // namespace op::erf::cpu + +#endif // __ERF_CPU_H__ diff --git a/src/infiniop/ops/erf/cuda/kernel.cuh b/src/infiniop/ops/erf/cuda/kernel.cuh new file mode 100644 index 000000000..820c10b19 --- /dev/null +++ b/src/infiniop/ops/erf/cuda/kernel.cuh @@ -0,0 +1,32 @@ +#ifndef __ERF_CUDA_H__ +#define __ERF_CUDA_H__ + +#include +#include + +namespace op::erf::cuda { +typedef struct ErfOp { +public: + static constexpr size_t num_inputs = 1; + template + __device__ __forceinline__ T operator()(const T &x) const { + if constexpr (std::is_same_v) { + return __floats2half2_rn(erff(__half2float(__low2half(x))), erff(__half2float(__high2half(x)))); + } else if constexpr (std::is_same_v) { + return __float2half(erff(__half2float(x))); + } else if constexpr (std::is_same_v) { + float x0 = __bfloat162float(__low2bfloat16(x)); + float x1 = __bfloat162float(__high2bfloat16(x)); + return __floats2bfloat162_rn(erff(x0), erff(x1)); + } else if constexpr (std::is_same_v) { + return __float2bfloat16_rn(erff(__bfloat162float(x))); + } else if constexpr (std::is_same_v) { + return erff(x); + } else { + return std::erf(x); + } + } +} ErfOp; +} // namespace op::erf::cuda + +#endif // __ERF_CUDA_H__ diff --git a/src/infiniop/ops/erf/nvidia/erf_nvidia.cu b/src/infiniop/ops/erf/nvidia/erf_nvidia.cu new file mode 100644 index 000000000..9080593de --- /dev/null +++ b/src/infiniop/ops/erf/nvidia/erf_nvidia.cu @@ -0,0 +1,54 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "erf_nvidia.cuh" + +namespace op::erf::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &x_desc = input_desc_vec.at(0); + const auto &y_shape = out_desc->shape(); + const auto &x_shape = x_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); + + CHECK_SAME_SHAPE(y_shape, x_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::ErfOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::ErfOp, float>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::erf::nvidia diff --git a/src/infiniop/ops/erf/nvidia/erf_nvidia.cuh b/src/infiniop/ops/erf/nvidia/erf_nvidia.cuh new file mode 100644 index 000000000..0621150fa --- /dev/null +++ b/src/infiniop/ops/erf/nvidia/erf_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __ERF_NVIDIA_API_H__ +#define __ERF_NVIDIA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(erf, nvidia) + +#endif // __ERF_NVIDIA_API_H__ diff --git a/src/infiniop/ops/erf/operator.cc b/src/infiniop/ops/erf/operator.cc new file mode 100644 index 000000000..1491cfa9a --- /dev/null +++ b/src/infiniop/ops/erf/operator.cc @@ -0,0 +1,139 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/erf.h" + +#ifdef ENABLE_CPU_API +#include "cpu/erf_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#include "nvidia/erf_nvidia.cuh" +#endif + +__C infiniStatus_t infiniopCreateErfDescriptor( + infiniopHandle_t handle, + infiniopErfDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::erf::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + y_desc, \ + {x_desc}) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetErfWorkspaceSize(infiniopErfDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu) +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia) +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia) +#endif +#ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia) +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopErf( + infiniopErfDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, y, {x}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroyErfDescriptor(infiniopErfDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + DELETE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/ops/floor/cpu/floor_cpu.cc b/src/infiniop/ops/floor/cpu/floor_cpu.cc new file mode 100644 index 000000000..e809a02e2 --- /dev/null +++ b/src/infiniop/ops/floor/cpu/floor_cpu.cc @@ -0,0 +1,48 @@ +#include "floor_cpu.h" + +namespace op::floor::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &x_desc = input_desc_vec.at(0); + const auto &y_shape = out_desc->shape(); + const auto &x_shape = x_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); + + CHECK_SAME_SHAPE(y_shape, x_shape); + + // create CPU elementwise descriptor + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::floor::cpu diff --git a/src/infiniop/ops/floor/cpu/floor_cpu.h b/src/infiniop/ops/floor/cpu/floor_cpu.h new file mode 100644 index 000000000..91508a384 --- /dev/null +++ b/src/infiniop/ops/floor/cpu/floor_cpu.h @@ -0,0 +1,26 @@ +#ifndef __FLOOR_CPU_H__ +#define __FLOOR_CPU_H__ + +#include + +#include "../../../elementwise/cpu/elementwise_cpu.h" + +ELEMENTWISE_DESCRIPTOR(floor, cpu) + +namespace op::floor::cpu { +typedef struct FloorOp { +public: + static constexpr size_t num_inputs = 1; + + template + T operator()(const T &x) const { + if constexpr (std::is_integral_v) { + return x; + } else { + return std::floor(x); + } + } +} FloorOp; +} // namespace op::floor::cpu + +#endif // __FLOOR_CPU_H__ diff --git a/src/infiniop/ops/floor/cuda/kernel.cuh b/src/infiniop/ops/floor/cuda/kernel.cuh new file mode 100644 index 000000000..c89ce34f4 --- /dev/null +++ b/src/infiniop/ops/floor/cuda/kernel.cuh @@ -0,0 +1,34 @@ +#ifndef __FLOOR_CUDA_H__ +#define __FLOOR_CUDA_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" +#include + +namespace op::floor::cuda { +typedef struct FloorOp { +public: + static constexpr size_t num_inputs = 1; + template + __device__ __forceinline__ T operator()(const T &x) const { + if constexpr (std::is_same_v) { + return h2floor(x); + } else if constexpr (std::is_same_v) { + return hfloor(x); + } else if constexpr (std::is_same_v) { + float x0 = __bfloat162float(__low2bfloat16(x)); + float x1 = __bfloat162float(__high2bfloat16(x)); + return __floats2bfloat162_rn(floorf(x0), floorf(x1)); + } else if constexpr (std::is_same_v) { + return __float2bfloat16_rn(floorf(__bfloat162float(x))); + } else if constexpr (std::is_same_v) { + return floorf(x); + } else if constexpr (std::is_integral_v) { + return x; + } else { + return std::floor(x); + } + } +} FloorOp; +} // namespace op::floor::cuda + +#endif // __FLOOR_CUDA_H__ diff --git a/src/infiniop/ops/floor/nvidia/floor_nvidia.cu b/src/infiniop/ops/floor/nvidia/floor_nvidia.cu new file mode 100644 index 000000000..08305048a --- /dev/null +++ b/src/infiniop/ops/floor/nvidia/floor_nvidia.cu @@ -0,0 +1,54 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "floor_nvidia.cuh" + +namespace op::floor::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &x_desc = input_desc_vec.at(0); + const auto &y_shape = out_desc->shape(); + const auto &x_shape = x_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); + + CHECK_SAME_SHAPE(y_shape, x_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::FloorOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::FloorOp, float>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::floor::nvidia diff --git a/src/infiniop/ops/floor/nvidia/floor_nvidia.cuh b/src/infiniop/ops/floor/nvidia/floor_nvidia.cuh new file mode 100644 index 000000000..7a3c2f5c7 --- /dev/null +++ b/src/infiniop/ops/floor/nvidia/floor_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __FLOOR_NVIDIA_API_H__ +#define __FLOOR_NVIDIA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(floor, nvidia) + +#endif // __FLOOR_NVIDIA_API_H__ diff --git a/src/infiniop/ops/floor/operator.cc b/src/infiniop/ops/floor/operator.cc new file mode 100644 index 000000000..4e4ed2b5a --- /dev/null +++ b/src/infiniop/ops/floor/operator.cc @@ -0,0 +1,139 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/floor.h" + +#ifdef ENABLE_CPU_API +#include "cpu/floor_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#include "nvidia/floor_nvidia.cuh" +#endif + +__C infiniStatus_t infiniopCreateFloorDescriptor( + infiniopHandle_t handle, + infiniopFloorDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::floor::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + y_desc, \ + {x_desc}) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetFloorWorkspaceSize(infiniopFloorDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu) +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia) +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia) +#endif +#ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia) +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopFloor( + infiniopFloorDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, y, {x}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroyFloorDescriptor(infiniopFloorDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + DELETE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/ops/log/cpu/log_cpu.cc b/src/infiniop/ops/log/cpu/log_cpu.cc new file mode 100644 index 000000000..e7314c319 --- /dev/null +++ b/src/infiniop/ops/log/cpu/log_cpu.cc @@ -0,0 +1,48 @@ +#include "log_cpu.h" + +namespace op::log::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &x_desc = input_desc_vec.at(0); + const auto &y_shape = out_desc->shape(); + const auto &x_shape = x_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); + + CHECK_SAME_SHAPE(y_shape, x_shape); + + // create CPU elementwise descriptor + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::log::cpu diff --git a/src/infiniop/ops/log/cpu/log_cpu.h b/src/infiniop/ops/log/cpu/log_cpu.h new file mode 100644 index 000000000..535e681d3 --- /dev/null +++ b/src/infiniop/ops/log/cpu/log_cpu.h @@ -0,0 +1,22 @@ +#ifndef __LOG_CPU_H__ +#define __LOG_CPU_H__ + +#include + +#include "../../../elementwise/cpu/elementwise_cpu.h" + +ELEMENTWISE_DESCRIPTOR(log, cpu) + +namespace op::log::cpu { +typedef struct LogOp { +public: + static constexpr size_t num_inputs = 1; + + template + T operator()(const T &x) const { + return std::log(x); + } +} LogOp; +} // namespace op::log::cpu + +#endif // __LOG_CPU_H__ diff --git a/src/infiniop/ops/log/cuda/kernel.cuh b/src/infiniop/ops/log/cuda/kernel.cuh new file mode 100644 index 000000000..b1e46873c --- /dev/null +++ b/src/infiniop/ops/log/cuda/kernel.cuh @@ -0,0 +1,32 @@ +#ifndef __LOG_CUDA_H__ +#define __LOG_CUDA_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" +#include + +namespace op::log::cuda { +typedef struct LogOp { +public: + static constexpr size_t num_inputs = 1; + template + __device__ __forceinline__ T operator()(const T &x) const { + if constexpr (std::is_same_v) { + return h2log(x); + } else if constexpr (std::is_same_v) { + return __float2half(__logf(__half2float(x))); + } else if constexpr (std::is_same_v) { + float x0 = __bfloat162float(__low2bfloat16(x)); + float x1 = __bfloat162float(__high2bfloat16(x)); + return __floats2bfloat162_rn(logf(x0), logf(x1)); + } else if constexpr (std::is_same_v) { + return __float2bfloat16_rn(logf(__bfloat162float(x))); + } else if constexpr (std::is_same_v) { + return __logf(x); + } else { + return std::log(x); + } + } +} LogOp; +} // namespace op::log::cuda + +#endif // __LOG_CUDA_H__ diff --git a/src/infiniop/ops/log/nvidia/log_nvidia.cu b/src/infiniop/ops/log/nvidia/log_nvidia.cu new file mode 100644 index 000000000..9e7bcafc4 --- /dev/null +++ b/src/infiniop/ops/log/nvidia/log_nvidia.cu @@ -0,0 +1,54 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "log_nvidia.cuh" + +namespace op::log::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &x_desc = input_desc_vec.at(0); + const auto &y_shape = out_desc->shape(); + const auto &x_shape = x_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); + + CHECK_SAME_SHAPE(y_shape, x_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::LogOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::LogOp, float>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::log::nvidia diff --git a/src/infiniop/ops/log/nvidia/log_nvidia.cuh b/src/infiniop/ops/log/nvidia/log_nvidia.cuh new file mode 100644 index 000000000..c48841622 --- /dev/null +++ b/src/infiniop/ops/log/nvidia/log_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __LOG_NVIDIA_API_H__ +#define __LOG_NVIDIA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(log, nvidia) + +#endif // __LOG_NVIDIA_API_H__ diff --git a/src/infiniop/ops/log/operator.cc b/src/infiniop/ops/log/operator.cc new file mode 100644 index 000000000..8f2add408 --- /dev/null +++ b/src/infiniop/ops/log/operator.cc @@ -0,0 +1,139 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/log.h" + +#ifdef ENABLE_CPU_API +#include "cpu/log_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#include "nvidia/log_nvidia.cuh" +#endif + +__C infiniStatus_t infiniopCreateLogDescriptor( + infiniopHandle_t handle, + infiniopLogDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::log::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + y_desc, \ + {x_desc}) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetLogWorkspaceSize(infiniopLogDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu) +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia) +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia) +#endif +#ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia) +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopLog( + infiniopLogDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, y, {x}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroyLogDescriptor(infiniopLogDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + DELETE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/ops/neg/cpu/neg_cpu.cc b/src/infiniop/ops/neg/cpu/neg_cpu.cc new file mode 100644 index 000000000..5da2ae4c3 --- /dev/null +++ b/src/infiniop/ops/neg/cpu/neg_cpu.cc @@ -0,0 +1,48 @@ +#include "neg_cpu.h" + +namespace op::neg::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &x_desc = input_desc_vec.at(0); + const auto &y_shape = out_desc->shape(); + const auto &x_shape = x_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); + + CHECK_SAME_SHAPE(y_shape, x_shape); + + // create CPU elementwise descriptor + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::neg::cpu diff --git a/src/infiniop/ops/neg/cpu/neg_cpu.h b/src/infiniop/ops/neg/cpu/neg_cpu.h new file mode 100644 index 000000000..ea45989b3 --- /dev/null +++ b/src/infiniop/ops/neg/cpu/neg_cpu.h @@ -0,0 +1,20 @@ +#ifndef __NEG_CPU_H__ +#define __NEG_CPU_H__ + +#include "../../../elementwise/cpu/elementwise_cpu.h" + +ELEMENTWISE_DESCRIPTOR(neg, cpu) + +namespace op::neg::cpu { +typedef struct NegOp { +public: + static constexpr size_t num_inputs = 1; + + template + T operator()(const T &x) const { + return -x; + } +} NegOp; +} // namespace op::neg::cpu + +#endif // __NEG_CPU_H__ diff --git a/src/infiniop/ops/neg/cuda/kernel.cuh b/src/infiniop/ops/neg/cuda/kernel.cuh new file mode 100644 index 000000000..57904b3df --- /dev/null +++ b/src/infiniop/ops/neg/cuda/kernel.cuh @@ -0,0 +1,23 @@ +#ifndef __NEG_CUDA_H__ +#define __NEG_CUDA_H__ + +#include + +namespace op::neg::cuda { +typedef struct NegOp { +public: + static constexpr size_t num_inputs = 1; + template + __device__ __forceinline__ T operator()(const T &x) const { + if constexpr (std::is_same_v) { + return __hneg2(x); + } else if constexpr (std::is_same_v) { + return __hneg(x); + } else { + return -x; + } + } +} NegOp; +} // namespace op::neg::cuda + +#endif // __NEG_CUDA_H__ diff --git a/src/infiniop/ops/neg/nvidia/neg_nvidia.cu b/src/infiniop/ops/neg/nvidia/neg_nvidia.cu new file mode 100644 index 000000000..d18b8bf25 --- /dev/null +++ b/src/infiniop/ops/neg/nvidia/neg_nvidia.cu @@ -0,0 +1,54 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "neg_nvidia.cuh" + +namespace op::neg::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &x_desc = input_desc_vec.at(0); + const auto &y_shape = out_desc->shape(); + const auto &x_shape = x_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); + + CHECK_SAME_SHAPE(y_shape, x_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::NegOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::NegOp, float>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::neg::nvidia diff --git a/src/infiniop/ops/neg/nvidia/neg_nvidia.cuh b/src/infiniop/ops/neg/nvidia/neg_nvidia.cuh new file mode 100644 index 000000000..1265cd3df --- /dev/null +++ b/src/infiniop/ops/neg/nvidia/neg_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __NEG_NVIDIA_API_H__ +#define __NEG_NVIDIA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(neg, nvidia) + +#endif // __NEG_NVIDIA_API_H__ diff --git a/src/infiniop/ops/neg/operator.cc b/src/infiniop/ops/neg/operator.cc new file mode 100644 index 000000000..d4134df3e --- /dev/null +++ b/src/infiniop/ops/neg/operator.cc @@ -0,0 +1,139 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/neg.h" + +#ifdef ENABLE_CPU_API +#include "cpu/neg_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#include "nvidia/neg_nvidia.cuh" +#endif + +__C infiniStatus_t infiniopCreateNegDescriptor( + infiniopHandle_t handle, + infiniopNegDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::neg::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + y_desc, \ + {x_desc}) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetNegWorkspaceSize(infiniopNegDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu) +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia) +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia) +#endif +#ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia) +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopNeg( + infiniopNegDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, y, {x}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroyNegDescriptor(infiniopNegDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + DELETE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/ops/pow/cuda/kernel.cuh b/src/infiniop/ops/pow/cuda/kernel.cuh index e8b5324a0..3786e7a52 100644 --- a/src/infiniop/ops/pow/cuda/kernel.cuh +++ b/src/infiniop/ops/pow/cuda/kernel.cuh @@ -2,8 +2,8 @@ #define __POW_CUDA_H__ #include -#include #include +#include namespace op::pow::cuda { typedef struct PowOp { diff --git a/src/infiniop/ops/reciprocal/cpu/reciprocal_cpu.cc b/src/infiniop/ops/reciprocal/cpu/reciprocal_cpu.cc new file mode 100644 index 000000000..52874c8b3 --- /dev/null +++ b/src/infiniop/ops/reciprocal/cpu/reciprocal_cpu.cc @@ -0,0 +1,48 @@ +#include "reciprocal_cpu.h" + +namespace op::reciprocal::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &x_desc = input_desc_vec.at(0); + const auto &y_shape = out_desc->shape(); + const auto &x_shape = x_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); + + CHECK_SAME_SHAPE(y_shape, x_shape); + + // create CPU elementwise descriptor + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::reciprocal::cpu diff --git a/src/infiniop/ops/reciprocal/cpu/reciprocal_cpu.h b/src/infiniop/ops/reciprocal/cpu/reciprocal_cpu.h new file mode 100644 index 000000000..0a0f223f0 --- /dev/null +++ b/src/infiniop/ops/reciprocal/cpu/reciprocal_cpu.h @@ -0,0 +1,20 @@ +#ifndef __RECIPROCAL_CPU_H__ +#define __RECIPROCAL_CPU_H__ + +#include "../../../elementwise/cpu/elementwise_cpu.h" + +ELEMENTWISE_DESCRIPTOR(reciprocal, cpu) + +namespace op::reciprocal::cpu { +typedef struct ReciprocalOp { +public: + static constexpr size_t num_inputs = 1; + + template + T operator()(const T &x) const { + return T(1) / x; + } +} ReciprocalOp; +} // namespace op::reciprocal::cpu + +#endif // __RECIPROCAL_CPU_H__ diff --git a/src/infiniop/ops/reciprocal/cuda/kernel.cuh b/src/infiniop/ops/reciprocal/cuda/kernel.cuh new file mode 100644 index 000000000..94c71de90 --- /dev/null +++ b/src/infiniop/ops/reciprocal/cuda/kernel.cuh @@ -0,0 +1,32 @@ +#ifndef __RECIPROCAL_CUDA_H__ +#define __RECIPROCAL_CUDA_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" +#include + +namespace op::reciprocal::cuda { +typedef struct ReciprocalOp { +public: + static constexpr size_t num_inputs = 1; + template + __device__ __forceinline__ T operator()(const T &x) const { + if constexpr (std::is_same_v) { + return h2rcp(x); + } else if constexpr (std::is_same_v) { + return hrcp(x); + } else if constexpr (std::is_same_v) { + float x0 = __bfloat162float(__low2bfloat16(x)); + float x1 = __bfloat162float(__high2bfloat16(x)); + return __floats2bfloat162_rn(__frcp_rn(x0), __frcp_rn(x1)); + } else if constexpr (std::is_same_v) { + return __float2bfloat16_rn(__frcp_rn(__bfloat162float(x))); + } else if constexpr (std::is_same_v) { + return __frcp_rn(x); + } else { + return T(1) / x; + } + } +} ReciprocalOp; +} // namespace op::reciprocal::cuda + +#endif // __RECIPROCAL_CUDA_H__ diff --git a/src/infiniop/ops/reciprocal/nvidia/reciprocal_nvidia.cu b/src/infiniop/ops/reciprocal/nvidia/reciprocal_nvidia.cu new file mode 100644 index 000000000..45b74e25e --- /dev/null +++ b/src/infiniop/ops/reciprocal/nvidia/reciprocal_nvidia.cu @@ -0,0 +1,54 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "reciprocal_nvidia.cuh" + +namespace op::reciprocal::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &x_desc = input_desc_vec.at(0); + const auto &y_shape = out_desc->shape(); + const auto &x_shape = x_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); + + CHECK_SAME_SHAPE(y_shape, x_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::ReciprocalOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::ReciprocalOp, float>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::reciprocal::nvidia diff --git a/src/infiniop/ops/reciprocal/nvidia/reciprocal_nvidia.cuh b/src/infiniop/ops/reciprocal/nvidia/reciprocal_nvidia.cuh new file mode 100644 index 000000000..d98c8f4c2 --- /dev/null +++ b/src/infiniop/ops/reciprocal/nvidia/reciprocal_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __RECIPROCAL_NVIDIA_API_H__ +#define __RECIPROCAL_NVIDIA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(reciprocal, nvidia) + +#endif // __RECIPROCAL_NVIDIA_API_H__ diff --git a/src/infiniop/ops/reciprocal/operator.cc b/src/infiniop/ops/reciprocal/operator.cc new file mode 100644 index 000000000..033286024 --- /dev/null +++ b/src/infiniop/ops/reciprocal/operator.cc @@ -0,0 +1,139 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/reciprocal.h" + +#ifdef ENABLE_CPU_API +#include "cpu/reciprocal_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#include "nvidia/reciprocal_nvidia.cuh" +#endif + +__C infiniStatus_t infiniopCreateReciprocalDescriptor( + infiniopHandle_t handle, + infiniopReciprocalDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::reciprocal::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + y_desc, \ + {x_desc}) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetReciprocalWorkspaceSize(infiniopReciprocalDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu) +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia) +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia) +#endif +#ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia) +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopReciprocal( + infiniopReciprocalDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, y, {x}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroyReciprocalDescriptor(infiniopReciprocalDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + DELETE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/ops/round/cpu/round_cpu.cc b/src/infiniop/ops/round/cpu/round_cpu.cc new file mode 100644 index 000000000..0b0cea7b7 --- /dev/null +++ b/src/infiniop/ops/round/cpu/round_cpu.cc @@ -0,0 +1,48 @@ +#include "round_cpu.h" + +namespace op::round::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &x_desc = input_desc_vec.at(0); + const auto &y_shape = out_desc->shape(); + const auto &x_shape = x_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); + + CHECK_SAME_SHAPE(y_shape, x_shape); + + // create CPU elementwise descriptor + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::round::cpu diff --git a/src/infiniop/ops/round/cpu/round_cpu.h b/src/infiniop/ops/round/cpu/round_cpu.h new file mode 100644 index 000000000..eccd6df0f --- /dev/null +++ b/src/infiniop/ops/round/cpu/round_cpu.h @@ -0,0 +1,25 @@ +#ifndef __ROUND_CPU_H__ +#define __ROUND_CPU_H__ + +#include "../../../elementwise/cpu/elementwise_cpu.h" +#include + +ELEMENTWISE_DESCRIPTOR(round, cpu) + +namespace op::round::cpu { +typedef struct RoundOp { +public: + static constexpr size_t num_inputs = 1; + + template + T operator()(const T &x) const { + if constexpr (std::is_integral_v) { + return x; + } else { + return std::nearbyint(x); + } + } +} RoundOp; +} // namespace op::round::cpu + +#endif // __ROUND_CPU_H__ diff --git a/src/infiniop/ops/round/cuda/kernel.cuh b/src/infiniop/ops/round/cuda/kernel.cuh new file mode 100644 index 000000000..c52a10716 --- /dev/null +++ b/src/infiniop/ops/round/cuda/kernel.cuh @@ -0,0 +1,34 @@ +#ifndef __ROUND_CUDA_H__ +#define __ROUND_CUDA_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" +#include + +namespace op::round::cuda { +typedef struct RoundOp { +public: + static constexpr size_t num_inputs = 1; + template + __device__ __forceinline__ T operator()(const T &x) const { + if constexpr (std::is_same_v) { + return h2rint(x); + } else if constexpr (std::is_same_v) { + return hrint(x); + } else if constexpr (std::is_same_v) { + float x0 = __bfloat162float(__low2bfloat16(x)); + float x1 = __bfloat162float(__high2bfloat16(x)); + return __floats2bfloat162_rn(rintf(x0), rintf(x1)); + } else if constexpr (std::is_same_v) { + return __float2bfloat16_rn(rintf(__bfloat162float(x))); + } else if constexpr (std::is_same_v) { + return rintf(x); + } else if constexpr (std::is_integral_v) { + return x; + } else { + return std::nearbyint(x); + } + } +} RoundOp; +} // namespace op::round::cuda + +#endif // __ROUND_CUDA_H__ diff --git a/src/infiniop/ops/round/nvidia/round_nvidia.cu b/src/infiniop/ops/round/nvidia/round_nvidia.cu new file mode 100644 index 000000000..c1fabc885 --- /dev/null +++ b/src/infiniop/ops/round/nvidia/round_nvidia.cu @@ -0,0 +1,54 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "round_nvidia.cuh" + +namespace op::round::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &x_desc = input_desc_vec.at(0); + const auto &y_shape = out_desc->shape(); + const auto &x_shape = x_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); + + CHECK_SAME_SHAPE(y_shape, x_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::RoundOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::RoundOp, float>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::round::nvidia diff --git a/src/infiniop/ops/round/nvidia/round_nvidia.cuh b/src/infiniop/ops/round/nvidia/round_nvidia.cuh new file mode 100644 index 000000000..65bb38566 --- /dev/null +++ b/src/infiniop/ops/round/nvidia/round_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __ROUND_NVIDIA_API_H__ +#define __ROUND_NVIDIA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(round, nvidia) + +#endif // __ROUND_NVIDIA_API_H__ diff --git a/src/infiniop/ops/round/operator.cc b/src/infiniop/ops/round/operator.cc new file mode 100644 index 000000000..9468803c8 --- /dev/null +++ b/src/infiniop/ops/round/operator.cc @@ -0,0 +1,139 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/round.h" + +#ifdef ENABLE_CPU_API +#include "cpu/round_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#include "nvidia/round_nvidia.cuh" +#endif + +__C infiniStatus_t infiniopCreateRoundDescriptor( + infiniopHandle_t handle, + infiniopRoundDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::round::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + y_desc, \ + {x_desc}) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetRoundWorkspaceSize(infiniopRoundDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu) +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia) +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia) +#endif +#ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia) +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopRound( + infiniopRoundDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, y, {x}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroyRoundDescriptor(infiniopRoundDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + DELETE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/ops/sign/cpu/sign_cpu.cc b/src/infiniop/ops/sign/cpu/sign_cpu.cc new file mode 100644 index 000000000..1f3430e73 --- /dev/null +++ b/src/infiniop/ops/sign/cpu/sign_cpu.cc @@ -0,0 +1,48 @@ +#include "sign_cpu.h" + +namespace op::sign::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &x_desc = input_desc_vec.at(0); + const auto &y_shape = out_desc->shape(); + const auto &x_shape = x_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); + + CHECK_SAME_SHAPE(y_shape, x_shape); + + // create CPU elementwise descriptor + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::sign::cpu diff --git a/src/infiniop/ops/sign/cpu/sign_cpu.h b/src/infiniop/ops/sign/cpu/sign_cpu.h new file mode 100644 index 000000000..505194c85 --- /dev/null +++ b/src/infiniop/ops/sign/cpu/sign_cpu.h @@ -0,0 +1,20 @@ +#ifndef __SIGN_CPU_H__ +#define __SIGN_CPU_H__ + +#include "../../../elementwise/cpu/elementwise_cpu.h" + +ELEMENTWISE_DESCRIPTOR(sign, cpu) + +namespace op::sign::cpu { +typedef struct SignOp { +public: + static constexpr size_t num_inputs = 1; + + template + T operator()(const T &x) const { + return x > T(0) ? T(1) : (x == T(0) ? T(0) : T(-1)); + } +} SignOp; +} // namespace op::sign::cpu + +#endif // __SIGN_CPU_H__ diff --git a/src/infiniop/ops/sign/cuda/kernel.cuh b/src/infiniop/ops/sign/cuda/kernel.cuh new file mode 100644 index 000000000..3737282b0 --- /dev/null +++ b/src/infiniop/ops/sign/cuda/kernel.cuh @@ -0,0 +1,25 @@ +#ifndef __SIGN_CUDA_H__ +#define __SIGN_CUDA_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" +#include + +namespace op::sign::cuda { +typedef struct SignOp { +public: + static constexpr size_t num_inputs = 1; + template + __device__ __forceinline__ T operator()(const T &x) const { + if constexpr (std::is_same_v) { + const auto lt_mask = __hlt2(x, __floats2half2_rn(0.0f, 0.0f)); + return __hadd2(__hneg2(lt_mask), __hsub2(__floats2half2_rn(1.0f, 1.0f), lt_mask)); + } else if constexpr (std::is_same_v) { + return x > half(0) ? half(1) : (x == half(0) ? half(0) : half(-1)); + } else { + return x > T(0) ? T(1) : (x == T(0) ? T(0) : T(-1)); + } + } +} SignOp; +} // namespace op::sign::cuda + +#endif // __SIGN_CUDA_H__ diff --git a/src/infiniop/ops/sign/nvidia/sign_nvidia.cu b/src/infiniop/ops/sign/nvidia/sign_nvidia.cu new file mode 100644 index 000000000..6a3152e41 --- /dev/null +++ b/src/infiniop/ops/sign/nvidia/sign_nvidia.cu @@ -0,0 +1,54 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "sign_nvidia.cuh" + +namespace op::sign::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &x_desc = input_desc_vec.at(0); + const auto &y_shape = out_desc->shape(); + const auto &x_shape = x_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); + + CHECK_SAME_SHAPE(y_shape, x_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::SignOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::SignOp, float>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::sign::nvidia diff --git a/src/infiniop/ops/sign/nvidia/sign_nvidia.cuh b/src/infiniop/ops/sign/nvidia/sign_nvidia.cuh new file mode 100644 index 000000000..d5f2540a3 --- /dev/null +++ b/src/infiniop/ops/sign/nvidia/sign_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __SIGN_NVIDIA_API_H__ +#define __SIGN_NVIDIA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(sign, nvidia) + +#endif // __SIGN_NVIDIA_API_H__ diff --git a/src/infiniop/ops/sign/operator.cc b/src/infiniop/ops/sign/operator.cc new file mode 100644 index 000000000..8f658a9b3 --- /dev/null +++ b/src/infiniop/ops/sign/operator.cc @@ -0,0 +1,139 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/sign.h" + +#ifdef ENABLE_CPU_API +#include "cpu/sign_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#include "nvidia/sign_nvidia.cuh" +#endif + +__C infiniStatus_t infiniopCreateSignDescriptor( + infiniopHandle_t handle, + infiniopSignDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::sign::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + y_desc, \ + {x_desc}) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetSignWorkspaceSize(infiniopSignDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu) +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia) +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia) +#endif +#ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia) +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopSign( + infiniopSignDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, y, {x}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroySignDescriptor(infiniopSignDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + DELETE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/ops/sinh/cpu/sinh_cpu.cc b/src/infiniop/ops/sinh/cpu/sinh_cpu.cc new file mode 100644 index 000000000..40685847d --- /dev/null +++ b/src/infiniop/ops/sinh/cpu/sinh_cpu.cc @@ -0,0 +1,48 @@ +#include "sinh_cpu.h" + +namespace op::sinh::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &x_desc = input_desc_vec.at(0); + const auto &y_shape = out_desc->shape(); + const auto &x_shape = x_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); + + CHECK_SAME_SHAPE(y_shape, x_shape); + + // create CPU elementwise descriptor + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::sinh::cpu diff --git a/src/infiniop/ops/sinh/cpu/sinh_cpu.h b/src/infiniop/ops/sinh/cpu/sinh_cpu.h new file mode 100644 index 000000000..dbc8f3c7e --- /dev/null +++ b/src/infiniop/ops/sinh/cpu/sinh_cpu.h @@ -0,0 +1,22 @@ +#ifndef __SINH_CPU_H__ +#define __SINH_CPU_H__ + +#include + +#include "../../../elementwise/cpu/elementwise_cpu.h" + +ELEMENTWISE_DESCRIPTOR(sinh, cpu) + +namespace op::sinh::cpu { +typedef struct SinhOp { +public: + static constexpr size_t num_inputs = 1; + + template + T operator()(const T &x) const { + return std::sinh(x); + } +} SinhOp; +} // namespace op::sinh::cpu + +#endif // __SINH_CPU_H__ diff --git a/src/infiniop/ops/sinh/cuda/kernel.cuh b/src/infiniop/ops/sinh/cuda/kernel.cuh new file mode 100644 index 000000000..c09150666 --- /dev/null +++ b/src/infiniop/ops/sinh/cuda/kernel.cuh @@ -0,0 +1,32 @@ +#ifndef __SINH_CUDA_H__ +#define __SINH_CUDA_H__ + +#include +#include + +namespace op::sinh::cuda { +typedef struct SinhOp { +public: + static constexpr size_t num_inputs = 1; + template + __device__ __forceinline__ T operator()(const T &x) const { + if constexpr (std::is_same_v) { + return __floats2half2_rn(sinhf(__half2float(__low2half(x))), sinhf(__half2float(__high2half(x)))); + } else if constexpr (std::is_same_v) { + return __float2half(sinhf(__half2float(x))); + } else if constexpr (std::is_same_v) { + float x0 = __bfloat162float(__low2bfloat16(x)); + float x1 = __bfloat162float(__high2bfloat16(x)); + return __floats2bfloat162_rn(sinhf(x0), sinhf(x1)); + } else if constexpr (std::is_same_v) { + return __float2bfloat16_rn(sinhf(__bfloat162float(x))); + } else if constexpr (std::is_same_v) { + return sinhf(x); + } else { + return std::sinh(x); + } + } +} SinhOp; +} // namespace op::sinh::cuda + +#endif // __SINH_CUDA_H__ diff --git a/src/infiniop/ops/sinh/nvidia/sinh_nvidia.cu b/src/infiniop/ops/sinh/nvidia/sinh_nvidia.cu new file mode 100644 index 000000000..d4c3fd165 --- /dev/null +++ b/src/infiniop/ops/sinh/nvidia/sinh_nvidia.cu @@ -0,0 +1,54 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "sinh_nvidia.cuh" + +namespace op::sinh::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &x_desc = input_desc_vec.at(0); + const auto &y_shape = out_desc->shape(); + const auto &x_shape = x_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); + + CHECK_SAME_SHAPE(y_shape, x_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::SinhOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::SinhOp, float>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::sinh::nvidia diff --git a/src/infiniop/ops/sinh/nvidia/sinh_nvidia.cuh b/src/infiniop/ops/sinh/nvidia/sinh_nvidia.cuh new file mode 100644 index 000000000..66e3e3e67 --- /dev/null +++ b/src/infiniop/ops/sinh/nvidia/sinh_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __SINH_NVIDIA_API_H__ +#define __SINH_NVIDIA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(sinh, nvidia) + +#endif // __SINH_NVIDIA_API_H__ diff --git a/src/infiniop/ops/sinh/operator.cc b/src/infiniop/ops/sinh/operator.cc new file mode 100644 index 000000000..1636ce2c8 --- /dev/null +++ b/src/infiniop/ops/sinh/operator.cc @@ -0,0 +1,139 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/sinh.h" + +#ifdef ENABLE_CPU_API +#include "cpu/sinh_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#include "nvidia/sinh_nvidia.cuh" +#endif + +__C infiniStatus_t infiniopCreateSinhDescriptor( + infiniopHandle_t handle, + infiniopSinhDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::sinh::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + y_desc, \ + {x_desc}) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetSinhWorkspaceSize(infiniopSinhDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu) +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia) +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia) +#endif +#ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia) +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopSinh( + infiniopSinhDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, y, {x}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroySinhDescriptor(infiniopSinhDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + DELETE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/ops/sqrt/cpu/sqrt_cpu.cc b/src/infiniop/ops/sqrt/cpu/sqrt_cpu.cc new file mode 100644 index 000000000..99e723126 --- /dev/null +++ b/src/infiniop/ops/sqrt/cpu/sqrt_cpu.cc @@ -0,0 +1,48 @@ +#include "sqrt_cpu.h" + +namespace op::sqrt::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &x_desc = input_desc_vec.at(0); + const auto &y_shape = out_desc->shape(); + const auto &x_shape = x_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); + + CHECK_SAME_SHAPE(y_shape, x_shape); + + // create CPU elementwise descriptor + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::sqrt::cpu diff --git a/src/infiniop/ops/sqrt/cpu/sqrt_cpu.h b/src/infiniop/ops/sqrt/cpu/sqrt_cpu.h new file mode 100644 index 000000000..3d026cf63 --- /dev/null +++ b/src/infiniop/ops/sqrt/cpu/sqrt_cpu.h @@ -0,0 +1,22 @@ +#ifndef __SQRT_CPU_H__ +#define __SQRT_CPU_H__ + +#include + +#include "../../../elementwise/cpu/elementwise_cpu.h" + +ELEMENTWISE_DESCRIPTOR(sqrt, cpu) + +namespace op::sqrt::cpu { +typedef struct SqrtOp { +public: + static constexpr size_t num_inputs = 1; + + template + T operator()(const T &x) const { + return std::sqrt(x); + } +} SqrtOp; +} // namespace op::sqrt::cpu + +#endif // __SQRT_CPU_H__ diff --git a/src/infiniop/ops/sqrt/cuda/kernel.cuh b/src/infiniop/ops/sqrt/cuda/kernel.cuh new file mode 100644 index 000000000..c82cd7dd5 --- /dev/null +++ b/src/infiniop/ops/sqrt/cuda/kernel.cuh @@ -0,0 +1,32 @@ +#ifndef __SQRT_CUDA_H__ +#define __SQRT_CUDA_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" +#include + +namespace op::sqrt::cuda { +typedef struct SqrtOp { +public: + static constexpr size_t num_inputs = 1; + template + __device__ __forceinline__ T operator()(const T &x) const { + if constexpr (std::is_same_v) { + return h2sqrt(x); + } else if constexpr (std::is_same_v) { + return hsqrt(x); + } else if constexpr (std::is_same_v) { + float x0 = __bfloat162float(__low2bfloat16(x)); + float x1 = __bfloat162float(__high2bfloat16(x)); + return __floats2bfloat162_rn(__fsqrt_rn(x0), __fsqrt_rn(x1)); + } else if constexpr (std::is_same_v) { + return __float2bfloat16_rn(__fsqrt_rn(__bfloat162float(x))); + } else if constexpr (std::is_same_v) { + return __fsqrt_rn(x); + } else { + return std::sqrt(x); + } + } +} SqrtOp; +} // namespace op::sqrt::cuda + +#endif // __SQRT_CUDA_H__ diff --git a/src/infiniop/ops/sqrt/nvidia/sqrt_nvidia.cu b/src/infiniop/ops/sqrt/nvidia/sqrt_nvidia.cu new file mode 100644 index 000000000..519d06e89 --- /dev/null +++ b/src/infiniop/ops/sqrt/nvidia/sqrt_nvidia.cu @@ -0,0 +1,54 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "sqrt_nvidia.cuh" + +namespace op::sqrt::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &x_desc = input_desc_vec.at(0); + const auto &y_shape = out_desc->shape(); + const auto &x_shape = x_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); + + CHECK_SAME_SHAPE(y_shape, x_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::SqrtOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::SqrtOp, float>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::sqrt::nvidia diff --git a/src/infiniop/ops/sqrt/nvidia/sqrt_nvidia.cuh b/src/infiniop/ops/sqrt/nvidia/sqrt_nvidia.cuh new file mode 100644 index 000000000..6cd98c814 --- /dev/null +++ b/src/infiniop/ops/sqrt/nvidia/sqrt_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __SQRT_NVIDIA_API_H__ +#define __SQRT_NVIDIA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(sqrt, nvidia) + +#endif // __SQRT_NVIDIA_API_H__ diff --git a/src/infiniop/ops/sqrt/operator.cc b/src/infiniop/ops/sqrt/operator.cc new file mode 100644 index 000000000..b11c8a4b5 --- /dev/null +++ b/src/infiniop/ops/sqrt/operator.cc @@ -0,0 +1,139 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/sqrt.h" + +#ifdef ENABLE_CPU_API +#include "cpu/sqrt_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#include "nvidia/sqrt_nvidia.cuh" +#endif + +__C infiniStatus_t infiniopCreateSqrtDescriptor( + infiniopHandle_t handle, + infiniopSqrtDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::sqrt::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + y_desc, \ + {x_desc}) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetSqrtWorkspaceSize(infiniopSqrtDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu) +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia) +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia) +#endif +#ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia) +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopSqrt( + infiniopSqrtDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, y, {x}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroySqrtDescriptor(infiniopSqrtDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + DELETE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/ops/tan/cpu/tan_cpu.cc b/src/infiniop/ops/tan/cpu/tan_cpu.cc new file mode 100644 index 000000000..2947dfc5e --- /dev/null +++ b/src/infiniop/ops/tan/cpu/tan_cpu.cc @@ -0,0 +1,48 @@ +#include "tan_cpu.h" + +namespace op::tan::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &x_desc = input_desc_vec.at(0); + const auto &y_shape = out_desc->shape(); + const auto &x_shape = x_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); + + CHECK_SAME_SHAPE(y_shape, x_shape); + + // create CPU elementwise descriptor + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::tan::cpu diff --git a/src/infiniop/ops/tan/cpu/tan_cpu.h b/src/infiniop/ops/tan/cpu/tan_cpu.h new file mode 100644 index 000000000..c3a22456c --- /dev/null +++ b/src/infiniop/ops/tan/cpu/tan_cpu.h @@ -0,0 +1,22 @@ +#ifndef __TAN_CPU_H__ +#define __TAN_CPU_H__ + +#include + +#include "../../../elementwise/cpu/elementwise_cpu.h" + +ELEMENTWISE_DESCRIPTOR(tan, cpu) + +namespace op::tan::cpu { +typedef struct TanOp { +public: + static constexpr size_t num_inputs = 1; + + template + T operator()(const T &x) const { + return std::tan(x); + } +} TanOp; +} // namespace op::tan::cpu + +#endif // __TAN_CPU_H__ diff --git a/src/infiniop/ops/tan/cuda/kernel.cuh b/src/infiniop/ops/tan/cuda/kernel.cuh new file mode 100644 index 000000000..bbd8facaa --- /dev/null +++ b/src/infiniop/ops/tan/cuda/kernel.cuh @@ -0,0 +1,55 @@ +#ifndef __TAN_CUDA_H__ +#define __TAN_CUDA_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" +#include +#include + +#define TAN_THRESHOLD 15000 + +namespace op::tan::cuda { +typedef struct TanOp { +public: + static constexpr size_t num_inputs = 1; + template + __device__ __forceinline__ T operator()(const T &x) const { + if constexpr (std::is_same_v) { + return h2sin(x) / h2cos(x); + } else if constexpr (std::is_same_v) { + float tan_f = __tanf(__half2float(x)); + if (std::fabs(tan_f) > TAN_THRESHOLD) { + return __float2half(tanf(__half2float(x))); + } + return __float2half(tan_f); + } else if constexpr (std::is_same_v) { + float x0 = __bfloat162float(__low2bfloat16(x)); + float x1 = __bfloat162float(__high2bfloat16(x)); + float tan_f0 = __tanf(x0); + float tan_f1 = __tanf(x1); + if (std::fabs(tan_f0) > TAN_THRESHOLD) { + tan_f0 = tanf(x0); + } + if (std::fabs(tan_f1) > TAN_THRESHOLD) { + tan_f1 = tanf(x1); + } + return __floats2bfloat162_rn(tan_f0, tan_f1); + } else if constexpr (std::is_same_v) { + float tan_f = __tanf(__bfloat162float(x)); + if (std::fabs(tan_f) > TAN_THRESHOLD) { + return __float2bfloat16_rn(tanf(__bfloat162float(x))); + } + return __float2bfloat16_rn(tan_f); + } else if constexpr (std::is_same_v) { + float tan_f = __tanf(x); + if (std::fabs(tan_f) > TAN_THRESHOLD) { + return tanf(x); + } + return tan_f; + } else { + return std::tan(x); + } + } +} TanOp; +} // namespace op::tan::cuda + +#endif // __TAN_CUDA_H__ diff --git a/src/infiniop/ops/tan/nvidia/tan_nvidia.cu b/src/infiniop/ops/tan/nvidia/tan_nvidia.cu new file mode 100644 index 000000000..b4c24e2fe --- /dev/null +++ b/src/infiniop/ops/tan/nvidia/tan_nvidia.cu @@ -0,0 +1,54 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "tan_nvidia.cuh" + +namespace op::tan::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &x_desc = input_desc_vec.at(0); + const auto &y_shape = out_desc->shape(); + const auto &x_shape = x_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); + + CHECK_SAME_SHAPE(y_shape, x_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::TanOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::TanOp, float>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::tan::nvidia diff --git a/src/infiniop/ops/tan/nvidia/tan_nvidia.cuh b/src/infiniop/ops/tan/nvidia/tan_nvidia.cuh new file mode 100644 index 000000000..ec620cbeb --- /dev/null +++ b/src/infiniop/ops/tan/nvidia/tan_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __TAN_NVIDIA_API_H__ +#define __TAN_NVIDIA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(tan, nvidia) + +#endif // __TAN_NVIDIA_API_H__ diff --git a/src/infiniop/ops/tan/operator.cc b/src/infiniop/ops/tan/operator.cc new file mode 100644 index 000000000..48ae8d48e --- /dev/null +++ b/src/infiniop/ops/tan/operator.cc @@ -0,0 +1,139 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/tan.h" + +#ifdef ENABLE_CPU_API +#include "cpu/tan_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#include "nvidia/tan_nvidia.cuh" +#endif + +__C infiniStatus_t infiniopCreateTanDescriptor( + infiniopHandle_t handle, + infiniopTanDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::tan::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + y_desc, \ + {x_desc}) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetTanWorkspaceSize(infiniopTanDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu) +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia) +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia) +#endif +#ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia) +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopTan( + infiniopTanDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, y, {x}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroyTanDescriptor(infiniopTanDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + DELETE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/test/infiniop/abs.py b/test/infiniop/abs.py new file mode 100644 index 000000000..df8748a97 --- /dev/null +++ b/test/infiniop/abs.py @@ -0,0 +1,164 @@ +import ctypes +from ctypes import c_uint64 +from enum import Enum, auto + +import torch +from libinfiniop import ( + LIBINFINIOP, + InfiniDeviceNames, + InfiniDtype, + InfiniDtypeNames, + TestTensor, + TestWorkspace, + check_error, + debug, + get_args, + get_test_devices, + get_tolerance, + infiniopOperatorDescriptor_t, + profile_operation, + test_operator, +) + +# ============================================================================== +# Configuration (Internal Use Only) +# ============================================================================== +# These are not meant to be imported from other modules +_TEST_CASES_ = [ + # tensor_shape, inplace + ((1, 3),), + ((3, 3),), + ((32, 20, 512),), + ((33, 333, 333),), + ((32, 256, 112, 112),), + ((3, 3, 13, 9, 17),), +] + + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_X = auto() + + +# Inplace options applied for each test case in _TEST_CASES_ +_INPLACE = [ + Inplace.OUT_OF_PLACE, + Inplace.INPLACE_X, +] + +# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_ +_TEST_CASES = [ + test_case + (inplace_item,) + for test_case in _TEST_CASES_ + for inplace_item in _INPLACE +] + +# Data types used for testing (matching old operators library: only F16 and F32) +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32] + +# Tolerance map for different data types +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def abs_op(x): + return torch.abs(x).to(x.dtype) + + +def test( + handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=InfiniDtype.F16, sync=None +): + # Generate test tensors with values in range [-1, 1) for abs operation + x_torch_tensor = torch.rand(shape) * 2 - 1 + + x = TestTensor( + shape, + x_torch_tensor.stride(), + dtype, + device, + mode="manual", + set_tensor=x_torch_tensor, + ) + + if inplace == Inplace.INPLACE_X: + y = x + else: + y = TestTensor(shape, None, dtype, device) + + if y.is_broadcast(): + return + + print( + f"Testing Abs on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}" + ) + + ans = abs_op(x.torch_tensor()) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateAbsDescriptor( + handle, ctypes.byref(descriptor), y.descriptor, x.descriptor + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + for tensor in [x, y]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetAbsWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, y.device) + + def lib_abs(): + check_error( + LIBINFINIOP.infiniopAbs( + descriptor, workspace.data(), workspace_size.value, y.data(), x.data(), None + ) + ) + + lib_abs() + if sync is not None: + sync() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(y.actual_tensor(), ans, atol=atol, rtol=rtol) + assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: abs_op(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_abs(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + + check_error(LIBINFINIOP.infiniopDestroyAbsDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/acos.py b/test/infiniop/acos.py new file mode 100644 index 000000000..d39e966c4 --- /dev/null +++ b/test/infiniop/acos.py @@ -0,0 +1,165 @@ +import ctypes +from ctypes import c_uint64 +from enum import Enum, auto + +import torch +from libinfiniop import ( + LIBINFINIOP, + InfiniDeviceNames, + InfiniDtype, + InfiniDtypeNames, + TestTensor, + TestWorkspace, + check_error, + debug, + get_args, + get_test_devices, + get_tolerance, + infiniopOperatorDescriptor_t, + profile_operation, + test_operator, +) + +# ============================================================================== +# Configuration (Internal Use Only) +# ============================================================================== +# These are not meant to be imported from other modules +_TEST_CASES_ = [ + # tensor_shape, inplace + ((1, 3),), + ((3, 3),), + ((32, 20, 512),), + ((33, 333, 333),), + ((32, 256, 112, 112),), + ((3, 3, 13, 9, 17),), +] + + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_X = auto() + + +# Inplace options applied for each test case in _TEST_CASES_ +_INPLACE = [ + Inplace.OUT_OF_PLACE, + Inplace.INPLACE_X, +] + +# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_ +_TEST_CASES = [ + test_case + (inplace_item,) + for test_case in _TEST_CASES_ + for inplace_item in _INPLACE +] + +# Data types used for testing (matching old operators library: only F16 and F32) +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32] + +# Tolerance map for different data types +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def acos_op(x): + return torch.acos(x).to(x.dtype) + + +def test( + handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=InfiniDtype.F16, sync=None +): + # Generate test tensors with values in range [-1, 1) for acos operation + # acos domain is [-1, 1], so we use range [-1, 1) + x_torch_tensor = torch.rand(shape) * 2 - 1 + + x = TestTensor( + shape, + x_torch_tensor.stride(), + dtype, + device, + mode="manual", + set_tensor=x_torch_tensor, + ) + + if inplace == Inplace.INPLACE_X: + y = x + else: + y = TestTensor(shape, None, dtype, device) + + if y.is_broadcast(): + return + + print( + f"Testing Acos on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}" + ) + + ans = acos_op(x.torch_tensor()) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateAcosDescriptor( + handle, ctypes.byref(descriptor), y.descriptor, x.descriptor + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + for tensor in [x, y]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetAcosWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, y.device) + + def lib_acos(): + check_error( + LIBINFINIOP.infiniopAcos( + descriptor, workspace.data(), workspace_size.value, y.data(), x.data(), None + ) + ) + + lib_acos() + if sync is not None: + sync() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(y.actual_tensor(), ans, atol=atol, rtol=rtol) + assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol, equal_nan=True) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: acos_op(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_acos(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + + check_error(LIBINFINIOP.infiniopDestroyAcosDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/acosh.py b/test/infiniop/acosh.py new file mode 100644 index 000000000..c6777998b --- /dev/null +++ b/test/infiniop/acosh.py @@ -0,0 +1,165 @@ +import ctypes +from ctypes import c_uint64 +from enum import Enum, auto + +import torch +from libinfiniop import ( + LIBINFINIOP, + InfiniDeviceNames, + InfiniDtype, + InfiniDtypeNames, + TestTensor, + TestWorkspace, + check_error, + debug, + get_args, + get_test_devices, + get_tolerance, + infiniopOperatorDescriptor_t, + profile_operation, + test_operator, +) + +# ============================================================================== +# Configuration (Internal Use Only) +# ============================================================================== +# These are not meant to be imported from other modules +_TEST_CASES_ = [ + # tensor_shape, inplace + ((1, 3),), + ((3, 3),), + ((32, 20, 512),), + ((33, 333, 333),), + ((32, 256, 112, 112),), + ((3, 3, 13, 9, 17),), +] + + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_X = auto() + + +# Inplace options applied for each test case in _TEST_CASES_ +_INPLACE = [ + Inplace.OUT_OF_PLACE, + Inplace.INPLACE_X, +] + +# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_ +_TEST_CASES = [ + test_case + (inplace_item,) + for test_case in _TEST_CASES_ + for inplace_item in _INPLACE +] + +# Data types used for testing (matching old operators library: only F16 and F32) +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32] + +# Tolerance map for different data types +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def acosh_op(x): + return torch.acosh(x).to(x.dtype) + + +def test( + handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=InfiniDtype.F16, sync=None +): + # Generate test tensors with values in range [1, 101) for acosh operation + # acosh domain is [1, +∞), so we use range [1, 101) + x_torch_tensor = torch.rand(shape) * 100 + 1 + + x = TestTensor( + shape, + x_torch_tensor.stride(), + dtype, + device, + mode="manual", + set_tensor=x_torch_tensor, + ) + + if inplace == Inplace.INPLACE_X: + y = x + else: + y = TestTensor(shape, None, dtype, device) + + if y.is_broadcast(): + return + + print( + f"Testing Acosh on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}" + ) + + ans = acosh_op(x.torch_tensor()) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateAcoshDescriptor( + handle, ctypes.byref(descriptor), y.descriptor, x.descriptor + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + for tensor in [x, y]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetAcoshWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, y.device) + + def lib_acosh(): + check_error( + LIBINFINIOP.infiniopAcosh( + descriptor, workspace.data(), workspace_size.value, y.data(), x.data(), None + ) + ) + + lib_acosh() + if sync is not None: + sync() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(y.actual_tensor(), ans, atol=atol, rtol=rtol) + assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol, equal_nan=True) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: acosh_op(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_acosh(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + + check_error(LIBINFINIOP.infiniopDestroyAcoshDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/asin.py b/test/infiniop/asin.py new file mode 100644 index 000000000..18cf0ec8e --- /dev/null +++ b/test/infiniop/asin.py @@ -0,0 +1,165 @@ +import ctypes +from ctypes import c_uint64 +from enum import Enum, auto + +import torch +from libinfiniop import ( + LIBINFINIOP, + InfiniDeviceNames, + InfiniDtype, + InfiniDtypeNames, + TestTensor, + TestWorkspace, + check_error, + debug, + get_args, + get_test_devices, + get_tolerance, + infiniopOperatorDescriptor_t, + profile_operation, + test_operator, +) + +# ============================================================================== +# Configuration (Internal Use Only) +# ============================================================================== +# These are not meant to be imported from other modules +_TEST_CASES_ = [ + # tensor_shape, inplace + ((1, 3),), + ((3, 3),), + ((32, 20, 512),), + ((33, 333, 333),), + ((32, 256, 112, 112),), + ((3, 3, 13, 9, 17),), +] + + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_X = auto() + + +# Inplace options applied for each test case in _TEST_CASES_ +_INPLACE = [ + Inplace.OUT_OF_PLACE, + Inplace.INPLACE_X, +] + +# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_ +_TEST_CASES = [ + test_case + (inplace_item,) + for test_case in _TEST_CASES_ + for inplace_item in _INPLACE +] + +# Data types used for testing (matching old operators library: only F16 and F32) +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32] + +# Tolerance map for different data types +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def asin_op(x): + return torch.asin(x).to(x.dtype) + + +def test( + handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=InfiniDtype.F16, sync=None +): + # Generate test tensors with values in range [-1, 1) for asin operation + # asin domain is [-1, 1], so we use range [-1, 1) + x_torch_tensor = torch.rand(shape) * 2 - 1 + + x = TestTensor( + shape, + x_torch_tensor.stride(), + dtype, + device, + mode="manual", + set_tensor=x_torch_tensor, + ) + + if inplace == Inplace.INPLACE_X: + y = x + else: + y = TestTensor(shape, None, dtype, device) + + if y.is_broadcast(): + return + + print( + f"Testing Asin on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}" + ) + + ans = asin_op(x.torch_tensor()) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateAsinDescriptor( + handle, ctypes.byref(descriptor), y.descriptor, x.descriptor + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + for tensor in [x, y]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetAsinWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, y.device) + + def lib_asin(): + check_error( + LIBINFINIOP.infiniopAsin( + descriptor, workspace.data(), workspace_size.value, y.data(), x.data(), None + ) + ) + + lib_asin() + if sync is not None: + sync() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(y.actual_tensor(), ans, atol=atol, rtol=rtol) + assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol, equal_nan=True) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: asin_op(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_asin(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + + check_error(LIBINFINIOP.infiniopDestroyAsinDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/asinh.py b/test/infiniop/asinh.py new file mode 100644 index 000000000..d051d486e --- /dev/null +++ b/test/infiniop/asinh.py @@ -0,0 +1,165 @@ +import ctypes +from ctypes import c_uint64 +from enum import Enum, auto + +import torch +from libinfiniop import ( + LIBINFINIOP, + InfiniDeviceNames, + InfiniDtype, + InfiniDtypeNames, + TestTensor, + TestWorkspace, + check_error, + debug, + get_args, + get_test_devices, + get_tolerance, + infiniopOperatorDescriptor_t, + profile_operation, + test_operator, +) + +# ============================================================================== +# Configuration (Internal Use Only) +# ============================================================================== +# These are not meant to be imported from other modules +_TEST_CASES_ = [ + # tensor_shape, inplace + ((1, 3),), + ((3, 3),), + ((32, 20, 512),), + ((33, 333, 333),), + ((32, 256, 112, 112),), + ((3, 3, 13, 9, 17),), +] + + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_X = auto() + + +# Inplace options applied for each test case in _TEST_CASES_ +_INPLACE = [ + Inplace.OUT_OF_PLACE, + Inplace.INPLACE_X, +] + +# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_ +_TEST_CASES = [ + test_case + (inplace_item,) + for test_case in _TEST_CASES_ + for inplace_item in _INPLACE +] + +# Data types used for testing (matching old operators library: only F16 and F32) +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32] + +# Tolerance map for different data types +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def asinh_op(x): + return torch.asinh(x).to(x.dtype) + + +def test( + handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=InfiniDtype.F16, sync=None +): + # Generate test tensors with values in range [0, 100) for asinh operation + # asinh domain is (-∞, +∞), so we use range [0, 100) + x_torch_tensor = torch.rand(shape) * 100 + + x = TestTensor( + shape, + x_torch_tensor.stride(), + dtype, + device, + mode="manual", + set_tensor=x_torch_tensor, + ) + + if inplace == Inplace.INPLACE_X: + y = x + else: + y = TestTensor(shape, None, dtype, device) + + if y.is_broadcast(): + return + + print( + f"Testing Asinh on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}" + ) + + ans = asinh_op(x.torch_tensor()) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateAsinhDescriptor( + handle, ctypes.byref(descriptor), y.descriptor, x.descriptor + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + for tensor in [x, y]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetAsinhWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, y.device) + + def lib_asinh(): + check_error( + LIBINFINIOP.infiniopAsinh( + descriptor, workspace.data(), workspace_size.value, y.data(), x.data(), None + ) + ) + + lib_asinh() + if sync is not None: + sync() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(y.actual_tensor(), ans, atol=atol, rtol=rtol) + assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol, equal_nan=True) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: asinh_op(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_asinh(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + + check_error(LIBINFINIOP.infiniopDestroyAsinhDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/atan.py b/test/infiniop/atan.py new file mode 100644 index 000000000..01fceff5b --- /dev/null +++ b/test/infiniop/atan.py @@ -0,0 +1,164 @@ +import ctypes +from ctypes import c_uint64 +from enum import Enum, auto + +import torch +from libinfiniop import ( + LIBINFINIOP, + InfiniDeviceNames, + InfiniDtype, + InfiniDtypeNames, + TestTensor, + TestWorkspace, + check_error, + debug, + get_args, + get_test_devices, + get_tolerance, + infiniopOperatorDescriptor_t, + profile_operation, + test_operator, +) + +# ============================================================================== +# Configuration (Internal Use Only) +# ============================================================================== +# These are not meant to be imported from other modules +_TEST_CASES_ = [ + # tensor_shape, inplace + ((1, 3),), + ((3, 3, 13, 9, 17),), + ((32, 20, 512),), + ((33, 333, 333),), + ((32, 256, 112, 112),), +] + + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_X = auto() + + +# Inplace options applied for each test case in _TEST_CASES_ +_INPLACE = [ + Inplace.OUT_OF_PLACE, + Inplace.INPLACE_X, +] + +# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_ +_TEST_CASES = [ + test_case + (inplace_item,) + for test_case in _TEST_CASES_ + for inplace_item in _INPLACE +] + +# Data types used for testing (matching old operators library: only F16 and F32) +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32] + +# Tolerance map for different data types +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def atan_op(x): + return torch.atan(x).to(x.dtype) + + +def test( + handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=InfiniDtype.F16, sync=None +): + # Generate test tensors with values in range [-200, -100) for atan operation + # atan domain is (-∞, +∞), so we use range [-200, -100) + x_torch_tensor = torch.rand(shape) * 100 - 200 + + x = TestTensor( + shape, + x_torch_tensor.stride(), + dtype, + device, + mode="manual", + set_tensor=x_torch_tensor, + ) + + if inplace == Inplace.INPLACE_X: + y = x + else: + y = TestTensor(shape, None, dtype, device) + + if y.is_broadcast(): + return + + print( + f"Testing Atan on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}" + ) + + ans = atan_op(x.torch_tensor()) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateAtanDescriptor( + handle, ctypes.byref(descriptor), y.descriptor, x.descriptor + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + for tensor in [x, y]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetAtanWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, y.device) + + def lib_atan(): + check_error( + LIBINFINIOP.infiniopAtan( + descriptor, workspace.data(), workspace_size.value, y.data(), x.data(), None + ) + ) + + lib_atan() + if sync is not None: + sync() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(y.actual_tensor(), ans, atol=atol, rtol=rtol) + assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol, equal_nan=True) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: atan_op(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_atan(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + + check_error(LIBINFINIOP.infiniopDestroyAtanDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/atanh.py b/test/infiniop/atanh.py new file mode 100644 index 000000000..74073a6f2 --- /dev/null +++ b/test/infiniop/atanh.py @@ -0,0 +1,165 @@ +import ctypes +from ctypes import c_uint64 +from enum import Enum, auto + +import torch +from libinfiniop import ( + LIBINFINIOP, + InfiniDeviceNames, + InfiniDtype, + InfiniDtypeNames, + TestTensor, + TestWorkspace, + check_error, + debug, + get_args, + get_test_devices, + get_tolerance, + infiniopOperatorDescriptor_t, + profile_operation, + test_operator, +) + +# ============================================================================== +# Configuration (Internal Use Only) +# ============================================================================== +# These are not meant to be imported from other modules +_TEST_CASES_ = [ + # tensor_shape, inplace + ((1, 3),), + ((3, 3),), + ((32, 20, 512),), + ((33, 333, 333),), + ((32, 256, 112, 112),), + ((3, 3, 13, 9, 17),), +] + + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_X = auto() + + +# Inplace options applied for each test case in _TEST_CASES_ +_INPLACE = [ + Inplace.OUT_OF_PLACE, + Inplace.INPLACE_X, +] + +# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_ +_TEST_CASES = [ + test_case + (inplace_item,) + for test_case in _TEST_CASES_ + for inplace_item in _INPLACE +] + +# Data types used for testing (matching old operators library: only F16 and F32) +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32] + +# Tolerance map for different data types +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def atanh_op(x): + return torch.atanh(x).to(x.dtype) + + +def test( + handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=InfiniDtype.F16, sync=None +): + # Generate test tensors with values in range [-1, 1) for atanh operation + # atanh domain is (-1, 1), so we use range [-1, 1) + x_torch_tensor = torch.rand(shape) * 2 - 1 + + x = TestTensor( + shape, + x_torch_tensor.stride(), + dtype, + device, + mode="manual", + set_tensor=x_torch_tensor, + ) + + if inplace == Inplace.INPLACE_X: + y = x + else: + y = TestTensor(shape, None, dtype, device) + + if y.is_broadcast(): + return + + print( + f"Testing Atanh on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}" + ) + + ans = atanh_op(x.torch_tensor()) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateAtanhDescriptor( + handle, ctypes.byref(descriptor), y.descriptor, x.descriptor + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + for tensor in [x, y]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetAtanhWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, y.device) + + def lib_atanh(): + check_error( + LIBINFINIOP.infiniopAtanh( + descriptor, workspace.data(), workspace_size.value, y.data(), x.data(), None + ) + ) + + lib_atanh() + if sync is not None: + sync() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(y.actual_tensor(), ans, atol=atol, rtol=rtol) + assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol, equal_nan=True) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: atanh_op(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_atanh(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + + check_error(LIBINFINIOP.infiniopDestroyAtanhDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/ceil.py b/test/infiniop/ceil.py new file mode 100644 index 000000000..afc1993c1 --- /dev/null +++ b/test/infiniop/ceil.py @@ -0,0 +1,165 @@ +import ctypes +from ctypes import c_uint64 +from enum import Enum, auto + +import torch +from libinfiniop import ( + LIBINFINIOP, + InfiniDeviceNames, + InfiniDtype, + InfiniDtypeNames, + TestTensor, + TestWorkspace, + check_error, + debug, + get_args, + get_test_devices, + get_tolerance, + infiniopOperatorDescriptor_t, + profile_operation, + test_operator, +) + +# ============================================================================== +# Configuration (Internal Use Only) +# ============================================================================== +# These are not meant to be imported from other modules +_TEST_CASES_ = [ + # tensor_shape, inplace + ((1, 3),), + ((3, 3),), + ((32, 20, 512),), + ((33, 333, 333),), + ((32, 256, 112, 112),), + ((3, 3, 13, 9, 17),), +] + + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_X = auto() + + +# Inplace options applied for each test case in _TEST_CASES_ +_INPLACE = [ + Inplace.OUT_OF_PLACE, + Inplace.INPLACE_X, +] + +# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_ +_TEST_CASES = [ + test_case + (inplace_item,) + for test_case in _TEST_CASES_ + for inplace_item in _INPLACE +] + +# Data types used for testing (matching old operators library: only F16 and F32) +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32] + +# Tolerance map for different data types +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def ceil_op(x): + return torch.ceil(x).to(x.dtype) + + +def test( + handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=InfiniDtype.F16, sync=None +): + # Generate test tensors with values in range [-20, -10) for ceil operation + # ceil domain is (-∞, +∞), so we use range [-20, -10) + x_torch_tensor = torch.rand(shape) * 10 - 20 + + x = TestTensor( + shape, + x_torch_tensor.stride(), + dtype, + device, + mode="manual", + set_tensor=x_torch_tensor, + ) + + if inplace == Inplace.INPLACE_X: + y = x + else: + y = TestTensor(shape, None, dtype, device) + + if y.is_broadcast(): + return + + print( + f"Testing Ceil on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}" + ) + + ans = ceil_op(x.torch_tensor()) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateCeilDescriptor( + handle, ctypes.byref(descriptor), y.descriptor, x.descriptor + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + for tensor in [x, y]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetCeilWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, y.device) + + def lib_ceil(): + check_error( + LIBINFINIOP.infiniopCeil( + descriptor, workspace.data(), workspace_size.value, y.data(), x.data(), None + ) + ) + + lib_ceil() + if sync is not None: + sync() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(y.actual_tensor(), ans, atol=atol, rtol=rtol) + assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol, equal_nan=True) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: ceil_op(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_ceil(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + + check_error(LIBINFINIOP.infiniopDestroyCeilDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/cos.py b/test/infiniop/cos.py new file mode 100644 index 000000000..972f17b7b --- /dev/null +++ b/test/infiniop/cos.py @@ -0,0 +1,166 @@ +import ctypes +from ctypes import c_uint64 +from enum import Enum, auto + +import torch +from libinfiniop import ( + LIBINFINIOP, + InfiniDeviceNames, + InfiniDtype, + InfiniDtypeNames, + TestTensor, + TestWorkspace, + check_error, + debug, + get_args, + get_test_devices, + get_tolerance, + infiniopOperatorDescriptor_t, + profile_operation, + test_operator, +) + +# ============================================================================== +# Configuration (Internal Use Only) +# ============================================================================== +# These are not meant to be imported from other modules +_TEST_CASES_ = [ + # tensor_shape, inplace + ((1, 3),), + ((3, 3),), + ((32, 20, 512),), + ((33, 333, 333),), + ((32, 256, 112, 112),), + ((3, 3, 13, 9, 17),), +] + + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_X = auto() + + +# Inplace options applied for each test case in _TEST_CASES_ +_INPLACE = [ + Inplace.OUT_OF_PLACE, + Inplace.INPLACE_X, +] + +# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_ +_TEST_CASES = [ + test_case + (inplace_item,) + for test_case in _TEST_CASES_ + for inplace_item in _INPLACE +] + +# Data types used for testing (matching old operators library: only F16 and F32) +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32] + +# Tolerance map for different data types +# Copied from old operators library: atol=1e-4, rtol=1e-2 +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-4, "rtol": 1e-2}, + InfiniDtype.F32: {"atol": 1e-4, "rtol": 1e-2}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def cos_op(x): + return torch.cos(x).to(x.dtype) + + +def test( + handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=InfiniDtype.F16, sync=None +): + # Generate test tensors with values in range [-200, -100) for cos operation + # cos domain is (-∞, +∞), so we use range [-200, -100) + x_torch_tensor = torch.rand(shape) * 100 - 200 + + x = TestTensor( + shape, + x_torch_tensor.stride(), + dtype, + device, + mode="manual", + set_tensor=x_torch_tensor, + ) + + if inplace == Inplace.INPLACE_X: + y = x + else: + y = TestTensor(shape, None, dtype, device) + + if y.is_broadcast(): + return + + print( + f"Testing Cos on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}" + ) + + ans = cos_op(x.torch_tensor()) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateCosDescriptor( + handle, ctypes.byref(descriptor), y.descriptor, x.descriptor + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + for tensor in [x, y]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetCosWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, y.device) + + def lib_cos(): + check_error( + LIBINFINIOP.infiniopCos( + descriptor, workspace.data(), workspace_size.value, y.data(), x.data(), None + ) + ) + + lib_cos() + if sync is not None: + sync() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(y.actual_tensor(), ans, atol=atol, rtol=rtol) + assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol, equal_nan=True) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: cos_op(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_cos(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + + check_error(LIBINFINIOP.infiniopDestroyCosDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/cosh.py b/test/infiniop/cosh.py new file mode 100644 index 000000000..ee7994531 --- /dev/null +++ b/test/infiniop/cosh.py @@ -0,0 +1,165 @@ +import ctypes +from ctypes import c_uint64 +from enum import Enum, auto + +import torch +from libinfiniop import ( + LIBINFINIOP, + InfiniDeviceNames, + InfiniDtype, + InfiniDtypeNames, + TestTensor, + TestWorkspace, + check_error, + debug, + get_args, + get_test_devices, + get_tolerance, + infiniopOperatorDescriptor_t, + profile_operation, + test_operator, +) + +# ============================================================================== +# Configuration (Internal Use Only) +# ============================================================================== +# These are not meant to be imported from other modules +_TEST_CASES_ = [ + # tensor_shape, inplace + ((1, 3),), + ((3, 3),), + ((32, 20, 512),), + ((33, 333, 333),), + ((32, 256, 112, 112),), + ((3, 3, 13, 9, 17),), +] + + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_X = auto() + + +# Inplace options applied for each test case in _TEST_CASES_ +_INPLACE = [ + Inplace.OUT_OF_PLACE, + Inplace.INPLACE_X, +] + +# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_ +_TEST_CASES = [ + test_case + (inplace_item,) + for test_case in _TEST_CASES_ + for inplace_item in _INPLACE +] + +# Data types used for testing (matching old operators library: only F16 and F32) +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32] + +# Tolerance map for different data types +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def cosh_op(x): + return torch.cosh(x).to(x.dtype) + + +def test( + handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=InfiniDtype.F16, sync=None +): + # Generate test tensors with values in range [-200, -100) for cosh operation + # cosh domain is (-∞, +∞), so we use range [-200, -100) + x_torch_tensor = torch.rand(shape) * 100 - 200 + + x = TestTensor( + shape, + x_torch_tensor.stride(), + dtype, + device, + mode="manual", + set_tensor=x_torch_tensor, + ) + + if inplace == Inplace.INPLACE_X: + y = x + else: + y = TestTensor(shape, None, dtype, device) + + if y.is_broadcast(): + return + + print( + f"Testing Cosh on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}" + ) + + ans = cosh_op(x.torch_tensor()) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateCoshDescriptor( + handle, ctypes.byref(descriptor), y.descriptor, x.descriptor + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + for tensor in [x, y]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetCoshWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, y.device) + + def lib_cosh(): + check_error( + LIBINFINIOP.infiniopCosh( + descriptor, workspace.data(), workspace_size.value, y.data(), x.data(), None + ) + ) + + lib_cosh() + if sync is not None: + sync() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(y.actual_tensor(), ans, atol=atol, rtol=rtol) + assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol, equal_nan=True) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: cosh_op(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_cosh(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + + check_error(LIBINFINIOP.infiniopDestroyCoshDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/erf.py b/test/infiniop/erf.py new file mode 100644 index 000000000..f5f9c4cd9 --- /dev/null +++ b/test/infiniop/erf.py @@ -0,0 +1,165 @@ +import ctypes +from ctypes import c_uint64 +from enum import Enum, auto + +import torch +from libinfiniop import ( + LIBINFINIOP, + InfiniDeviceNames, + InfiniDtype, + InfiniDtypeNames, + TestTensor, + TestWorkspace, + check_error, + debug, + get_args, + get_test_devices, + get_tolerance, + infiniopOperatorDescriptor_t, + profile_operation, + test_operator, +) + +# ============================================================================== +# Configuration (Internal Use Only) +# ============================================================================== +# These are not meant to be imported from other modules +_TEST_CASES_ = [ + # tensor_shape, inplace + ((1, 3),), + ((3, 3),), + ((32, 20, 512),), + ((33, 333, 333),), + ((32, 256, 112, 112),), + ((3, 3, 13, 9, 17),), +] + + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_X = auto() + + +# Inplace options applied for each test case in _TEST_CASES_ +_INPLACE = [ + Inplace.OUT_OF_PLACE, + Inplace.INPLACE_X, +] + +# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_ +_TEST_CASES = [ + test_case + (inplace_item,) + for test_case in _TEST_CASES_ + for inplace_item in _INPLACE +] + +# Data types used for testing (matching old operators library: only F16 and F32) +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32] + +# Tolerance map for different data types +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def erf_op(x): + return torch.erf(x).to(x.dtype) + + +def test( + handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=InfiniDtype.F16, sync=None +): + # Generate test tensors with values in range [-3, 3) for erf operation + # erf domain is (-∞, +∞), so we use range [-3, 3) + x_torch_tensor = torch.rand(shape) * 6 - 3 + + x = TestTensor( + shape, + x_torch_tensor.stride(), + dtype, + device, + mode="manual", + set_tensor=x_torch_tensor, + ) + + if inplace == Inplace.INPLACE_X: + y = x + else: + y = TestTensor(shape, None, dtype, device) + + if y.is_broadcast(): + return + + print( + f"Testing Erf on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}" + ) + + ans = erf_op(x.torch_tensor()) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateErfDescriptor( + handle, ctypes.byref(descriptor), y.descriptor, x.descriptor + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + for tensor in [x, y]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetErfWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, y.device) + + def lib_erf(): + check_error( + LIBINFINIOP.infiniopErf( + descriptor, workspace.data(), workspace_size.value, y.data(), x.data(), None + ) + ) + + lib_erf() + if sync is not None: + sync() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(y.actual_tensor(), ans, atol=atol, rtol=rtol) + assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol, equal_nan=True) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: erf_op(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_erf(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + + check_error(LIBINFINIOP.infiniopDestroyErfDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/floor.py b/test/infiniop/floor.py new file mode 100644 index 000000000..b981da809 --- /dev/null +++ b/test/infiniop/floor.py @@ -0,0 +1,165 @@ +import ctypes +from ctypes import c_uint64 +from enum import Enum, auto + +import torch +from libinfiniop import ( + LIBINFINIOP, + InfiniDeviceNames, + InfiniDtype, + InfiniDtypeNames, + TestTensor, + TestWorkspace, + check_error, + debug, + get_args, + get_test_devices, + get_tolerance, + infiniopOperatorDescriptor_t, + profile_operation, + test_operator, +) + +# ============================================================================== +# Configuration (Internal Use Only) +# ============================================================================== +# These are not meant to be imported from other modules +_TEST_CASES_ = [ + # tensor_shape, inplace + ((1, 3),), + ((3, 3),), + ((32, 20, 512),), + ((33, 333, 333),), + ((32, 256, 112, 112),), + ((3, 3, 13, 9, 17),), +] + + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_X = auto() + + +# Inplace options applied for each test case in _TEST_CASES_ +_INPLACE = [ + Inplace.OUT_OF_PLACE, + Inplace.INPLACE_X, +] + +# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_ +_TEST_CASES = [ + test_case + (inplace_item,) + for test_case in _TEST_CASES_ + for inplace_item in _INPLACE +] + +# Data types used for testing (matching old operators library: only F16 and F32) +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32] + +# Tolerance map for different data types +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def floor_op(x): + return torch.floor(x).to(x.dtype) + + +def test( + handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=InfiniDtype.F16, sync=None +): + # Generate test tensors with values in range [-20, -10) for floor operation + # floor domain is (-∞, +∞), so we use range [-20, -10) + x_torch_tensor = torch.rand(shape) * 10 - 20 + + x = TestTensor( + shape, + x_torch_tensor.stride(), + dtype, + device, + mode="manual", + set_tensor=x_torch_tensor, + ) + + if inplace == Inplace.INPLACE_X: + y = x + else: + y = TestTensor(shape, None, dtype, device) + + if y.is_broadcast(): + return + + print( + f"Testing Floor on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}" + ) + + ans = floor_op(x.torch_tensor()) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateFloorDescriptor( + handle, ctypes.byref(descriptor), y.descriptor, x.descriptor + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + for tensor in [x, y]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetFloorWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, y.device) + + def lib_floor(): + check_error( + LIBINFINIOP.infiniopFloor( + descriptor, workspace.data(), workspace_size.value, y.data(), x.data(), None + ) + ) + + lib_floor() + if sync is not None: + sync() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(y.actual_tensor(), ans, atol=atol, rtol=rtol) + assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol, equal_nan=True) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: floor_op(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_floor(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + + check_error(LIBINFINIOP.infiniopDestroyFloorDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/libinfiniop/op_register.py b/test/infiniop/libinfiniop/op_register.py index a61cea018..20a9188d6 100644 --- a/test/infiniop/libinfiniop/op_register.py +++ b/test/infiniop/libinfiniop/op_register.py @@ -496,6 +496,589 @@ def rearrange_(lib): lib.infiniopDestroyRearrangeDescriptor.argtypes = [infiniopOperatorDescriptor_t] +@OpRegister.operator +def abs_(lib): + lib.infiniopCreateAbsDescriptor.restype = c_int32 + lib.infiniopCreateAbsDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + + lib.infiniopGetAbsWorkspaceSize.restype = c_int32 + lib.infiniopGetAbsWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + + lib.infiniopAbs.restype = c_int32 + lib.infiniopAbs.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + ] + + lib.infiniopDestroyAbsDescriptor.restype = c_int32 + lib.infiniopDestroyAbsDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + +@OpRegister.operator +def acos_(lib): + lib.infiniopCreateAcosDescriptor.restype = c_int32 + lib.infiniopCreateAcosDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + lib.infiniopGetAcosWorkspaceSize.restype = c_int32 + lib.infiniopGetAcosWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + lib.infiniopAcos.restype = c_int32 + lib.infiniopAcos.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + ] + lib.infiniopDestroyAcosDescriptor.restype = c_int32 + lib.infiniopDestroyAcosDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + +@OpRegister.operator +def acosh_(lib): + lib.infiniopCreateAcoshDescriptor.restype = c_int32 + lib.infiniopCreateAcoshDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + lib.infiniopGetAcoshWorkspaceSize.restype = c_int32 + lib.infiniopGetAcoshWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + lib.infiniopAcosh.restype = c_int32 + lib.infiniopAcosh.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + ] + lib.infiniopDestroyAcoshDescriptor.restype = c_int32 + lib.infiniopDestroyAcoshDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + +@OpRegister.operator +def asin_(lib): + lib.infiniopCreateAsinDescriptor.restype = c_int32 + lib.infiniopCreateAsinDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + lib.infiniopGetAsinWorkspaceSize.restype = c_int32 + lib.infiniopGetAsinWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + lib.infiniopAsin.restype = c_int32 + lib.infiniopAsin.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + ] + lib.infiniopDestroyAsinDescriptor.restype = c_int32 + lib.infiniopDestroyAsinDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + +@OpRegister.operator +def asinh_(lib): + lib.infiniopCreateAsinhDescriptor.restype = c_int32 + lib.infiniopCreateAsinhDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + lib.infiniopGetAsinhWorkspaceSize.restype = c_int32 + lib.infiniopGetAsinhWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + lib.infiniopAsinh.restype = c_int32 + lib.infiniopAsinh.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + ] + lib.infiniopDestroyAsinhDescriptor.restype = c_int32 + lib.infiniopDestroyAsinhDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + +@OpRegister.operator +def atan_(lib): + lib.infiniopCreateAtanDescriptor.restype = c_int32 + lib.infiniopCreateAtanDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + lib.infiniopGetAtanWorkspaceSize.restype = c_int32 + lib.infiniopGetAtanWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + lib.infiniopAtan.restype = c_int32 + lib.infiniopAtan.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + ] + lib.infiniopDestroyAtanDescriptor.restype = c_int32 + lib.infiniopDestroyAtanDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + +@OpRegister.operator +def atanh_(lib): + lib.infiniopCreateAtanhDescriptor.restype = c_int32 + lib.infiniopCreateAtanhDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + lib.infiniopGetAtanhWorkspaceSize.restype = c_int32 + lib.infiniopGetAtanhWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + lib.infiniopAtanh.restype = c_int32 + lib.infiniopAtanh.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + ] + lib.infiniopDestroyAtanhDescriptor.restype = c_int32 + lib.infiniopDestroyAtanhDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + +@OpRegister.operator +def ceil_(lib): + lib.infiniopCreateCeilDescriptor.restype = c_int32 + lib.infiniopCreateCeilDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + lib.infiniopGetCeilWorkspaceSize.restype = c_int32 + lib.infiniopGetCeilWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + lib.infiniopCeil.restype = c_int32 + lib.infiniopCeil.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + ] + lib.infiniopDestroyCeilDescriptor.restype = c_int32 + lib.infiniopDestroyCeilDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + +@OpRegister.operator +def cos_(lib): + lib.infiniopCreateCosDescriptor.restype = c_int32 + lib.infiniopCreateCosDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + lib.infiniopGetCosWorkspaceSize.restype = c_int32 + lib.infiniopGetCosWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + lib.infiniopCos.restype = c_int32 + lib.infiniopCos.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + ] + lib.infiniopDestroyCosDescriptor.restype = c_int32 + lib.infiniopDestroyCosDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + +@OpRegister.operator +def cosh_(lib): + lib.infiniopCreateCoshDescriptor.restype = c_int32 + lib.infiniopCreateCoshDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + lib.infiniopGetCoshWorkspaceSize.restype = c_int32 + lib.infiniopGetCoshWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + lib.infiniopCosh.restype = c_int32 + lib.infiniopCosh.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + ] + lib.infiniopDestroyCoshDescriptor.restype = c_int32 + lib.infiniopDestroyCoshDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + +@OpRegister.operator +def sinh_(lib): + lib.infiniopCreateSinhDescriptor.restype = c_int32 + lib.infiniopCreateSinhDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + lib.infiniopGetSinhWorkspaceSize.restype = c_int32 + lib.infiniopGetSinhWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + lib.infiniopSinh.restype = c_int32 + lib.infiniopSinh.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + ] + lib.infiniopDestroySinhDescriptor.restype = c_int32 + lib.infiniopDestroySinhDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + +@OpRegister.operator +def erf_(lib): + lib.infiniopCreateErfDescriptor.restype = c_int32 + lib.infiniopCreateErfDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + lib.infiniopGetErfWorkspaceSize.restype = c_int32 + lib.infiniopGetErfWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + lib.infiniopErf.restype = c_int32 + lib.infiniopErf.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + ] + lib.infiniopDestroyErfDescriptor.restype = c_int32 + lib.infiniopDestroyErfDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + +@OpRegister.operator +def floor_(lib): + lib.infiniopCreateFloorDescriptor.restype = c_int32 + lib.infiniopCreateFloorDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + lib.infiniopGetFloorWorkspaceSize.restype = c_int32 + lib.infiniopGetFloorWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + lib.infiniopFloor.restype = c_int32 + lib.infiniopFloor.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + ] + lib.infiniopDestroyFloorDescriptor.restype = c_int32 + lib.infiniopDestroyFloorDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + +@OpRegister.operator +def neg_(lib): + lib.infiniopCreateNegDescriptor.restype = c_int32 + lib.infiniopCreateNegDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + lib.infiniopGetNegWorkspaceSize.restype = c_int32 + lib.infiniopGetNegWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + lib.infiniopNeg.restype = c_int32 + lib.infiniopNeg.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + ] + lib.infiniopDestroyNegDescriptor.restype = c_int32 + lib.infiniopDestroyNegDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + +@OpRegister.operator +def reciprocal_(lib): + lib.infiniopCreateReciprocalDescriptor.restype = c_int32 + lib.infiniopCreateReciprocalDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + lib.infiniopGetReciprocalWorkspaceSize.restype = c_int32 + lib.infiniopGetReciprocalWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + lib.infiniopReciprocal.restype = c_int32 + lib.infiniopReciprocal.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + ] + lib.infiniopDestroyReciprocalDescriptor.restype = c_int32 + lib.infiniopDestroyReciprocalDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + +@OpRegister.operator +def round_(lib): + lib.infiniopCreateRoundDescriptor.restype = c_int32 + lib.infiniopCreateRoundDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + lib.infiniopGetRoundWorkspaceSize.restype = c_int32 + lib.infiniopGetRoundWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + lib.infiniopRound.restype = c_int32 + lib.infiniopRound.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + ] + lib.infiniopDestroyRoundDescriptor.restype = c_int32 + lib.infiniopDestroyRoundDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + +@OpRegister.operator +def sign_(lib): + lib.infiniopCreateSignDescriptor.restype = c_int32 + lib.infiniopCreateSignDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + lib.infiniopGetSignWorkspaceSize.restype = c_int32 + lib.infiniopGetSignWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + lib.infiniopSign.restype = c_int32 + lib.infiniopSign.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + ] + lib.infiniopDestroySignDescriptor.restype = c_int32 + lib.infiniopDestroySignDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + +@OpRegister.operator +def sqrt_(lib): + lib.infiniopCreateSqrtDescriptor.restype = c_int32 + lib.infiniopCreateSqrtDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + lib.infiniopGetSqrtWorkspaceSize.restype = c_int32 + lib.infiniopGetSqrtWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + lib.infiniopSqrt.restype = c_int32 + lib.infiniopSqrt.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + ] + lib.infiniopDestroySqrtDescriptor.restype = c_int32 + lib.infiniopDestroySqrtDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + +@OpRegister.operator +def log_(lib): + lib.infiniopCreateLogDescriptor.restype = c_int32 + lib.infiniopCreateLogDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + lib.infiniopGetLogWorkspaceSize.restype = c_int32 + lib.infiniopGetLogWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + lib.infiniopLog.restype = c_int32 + lib.infiniopLog.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + ] + lib.infiniopDestroyLogDescriptor.restype = c_int32 + lib.infiniopDestroyLogDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + +@OpRegister.operator +def tan_(lib): + lib.infiniopCreateTanDescriptor.restype = c_int32 + lib.infiniopCreateTanDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + lib.infiniopGetTanWorkspaceSize.restype = c_int32 + lib.infiniopGetTanWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + lib.infiniopTan.restype = c_int32 + lib.infiniopTan.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + ] + lib.infiniopDestroyTanDescriptor.restype = c_int32 + lib.infiniopDestroyTanDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + @OpRegister.operator def relu_(lib): lib.infiniopCreateReluDescriptor.restype = c_int32 diff --git a/test/infiniop/log.py b/test/infiniop/log.py new file mode 100644 index 000000000..4f97de374 --- /dev/null +++ b/test/infiniop/log.py @@ -0,0 +1,166 @@ +import ctypes +from ctypes import c_uint64 +from enum import Enum, auto + +import torch +from libinfiniop import ( + LIBINFINIOP, + InfiniDeviceNames, + InfiniDtype, + InfiniDtypeNames, + TestTensor, + TestWorkspace, + check_error, + debug, + get_args, + get_test_devices, + get_tolerance, + infiniopOperatorDescriptor_t, + profile_operation, + test_operator, +) + +# ============================================================================== +# Configuration (Internal Use Only) +# ============================================================================== +# These are not meant to be imported from other modules +_TEST_CASES_ = [ + # tensor_shape, inplace + ((1, 3),), + ((3, 3),), + ((32, 20, 512),), + ((33, 333, 333),), + ((32, 256, 112, 112),), + ((3, 3, 13, 9, 17),), +] + + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_X = auto() + + +# Inplace options applied for each test case in _TEST_CASES_ +_INPLACE = [ + Inplace.OUT_OF_PLACE, + Inplace.INPLACE_X, +] + +# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_ +_TEST_CASES = [ + test_case + (inplace_item,) + for test_case in _TEST_CASES_ + for inplace_item in _INPLACE +] + +# Data types used for testing (matching old operators library: only F16 and F32) +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32] + +# Tolerance map for different data types +# Copied from old operators library: atol=1e-7, rtol=1e-3 +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-7, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-3}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def log_op(x): + return torch.log(x).to(x.dtype) + + +def test( + handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=InfiniDtype.F16, sync=None +): + # Generate test tensors with values in range [0.1, 1.1) for log operation + # log domain is (0, +∞), so we use range [0.1, 1.1) + x_torch_tensor = torch.rand(shape) + 0.1 + + x = TestTensor( + shape, + x_torch_tensor.stride(), + dtype, + device, + mode="manual", + set_tensor=x_torch_tensor, + ) + + if inplace == Inplace.INPLACE_X: + y = x + else: + y = TestTensor(shape, None, dtype, device) + + if y.is_broadcast(): + return + + print( + f"Testing Log on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}" + ) + + ans = log_op(x.torch_tensor()) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateLogDescriptor( + handle, ctypes.byref(descriptor), y.descriptor, x.descriptor + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + for tensor in [x, y]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetLogWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, y.device) + + def lib_log(): + check_error( + LIBINFINIOP.infiniopLog( + descriptor, workspace.data(), workspace_size.value, y.data(), x.data(), None + ) + ) + + lib_log() + if sync is not None: + sync() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(y.actual_tensor(), ans, atol=atol, rtol=rtol) + assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol, equal_nan=True) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: log_op(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_log(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + + check_error(LIBINFINIOP.infiniopDestroyLogDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/neg.py b/test/infiniop/neg.py new file mode 100644 index 000000000..62607bce0 --- /dev/null +++ b/test/infiniop/neg.py @@ -0,0 +1,165 @@ +import ctypes +from ctypes import c_uint64 +from enum import Enum, auto + +import torch +from libinfiniop import ( + LIBINFINIOP, + InfiniDeviceNames, + InfiniDtype, + InfiniDtypeNames, + TestTensor, + TestWorkspace, + check_error, + debug, + get_args, + get_test_devices, + get_tolerance, + infiniopOperatorDescriptor_t, + profile_operation, + test_operator, +) + +# ============================================================================== +# Configuration (Internal Use Only) +# ============================================================================== +# These are not meant to be imported from other modules +_TEST_CASES_ = [ + # tensor_shape, inplace + ((1, 3),), + ((3, 3),), + ((32, 20, 512),), + ((33, 333, 333),), + ((32, 256, 112, 112),), + ((3, 3, 13, 9, 17),), +] + + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_X = auto() + + +# Inplace options applied for each test case in _TEST_CASES_ +_INPLACE = [ + Inplace.OUT_OF_PLACE, + Inplace.INPLACE_X, +] + +# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_ +_TEST_CASES = [ + test_case + (inplace_item,) + for test_case in _TEST_CASES_ + for inplace_item in _INPLACE +] + +# Data types used for testing (matching old operators library: only F16 and F32) +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32] + +# Tolerance map for different data types +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 0, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 0, "rtol": 1e-7}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def neg_op(x): + return torch.neg(x).to(x.dtype) + + +def test( + handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=InfiniDtype.F16, sync=None +): + # Generate test tensors with values in range [-200, -100) for neg operation + # This matches the original test case: * 100 - 200 + x_torch_tensor = torch.rand(shape) * 100 - 200 + + x = TestTensor( + shape, + x_torch_tensor.stride(), + dtype, + device, + mode="manual", + set_tensor=x_torch_tensor, + ) + + if inplace == Inplace.INPLACE_X: + y = x + else: + y = TestTensor(shape, None, dtype, device) + + if y.is_broadcast(): + return + + print( + f"Testing Neg on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}" + ) + + ans = neg_op(x.torch_tensor()) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateNegDescriptor( + handle, ctypes.byref(descriptor), y.descriptor, x.descriptor + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + for tensor in [x, y]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetNegWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, y.device) + + def lib_neg(): + check_error( + LIBINFINIOP.infiniopNeg( + descriptor, workspace.data(), workspace_size.value, y.data(), x.data(), None + ) + ) + + lib_neg() + if sync is not None: + sync() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(y.actual_tensor(), ans, atol=atol, rtol=rtol) + assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol, equal_nan=True) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: neg_op(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_neg(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + + check_error(LIBINFINIOP.infiniopDestroyNegDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/reciprocal.py b/test/infiniop/reciprocal.py new file mode 100644 index 000000000..4e816481c --- /dev/null +++ b/test/infiniop/reciprocal.py @@ -0,0 +1,168 @@ +import ctypes +from ctypes import c_uint64 +from enum import Enum, auto + +import torch +from libinfiniop import ( + LIBINFINIOP, + InfiniDeviceNames, + InfiniDtype, + InfiniDtypeNames, + TestTensor, + TestWorkspace, + check_error, + debug, + get_args, + get_test_devices, + get_tolerance, + infiniopOperatorDescriptor_t, + profile_operation, + test_operator, +) + +# ============================================================================== +# Configuration (Internal Use Only) +# ============================================================================== +# These are not meant to be imported from other modules +_TEST_CASES_ = [ + # tensor_shape, inplace + ((1, 3),), + ((3, 3),), + ((32, 20, 512),), + ((33, 333, 333),), + ((32, 256, 112, 112),), + ((3, 3, 13, 9, 17),), +] + + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_X = auto() + + +# Inplace options applied for each test case in _TEST_CASES_ +_INPLACE = [ + Inplace.OUT_OF_PLACE, + Inplace.INPLACE_X, +] + +# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_ +_TEST_CASES = [ + test_case + (inplace_item,) + for test_case in _TEST_CASES_ + for inplace_item in _INPLACE +] + +# Data types used for testing (matching old operators library: only F16 and F32) +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32] + +# Tolerance map for different data types +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 0, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 0, "rtol": 1e-7}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def reciprocal_op(x): + return torch.reciprocal(x).to(x.dtype) + + +def test( + handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=InfiniDtype.F16, sync=None +): + # Generate test tensors with values in range [-10, 10) for reciprocal operation + # This matches the original test case: * 20 - 10 + # Note: Avoid values too close to zero to prevent division by zero issues + x_torch_tensor = torch.rand(shape) * 20 - 10 + # Ensure no zero values + x_torch_tensor = torch.where(x_torch_tensor == 0, torch.ones_like(x_torch_tensor), x_torch_tensor) + + x = TestTensor( + shape, + x_torch_tensor.stride(), + dtype, + device, + mode="manual", + set_tensor=x_torch_tensor, + ) + + if inplace == Inplace.INPLACE_X: + y = x + else: + y = TestTensor(shape, None, dtype, device) + + if y.is_broadcast(): + return + + print( + f"Testing Reciprocal on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}" + ) + + ans = reciprocal_op(x.torch_tensor()) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateReciprocalDescriptor( + handle, ctypes.byref(descriptor), y.descriptor, x.descriptor + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + for tensor in [x, y]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetReciprocalWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, y.device) + + def lib_reciprocal(): + check_error( + LIBINFINIOP.infiniopReciprocal( + descriptor, workspace.data(), workspace_size.value, y.data(), x.data(), None + ) + ) + + lib_reciprocal() + if sync is not None: + sync() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(y.actual_tensor(), ans, atol=atol, rtol=rtol) + assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol, equal_nan=True) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: reciprocal_op(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_reciprocal(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + + check_error(LIBINFINIOP.infiniopDestroyReciprocalDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/round.py b/test/infiniop/round.py new file mode 100644 index 000000000..d6053f676 --- /dev/null +++ b/test/infiniop/round.py @@ -0,0 +1,165 @@ +import ctypes +from ctypes import c_uint64 +from enum import Enum, auto + +import torch +from libinfiniop import ( + LIBINFINIOP, + InfiniDeviceNames, + InfiniDtype, + InfiniDtypeNames, + TestTensor, + TestWorkspace, + check_error, + debug, + get_args, + get_test_devices, + get_tolerance, + infiniopOperatorDescriptor_t, + profile_operation, + test_operator, +) + +# ============================================================================== +# Configuration (Internal Use Only) +# ============================================================================== +# These are not meant to be imported from other modules +_TEST_CASES_ = [ + # tensor_shape, inplace + ((1, 3),), + ((3, 3),), + ((32, 20, 512),), + ((33, 333, 333),), + ((32, 256, 112, 112),), + ((3, 3, 13, 9, 17),), +] + + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_X = auto() + + +# Inplace options applied for each test case in _TEST_CASES_ +_INPLACE = [ + Inplace.OUT_OF_PLACE, + Inplace.INPLACE_X, +] + +# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_ +_TEST_CASES = [ + test_case + (inplace_item,) + for test_case in _TEST_CASES_ + for inplace_item in _INPLACE +] + +# Data types used for testing (matching old operators library: only F16 and F32) +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32] + +# Tolerance map for different data types +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def round_op(x): + return torch.round(x).to(x.dtype) + + +def test( + handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=InfiniDtype.F16, sync=None +): + # Generate test tensors with values in range [-20, -10) for round operation + # This matches the original test case: * 10 - 20 + x_torch_tensor = torch.rand(shape) * 10 - 20 + + x = TestTensor( + shape, + x_torch_tensor.stride(), + dtype, + device, + mode="manual", + set_tensor=x_torch_tensor, + ) + + if inplace == Inplace.INPLACE_X: + y = x + else: + y = TestTensor(shape, None, dtype, device) + + if y.is_broadcast(): + return + + print( + f"Testing Round on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}" + ) + + ans = round_op(x.torch_tensor()) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateRoundDescriptor( + handle, ctypes.byref(descriptor), y.descriptor, x.descriptor + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + for tensor in [x, y]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetRoundWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, y.device) + + def lib_round(): + check_error( + LIBINFINIOP.infiniopRound( + descriptor, workspace.data(), workspace_size.value, y.data(), x.data(), None + ) + ) + + lib_round() + if sync is not None: + sync() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(y.actual_tensor(), ans, atol=atol, rtol=rtol) + assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol, equal_nan=True) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: round_op(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_round(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + + check_error(LIBINFINIOP.infiniopDestroyRoundDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/sign.py b/test/infiniop/sign.py new file mode 100644 index 000000000..f0eb5b5f8 --- /dev/null +++ b/test/infiniop/sign.py @@ -0,0 +1,166 @@ +import ctypes +from ctypes import c_uint64 +from enum import Enum, auto + +import torch +from libinfiniop import ( + LIBINFINIOP, + InfiniDeviceNames, + InfiniDtype, + InfiniDtypeNames, + TestTensor, + TestWorkspace, + check_error, + debug, + get_args, + get_test_devices, + get_tolerance, + infiniopOperatorDescriptor_t, + profile_operation, + test_operator, +) + +# ============================================================================== +# Configuration (Internal Use Only) +# ============================================================================== +# These are not meant to be imported from other modules +_TEST_CASES_ = [ + # tensor_shape, inplace + ((1, 3),), + ((3, 3),), + ((32, 20, 512),), + ((33, 333, 333),), + ((32, 256, 112, 112),), + ((3, 3, 13, 9, 17),), +] + + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_X = auto() + + +# Inplace options applied for each test case in _TEST_CASES_ +_INPLACE = [ + Inplace.OUT_OF_PLACE, + Inplace.INPLACE_X, +] + +# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_ +_TEST_CASES = [ + test_case + (inplace_item,) + for test_case in _TEST_CASES_ + for inplace_item in _INPLACE +] + +# Data types used for testing (matching old operators library: only F16 and F32) +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32] + +# Tolerance map for different data types +# Copied from old operators library: atol=0, rtol=0 +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 0, "rtol": 0}, + InfiniDtype.F32: {"atol": 0, "rtol": 0}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def sign_op(x): + return torch.sign(x).to(x.dtype) + + +def test( + handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=InfiniDtype.F16, sync=None +): + # Generate test tensors with values in range [-200, -100) for sign operation + # sign domain is (-∞, +∞), so we use range [-200, -100) + x_torch_tensor = torch.rand(shape) * 100 - 200 + + x = TestTensor( + shape, + x_torch_tensor.stride(), + dtype, + device, + mode="manual", + set_tensor=x_torch_tensor, + ) + + if inplace == Inplace.INPLACE_X: + y = x + else: + y = TestTensor(shape, None, dtype, device) + + if y.is_broadcast(): + return + + print( + f"Testing Sign on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}" + ) + + ans = sign_op(x.torch_tensor()) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateSignDescriptor( + handle, ctypes.byref(descriptor), y.descriptor, x.descriptor + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + for tensor in [x, y]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetSignWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, y.device) + + def lib_sign(): + check_error( + LIBINFINIOP.infiniopSign( + descriptor, workspace.data(), workspace_size.value, y.data(), x.data(), None + ) + ) + + lib_sign() + if sync is not None: + sync() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(y.actual_tensor(), ans, atol=atol, rtol=rtol) + assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol, equal_nan=True) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: sign_op(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_sign(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + + check_error(LIBINFINIOP.infiniopDestroySignDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/sinh.py b/test/infiniop/sinh.py new file mode 100644 index 000000000..99bc02c58 --- /dev/null +++ b/test/infiniop/sinh.py @@ -0,0 +1,166 @@ +import ctypes +from ctypes import c_uint64 +from enum import Enum, auto + +import torch +from libinfiniop import ( + LIBINFINIOP, + InfiniDeviceNames, + InfiniDtype, + InfiniDtypeNames, + TestTensor, + TestWorkspace, + check_error, + debug, + get_args, + get_test_devices, + get_tolerance, + infiniopOperatorDescriptor_t, + profile_operation, + test_operator, +) + +# ============================================================================== +# Configuration (Internal Use Only) +# ============================================================================== +# These are not meant to be imported from other modules +_TEST_CASES_ = [ + # tensor_shape, inplace + ((1, 3),), + ((3, 3),), + ((32, 20, 512),), + ((33, 333, 333),), + ((32, 256, 112, 112),), + ((3, 3, 13, 9, 17),), +] + + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_X = auto() + + +# Inplace options applied for each test case in _TEST_CASES_ +_INPLACE = [ + Inplace.OUT_OF_PLACE, + Inplace.INPLACE_X, +] + +# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_ +_TEST_CASES = [ + test_case + (inplace_item,) + for test_case in _TEST_CASES_ + for inplace_item in _INPLACE +] + +# Data types used for testing (matching old operators library: only F16 and F32) +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32] + +# Tolerance map for different data types +# Copied from old operators library: atol=0, rtol=0 +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 0, "rtol": 0}, + InfiniDtype.F32: {"atol": 0, "rtol": 0}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def sinh_op(x): + return torch.sinh(x).to(x.dtype) + + +def test( + handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=InfiniDtype.F16, sync=None +): + # Generate test tensors with values in range [-200, -100) for sinh operation + # sinh domain is (-∞, +∞), so we use range [-200, -100) + x_torch_tensor = torch.rand(shape) * 100 - 200 + + x = TestTensor( + shape, + x_torch_tensor.stride(), + dtype, + device, + mode="manual", + set_tensor=x_torch_tensor, + ) + + if inplace == Inplace.INPLACE_X: + y = x + else: + y = TestTensor(shape, None, dtype, device) + + if y.is_broadcast(): + return + + print( + f"Testing Sinh on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}" + ) + + ans = sinh_op(x.torch_tensor()) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateSinhDescriptor( + handle, ctypes.byref(descriptor), y.descriptor, x.descriptor + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + for tensor in [x, y]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetSinhWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, y.device) + + def lib_sinh(): + check_error( + LIBINFINIOP.infiniopSinh( + descriptor, workspace.data(), workspace_size.value, y.data(), x.data(), None + ) + ) + + lib_sinh() + if sync is not None: + sync() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(y.actual_tensor(), ans, atol=atol, rtol=rtol) + assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol, equal_nan=True) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: sinh_op(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_sinh(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + + check_error(LIBINFINIOP.infiniopDestroySinhDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/sqrt.py b/test/infiniop/sqrt.py new file mode 100644 index 000000000..6e1419971 --- /dev/null +++ b/test/infiniop/sqrt.py @@ -0,0 +1,166 @@ +import ctypes +from ctypes import c_uint64 +from enum import Enum, auto + +import torch +from libinfiniop import ( + LIBINFINIOP, + InfiniDeviceNames, + InfiniDtype, + InfiniDtypeNames, + TestTensor, + TestWorkspace, + check_error, + debug, + get_args, + get_test_devices, + get_tolerance, + infiniopOperatorDescriptor_t, + profile_operation, + test_operator, +) + +# ============================================================================== +# Configuration (Internal Use Only) +# ============================================================================== +# These are not meant to be imported from other modules +_TEST_CASES_ = [ + # tensor_shape, inplace + ((1, 3),), + ((3, 3),), + ((32, 20, 512),), + ((33, 333, 333),), + ((32, 256, 112, 112),), + ((3, 3, 13, 9, 17),), +] + + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_X = auto() + + +# Inplace options applied for each test case in _TEST_CASES_ +_INPLACE = [ + Inplace.OUT_OF_PLACE, + Inplace.INPLACE_X, +] + +# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_ +_TEST_CASES = [ + test_case + (inplace_item,) + for test_case in _TEST_CASES_ + for inplace_item in _INPLACE +] + +# Data types used for testing (matching old operators library: only F16 and F32) +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32] + +# Tolerance map for different data types +# Copied from old operators library: atol=0, rtol=1e-3 +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 0, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 0, "rtol": 1e-3}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def sqrt_op(x): + return torch.sqrt(x).to(x.dtype) + + +def test( + handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=InfiniDtype.F16, sync=None +): + # Generate test tensors with values in range [0, 100) for sqrt operation + # sqrt domain is [0, +∞), so we use range [0, 100) + x_torch_tensor = torch.rand(shape) * 100 + + x = TestTensor( + shape, + x_torch_tensor.stride(), + dtype, + device, + mode="manual", + set_tensor=x_torch_tensor, + ) + + if inplace == Inplace.INPLACE_X: + y = x + else: + y = TestTensor(shape, None, dtype, device) + + if y.is_broadcast(): + return + + print( + f"Testing Sqrt on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}" + ) + + ans = sqrt_op(x.torch_tensor()) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateSqrtDescriptor( + handle, ctypes.byref(descriptor), y.descriptor, x.descriptor + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + for tensor in [x, y]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetSqrtWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, y.device) + + def lib_sqrt(): + check_error( + LIBINFINIOP.infiniopSqrt( + descriptor, workspace.data(), workspace_size.value, y.data(), x.data(), None + ) + ) + + lib_sqrt() + if sync is not None: + sync() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(y.actual_tensor(), ans, atol=atol, rtol=rtol) + assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol, equal_nan=True) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: sqrt_op(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_sqrt(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + + check_error(LIBINFINIOP.infiniopDestroySqrtDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/tan.py b/test/infiniop/tan.py new file mode 100644 index 000000000..877f5dd58 --- /dev/null +++ b/test/infiniop/tan.py @@ -0,0 +1,167 @@ +import ctypes +import math +from ctypes import c_uint64 +from enum import Enum, auto + +import torch +from libinfiniop import ( + LIBINFINIOP, + InfiniDeviceNames, + InfiniDtype, + InfiniDtypeNames, + TestTensor, + TestWorkspace, + check_error, + debug, + get_args, + get_test_devices, + get_tolerance, + infiniopOperatorDescriptor_t, + profile_operation, + test_operator, +) + +# ============================================================================== +# Configuration (Internal Use Only) +# ============================================================================== +# These are not meant to be imported from other modules +_TEST_CASES_ = [ + # tensor_shape, inplace + ((1, 3),), + ((3, 3),), + ((32, 20, 512),), + ((33, 333, 333),), + ((32, 256, 112, 112),), + ((3, 3, 13, 9, 17),), +] + + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_X = auto() + + +# Inplace options applied for each test case in _TEST_CASES_ +_INPLACE = [ + Inplace.OUT_OF_PLACE, + Inplace.INPLACE_X, +] + +# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_ +_TEST_CASES = [ + test_case + (inplace_item,) + for test_case in _TEST_CASES_ + for inplace_item in _INPLACE +] + +# Data types used for testing (matching old operators library: only F16 and F32) +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32] + +# Tolerance map for different data types +# Copied from old operators library: atol=1e-6, rtol=1e-2 +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-6, "rtol": 1e-2}, + InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-2}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def tan_op(x): + return torch.tan(x).to(x.dtype) + + +def test( + handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=InfiniDtype.F16, sync=None +): + # Generate test tensors with values in range [-2π, 2π) for tan operation + # tan domain is (-∞, +∞), so we use range [-2π, 2π) + x_torch_tensor = torch.rand(shape) * 4 * math.pi - 2 * math.pi + + x = TestTensor( + shape, + x_torch_tensor.stride(), + dtype, + device, + mode="manual", + set_tensor=x_torch_tensor, + ) + + if inplace == Inplace.INPLACE_X: + y = x + else: + y = TestTensor(shape, None, dtype, device) + + if y.is_broadcast(): + return + + print( + f"Testing Tan on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}" + ) + + ans = tan_op(x.torch_tensor()) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateTanDescriptor( + handle, ctypes.byref(descriptor), y.descriptor, x.descriptor + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + for tensor in [x, y]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetTanWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, y.device) + + def lib_tan(): + check_error( + LIBINFINIOP.infiniopTan( + descriptor, workspace.data(), workspace_size.value, y.data(), x.data(), None + ) + ) + + lib_tan() + if sync is not None: + sync() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(y.actual_tensor(), ans, atol=atol, rtol=rtol) + assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol, equal_nan=True) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: tan_op(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_tan(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + + check_error(LIBINFINIOP.infiniopDestroyTanDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m") From c9247c809ec2143b7f81a87cb26cab21766227a1 Mon Sep 17 00:00:00 2001 From: gongchensu Date: Thu, 15 Jan 2026 02:33:13 +0000 Subject: [PATCH 3/7] Issue/887 - Refactor binary and unary operators to reduce code duplication. --- include/infiniop/ops/abs.h | 20 +- include/infiniop/ops/acos.h | 20 +- include/infiniop/ops/acosh.h | 20 +- include/infiniop/ops/asin.h | 20 +- include/infiniop/ops/asinh.h | 20 +- include/infiniop/ops/atan.h | 20 +- include/infiniop/ops/atanh.h | 20 +- include/infiniop/ops/binary_op_api.h | 50 ++ include/infiniop/ops/ceil.h | 20 +- include/infiniop/ops/cos.h | 20 +- include/infiniop/ops/cosh.h | 20 +- include/infiniop/ops/div.h | 22 +- include/infiniop/ops/erf.h | 20 +- include/infiniop/ops/floor.h | 20 +- include/infiniop/ops/log.h | 20 +- include/infiniop/ops/max.h | 22 +- include/infiniop/ops/min.h | 22 +- include/infiniop/ops/mod.h | 22 +- include/infiniop/ops/neg.h | 20 +- include/infiniop/ops/pow.h | 22 +- include/infiniop/ops/reciprocal.h | 20 +- include/infiniop/ops/round.h | 20 +- include/infiniop/ops/sign.h | 20 +- include/infiniop/ops/sinh.h | 20 +- include/infiniop/ops/sqrt.h | 20 +- include/infiniop/ops/tan.h | 20 +- include/infiniop/ops/unary_op_api.h | 48 ++ scripts/test_binary_unary.py | 143 +++++ src/infiniop/elementwise/binary.h | 261 +++++++++ .../elementwise/cpu/elementwise_cpu_impl.h | 130 +++++ .../nvidia/elementwise_nvidia_impl.cuh | 134 +++++ src/infiniop/elementwise/unary.h | 524 ++++++++++++++++++ src/infiniop/operator_impl.h | 288 ++++++++++ src/infiniop/ops/abs/cpu/abs_cpu.cc | 44 +- src/infiniop/ops/abs/cpu/abs_cpu.h | 21 +- src/infiniop/ops/abs/cuda/kernel.cuh | 20 +- src/infiniop/ops/abs/nvidia/abs_nvidia.cu | 48 +- src/infiniop/ops/abs/operator.cc | 132 +---- src/infiniop/ops/acos/cpu/acos_cpu.cc | 44 +- src/infiniop/ops/acos/cpu/acos_cpu.h | 17 +- src/infiniop/ops/acos/cuda/kernel.cuh | 26 +- src/infiniop/ops/acos/nvidia/acos_nvidia.cu | 48 +- src/infiniop/ops/acos/operator.cc | 132 +---- src/infiniop/ops/acosh/cpu/acosh_cpu.cc | 44 +- src/infiniop/ops/acosh/cpu/acosh_cpu.h | 17 +- src/infiniop/ops/acosh/cuda/kernel.cuh | 26 +- src/infiniop/ops/acosh/nvidia/acosh_nvidia.cu | 48 +- src/infiniop/ops/acosh/operator.cc | 132 +---- src/infiniop/ops/asin/cpu/asin_cpu.cc | 44 +- src/infiniop/ops/asin/cpu/asin_cpu.h | 17 +- src/infiniop/ops/asin/cuda/kernel.cuh | 26 +- src/infiniop/ops/asin/nvidia/asin_nvidia.cu | 48 +- src/infiniop/ops/asin/operator.cc | 132 +---- src/infiniop/ops/asinh/cpu/asinh_cpu.cc | 44 +- src/infiniop/ops/asinh/cpu/asinh_cpu.h | 17 +- src/infiniop/ops/asinh/cuda/kernel.cuh | 26 +- src/infiniop/ops/asinh/nvidia/asinh_nvidia.cu | 48 +- src/infiniop/ops/asinh/operator.cc | 132 +---- src/infiniop/ops/atan/cpu/atan_cpu.cc | 44 +- src/infiniop/ops/atan/cpu/atan_cpu.h | 17 +- src/infiniop/ops/atan/cuda/kernel.cuh | 26 +- src/infiniop/ops/atan/nvidia/atan_nvidia.cu | 48 +- src/infiniop/ops/atan/operator.cc | 132 +---- src/infiniop/ops/atanh/cpu/atanh_cpu.cc | 44 +- src/infiniop/ops/atanh/cpu/atanh_cpu.h | 17 +- src/infiniop/ops/atanh/cuda/kernel.cuh | 26 +- src/infiniop/ops/atanh/nvidia/atanh_nvidia.cu | 48 +- src/infiniop/ops/atanh/operator.cc | 132 +---- src/infiniop/ops/ceil/cpu/ceil_cpu.cc | 44 +- src/infiniop/ops/ceil/cpu/ceil_cpu.h | 21 +- src/infiniop/ops/ceil/cuda/kernel.cuh | 28 +- src/infiniop/ops/ceil/nvidia/ceil_nvidia.cu | 48 +- src/infiniop/ops/ceil/operator.cc | 132 +---- src/infiniop/ops/cos/cpu/cos_cpu.cc | 44 +- src/infiniop/ops/cos/cpu/cos_cpu.h | 17 +- src/infiniop/ops/cos/cuda/kernel.cuh | 26 +- src/infiniop/ops/cos/nvidia/cos_nvidia.cu | 48 +- src/infiniop/ops/cos/operator.cc | 132 +---- src/infiniop/ops/cosh/cpu/cosh_cpu.cc | 44 +- src/infiniop/ops/cosh/cpu/cosh_cpu.h | 17 +- src/infiniop/ops/cosh/cuda/kernel.cuh | 26 +- src/infiniop/ops/cosh/nvidia/cosh_nvidia.cu | 48 +- src/infiniop/ops/cosh/operator.cc | 132 +---- src/infiniop/ops/div/cpu/div_cpu.cc | 46 +- src/infiniop/ops/div/cpu/div_cpu.h | 14 +- src/infiniop/ops/div/cuda/kernel.cuh | 19 +- src/infiniop/ops/div/nvidia/div_nvidia.cu | 51 +- src/infiniop/ops/div/operator.cc | 195 +------ src/infiniop/ops/erf/cpu/erf_cpu.cc | 44 +- src/infiniop/ops/erf/cpu/erf_cpu.h | 17 +- src/infiniop/ops/erf/cuda/kernel.cuh | 26 +- src/infiniop/ops/erf/nvidia/erf_nvidia.cu | 48 +- src/infiniop/ops/erf/operator.cc | 132 +---- src/infiniop/ops/floor/cpu/floor_cpu.cc | 44 +- src/infiniop/ops/floor/cpu/floor_cpu.h | 21 +- src/infiniop/ops/floor/cuda/kernel.cuh | 28 +- src/infiniop/ops/floor/nvidia/floor_nvidia.cu | 48 +- src/infiniop/ops/floor/operator.cc | 132 +---- src/infiniop/ops/log/cpu/log_cpu.cc | 44 +- src/infiniop/ops/log/cpu/log_cpu.h | 17 +- src/infiniop/ops/log/cuda/kernel.cuh | 26 +- src/infiniop/ops/log/nvidia/log_nvidia.cu | 48 +- src/infiniop/ops/log/operator.cc | 132 +---- src/infiniop/ops/max/cpu/max_cpu.cc | 46 +- src/infiniop/ops/max/cpu/max_cpu.h | 15 +- src/infiniop/ops/max/cuda/kernel.cuh | 19 +- src/infiniop/ops/max/nvidia/max_nvidia.cu | 51 +- src/infiniop/ops/max/operator.cc | 195 +------ src/infiniop/ops/min/cpu/min_cpu.cc | 46 +- src/infiniop/ops/min/cpu/min_cpu.h | 15 +- src/infiniop/ops/min/cuda/kernel.cuh | 19 +- src/infiniop/ops/min/nvidia/min_nvidia.cu | 51 +- src/infiniop/ops/min/operator.cc | 195 +------ src/infiniop/ops/mod/cpu/mod_cpu.cc | 45 +- src/infiniop/ops/mod/cpu/mod_cpu.h | 18 +- src/infiniop/ops/mod/cuda/kernel.cuh | 24 +- src/infiniop/ops/mod/nvidia/mod_nvidia.cu | 51 +- src/infiniop/ops/mod/operator.cc | 135 +---- src/infiniop/ops/neg/cpu/neg_cpu.cc | 44 +- src/infiniop/ops/neg/cpu/neg_cpu.h | 15 +- src/infiniop/ops/neg/cuda/kernel.cuh | 17 +- src/infiniop/ops/neg/nvidia/neg_nvidia.cu | 48 +- src/infiniop/ops/neg/operator.cc | 132 +---- src/infiniop/ops/pow/cpu/pow_cpu.cc | 45 +- src/infiniop/ops/pow/cpu/pow_cpu.h | 14 +- src/infiniop/ops/pow/cuda/kernel.cuh | 34 +- src/infiniop/ops/pow/nvidia/pow_nvidia.cu | 51 +- src/infiniop/ops/pow/operator.cc | 135 +---- .../ops/reciprocal/cpu/reciprocal_cpu.cc | 44 +- .../ops/reciprocal/cpu/reciprocal_cpu.h | 15 +- src/infiniop/ops/reciprocal/cuda/kernel.cuh | 26 +- .../reciprocal/nvidia/reciprocal_nvidia.cu | 48 +- src/infiniop/ops/reciprocal/operator.cc | 132 +---- src/infiniop/ops/round/cpu/round_cpu.cc | 44 +- src/infiniop/ops/round/cpu/round_cpu.h | 20 +- src/infiniop/ops/round/cuda/kernel.cuh | 28 +- src/infiniop/ops/round/nvidia/round_nvidia.cu | 48 +- src/infiniop/ops/round/operator.cc | 132 +---- src/infiniop/ops/sign/cpu/sign_cpu.cc | 44 +- src/infiniop/ops/sign/cpu/sign_cpu.h | 15 +- src/infiniop/ops/sign/cuda/kernel.cuh | 19 +- src/infiniop/ops/sign/nvidia/sign_nvidia.cu | 48 +- src/infiniop/ops/sign/operator.cc | 132 +---- src/infiniop/ops/sinh/cpu/sinh_cpu.cc | 44 +- src/infiniop/ops/sinh/cpu/sinh_cpu.h | 17 +- src/infiniop/ops/sinh/cuda/kernel.cuh | 26 +- src/infiniop/ops/sinh/nvidia/sinh_nvidia.cu | 48 +- src/infiniop/ops/sinh/operator.cc | 132 +---- src/infiniop/ops/sqrt/cpu/sqrt_cpu.cc | 44 +- src/infiniop/ops/sqrt/cpu/sqrt_cpu.h | 17 +- src/infiniop/ops/sqrt/cuda/kernel.cuh | 26 +- src/infiniop/ops/sqrt/nvidia/sqrt_nvidia.cu | 48 +- src/infiniop/ops/sqrt/operator.cc | 132 +---- src/infiniop/ops/tan/cpu/tan_cpu.cc | 44 +- src/infiniop/ops/tan/cpu/tan_cpu.h | 17 +- src/infiniop/ops/tan/cuda/kernel.cuh | 49 +- src/infiniop/ops/tan/nvidia/tan_nvidia.cu | 48 +- src/infiniop/ops/tan/operator.cc | 132 +---- src/infiniop/ops/tanh/cuda/kernel.cuh | 38 +- src/infiniop/ops/tanh/nvidia/tanh_nvidia.cu | 53 +- 160 files changed, 1885 insertions(+), 7179 deletions(-) create mode 100644 include/infiniop/ops/binary_op_api.h create mode 100644 include/infiniop/ops/unary_op_api.h create mode 100755 scripts/test_binary_unary.py create mode 100644 src/infiniop/elementwise/binary.h create mode 100644 src/infiniop/elementwise/cpu/elementwise_cpu_impl.h create mode 100644 src/infiniop/elementwise/nvidia/elementwise_nvidia_impl.cuh create mode 100644 src/infiniop/elementwise/unary.h create mode 100644 src/infiniop/operator_impl.h diff --git a/include/infiniop/ops/abs.h b/include/infiniop/ops/abs.h index 7b5872657..1d1f1cbd1 100644 --- a/include/infiniop/ops/abs.h +++ b/include/infiniop/ops/abs.h @@ -1,24 +1,8 @@ #ifndef __INFINIOP_ABS_API_H__ #define __INFINIOP_ABS_API_H__ -#include "../operator_descriptor.h" +#include "unary_op_api.h" -typedef struct InfiniopDescriptor *infiniopAbsDescriptor_t; - -__C __export infiniStatus_t infiniopCreateAbsDescriptor(infiniopHandle_t handle, - infiniopAbsDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t y, - infiniopTensorDescriptor_t x); - -__C __export infiniStatus_t infiniopGetAbsWorkspaceSize(infiniopAbsDescriptor_t desc, size_t *size); - -__C __export infiniStatus_t infiniopAbs(infiniopAbsDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *y, - const void *x, - void *stream); - -__C __export infiniStatus_t infiniopDestroyAbsDescriptor(infiniopAbsDescriptor_t desc); +UNARY_OP_API_DECLARE(abs, Abs) #endif diff --git a/include/infiniop/ops/acos.h b/include/infiniop/ops/acos.h index fe6af01ed..c2f4de837 100644 --- a/include/infiniop/ops/acos.h +++ b/include/infiniop/ops/acos.h @@ -1,24 +1,8 @@ #ifndef __INFINIOP_ACOS_API_H__ #define __INFINIOP_ACOS_API_H__ -#include "../operator_descriptor.h" +#include "unary_op_api.h" -typedef struct InfiniopDescriptor *infiniopAcosDescriptor_t; - -__C __export infiniStatus_t infiniopCreateAcosDescriptor(infiniopHandle_t handle, - infiniopAcosDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t y, - infiniopTensorDescriptor_t x); - -__C __export infiniStatus_t infiniopGetAcosWorkspaceSize(infiniopAcosDescriptor_t desc, size_t *size); - -__C __export infiniStatus_t infiniopAcos(infiniopAcosDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *y, - const void *x, - void *stream); - -__C __export infiniStatus_t infiniopDestroyAcosDescriptor(infiniopAcosDescriptor_t desc); +UNARY_OP_API_DECLARE(acos, Acos) #endif diff --git a/include/infiniop/ops/acosh.h b/include/infiniop/ops/acosh.h index be28918bb..e8630b7d5 100644 --- a/include/infiniop/ops/acosh.h +++ b/include/infiniop/ops/acosh.h @@ -1,24 +1,8 @@ #ifndef __INFINIOP_ACOSH_API_H__ #define __INFINIOP_ACOSH_API_H__ -#include "../operator_descriptor.h" +#include "unary_op_api.h" -typedef struct InfiniopDescriptor *infiniopAcoshDescriptor_t; - -__C __export infiniStatus_t infiniopCreateAcoshDescriptor(infiniopHandle_t handle, - infiniopAcoshDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t y, - infiniopTensorDescriptor_t x); - -__C __export infiniStatus_t infiniopGetAcoshWorkspaceSize(infiniopAcoshDescriptor_t desc, size_t *size); - -__C __export infiniStatus_t infiniopAcosh(infiniopAcoshDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *y, - const void *x, - void *stream); - -__C __export infiniStatus_t infiniopDestroyAcoshDescriptor(infiniopAcoshDescriptor_t desc); +UNARY_OP_API_DECLARE(acosh, Acosh) #endif diff --git a/include/infiniop/ops/asin.h b/include/infiniop/ops/asin.h index 2aac6d1e1..1a8bdd7b8 100644 --- a/include/infiniop/ops/asin.h +++ b/include/infiniop/ops/asin.h @@ -1,24 +1,8 @@ #ifndef __INFINIOP_ASIN_API_H__ #define __INFINIOP_ASIN_API_H__ -#include "../operator_descriptor.h" +#include "unary_op_api.h" -typedef struct InfiniopDescriptor *infiniopAsinDescriptor_t; - -__C __export infiniStatus_t infiniopCreateAsinDescriptor(infiniopHandle_t handle, - infiniopAsinDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t y, - infiniopTensorDescriptor_t x); - -__C __export infiniStatus_t infiniopGetAsinWorkspaceSize(infiniopAsinDescriptor_t desc, size_t *size); - -__C __export infiniStatus_t infiniopAsin(infiniopAsinDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *y, - const void *x, - void *stream); - -__C __export infiniStatus_t infiniopDestroyAsinDescriptor(infiniopAsinDescriptor_t desc); +UNARY_OP_API_DECLARE(asin, Asin) #endif diff --git a/include/infiniop/ops/asinh.h b/include/infiniop/ops/asinh.h index d1385fc01..2a3aebf5a 100644 --- a/include/infiniop/ops/asinh.h +++ b/include/infiniop/ops/asinh.h @@ -1,24 +1,8 @@ #ifndef __INFINIOP_ASINH_API_H__ #define __INFINIOP_ASINH_API_H__ -#include "../operator_descriptor.h" +#include "unary_op_api.h" -typedef struct InfiniopDescriptor *infiniopAsinhDescriptor_t; - -__C __export infiniStatus_t infiniopCreateAsinhDescriptor(infiniopHandle_t handle, - infiniopAsinhDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t y, - infiniopTensorDescriptor_t x); - -__C __export infiniStatus_t infiniopGetAsinhWorkspaceSize(infiniopAsinhDescriptor_t desc, size_t *size); - -__C __export infiniStatus_t infiniopAsinh(infiniopAsinhDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *y, - const void *x, - void *stream); - -__C __export infiniStatus_t infiniopDestroyAsinhDescriptor(infiniopAsinhDescriptor_t desc); +UNARY_OP_API_DECLARE(asinh, Asinh) #endif diff --git a/include/infiniop/ops/atan.h b/include/infiniop/ops/atan.h index 3b1a5bde3..18eed316f 100644 --- a/include/infiniop/ops/atan.h +++ b/include/infiniop/ops/atan.h @@ -1,24 +1,8 @@ #ifndef __INFINIOP_ATAN_API_H__ #define __INFINIOP_ATAN_API_H__ -#include "../operator_descriptor.h" +#include "unary_op_api.h" -typedef struct InfiniopDescriptor *infiniopAtanDescriptor_t; - -__C __export infiniStatus_t infiniopCreateAtanDescriptor(infiniopHandle_t handle, - infiniopAtanDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t y, - infiniopTensorDescriptor_t x); - -__C __export infiniStatus_t infiniopGetAtanWorkspaceSize(infiniopAtanDescriptor_t desc, size_t *size); - -__C __export infiniStatus_t infiniopAtan(infiniopAtanDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *y, - const void *x, - void *stream); - -__C __export infiniStatus_t infiniopDestroyAtanDescriptor(infiniopAtanDescriptor_t desc); +UNARY_OP_API_DECLARE(atan, Atan) #endif diff --git a/include/infiniop/ops/atanh.h b/include/infiniop/ops/atanh.h index 800afd5d5..e7db5b53c 100644 --- a/include/infiniop/ops/atanh.h +++ b/include/infiniop/ops/atanh.h @@ -1,24 +1,8 @@ #ifndef __INFINIOP_ATANH_API_H__ #define __INFINIOP_ATANH_API_H__ -#include "../operator_descriptor.h" +#include "unary_op_api.h" -typedef struct InfiniopDescriptor *infiniopAtanhDescriptor_t; - -__C __export infiniStatus_t infiniopCreateAtanhDescriptor(infiniopHandle_t handle, - infiniopAtanhDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t y, - infiniopTensorDescriptor_t x); - -__C __export infiniStatus_t infiniopGetAtanhWorkspaceSize(infiniopAtanhDescriptor_t desc, size_t *size); - -__C __export infiniStatus_t infiniopAtanh(infiniopAtanhDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *y, - const void *x, - void *stream); - -__C __export infiniStatus_t infiniopDestroyAtanhDescriptor(infiniopAtanhDescriptor_t desc); +UNARY_OP_API_DECLARE(atanh, Atanh) #endif diff --git a/include/infiniop/ops/binary_op_api.h b/include/infiniop/ops/binary_op_api.h new file mode 100644 index 000000000..4ab2401b9 --- /dev/null +++ b/include/infiniop/ops/binary_op_api.h @@ -0,0 +1,50 @@ +#ifndef __INFINIOP_BINARY_OP_API_H__ +#define __INFINIOP_BINARY_OP_API_H__ + +#include "../operator_descriptor.h" + +/** + * @brief Macro to generate the C API header for a binary operator. + * + * This macro generates all the necessary declarations for a binary operator: + * - Descriptor type definition + * - Create descriptor function + * - Get workspace size function + * - Execute operator function + * - Destroy descriptor function + * + * Usage: + * BINARY_OP_API_DECLARE(div, Div) + * BINARY_OP_API_DECLARE(pow, Pow) + * + * @param OP_NAME Lowercase operator name (e.g., div, pow, mod) + * @param OP_NAME_UPPER Uppercase operator name (e.g., Div, Pow, Mod) + */ +#define BINARY_OP_API_DECLARE(OP_NAME, OP_NAME_UPPER) \ + \ + typedef struct InfiniopDescriptor *infiniop##OP_NAME_UPPER##Descriptor_t; \ + \ + __C __export infiniStatus_t infiniopCreate##OP_NAME_UPPER##Descriptor( \ + infiniopHandle_t handle, \ + infiniop##OP_NAME_UPPER##Descriptor_t *desc_ptr, \ + infiniopTensorDescriptor_t c, \ + infiniopTensorDescriptor_t a, \ + infiniopTensorDescriptor_t b); \ + \ + __C __export infiniStatus_t infiniopGet##OP_NAME_UPPER##WorkspaceSize( \ + infiniop##OP_NAME_UPPER##Descriptor_t desc, \ + size_t *size); \ + \ + __C __export infiniStatus_t infiniop##OP_NAME_UPPER( \ + infiniop##OP_NAME_UPPER##Descriptor_t desc, \ + void *workspace, \ + size_t workspace_size, \ + void *c, \ + const void *a, \ + const void *b, \ + void *stream); \ + \ + __C __export infiniStatus_t infiniopDestroy##OP_NAME_UPPER##Descriptor( \ + infiniop##OP_NAME_UPPER##Descriptor_t desc); + +#endif // __INFINIOP_BINARY_OP_API_H__ diff --git a/include/infiniop/ops/ceil.h b/include/infiniop/ops/ceil.h index 4539d77fd..8fca73b2e 100644 --- a/include/infiniop/ops/ceil.h +++ b/include/infiniop/ops/ceil.h @@ -1,24 +1,8 @@ #ifndef __INFINIOP_CEIL_API_H__ #define __INFINIOP_CEIL_API_H__ -#include "../operator_descriptor.h" +#include "unary_op_api.h" -typedef struct InfiniopDescriptor *infiniopCeilDescriptor_t; - -__C __export infiniStatus_t infiniopCreateCeilDescriptor(infiniopHandle_t handle, - infiniopCeilDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t y, - infiniopTensorDescriptor_t x); - -__C __export infiniStatus_t infiniopGetCeilWorkspaceSize(infiniopCeilDescriptor_t desc, size_t *size); - -__C __export infiniStatus_t infiniopCeil(infiniopCeilDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *y, - const void *x, - void *stream); - -__C __export infiniStatus_t infiniopDestroyCeilDescriptor(infiniopCeilDescriptor_t desc); +UNARY_OP_API_DECLARE(ceil, Ceil) #endif diff --git a/include/infiniop/ops/cos.h b/include/infiniop/ops/cos.h index 8f0b6eeb7..ed33b0a0e 100644 --- a/include/infiniop/ops/cos.h +++ b/include/infiniop/ops/cos.h @@ -1,24 +1,8 @@ #ifndef __INFINIOP_COS_API_H__ #define __INFINIOP_COS_API_H__ -#include "../operator_descriptor.h" +#include "unary_op_api.h" -typedef struct InfiniopDescriptor *infiniopCosDescriptor_t; - -__C __export infiniStatus_t infiniopCreateCosDescriptor(infiniopHandle_t handle, - infiniopCosDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t y, - infiniopTensorDescriptor_t x); - -__C __export infiniStatus_t infiniopGetCosWorkspaceSize(infiniopCosDescriptor_t desc, size_t *size); - -__C __export infiniStatus_t infiniopCos(infiniopCosDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *y, - const void *x, - void *stream); - -__C __export infiniStatus_t infiniopDestroyCosDescriptor(infiniopCosDescriptor_t desc); +UNARY_OP_API_DECLARE(cos, Cos) #endif diff --git a/include/infiniop/ops/cosh.h b/include/infiniop/ops/cosh.h index 3328151ad..b607b8fd1 100644 --- a/include/infiniop/ops/cosh.h +++ b/include/infiniop/ops/cosh.h @@ -1,24 +1,8 @@ #ifndef __INFINIOP_COSH_API_H__ #define __INFINIOP_COSH_API_H__ -#include "../operator_descriptor.h" +#include "unary_op_api.h" -typedef struct InfiniopDescriptor *infiniopCoshDescriptor_t; - -__C __export infiniStatus_t infiniopCreateCoshDescriptor(infiniopHandle_t handle, - infiniopCoshDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t y, - infiniopTensorDescriptor_t x); - -__C __export infiniStatus_t infiniopGetCoshWorkspaceSize(infiniopCoshDescriptor_t desc, size_t *size); - -__C __export infiniStatus_t infiniopCosh(infiniopCoshDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *y, - const void *x, - void *stream); - -__C __export infiniStatus_t infiniopDestroyCoshDescriptor(infiniopCoshDescriptor_t desc); +UNARY_OP_API_DECLARE(cosh, Cosh) #endif diff --git a/include/infiniop/ops/div.h b/include/infiniop/ops/div.h index e539b440c..6f146bf4c 100644 --- a/include/infiniop/ops/div.h +++ b/include/infiniop/ops/div.h @@ -1,26 +1,8 @@ #ifndef __INFINIOP_DIV_API_H__ #define __INFINIOP_DIV_API_H__ -#include "../operator_descriptor.h" +#include "binary_op_api.h" -typedef struct InfiniopDescriptor *infiniopDivDescriptor_t; - -__C __export infiniStatus_t infiniopCreateDivDescriptor(infiniopHandle_t handle, - infiniopDivDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t c, - infiniopTensorDescriptor_t a, - infiniopTensorDescriptor_t b); - -__C __export infiniStatus_t infiniopGetDivWorkspaceSize(infiniopDivDescriptor_t desc, size_t *size); - -__C __export infiniStatus_t infiniopDiv(infiniopDivDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *c, - const void *a, - const void *b, - void *stream); - -__C __export infiniStatus_t infiniopDestroyDivDescriptor(infiniopDivDescriptor_t desc); +BINARY_OP_API_DECLARE(div, Div) #endif diff --git a/include/infiniop/ops/erf.h b/include/infiniop/ops/erf.h index 8cbb8fb74..0dcc149da 100644 --- a/include/infiniop/ops/erf.h +++ b/include/infiniop/ops/erf.h @@ -1,24 +1,8 @@ #ifndef __INFINIOP_ERF_API_H__ #define __INFINIOP_ERF_API_H__ -#include "../operator_descriptor.h" +#include "unary_op_api.h" -typedef struct InfiniopDescriptor *infiniopErfDescriptor_t; - -__C __export infiniStatus_t infiniopCreateErfDescriptor(infiniopHandle_t handle, - infiniopErfDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t y, - infiniopTensorDescriptor_t x); - -__C __export infiniStatus_t infiniopGetErfWorkspaceSize(infiniopErfDescriptor_t desc, size_t *size); - -__C __export infiniStatus_t infiniopErf(infiniopErfDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *y, - const void *x, - void *stream); - -__C __export infiniStatus_t infiniopDestroyErfDescriptor(infiniopErfDescriptor_t desc); +UNARY_OP_API_DECLARE(erf, Erf) #endif diff --git a/include/infiniop/ops/floor.h b/include/infiniop/ops/floor.h index 2f65f8f4a..02efc6761 100644 --- a/include/infiniop/ops/floor.h +++ b/include/infiniop/ops/floor.h @@ -1,24 +1,8 @@ #ifndef __INFINIOP_FLOOR_API_H__ #define __INFINIOP_FLOOR_API_H__ -#include "../operator_descriptor.h" +#include "unary_op_api.h" -typedef struct InfiniopDescriptor *infiniopFloorDescriptor_t; - -__C __export infiniStatus_t infiniopCreateFloorDescriptor(infiniopHandle_t handle, - infiniopFloorDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t y, - infiniopTensorDescriptor_t x); - -__C __export infiniStatus_t infiniopGetFloorWorkspaceSize(infiniopFloorDescriptor_t desc, size_t *size); - -__C __export infiniStatus_t infiniopFloor(infiniopFloorDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *y, - const void *x, - void *stream); - -__C __export infiniStatus_t infiniopDestroyFloorDescriptor(infiniopFloorDescriptor_t desc); +UNARY_OP_API_DECLARE(floor, Floor) #endif diff --git a/include/infiniop/ops/log.h b/include/infiniop/ops/log.h index f5bec4382..3892ccb6e 100644 --- a/include/infiniop/ops/log.h +++ b/include/infiniop/ops/log.h @@ -1,24 +1,8 @@ #ifndef __INFINIOP_LOG_API_H__ #define __INFINIOP_LOG_API_H__ -#include "../operator_descriptor.h" +#include "unary_op_api.h" -typedef struct InfiniopDescriptor *infiniopLogDescriptor_t; - -__C __export infiniStatus_t infiniopCreateLogDescriptor(infiniopHandle_t handle, - infiniopLogDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t y, - infiniopTensorDescriptor_t x); - -__C __export infiniStatus_t infiniopGetLogWorkspaceSize(infiniopLogDescriptor_t desc, size_t *size); - -__C __export infiniStatus_t infiniopLog(infiniopLogDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *y, - const void *x, - void *stream); - -__C __export infiniStatus_t infiniopDestroyLogDescriptor(infiniopLogDescriptor_t desc); +UNARY_OP_API_DECLARE(log, Log) #endif diff --git a/include/infiniop/ops/max.h b/include/infiniop/ops/max.h index e6f2f5d4c..4b91e5c83 100644 --- a/include/infiniop/ops/max.h +++ b/include/infiniop/ops/max.h @@ -1,26 +1,8 @@ #ifndef __INFINIOP_MAX_API_H__ #define __INFINIOP_MAX_API_H__ -#include "../operator_descriptor.h" +#include "binary_op_api.h" -typedef struct InfiniopDescriptor *infiniopMaxDescriptor_t; - -__C __export infiniStatus_t infiniopCreateMaxDescriptor(infiniopHandle_t handle, - infiniopMaxDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t c, - infiniopTensorDescriptor_t a, - infiniopTensorDescriptor_t b); - -__C __export infiniStatus_t infiniopGetMaxWorkspaceSize(infiniopMaxDescriptor_t desc, size_t *size); - -__C __export infiniStatus_t infiniopMax(infiniopMaxDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *c, - const void *a, - const void *b, - void *stream); - -__C __export infiniStatus_t infiniopDestroyMaxDescriptor(infiniopMaxDescriptor_t desc); +BINARY_OP_API_DECLARE(max, Max) #endif diff --git a/include/infiniop/ops/min.h b/include/infiniop/ops/min.h index f72f0c4db..1496806df 100644 --- a/include/infiniop/ops/min.h +++ b/include/infiniop/ops/min.h @@ -1,26 +1,8 @@ #ifndef __INFINIOP_MIN_API_H__ #define __INFINIOP_MIN_API_H__ -#include "../operator_descriptor.h" +#include "binary_op_api.h" -typedef struct InfiniopDescriptor *infiniopMinDescriptor_t; - -__C __export infiniStatus_t infiniopCreateMinDescriptor(infiniopHandle_t handle, - infiniopMinDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t c, - infiniopTensorDescriptor_t a, - infiniopTensorDescriptor_t b); - -__C __export infiniStatus_t infiniopGetMinWorkspaceSize(infiniopMinDescriptor_t desc, size_t *size); - -__C __export infiniStatus_t infiniopMin(infiniopMinDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *c, - const void *a, - const void *b, - void *stream); - -__C __export infiniStatus_t infiniopDestroyMinDescriptor(infiniopMinDescriptor_t desc); +BINARY_OP_API_DECLARE(min, Min) #endif diff --git a/include/infiniop/ops/mod.h b/include/infiniop/ops/mod.h index 5a6cd5bbf..e4fcd571e 100644 --- a/include/infiniop/ops/mod.h +++ b/include/infiniop/ops/mod.h @@ -1,26 +1,8 @@ #ifndef __INFINIOP_MOD_API_H__ #define __INFINIOP_MOD_API_H__ -#include "../operator_descriptor.h" +#include "binary_op_api.h" -typedef struct InfiniopDescriptor *infiniopModDescriptor_t; - -__C __export infiniStatus_t infiniopCreateModDescriptor(infiniopHandle_t handle, - infiniopModDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t c, - infiniopTensorDescriptor_t a, - infiniopTensorDescriptor_t b); - -__C __export infiniStatus_t infiniopGetModWorkspaceSize(infiniopModDescriptor_t desc, size_t *size); - -__C __export infiniStatus_t infiniopMod(infiniopModDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *c, - const void *a, - const void *b, - void *stream); - -__C __export infiniStatus_t infiniopDestroyModDescriptor(infiniopModDescriptor_t desc); +BINARY_OP_API_DECLARE(mod, Mod) #endif diff --git a/include/infiniop/ops/neg.h b/include/infiniop/ops/neg.h index 4d3b06e21..0d18bbd5c 100644 --- a/include/infiniop/ops/neg.h +++ b/include/infiniop/ops/neg.h @@ -1,24 +1,8 @@ #ifndef __INFINIOP_NEG_API_H__ #define __INFINIOP_NEG_API_H__ -#include "../operator_descriptor.h" +#include "unary_op_api.h" -typedef struct InfiniopDescriptor *infiniopNegDescriptor_t; - -__C __export infiniStatus_t infiniopCreateNegDescriptor(infiniopHandle_t handle, - infiniopNegDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t y, - infiniopTensorDescriptor_t x); - -__C __export infiniStatus_t infiniopGetNegWorkspaceSize(infiniopNegDescriptor_t desc, size_t *size); - -__C __export infiniStatus_t infiniopNeg(infiniopNegDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *y, - const void *x, - void *stream); - -__C __export infiniStatus_t infiniopDestroyNegDescriptor(infiniopNegDescriptor_t desc); +UNARY_OP_API_DECLARE(neg, Neg) #endif diff --git a/include/infiniop/ops/pow.h b/include/infiniop/ops/pow.h index 6449d8622..f4e263a58 100644 --- a/include/infiniop/ops/pow.h +++ b/include/infiniop/ops/pow.h @@ -1,26 +1,8 @@ #ifndef __INFINIOP_POW_API_H__ #define __INFINIOP_POW_API_H__ -#include "../operator_descriptor.h" +#include "binary_op_api.h" -typedef struct InfiniopDescriptor *infiniopPowDescriptor_t; - -__C __export infiniStatus_t infiniopCreatePowDescriptor(infiniopHandle_t handle, - infiniopPowDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t c, - infiniopTensorDescriptor_t a, - infiniopTensorDescriptor_t b); - -__C __export infiniStatus_t infiniopGetPowWorkspaceSize(infiniopPowDescriptor_t desc, size_t *size); - -__C __export infiniStatus_t infiniopPow(infiniopPowDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *c, - const void *a, - const void *b, - void *stream); - -__C __export infiniStatus_t infiniopDestroyPowDescriptor(infiniopPowDescriptor_t desc); +BINARY_OP_API_DECLARE(pow, Pow) #endif diff --git a/include/infiniop/ops/reciprocal.h b/include/infiniop/ops/reciprocal.h index 73836fea4..7d5626176 100644 --- a/include/infiniop/ops/reciprocal.h +++ b/include/infiniop/ops/reciprocal.h @@ -1,24 +1,8 @@ #ifndef __INFINIOP_RECIPROCAL_API_H__ #define __INFINIOP_RECIPROCAL_API_H__ -#include "../operator_descriptor.h" +#include "unary_op_api.h" -typedef struct InfiniopDescriptor *infiniopReciprocalDescriptor_t; - -__C __export infiniStatus_t infiniopCreateReciprocalDescriptor(infiniopHandle_t handle, - infiniopReciprocalDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t y, - infiniopTensorDescriptor_t x); - -__C __export infiniStatus_t infiniopGetReciprocalWorkspaceSize(infiniopReciprocalDescriptor_t desc, size_t *size); - -__C __export infiniStatus_t infiniopReciprocal(infiniopReciprocalDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *y, - const void *x, - void *stream); - -__C __export infiniStatus_t infiniopDestroyReciprocalDescriptor(infiniopReciprocalDescriptor_t desc); +UNARY_OP_API_DECLARE(reciprocal, Reciprocal) #endif diff --git a/include/infiniop/ops/round.h b/include/infiniop/ops/round.h index 18c7fe44e..1bf4377ff 100644 --- a/include/infiniop/ops/round.h +++ b/include/infiniop/ops/round.h @@ -1,24 +1,8 @@ #ifndef __INFINIOP_ROUND_API_H__ #define __INFINIOP_ROUND_API_H__ -#include "../operator_descriptor.h" +#include "unary_op_api.h" -typedef struct InfiniopDescriptor *infiniopRoundDescriptor_t; - -__C __export infiniStatus_t infiniopCreateRoundDescriptor(infiniopHandle_t handle, - infiniopRoundDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t y, - infiniopTensorDescriptor_t x); - -__C __export infiniStatus_t infiniopGetRoundWorkspaceSize(infiniopRoundDescriptor_t desc, size_t *size); - -__C __export infiniStatus_t infiniopRound(infiniopRoundDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *y, - const void *x, - void *stream); - -__C __export infiniStatus_t infiniopDestroyRoundDescriptor(infiniopRoundDescriptor_t desc); +UNARY_OP_API_DECLARE(round, Round) #endif diff --git a/include/infiniop/ops/sign.h b/include/infiniop/ops/sign.h index fe47c7190..ef7854de8 100644 --- a/include/infiniop/ops/sign.h +++ b/include/infiniop/ops/sign.h @@ -1,24 +1,8 @@ #ifndef __INFINIOP_SIGN_API_H__ #define __INFINIOP_SIGN_API_H__ -#include "../operator_descriptor.h" +#include "unary_op_api.h" -typedef struct InfiniopDescriptor *infiniopSignDescriptor_t; - -__C __export infiniStatus_t infiniopCreateSignDescriptor(infiniopHandle_t handle, - infiniopSignDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t y, - infiniopTensorDescriptor_t x); - -__C __export infiniStatus_t infiniopGetSignWorkspaceSize(infiniopSignDescriptor_t desc, size_t *size); - -__C __export infiniStatus_t infiniopSign(infiniopSignDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *y, - const void *x, - void *stream); - -__C __export infiniStatus_t infiniopDestroySignDescriptor(infiniopSignDescriptor_t desc); +UNARY_OP_API_DECLARE(sign, Sign) #endif diff --git a/include/infiniop/ops/sinh.h b/include/infiniop/ops/sinh.h index a5325fb81..ea8511a2b 100644 --- a/include/infiniop/ops/sinh.h +++ b/include/infiniop/ops/sinh.h @@ -1,24 +1,8 @@ #ifndef __INFINIOP_SINH_API_H__ #define __INFINIOP_SINH_API_H__ -#include "../operator_descriptor.h" +#include "unary_op_api.h" -typedef struct InfiniopDescriptor *infiniopSinhDescriptor_t; - -__C __export infiniStatus_t infiniopCreateSinhDescriptor(infiniopHandle_t handle, - infiniopSinhDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t y, - infiniopTensorDescriptor_t x); - -__C __export infiniStatus_t infiniopGetSinhWorkspaceSize(infiniopSinhDescriptor_t desc, size_t *size); - -__C __export infiniStatus_t infiniopSinh(infiniopSinhDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *y, - const void *x, - void *stream); - -__C __export infiniStatus_t infiniopDestroySinhDescriptor(infiniopSinhDescriptor_t desc); +UNARY_OP_API_DECLARE(sinh, Sinh) #endif diff --git a/include/infiniop/ops/sqrt.h b/include/infiniop/ops/sqrt.h index db04ec8bc..6df6fe89c 100644 --- a/include/infiniop/ops/sqrt.h +++ b/include/infiniop/ops/sqrt.h @@ -1,24 +1,8 @@ #ifndef __INFINIOP_SQRT_API_H__ #define __INFINIOP_SQRT_API_H__ -#include "../operator_descriptor.h" +#include "unary_op_api.h" -typedef struct InfiniopDescriptor *infiniopSqrtDescriptor_t; - -__C __export infiniStatus_t infiniopCreateSqrtDescriptor(infiniopHandle_t handle, - infiniopSqrtDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t y, - infiniopTensorDescriptor_t x); - -__C __export infiniStatus_t infiniopGetSqrtWorkspaceSize(infiniopSqrtDescriptor_t desc, size_t *size); - -__C __export infiniStatus_t infiniopSqrt(infiniopSqrtDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *y, - const void *x, - void *stream); - -__C __export infiniStatus_t infiniopDestroySqrtDescriptor(infiniopSqrtDescriptor_t desc); +UNARY_OP_API_DECLARE(sqrt, Sqrt) #endif diff --git a/include/infiniop/ops/tan.h b/include/infiniop/ops/tan.h index 69fc47bf1..d4a2f0bf2 100644 --- a/include/infiniop/ops/tan.h +++ b/include/infiniop/ops/tan.h @@ -1,24 +1,8 @@ #ifndef __INFINIOP_TAN_API_H__ #define __INFINIOP_TAN_API_H__ -#include "../operator_descriptor.h" +#include "unary_op_api.h" -typedef struct InfiniopDescriptor *infiniopTanDescriptor_t; - -__C __export infiniStatus_t infiniopCreateTanDescriptor(infiniopHandle_t handle, - infiniopTanDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t y, - infiniopTensorDescriptor_t x); - -__C __export infiniStatus_t infiniopGetTanWorkspaceSize(infiniopTanDescriptor_t desc, size_t *size); - -__C __export infiniStatus_t infiniopTan(infiniopTanDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *y, - const void *x, - void *stream); - -__C __export infiniStatus_t infiniopDestroyTanDescriptor(infiniopTanDescriptor_t desc); +UNARY_OP_API_DECLARE(tan, Tan) #endif diff --git a/include/infiniop/ops/unary_op_api.h b/include/infiniop/ops/unary_op_api.h new file mode 100644 index 000000000..eefe3c3a4 --- /dev/null +++ b/include/infiniop/ops/unary_op_api.h @@ -0,0 +1,48 @@ +#ifndef __INFINIOP_UNARY_OP_API_H__ +#define __INFINIOP_UNARY_OP_API_H__ + +#include "../operator_descriptor.h" + +/** + * @brief Macro to generate the C API header for a unary operator. + * + * This macro generates all the necessary declarations for a unary operator: + * - Descriptor type definition + * - Create descriptor function + * - Get workspace size function + * - Execute operator function + * - Destroy descriptor function + * + * Usage: + * UNARY_OP_API_DECLARE(abs, Abs) + * UNARY_OP_API_DECLARE(log, Log) + * + * @param OP_NAME Lowercase operator name (e.g., abs, log, sin) + * @param OP_NAME_UPPER Uppercase operator name (e.g., Abs, Log, Sin) + */ +#define UNARY_OP_API_DECLARE(OP_NAME, OP_NAME_UPPER) \ + \ + typedef struct InfiniopDescriptor *infiniop##OP_NAME_UPPER##Descriptor_t; \ + \ + __C __export infiniStatus_t infiniopCreate##OP_NAME_UPPER##Descriptor( \ + infiniopHandle_t handle, \ + infiniop##OP_NAME_UPPER##Descriptor_t *desc_ptr, \ + infiniopTensorDescriptor_t y, \ + infiniopTensorDescriptor_t x); \ + \ + __C __export infiniStatus_t infiniopGet##OP_NAME_UPPER##WorkspaceSize( \ + infiniop##OP_NAME_UPPER##Descriptor_t desc, \ + size_t *size); \ + \ + __C __export infiniStatus_t infiniop##OP_NAME_UPPER( \ + infiniop##OP_NAME_UPPER##Descriptor_t desc, \ + void *workspace, \ + size_t workspace_size, \ + void *y, \ + const void *x, \ + void *stream); \ + \ + __C __export infiniStatus_t infiniopDestroy##OP_NAME_UPPER##Descriptor( \ + infiniop##OP_NAME_UPPER##Descriptor_t desc); + +#endif // __INFINIOP_UNARY_OP_API_H__ diff --git a/scripts/test_binary_unary.py b/scripts/test_binary_unary.py new file mode 100755 index 000000000..8dbbfbf53 --- /dev/null +++ b/scripts/test_binary_unary.py @@ -0,0 +1,143 @@ +import os +import subprocess +from set_env import set_env +import sys + +PROJECT_DIR = os.path.abspath( + os.path.join(os.path.dirname(__file__), "..", "test", "infiniop") +) +os.chdir(PROJECT_DIR) + + +def run_tests(args): + failed = [] + + # Binary operators (重构过的) + binary_tests = [ + "div.py", + "pow.py", + "mod.py", + "min.py", + "max.py", + ] + + # Unary operators (重构过的) + unary_tests = [ + "abs.py", + "log.py", + "cos.py", + "sqrt.py", + "neg.py", + "sign.py", + "reciprocal.py", + "round.py", + "floor.py", + "ceil.py", + "erf.py", + "cosh.py", + "sinh.py", + "tan.py", + "acos.py", + "acosh.py", + "asin.py", + "asinh.py", + "atan.py", + "atanh.py", + ] + + all_tests = binary_tests + unary_tests + + print("\033[94m" + "=" * 60 + "\033[0m") + print("\033[94mTesting Binary and Unary Operators (Refactored)\033[0m") + print("\033[94m" + "=" * 60 + "\033[0m") + print(f"\033[94mTotal tests: {len(all_tests)}\033[0m") + print(f"\033[94m - Binary operators: {len(binary_tests)}\033[0m") + print(f"\033[94m - Unary operators: {len(unary_tests)}\033[0m") + print() + + for test in all_tests: + if not os.path.exists(test): + print(f"\033[93m[SKIP] {test} - test file not found\033[0m") + continue + + print(f"\033[96m[RUN] {test}\033[0m", end=" ... ", flush=True) + result = subprocess.run( + f"python3 {test} {args}", + text=True, + encoding="utf-8", + shell=True, + capture_output=True + ) + + if result.returncode != 0: + print(f"\033[91m[FAIL]\033[0m") + print(f"\033[91mError output:\033[0m") + print(result.stderr) + failed.append(test) + else: + print(f"\033[92m[PASS]\033[0m") + + return failed + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser( + description="Test refactored binary and unary operators", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Test on CPU only (default) + python3 scripts/test_binary_unary.py --cpu + + # Test on NVIDIA GPU only + python3 scripts/test_binary_unary.py --nvidia + + # Test on both CPU and NVIDIA + python3 scripts/test_binary_unary.py --cpu --nvidia + + # Test with debug mode + python3 scripts/test_binary_unary.py --cpu --debug + + # Test with profiling + python3 scripts/test_binary_unary.py --nvidia --profile + """ + ) + + # Device selection arguments (same as test files) + parser.add_argument("--cpu", action="store_true", help="Run CPU tests") + parser.add_argument("--nvidia", action="store_true", help="Run NVIDIA GPU tests") + parser.add_argument("--debug", action="store_true", help="Enable debug mode") + parser.add_argument("--profile", action="store_true", help="Enable profiling") + + args, unknown = parser.parse_known_args() + + # Build command line arguments to pass to test files + test_args = [] + if args.cpu: + test_args.append("--cpu") + if args.nvidia: + test_args.append("--nvidia") + if args.debug: + test_args.append("--debug") + if args.profile: + test_args.append("--profile") + + # Add any unknown arguments (for compatibility) + test_args.extend(unknown) + + set_env() + failed = run_tests(" ".join(test_args)) + + print() + print("\033[94m" + "=" * 60 + "\033[0m") + if len(failed) == 0: + print("\033[92m✓ All tests passed!\033[0m") + else: + print(f"\033[91m✗ {len(failed)} test(s) failed:\033[0m") + for test in failed: + print(f"\033[91m - {test}\033[0m") + print("\033[94m" + "=" * 60 + "\033[0m") + + exit(len(failed)) diff --git a/src/infiniop/elementwise/binary.h b/src/infiniop/elementwise/binary.h new file mode 100644 index 000000000..1823fac3f --- /dev/null +++ b/src/infiniop/elementwise/binary.h @@ -0,0 +1,261 @@ +#ifndef __INFINIOP_ELEMENTWISE_BINARY_H__ +#define __INFINIOP_ELEMENTWISE_BINARY_H__ + +#include +#include +#include + +#ifdef __CUDACC__ +#include +#include +#include +// Include device-specific type aliases for cuda_bfloat16 +#include "../devices/nvidia/nvidia_kernel_common.cuh" +#endif + +namespace op::elementwise::binary { + +/** + * @brief Represents all the currently defined binary operations. + * + * This enum is used to specify which binary operation to perform + * in the generic BinaryOp template. + */ +enum class BinaryMode { + // Arithmetic operations: + Add, + Subtract, + Multiply, + Divide, + Pow, + Mod, + Max, + Min, + // Logical operations (for future use): + // And, Or, Xor, Less, LessOrEqual, Equal, Greater, GreaterOrEqual +}; + +/** + * @brief Generic binary operation template that performs different operations + * based on the specified BinaryMode. + * + * This template allows multiple binary operators (pow, div, mod, min, max, etc.) + * to share the same implementation infrastructure while only differing in the + * operation mode. + * + * @tparam Mode The binary operation mode (from BinaryMode enum) + */ +template +struct BinaryOp { + static constexpr size_t num_inputs = 2; + + template + T operator()(const T &a, const T &b) const { + if constexpr (Mode == BinaryMode::Add) { + return a + b; + } else if constexpr (Mode == BinaryMode::Subtract) { + return a - b; + } else if constexpr (Mode == BinaryMode::Multiply) { + return a * b; + } else if constexpr (Mode == BinaryMode::Divide) { + return a / b; + } else if constexpr (Mode == BinaryMode::Pow) { + return std::pow(a, b); + } else if constexpr (Mode == BinaryMode::Mod) { + if constexpr (std::is_floating_point_v) { + return std::fmod(a, b); + } else { + return a % b; + } + } else if constexpr (Mode == BinaryMode::Max) { + if constexpr (std::is_floating_point_v) { + return std::fmax(a, b); + } else { + return std::max(a, b); + } + } else if constexpr (Mode == BinaryMode::Min) { + if constexpr (std::is_floating_point_v) { + return std::fmin(a, b); + } else { + return std::min(a, b); + } + } else { + static_assert(Mode != Mode, "Unsupported binary operation mode"); + return a; + } + } +}; + +#ifdef __CUDACC__ +/** + * @brief CUDA-specific binary operation template that performs different operations + * based on the specified BinaryMode, using CUDA-optimized functions. + * + * This template provides CUDA device functions optimized for GPU execution, + * using intrinsics like __powf, __h2div, __hmin2, __hmax2, etc. + * + * @tparam Mode The binary operation mode (from BinaryMode enum) + */ +namespace cuda { +template +struct BinaryOp { + static constexpr size_t num_inputs = 2; + + template + __device__ __forceinline__ T operator()(const T &a, const T &b) const { + if constexpr (Mode == BinaryMode::Add) { + if constexpr (std::is_same_v) { + return __hadd2(a, b); + } else if constexpr (std::is_same_v || std::is_same_v) { + return __hadd(a, b); + } else if constexpr (std::is_same_v) { + return __fadd_rn(a, b); + } else { + return a + b; + } + } else if constexpr (Mode == BinaryMode::Subtract) { + if constexpr (std::is_same_v) { + return __hsub2(a, b); + } else if constexpr (std::is_same_v || std::is_same_v) { + return __hsub(a, b); + } else if constexpr (std::is_same_v) { + return __fsub_rn(a, b); + } else { + return a - b; + } + } else if constexpr (Mode == BinaryMode::Multiply) { + if constexpr (std::is_same_v) { + return __hmul2(a, b); + } else if constexpr (std::is_same_v || std::is_same_v) { + return __hmul(a, b); + } else if constexpr (std::is_same_v) { + return __fmul_rd(a, b); + } else { + return a * b; + } + } else if constexpr (Mode == BinaryMode::Divide) { + if constexpr (std::is_same_v) { + return __h2div(a, b); + } else if constexpr (std::is_same_v || std::is_same_v) { + return a / b; + } else if constexpr (std::is_same_v) { + return __fdividef(a, b); + } else { + return a / b; + } + } else if constexpr (Mode == BinaryMode::Pow) { + if constexpr (std::is_same_v) { + float2 a_f2 = __half22float2(a); + float2 b_f2 = __half22float2(b); + return __float22half2_rn(make_float2(__powf(a_f2.x, b_f2.x), __powf(a_f2.y, b_f2.y))); + } else if constexpr (std::is_same_v) { + float a_ = __half2float(a); + float b_ = __half2float(b); + float ans_f = __powf(a_, b_); + return __float2half(isnan(ans_f) ? std::pow(a_, b_) : ans_f); + } else if constexpr (std::is_same_v) { + float2 a_f2 = __bfloat1622float2(a); + float2 b_f2 = __bfloat1622float2(b); + return __floats2bfloat162_rn(__powf(a_f2.x, b_f2.x), __powf(a_f2.y, b_f2.y)); + } else if constexpr (std::is_same_v) { + float a_ = __bfloat162float(a); + float b_ = __bfloat162float(b); + return __float2bfloat16_rn(__powf(a_, b_)); + } else if constexpr (std::is_same_v) { + return __powf(a, b); + } else { + return std::pow(a, b); + } + } else if constexpr (Mode == BinaryMode::Mod) { + if constexpr (std::is_same_v) { + float2 a_f2 = __half22float2(a); + float2 b_f2 = __half22float2(b); + return __float22half2_rn(make_float2(std::fmod(a_f2.x, b_f2.x), std::fmod(a_f2.y, b_f2.y))); + } else if constexpr (std::is_same_v) { + float a_ = __half2float(a); + float b_ = __half2float(b); + return __float2half(std::fmod(a_, b_)); + } else if constexpr (std::is_floating_point_v) { + return std::fmod(a, b); + } else { + return a % b; + } + } else if constexpr (Mode == BinaryMode::Max) { + if constexpr (std::is_same_v) { + return __hmax2(a, b); + } else if constexpr (std::is_same_v || std::is_same_v) { + return a > b ? a : b; + } else if constexpr (std::is_same_v) { + return fmaxf(a, b); + } else { + return a > b ? a : b; + } + } else if constexpr (Mode == BinaryMode::Min) { + if constexpr (std::is_same_v) { + return __hmin2(a, b); + } else if constexpr (std::is_same_v || std::is_same_v) { + return a < b ? a : b; + } else if constexpr (std::is_same_v) { + return fminf(a, b); + } else { + return a < b ? a : b; + } + } else { + static_assert(Mode != Mode, "Unsupported binary operation mode"); + return a; + } + } +}; +} // namespace cuda +#endif // __CUDACC__ + +/** + * @brief Macro to define a binary elementwise descriptor for a specific operation. + * + * This macro simplifies the definition of binary operators (pow, div, mod, min, max, etc.) + * by automatically generating the Descriptor class and operation struct using the + * ELEMENTWISE_DESCRIPTOR macro and BinaryOp template. + * + * Usage: + * BINARY_ELEMENTWISE_DESCRIPTOR(pow, cpu, BinaryMode::Pow) + * BINARY_ELEMENTWISE_DESCRIPTOR(div, cpu, BinaryMode::Divide) + * + * @param OP The operator name (e.g., pow, div, mod) + * @param NAMESPACE The device namespace (e.g., cpu, nvidia) + * @param MODE The BinaryMode enum value for this operation + */ +#define BINARY_ELEMENTWISE_DESCRIPTOR(OP, NAMESPACE, MODE) \ + \ + ELEMENTWISE_DESCRIPTOR(OP, NAMESPACE) \ + \ + namespace op::OP::NAMESPACE { \ + using Op = op::elementwise::binary::BinaryOp; \ + } + +/** + * @brief Macro to define a binary elementwise descriptor for CUDA/NVIDIA backend. + * + * This macro is similar to BINARY_ELEMENTWISE_DESCRIPTOR but uses the CUDA-specific + * BinaryOp implementation for better GPU performance. + * + * Usage: + * BINARY_ELEMENTWISE_DESCRIPTOR_CUDA(pow, nvidia, BinaryMode::Pow) + * BINARY_ELEMENTWISE_DESCRIPTOR_CUDA(div, nvidia, BinaryMode::Divide) + * + * @param OP The operator name (e.g., pow, div, mod) + * @param NAMESPACE The device namespace (e.g., nvidia) + * @param MODE The BinaryMode enum value for this operation + */ +#ifdef __CUDACC__ +#define BINARY_ELEMENTWISE_DESCRIPTOR_CUDA(OP, NAMESPACE, MODE) \ + \ + ELEMENTWISE_DESCRIPTOR(OP, NAMESPACE) \ + \ + namespace op::OP::cuda { \ + using Op = op::elementwise::binary::cuda::BinaryOp; \ + } +#endif // __CUDACC__ + +} // namespace op::elementwise::binary + +#endif // __INFINIOP_ELEMENTWISE_BINARY_H__ diff --git a/src/infiniop/elementwise/cpu/elementwise_cpu_impl.h b/src/infiniop/elementwise/cpu/elementwise_cpu_impl.h new file mode 100644 index 000000000..030f4d87e --- /dev/null +++ b/src/infiniop/elementwise/cpu/elementwise_cpu_impl.h @@ -0,0 +1,130 @@ +#ifndef __INFINIOP_ELEMENTWISE_CPU_IMPL_H__ +#define __INFINIOP_ELEMENTWISE_CPU_IMPL_H__ + +#include "../../../utils/check.h" +#include "../../../utils/result.hpp" +#include "../../devices/cpu/common_cpu.h" +#include "elementwise_cpu.h" + +/** + * @brief Generic implementation for elementwise CPU operators. + * + * This file provides a generic implementation template that can be used + * by all binary and unary operators to reduce code duplication. + * + * Usage: + * #include "elementwise_cpu_impl.h" + * namespace op::pow::cpu { + * using Op = op::elementwise::binary::BinaryOp; + * ELEMENTWISE_CPU_IMPL_BINARY(pow) + * } + * + * namespace op::sqrt::cpu { + * using Op = op::elementwise::unary::UnaryOp; + * ELEMENTWISE_CPU_IMPL_UNARY(sqrt) + * } + */ + +/** + * @brief Macro to generate binary operator implementation. + * + * This macro generates the Descriptor destructor, create, and calculate methods + * for binary operators, using the generic implementation. + * + * Usage: + * namespace op::pow::cpu { + * using Op = op::elementwise::binary::BinaryOp; + * ELEMENTWISE_CPU_IMPL_BINARY(pow) + * } + */ +#define ELEMENTWISE_CPU_IMPL_BINARY(OP) \ + \ + Descriptor::~Descriptor() = default; \ + \ + infiniStatus_t Descriptor::create( \ + infiniopHandle_t handle_, \ + Descriptor **desc_ptr, \ + infiniopTensorDescriptor_t out_desc, \ + std::vector input_desc_vec) { \ + auto handle = reinterpret_cast(handle_); \ + auto dtype = out_desc->dtype(); \ + const auto &a_desc = input_desc_vec.at(0); \ + const auto &b_desc = input_desc_vec.at(1); \ + const auto &out_shape = out_desc->shape(); \ + const auto &a_shape = a_desc->shape(); \ + const auto &b_shape = b_desc->shape(); \ + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); \ + CHECK_SAME_SHAPE(out_shape, a_shape, b_shape); \ + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); \ + return INFINI_STATUS_SUCCESS; \ + } \ + \ + infiniStatus_t Descriptor::calculate( \ + void *workspace, \ + size_t workspace_size, \ + void *output, \ + std::vector inputs, \ + void *stream) const { \ + switch (_dtype) { \ + case INFINI_DTYPE_F16: \ + return _device_info->template calculate( \ + _info, output, inputs, stream); \ + case INFINI_DTYPE_F32: \ + return _device_info->template calculate( \ + _info, output, inputs, stream); \ + default: \ + return INFINI_STATUS_BAD_TENSOR_DTYPE; \ + } \ + } + +/** + * @brief Macro to generate unary operator implementation. + * + * This macro generates the Descriptor destructor, create, and calculate methods + * for unary operators, using the generic implementation. + * + * Usage: + * namespace op::sqrt::cpu { + * using Op = op::elementwise::unary::UnaryOp; + * ELEMENTWISE_CPU_IMPL_UNARY(sqrt) + * } + */ +#define ELEMENTWISE_CPU_IMPL_UNARY(OP) \ + \ + Descriptor::~Descriptor() = default; \ + \ + infiniStatus_t Descriptor::create( \ + infiniopHandle_t handle_, \ + Descriptor **desc_ptr, \ + infiniopTensorDescriptor_t out_desc, \ + std::vector input_desc_vec) { \ + auto handle = reinterpret_cast(handle_); \ + auto dtype = out_desc->dtype(); \ + const auto &x_desc = input_desc_vec.at(0); \ + const auto &y_shape = out_desc->shape(); \ + const auto &x_shape = x_desc->shape(); \ + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); \ + CHECK_SAME_SHAPE(y_shape, x_shape); \ + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); \ + return INFINI_STATUS_SUCCESS; \ + } \ + \ + infiniStatus_t Descriptor::calculate( \ + void *workspace, \ + size_t workspace_size, \ + void *output, \ + std::vector inputs, \ + void *stream) const { \ + switch (_dtype) { \ + case INFINI_DTYPE_F16: \ + return _device_info->template calculate( \ + _info, output, inputs, stream); \ + case INFINI_DTYPE_F32: \ + return _device_info->template calculate( \ + _info, output, inputs, stream); \ + default: \ + return INFINI_STATUS_BAD_TENSOR_DTYPE; \ + } \ + } + +#endif // __INFINIOP_ELEMENTWISE_CPU_IMPL_H__ diff --git a/src/infiniop/elementwise/nvidia/elementwise_nvidia_impl.cuh b/src/infiniop/elementwise/nvidia/elementwise_nvidia_impl.cuh new file mode 100644 index 000000000..39b78884a --- /dev/null +++ b/src/infiniop/elementwise/nvidia/elementwise_nvidia_impl.cuh @@ -0,0 +1,134 @@ +#ifndef __INFINIOP_ELEMENTWISE_NVIDIA_IMPL_CUH__ +#define __INFINIOP_ELEMENTWISE_NVIDIA_IMPL_CUH__ + +#include "../../../utils/check.h" +#include "../../../utils/result.hpp" +#include "../../devices/nvidia/nvidia_common.cuh" +#include "elementwise_nvidia.cuh" +#include +#include + +/** + * @brief Generic implementation for elementwise NVIDIA/CUDA operators. + * + * This file provides a generic implementation template that can be used + * by all binary and unary operators to reduce code duplication. + * + * Usage: + * #include "elementwise_nvidia_impl.cuh" + * namespace op::pow::nvidia { + * ELEMENTWISE_NVIDIA_IMPL_BINARY(pow) + * } + * + * namespace op::sqrt::nvidia { + * ELEMENTWISE_NVIDIA_IMPL_UNARY(sqrt) + * } + */ + +/** + * @brief Macro to generate binary operator implementation for NVIDIA/CUDA. + * + * This macro generates the Descriptor destructor, create, and calculate methods + * for binary operators, using the generic implementation. + * + * Usage: + * namespace op::pow::nvidia { + * ELEMENTWISE_NVIDIA_IMPL_BINARY(pow) + * } + */ +#define ELEMENTWISE_NVIDIA_IMPL_BINARY(OP) \ + \ + Descriptor::~Descriptor() = default; \ + \ + infiniStatus_t Descriptor::create( \ + infiniopHandle_t handle_, \ + Descriptor **desc_ptr, \ + infiniopTensorDescriptor_t out_desc, \ + std::vector input_desc_vec) { \ + auto handle = reinterpret_cast(handle_); \ + auto dtype = out_desc->dtype(); \ + const auto &a_desc = input_desc_vec.at(0); \ + const auto &b_desc = input_desc_vec.at(1); \ + const auto &c_shape = out_desc->shape(); \ + const auto &a_shape = a_desc->shape(); \ + const auto &b_shape = b_desc->shape(); \ + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); \ + CHECK_SAME_SHAPE(c_shape, a_shape, b_shape); \ + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); \ + return INFINI_STATUS_SUCCESS; \ + } \ + \ + infiniStatus_t Descriptor::calculate( \ + void *workspace, \ + size_t workspace_size, \ + void *output, \ + std::vector inputs, \ + void *stream) const { \ + if (workspace_size < _workspace_size) { \ + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; \ + } \ + switch (_dtype) { \ + case INFINI_DTYPE_F16: \ + return _device_info->calculate<256, cuda::Op, half>( \ + _info, workspace, output, inputs, stream); \ + case INFINI_DTYPE_F32: \ + return _device_info->calculate<256, cuda::Op, float>( \ + _info, workspace, output, inputs, stream); \ + default: \ + return INFINI_STATUS_BAD_TENSOR_DTYPE; \ + } \ + } + +/** + * @brief Macro to generate unary operator implementation for NVIDIA/CUDA. + * + * This macro generates the Descriptor destructor, create, and calculate methods + * for unary operators, using the generic implementation. + * + * Usage: + * namespace op::sqrt::nvidia { + * ELEMENTWISE_NVIDIA_IMPL_UNARY(sqrt) + * } + */ +#define ELEMENTWISE_NVIDIA_IMPL_UNARY(OP) \ + \ + Descriptor::~Descriptor() = default; \ + \ + infiniStatus_t Descriptor::create( \ + infiniopHandle_t handle_, \ + Descriptor **desc_ptr, \ + infiniopTensorDescriptor_t out_desc, \ + std::vector input_desc_vec) { \ + auto handle = reinterpret_cast(handle_); \ + auto dtype = out_desc->dtype(); \ + const auto &x_desc = input_desc_vec.at(0); \ + const auto &y_shape = out_desc->shape(); \ + const auto &x_shape = x_desc->shape(); \ + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); \ + CHECK_SAME_SHAPE(y_shape, x_shape); \ + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); \ + return INFINI_STATUS_SUCCESS; \ + } \ + \ + infiniStatus_t Descriptor::calculate( \ + void *workspace, \ + size_t workspace_size, \ + void *output, \ + std::vector inputs, \ + void *stream) const { \ + if (workspace_size < _workspace_size) { \ + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; \ + } \ + switch (_dtype) { \ + case INFINI_DTYPE_F16: \ + return _device_info->calculate<256, cuda::Op, half>( \ + _info, workspace, output, inputs, stream); \ + case INFINI_DTYPE_F32: \ + return _device_info->calculate<256, cuda::Op, float>( \ + _info, workspace, output, inputs, stream); \ + default: \ + return INFINI_STATUS_BAD_TENSOR_DTYPE; \ + } \ + } + +#endif // __INFINIOP_ELEMENTWISE_NVIDIA_IMPL_CUH__ diff --git a/src/infiniop/elementwise/unary.h b/src/infiniop/elementwise/unary.h new file mode 100644 index 000000000..9f41dedb2 --- /dev/null +++ b/src/infiniop/elementwise/unary.h @@ -0,0 +1,524 @@ +#ifndef __INFINIOP_ELEMENTWISE_UNARY_H__ +#define __INFINIOP_ELEMENTWISE_UNARY_H__ + +#include +#include +#include + +#ifdef __CUDACC__ +#include +#include +#include +// Include device-specific type aliases for cuda_bfloat16 +#include "../devices/nvidia/nvidia_kernel_common.cuh" +#endif + +namespace op::elementwise::unary { + +/** + * @brief Represents all the currently defined unary operations. + * + * This enum is used to specify which unary operation to perform + * in the generic UnaryOp template. + */ +enum class UnaryMode { + // Math operations: + Abs, + Exp, + Log, + Reciprocal, + Sqrt, + Neg, + Ceil, + Floor, + Round, + Sin, + Cos, + Tan, + Asin, + Acos, + Atan, + Sinh, + Cosh, + Tanh, + Asinh, + Acosh, + Atanh, + Relu, + Sigmoid, + Sign, + Erf, +}; + +/** + * @brief Generic unary operation template that performs different operations + * based on the specified UnaryMode. + * + * This template allows multiple unary operators (abs, log, sin, cos, etc.) + * to share the same implementation infrastructure while only differing in the + * operation mode. + * + * @tparam Mode The unary operation mode (from UnaryMode enum) + */ +template +struct UnaryOp { + static constexpr size_t num_inputs = 1; + + template + T operator()(const T &x) const { + if constexpr (Mode == UnaryMode::Abs) { + if constexpr (std::is_floating_point_v) { + return std::fabs(x); + } else { + return std::abs(x); + } + } else if constexpr (Mode == UnaryMode::Exp) { + return std::exp(x); + } else if constexpr (Mode == UnaryMode::Log) { + return std::log(x); + } else if constexpr (Mode == UnaryMode::Reciprocal) { + return T(1) / x; + } else if constexpr (Mode == UnaryMode::Sqrt) { + return std::sqrt(x); + } else if constexpr (Mode == UnaryMode::Neg) { + return -x; + } else if constexpr (Mode == UnaryMode::Ceil) { + return std::ceil(x); + } else if constexpr (Mode == UnaryMode::Floor) { + return std::floor(x); + } else if constexpr (Mode == UnaryMode::Round) { + if constexpr (std::is_integral_v) { + return x; + } else { + return std::nearbyint(x); + } + } else if constexpr (Mode == UnaryMode::Sin) { + return std::sin(x); + } else if constexpr (Mode == UnaryMode::Cos) { + return std::cos(x); + } else if constexpr (Mode == UnaryMode::Tan) { + return std::tan(x); + } else if constexpr (Mode == UnaryMode::Asin) { + return std::asin(x); + } else if constexpr (Mode == UnaryMode::Acos) { + return std::acos(x); + } else if constexpr (Mode == UnaryMode::Atan) { + return std::atan(x); + } else if constexpr (Mode == UnaryMode::Sinh) { + return std::sinh(x); + } else if constexpr (Mode == UnaryMode::Cosh) { + return std::cosh(x); + } else if constexpr (Mode == UnaryMode::Tanh) { + return std::tanh(x); + } else if constexpr (Mode == UnaryMode::Asinh) { + return std::asinh(x); + } else if constexpr (Mode == UnaryMode::Acosh) { + return std::acosh(x); + } else if constexpr (Mode == UnaryMode::Atanh) { + return std::atanh(x); + } else if constexpr (Mode == UnaryMode::Relu) { + return x > T(0) ? x : T(0); + } else if constexpr (Mode == UnaryMode::Sigmoid) { + return T(1) / (T(1) + std::exp(-x)); + } else if constexpr (Mode == UnaryMode::Sign) { + return x > T(0) ? T(1) : (x == T(0) ? T(0) : T(-1)); + } else if constexpr (Mode == UnaryMode::Erf) { + return std::erf(x); + } else { + static_assert(Mode != Mode, "Unsupported unary operation mode"); + return x; + } + } +}; + +#ifdef __CUDACC__ +/** + * @brief CUDA-specific unary operation template that performs different operations + * based on the specified UnaryMode, using CUDA-optimized functions. + * + * This template provides CUDA device functions optimized for GPU execution, + * using intrinsics like __habs2, __logf, __sinf, etc. + * + * @tparam Mode The unary operation mode (from UnaryMode enum) + */ +namespace cuda { +template +struct UnaryOp { + static constexpr size_t num_inputs = 1; + + template + __device__ __forceinline__ T operator()(const T &x) const { + if constexpr (Mode == UnaryMode::Abs) { + if constexpr (std::is_same_v) { + return __habs2(x); + } else if constexpr (std::is_same_v) { + return __habs(x); + } else if constexpr (std::is_floating_point_v) { + return std::fabs(x); + } else { + return std::abs(x); + } + } else if constexpr (Mode == UnaryMode::Exp) { + if constexpr (std::is_same_v) { + float2 x_f2 = __half22float2(x); + return __float22half2_rn(make_float2(__expf(x_f2.x), __expf(x_f2.y))); + } else if constexpr (std::is_same_v) { + return __float2half(__expf(__half2float(x))); + } else if constexpr (std::is_same_v) { + float2 x_f2 = __bfloat1622float2(x); + return __floats2bfloat162_rn(__expf(x_f2.x), __expf(x_f2.y)); + } else if constexpr (std::is_same_v) { + return __float2bfloat16_rn(__expf(__bfloat162float(x))); + } else if constexpr (std::is_same_v) { + return __expf(x); + } else { + return std::exp(x); + } + } else if constexpr (Mode == UnaryMode::Log) { + if constexpr (std::is_same_v) { + return h2log(x); + } else if constexpr (std::is_same_v) { + return __float2half(__logf(__half2float(x))); + } else if constexpr (std::is_same_v) { + float x0 = __bfloat162float(__low2bfloat16(x)); + float x1 = __bfloat162float(__high2bfloat16(x)); + return __floats2bfloat162_rn(logf(x0), logf(x1)); + } else if constexpr (std::is_same_v) { + return __float2bfloat16_rn(logf(__bfloat162float(x))); + } else if constexpr (std::is_same_v) { + return __logf(x); + } else { + return std::log(x); + } + } else if constexpr (Mode == UnaryMode::Reciprocal) { + if constexpr (std::is_same_v) { + return h2rcp(x); + } else if constexpr (std::is_same_v) { + return hrcp(x); + } else if constexpr (std::is_same_v) { + float x0 = __bfloat162float(__low2bfloat16(x)); + float x1 = __bfloat162float(__high2bfloat16(x)); + return __floats2bfloat162_rn(__frcp_rn(x0), __frcp_rn(x1)); + } else if constexpr (std::is_same_v) { + return __float2bfloat16_rn(__frcp_rn(__bfloat162float(x))); + } else if constexpr (std::is_same_v) { + return __frcp_rn(x); + } else { + return T(1) / x; + } + } else if constexpr (Mode == UnaryMode::Sqrt) { + if constexpr (std::is_same_v) { + return h2sqrt(x); + } else if constexpr (std::is_same_v) { + return hsqrt(x); + } else if constexpr (std::is_same_v) { + float x0 = __bfloat162float(__low2bfloat16(x)); + float x1 = __bfloat162float(__high2bfloat16(x)); + return __floats2bfloat162_rn(sqrtf(x0), sqrtf(x1)); + } else if constexpr (std::is_same_v) { + return __float2bfloat16_rn(sqrtf(__bfloat162float(x))); + } else if constexpr (std::is_same_v) { + return __fsqrt_rn(x); + } else { + return std::sqrt(x); + } + } else if constexpr (Mode == UnaryMode::Neg) { + if constexpr (std::is_same_v) { + return __hneg2(x); + } else if constexpr (std::is_same_v) { + return __hneg(x); + } else { + return -x; + } + } else if constexpr (Mode == UnaryMode::Ceil) { + if constexpr (std::is_same_v) { + return h2ceil(x); + } else if constexpr (std::is_same_v) { + return hceil(x); + } else if constexpr (std::is_same_v) { + float x0 = __bfloat162float(__low2bfloat16(x)); + float x1 = __bfloat162float(__high2bfloat16(x)); + return __floats2bfloat162_rn(ceilf(x0), ceilf(x1)); + } else if constexpr (std::is_same_v) { + return __float2bfloat16_rn(ceilf(__bfloat162float(x))); + } else if constexpr (std::is_same_v) { + return ceilf(x); + } else if constexpr (std::is_integral_v) { + return x; + } else { + return std::ceil(x); + } + } else if constexpr (Mode == UnaryMode::Floor) { + if constexpr (std::is_same_v) { + return h2floor(x); + } else if constexpr (std::is_same_v) { + return hfloor(x); + } else if constexpr (std::is_same_v) { + float x0 = __bfloat162float(__low2bfloat16(x)); + float x1 = __bfloat162float(__high2bfloat16(x)); + return __floats2bfloat162_rn(floorf(x0), floorf(x1)); + } else if constexpr (std::is_same_v) { + return __float2bfloat16_rn(floorf(__bfloat162float(x))); + } else if constexpr (std::is_same_v) { + return floorf(x); + } else if constexpr (std::is_integral_v) { + return x; + } else { + return std::floor(x); + } + } else if constexpr (Mode == UnaryMode::Round) { + if constexpr (std::is_same_v) { + return h2rint(x); + } else if constexpr (std::is_same_v) { + return hrint(x); + } else if constexpr (std::is_same_v) { + float x0 = __bfloat162float(__low2bfloat16(x)); + float x1 = __bfloat162float(__high2bfloat16(x)); + return __floats2bfloat162_rn(rintf(x0), rintf(x1)); + } else if constexpr (std::is_same_v) { + return __float2bfloat16_rn(rintf(__bfloat162float(x))); + } else if constexpr (std::is_same_v) { + return rintf(x); + } else if constexpr (std::is_integral_v) { + return x; + } else { + return std::nearbyint(x); + } + } else if constexpr (Mode == UnaryMode::Sin) { + if constexpr (std::is_same_v) { + float2 x_f2 = __half22float2(x); + return __float22half2_rn(make_float2(__sinf(x_f2.x), __sinf(x_f2.y))); + } else if constexpr (std::is_same_v) { + return __float2half(__sinf(__half2float(x))); + } else if constexpr (std::is_same_v) { + float x0 = __bfloat162float(__low2bfloat16(x)); + float x1 = __bfloat162float(__high2bfloat16(x)); + return __floats2bfloat162_rn(sinf(x0), sinf(x1)); + } else if constexpr (std::is_same_v) { + return __float2bfloat16_rn(sinf(__bfloat162float(x))); + } else if constexpr (std::is_same_v) { + return __sinf(x); + } else { + return std::sin(x); + } + } else if constexpr (Mode == UnaryMode::Cos) { + if constexpr (std::is_same_v) { + float2 x_f2 = __half22float2(x); + return __float22half2_rn(make_float2(__cosf(x_f2.x), __cosf(x_f2.y))); + } else if constexpr (std::is_same_v) { + return __float2half(__cosf(__half2float(x))); + } else if constexpr (std::is_same_v) { + float x0 = __bfloat162float(__low2bfloat16(x)); + float x1 = __bfloat162float(__high2bfloat16(x)); + return __floats2bfloat162_rn(cosf(x0), cosf(x1)); + } else if constexpr (std::is_same_v) { + return __float2bfloat16_rn(cosf(__bfloat162float(x))); + } else if constexpr (std::is_same_v) { + return __cosf(x); + } else { + return std::cos(x); + } + } else if constexpr (Mode == UnaryMode::Tan) { + if constexpr (std::is_same_v) { + float2 x_f2 = __half22float2(x); + return __float22half2_rn(make_float2(tanf(x_f2.x), tanf(x_f2.y))); + } else if constexpr (std::is_same_v) { + return __float2half(tanf(__half2float(x))); + } else if constexpr (std::is_same_v) { + return tanf(x); + } else { + return std::tan(x); + } + } else if constexpr (Mode == UnaryMode::Asin) { + if constexpr (std::is_same_v) { + float2 x_f2 = __half22float2(x); + return __float22half2_rn(make_float2(asinf(x_f2.x), asinf(x_f2.y))); + } else if constexpr (std::is_same_v) { + return __float2half(asinf(__half2float(x))); + } else if constexpr (std::is_same_v) { + return asinf(x); + } else { + return std::asin(x); + } + } else if constexpr (Mode == UnaryMode::Acos) { + if constexpr (std::is_same_v) { + float2 x_f2 = __half22float2(x); + return __float22half2_rn(make_float2(acosf(x_f2.x), acosf(x_f2.y))); + } else if constexpr (std::is_same_v) { + return __float2half(acosf(__half2float(x))); + } else if constexpr (std::is_same_v) { + return acosf(x); + } else { + return std::acos(x); + } + } else if constexpr (Mode == UnaryMode::Atan) { + if constexpr (std::is_same_v) { + float2 x_f2 = __half22float2(x); + return __float22half2_rn(make_float2(atanf(x_f2.x), atanf(x_f2.y))); + } else if constexpr (std::is_same_v) { + return __float2half(atanf(__half2float(x))); + } else if constexpr (std::is_same_v) { + return atanf(x); + } else { + return std::atan(x); + } + } else if constexpr (Mode == UnaryMode::Sinh) { + if constexpr (std::is_same_v) { + float2 x_f2 = __half22float2(x); + return __float22half2_rn(make_float2(sinhf(x_f2.x), sinhf(x_f2.y))); + } else if constexpr (std::is_same_v) { + return __float2half(sinhf(__half2float(x))); + } else if constexpr (std::is_same_v) { + return sinhf(x); + } else { + return std::sinh(x); + } + } else if constexpr (Mode == UnaryMode::Cosh) { + if constexpr (std::is_same_v) { + float2 x_f2 = __half22float2(x); + return __float22half2_rn(make_float2(coshf(x_f2.x), coshf(x_f2.y))); + } else if constexpr (std::is_same_v) { + return __float2half(coshf(__half2float(x))); + } else if constexpr (std::is_same_v) { + return coshf(x); + } else { + return std::cosh(x); + } + } else if constexpr (Mode == UnaryMode::Tanh) { + if constexpr (std::is_same_v) { + return __h2tanh(x); + } else if constexpr (std::is_same_v) { + return __float2half(tanhf(__half2float(x))); + } else if constexpr (std::is_same_v) { + float f0 = __bfloat162float(__low2bfloat16(x)); + float f1 = __bfloat162float(__high2bfloat16(x)); + return __floats2bfloat162_rn(tanhf(f0), tanhf(f1)); + } else if constexpr (std::is_same_v) { + return __float2bfloat16_rn(tanhf(__bfloat162float(x))); + } else if constexpr (std::is_same_v) { + return tanhf(x); + } else { + return std::tanh(x); + } + } else if constexpr (Mode == UnaryMode::Asinh) { + if constexpr (std::is_same_v) { + return __floats2half2_rn(asinhf(__half2float(__low2half(x))), asinhf(__half2float(__high2half(x)))); + } else if constexpr (std::is_same_v) { + return __float2half(asinhf(__half2float(x))); + } else if constexpr (std::is_same_v) { + float x0 = __bfloat162float(__low2bfloat16(x)); + float x1 = __bfloat162float(__high2bfloat16(x)); + return __floats2bfloat162_rn(asinhf(x0), asinhf(x1)); + } else if constexpr (std::is_same_v) { + return __float2bfloat16_rn(asinhf(__bfloat162float(x))); + } else if constexpr (std::is_same_v) { + return asinhf(x); + } else { + return std::asinh(x); + } + } else if constexpr (Mode == UnaryMode::Acosh) { + if constexpr (std::is_same_v) { + return __floats2half2_rn(acoshf(__half2float(__low2half(x))), acoshf(__half2float(__high2half(x)))); + } else if constexpr (std::is_same_v) { + return __float2half(acoshf(__half2float(x))); + } else if constexpr (std::is_same_v) { + float x0 = __bfloat162float(__low2bfloat16(x)); + float x1 = __bfloat162float(__high2bfloat16(x)); + return __floats2bfloat162_rn(acoshf(x0), acoshf(x1)); + } else if constexpr (std::is_same_v) { + return __float2bfloat16_rn(acoshf(__bfloat162float(x))); + } else if constexpr (std::is_same_v) { + return acoshf(x); + } else { + return std::acosh(x); + } + } else if constexpr (Mode == UnaryMode::Atanh) { + if constexpr (std::is_same_v) { + return __floats2half2_rn(atanhf(__half2float(__low2half(x))), atanhf(__half2float(__high2half(x)))); + } else if constexpr (std::is_same_v) { + return __float2half(atanhf(__half2float(x))); + } else if constexpr (std::is_same_v) { + float x0 = __bfloat162float(__low2bfloat16(x)); + float x1 = __bfloat162float(__high2bfloat16(x)); + return __floats2bfloat162_rn(atanhf(x0), atanhf(x1)); + } else if constexpr (std::is_same_v) { + return __float2bfloat16_rn(atanhf(__bfloat162float(x))); + } else if constexpr (std::is_same_v) { + return atanhf(x); + } else { + return std::atanh(x); + } + } else if constexpr (Mode == UnaryMode::Relu) { + if constexpr (std::is_same_v) { + return __hmax2(x, __floats2half2_rn(0.0f, 0.0f)); + } else { + return x > T(0) ? x : T(0); + } + } else if constexpr (Mode == UnaryMode::Sigmoid) { + if constexpr (std::is_same_v) { + float2 x_f2 = __half22float2(x); + float2 exp_neg_x = make_float2(__expf(-x_f2.x), __expf(-x_f2.y)); + return __float22half2_rn(make_float2(1.0f / (1.0f + exp_neg_x.x), 1.0f / (1.0f + exp_neg_x.y))); + } else if constexpr (std::is_same_v) { + float x_ = __half2float(x); + return __float2half(1.0f / (1.0f + __expf(-x_))); + } else if constexpr (std::is_same_v) { + return 1.0f / (1.0f + __expf(-x)); + } else { + return T(1) / (T(1) + std::exp(-x)); + } + } else if constexpr (Mode == UnaryMode::Sign) { + if constexpr (std::is_same_v) { + const auto lt_mask = __hlt2(x, __floats2half2_rn(0.0f, 0.0f)); + return __hadd2(__hneg2(lt_mask), __hsub2(__floats2half2_rn(1.0f, 1.0f), lt_mask)); + } else if constexpr (std::is_same_v) { + return x > half(0) ? half(1) : (x == half(0) ? half(0) : half(-1)); + } else { + return x > T(0) ? T(1) : (x == T(0) ? T(0) : T(-1)); + } + } else if constexpr (Mode == UnaryMode::Erf) { + if constexpr (std::is_same_v) { + float2 x_f2 = __half22float2(x); + return __float22half2_rn(make_float2(erff(x_f2.x), erff(x_f2.y))); + } else if constexpr (std::is_same_v) { + return __float2half(erff(__half2float(x))); + } else if constexpr (std::is_same_v) { + return erff(x); + } else { + return std::erf(x); + } + } else { + static_assert(Mode != Mode, "Unsupported unary operation mode"); + return x; + } + } +}; +} // namespace cuda +#endif // __CUDACC__ + +/** + * @brief Macro to define a unary elementwise descriptor for a specific operation. + * + * This macro simplifies the definition of unary operators (abs, log, sin, cos, etc.) + * by automatically generating the Descriptor class and operation struct using the + * ELEMENTWISE_DESCRIPTOR macro and UnaryOp template. + * + * Usage: + * UNARY_ELEMENTWISE_DESCRIPTOR(abs, cpu, UnaryMode::Abs) + * UNARY_ELEMENTWISE_DESCRIPTOR(log, cpu, UnaryMode::Log) + * + * @param OP The operator name (e.g., abs, log, sin) + * @param NAMESPACE The device namespace (e.g., cpu, nvidia) + * @param MODE The UnaryMode enum value for this operation + */ +#define UNARY_ELEMENTWISE_DESCRIPTOR(OP, NAMESPACE, MODE) \ + \ + ELEMENTWISE_DESCRIPTOR(OP, NAMESPACE) \ + \ + namespace op::OP::NAMESPACE { \ + using Op = op::elementwise::unary::UnaryOp; \ + } + +} // namespace op::elementwise::unary + +#endif // __INFINIOP_ELEMENTWISE_UNARY_H__ diff --git a/src/infiniop/operator_impl.h b/src/infiniop/operator_impl.h new file mode 100644 index 000000000..3ff543f7e --- /dev/null +++ b/src/infiniop/operator_impl.h @@ -0,0 +1,288 @@ +#ifndef __INFINIOP_OPERATOR_IMPL_H__ +#define __INFINIOP_OPERATOR_IMPL_H__ + +#include "handle.h" +#include "operator.h" + +// Conditional compilation helpers +#ifdef ENABLE_CPU_API +#define IF_ENABLE_CPU_API(...) __VA_ARGS__ +#else +#define IF_ENABLE_CPU_API(...) +#endif + +#ifdef ENABLE_NVIDIA_API +#define IF_ENABLE_NVIDIA_API(...) __VA_ARGS__ +#else +#define IF_ENABLE_NVIDIA_API(...) +#endif + +#ifdef ENABLE_ILUVATAR_API +#define IF_ENABLE_ILUVATAR_API(...) __VA_ARGS__ +#else +#define IF_ENABLE_ILUVATAR_API(...) +#endif + +#ifdef ENABLE_QY_API +#define IF_ENABLE_QY_API(...) __VA_ARGS__ +#else +#define IF_ENABLE_QY_API(...) +#endif + +#ifdef ENABLE_METAX_API +#define IF_ENABLE_METAX_API(...) __VA_ARGS__ +#else +#define IF_ENABLE_METAX_API(...) +#endif + +#ifdef ENABLE_KUNLUN_API +#define IF_ENABLE_KUNLUN_API(...) __VA_ARGS__ +#else +#define IF_ENABLE_KUNLUN_API(...) +#endif + +#ifdef ENABLE_CAMBRICON_API +#define IF_ENABLE_CAMBRICON_API(...) __VA_ARGS__ +#else +#define IF_ENABLE_CAMBRICON_API(...) +#endif + +#ifdef ENABLE_MOORE_API +#define IF_ENABLE_MOORE_API(...) __VA_ARGS__ +#else +#define IF_ENABLE_MOORE_API(...) +#endif + +/** + * Binary operator implementation macros + */ +#define BINARY_OP_IMPL_CASE(OP_NAME, DEVICE, NAMESPACE, c_desc, a_desc, b_desc) \ + IF_ENABLE_##DEVICE##_API( \ + case INFINI_DEVICE_##DEVICE \ + : return op::OP_NAME::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + c_desc, \ + {a_desc, b_desc});) + +#define BINARY_OP_IMPL_DEVICE_CASES(OP_NAME, c_desc, a_desc, b_desc) \ + BINARY_OP_IMPL_CASE(OP_NAME, CPU, cpu, c_desc, a_desc, b_desc) \ + BINARY_OP_IMPL_CASE(OP_NAME, NVIDIA, nvidia, c_desc, a_desc, b_desc) \ + BINARY_OP_IMPL_CASE(OP_NAME, ILUVATAR, nvidia, c_desc, a_desc, b_desc) \ + BINARY_OP_IMPL_CASE(OP_NAME, QY, nvidia, c_desc, a_desc, b_desc) \ + BINARY_OP_IMPL_CASE(OP_NAME, METAX, metax, c_desc, a_desc, b_desc) \ + BINARY_OP_IMPL_CASE(OP_NAME, KUNLUN, kunlun, c_desc, a_desc, b_desc) \ + BINARY_OP_IMPL_CASE(OP_NAME, CAMBRICON, bang, c_desc, a_desc, b_desc) \ + BINARY_OP_IMPL_CASE(OP_NAME, MOORE, moore, c_desc, a_desc, b_desc) + +#define BINARY_OP_IMPL_GET_WORKSPACE_CASE(OP_NAME, DEVICE, NAMESPACE) \ + IF_ENABLE_##DEVICE##_API( \ + case INFINI_DEVICE_##DEVICE \ + : \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS;) + +#define BINARY_OP_IMPL_GET_WORKSPACE_CASES(OP_NAME) \ + BINARY_OP_IMPL_GET_WORKSPACE_CASE(OP_NAME, CPU, cpu) \ + BINARY_OP_IMPL_GET_WORKSPACE_CASE(OP_NAME, NVIDIA, nvidia) \ + BINARY_OP_IMPL_GET_WORKSPACE_CASE(OP_NAME, ILUVATAR, nvidia) \ + BINARY_OP_IMPL_GET_WORKSPACE_CASE(OP_NAME, QY, nvidia) \ + BINARY_OP_IMPL_GET_WORKSPACE_CASE(OP_NAME, METAX, metax) \ + BINARY_OP_IMPL_GET_WORKSPACE_CASE(OP_NAME, KUNLUN, kunlun) \ + BINARY_OP_IMPL_GET_WORKSPACE_CASE(OP_NAME, CAMBRICON, bang) \ + BINARY_OP_IMPL_GET_WORKSPACE_CASE(OP_NAME, MOORE, moore) + +#define BINARY_OP_IMPL_CALCULATE_CASE(OP_NAME, DEVICE, NAMESPACE, c, a, b) \ + IF_ENABLE_##DEVICE##_API( \ + case INFINI_DEVICE_##DEVICE \ + : return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, c, {a, b}, stream);) + +#define BINARY_OP_IMPL_CALCULATE_CASES(OP_NAME, c, a, b) \ + BINARY_OP_IMPL_CALCULATE_CASE(OP_NAME, CPU, cpu, c, a, b) \ + BINARY_OP_IMPL_CALCULATE_CASE(OP_NAME, NVIDIA, nvidia, c, a, b) \ + BINARY_OP_IMPL_CALCULATE_CASE(OP_NAME, ILUVATAR, nvidia, c, a, b) \ + BINARY_OP_IMPL_CALCULATE_CASE(OP_NAME, QY, nvidia, c, a, b) \ + BINARY_OP_IMPL_CALCULATE_CASE(OP_NAME, METAX, metax, c, a, b) \ + BINARY_OP_IMPL_CALCULATE_CASE(OP_NAME, KUNLUN, kunlun, c, a, b) \ + BINARY_OP_IMPL_CALCULATE_CASE(OP_NAME, CAMBRICON, bang, c, a, b) \ + BINARY_OP_IMPL_CALCULATE_CASE(OP_NAME, MOORE, moore, c, a, b) + +#define BINARY_OP_IMPL_DESTROY_CASE(OP_NAME, DEVICE, NAMESPACE) \ + IF_ENABLE_##DEVICE##_API( \ + case INFINI_DEVICE_##DEVICE \ + : delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS;) + +#define BINARY_OP_IMPL_DESTROY_CASES(OP_NAME) \ + BINARY_OP_IMPL_DESTROY_CASE(OP_NAME, CPU, cpu) \ + BINARY_OP_IMPL_DESTROY_CASE(OP_NAME, NVIDIA, nvidia) \ + BINARY_OP_IMPL_DESTROY_CASE(OP_NAME, ILUVATAR, nvidia) \ + BINARY_OP_IMPL_DESTROY_CASE(OP_NAME, QY, nvidia) \ + BINARY_OP_IMPL_DESTROY_CASE(OP_NAME, METAX, metax) \ + BINARY_OP_IMPL_DESTROY_CASE(OP_NAME, KUNLUN, kunlun) \ + BINARY_OP_IMPL_DESTROY_CASE(OP_NAME, CAMBRICON, bang) \ + BINARY_OP_IMPL_DESTROY_CASE(OP_NAME, MOORE, moore) + +#define BINARY_OP_IMPL(OP_NAME, OP_NAME_UPPER) \ + __C infiniStatus_t infiniopCreate##OP_NAME_UPPER##Descriptor( \ + infiniopHandle_t handle, \ + infiniop##OP_NAME_UPPER##Descriptor_t *desc_ptr, \ + infiniopTensorDescriptor_t c_desc, \ + infiniopTensorDescriptor_t a_desc, \ + infiniopTensorDescriptor_t b_desc) { \ + switch (handle->device) { \ + BINARY_OP_IMPL_DEVICE_CASES(OP_NAME, c_desc, a_desc, b_desc) \ + default: \ + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; \ + } \ + } \ + __C infiniStatus_t infiniopGet##OP_NAME_UPPER##WorkspaceSize( \ + infiniop##OP_NAME_UPPER##Descriptor_t desc, \ + size_t *size) { \ + switch (desc->device_type) { \ + BINARY_OP_IMPL_GET_WORKSPACE_CASES(OP_NAME) \ + default: \ + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; \ + } \ + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; \ + } \ + __C infiniStatus_t infiniop##OP_NAME_UPPER( \ + infiniop##OP_NAME_UPPER##Descriptor_t desc, \ + void *workspace, \ + size_t workspace_size, \ + void *c, \ + const void *a, \ + const void *b, \ + void *stream) { \ + switch (desc->device_type) { \ + BINARY_OP_IMPL_CALCULATE_CASES(OP_NAME, c, a, b) \ + default: \ + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; \ + } \ + } \ + __C infiniStatus_t infiniopDestroy##OP_NAME_UPPER##Descriptor( \ + infiniop##OP_NAME_UPPER##Descriptor_t desc) { \ + switch (desc->device_type) { \ + BINARY_OP_IMPL_DESTROY_CASES(OP_NAME) \ + default: \ + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; \ + } \ + } + +/** + * Unary operator implementation macros + */ +#define UNARY_OP_IMPL_CASE(OP_NAME, DEVICE, NAMESPACE, y_desc, x_desc) \ + IF_ENABLE_##DEVICE##_API( \ + case INFINI_DEVICE_##DEVICE \ + : return op::OP_NAME::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + y_desc, \ + {x_desc});) + +#define UNARY_OP_IMPL_DEVICE_CASES(OP_NAME, y_desc, x_desc) \ + UNARY_OP_IMPL_CASE(OP_NAME, CPU, cpu, y_desc, x_desc) \ + UNARY_OP_IMPL_CASE(OP_NAME, NVIDIA, nvidia, y_desc, x_desc) \ + UNARY_OP_IMPL_CASE(OP_NAME, ILUVATAR, nvidia, y_desc, x_desc) \ + UNARY_OP_IMPL_CASE(OP_NAME, QY, nvidia, y_desc, x_desc) \ + UNARY_OP_IMPL_CASE(OP_NAME, METAX, metax, y_desc, x_desc) \ + UNARY_OP_IMPL_CASE(OP_NAME, KUNLUN, kunlun, y_desc, x_desc) \ + UNARY_OP_IMPL_CASE(OP_NAME, CAMBRICON, bang, y_desc, x_desc) \ + UNARY_OP_IMPL_CASE(OP_NAME, MOORE, moore, y_desc, x_desc) + +#define UNARY_OP_IMPL_GET_WORKSPACE_CASE(OP_NAME, DEVICE, NAMESPACE) \ + IF_ENABLE_##DEVICE##_API( \ + case INFINI_DEVICE_##DEVICE \ + : \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS;) + +#define UNARY_OP_IMPL_GET_WORKSPACE_CASES(OP_NAME) \ + UNARY_OP_IMPL_GET_WORKSPACE_CASE(OP_NAME, CPU, cpu) \ + UNARY_OP_IMPL_GET_WORKSPACE_CASE(OP_NAME, NVIDIA, nvidia) \ + UNARY_OP_IMPL_GET_WORKSPACE_CASE(OP_NAME, ILUVATAR, nvidia) \ + UNARY_OP_IMPL_GET_WORKSPACE_CASE(OP_NAME, QY, nvidia) \ + UNARY_OP_IMPL_GET_WORKSPACE_CASE(OP_NAME, METAX, metax) \ + UNARY_OP_IMPL_GET_WORKSPACE_CASE(OP_NAME, KUNLUN, kunlun) \ + UNARY_OP_IMPL_GET_WORKSPACE_CASE(OP_NAME, CAMBRICON, bang) \ + UNARY_OP_IMPL_GET_WORKSPACE_CASE(OP_NAME, MOORE, moore) + +#define UNARY_OP_IMPL_CALCULATE_CASE(OP_NAME, DEVICE, NAMESPACE, y, x) \ + IF_ENABLE_##DEVICE##_API( \ + case INFINI_DEVICE_##DEVICE \ + : return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, y, {x}, stream);) + +#define UNARY_OP_IMPL_CALCULATE_CASES(OP_NAME, y, x) \ + UNARY_OP_IMPL_CALCULATE_CASE(OP_NAME, CPU, cpu, y, x) \ + UNARY_OP_IMPL_CALCULATE_CASE(OP_NAME, NVIDIA, nvidia, y, x) \ + UNARY_OP_IMPL_CALCULATE_CASE(OP_NAME, ILUVATAR, nvidia, y, x) \ + UNARY_OP_IMPL_CALCULATE_CASE(OP_NAME, QY, nvidia, y, x) \ + UNARY_OP_IMPL_CALCULATE_CASE(OP_NAME, METAX, metax, y, x) \ + UNARY_OP_IMPL_CALCULATE_CASE(OP_NAME, KUNLUN, kunlun, y, x) \ + UNARY_OP_IMPL_CALCULATE_CASE(OP_NAME, CAMBRICON, bang, y, x) \ + UNARY_OP_IMPL_CALCULATE_CASE(OP_NAME, MOORE, moore, y, x) + +#define UNARY_OP_IMPL_DESTROY_CASE(OP_NAME, DEVICE, NAMESPACE) \ + IF_ENABLE_##DEVICE##_API( \ + case INFINI_DEVICE_##DEVICE \ + : delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS;) + +#define UNARY_OP_IMPL_DESTROY_CASES(OP_NAME) \ + UNARY_OP_IMPL_DESTROY_CASE(OP_NAME, CPU, cpu) \ + UNARY_OP_IMPL_DESTROY_CASE(OP_NAME, NVIDIA, nvidia) \ + UNARY_OP_IMPL_DESTROY_CASE(OP_NAME, ILUVATAR, nvidia) \ + UNARY_OP_IMPL_DESTROY_CASE(OP_NAME, QY, nvidia) \ + UNARY_OP_IMPL_DESTROY_CASE(OP_NAME, METAX, metax) \ + UNARY_OP_IMPL_DESTROY_CASE(OP_NAME, KUNLUN, kunlun) \ + UNARY_OP_IMPL_DESTROY_CASE(OP_NAME, CAMBRICON, bang) \ + UNARY_OP_IMPL_DESTROY_CASE(OP_NAME, MOORE, moore) + +#define UNARY_OP_IMPL(OP_NAME, OP_NAME_UPPER) \ + __C infiniStatus_t infiniopCreate##OP_NAME_UPPER##Descriptor( \ + infiniopHandle_t handle, \ + infiniop##OP_NAME_UPPER##Descriptor_t *desc_ptr, \ + infiniopTensorDescriptor_t y_desc, \ + infiniopTensorDescriptor_t x_desc) { \ + switch (handle->device) { \ + UNARY_OP_IMPL_DEVICE_CASES(OP_NAME, y_desc, x_desc) \ + default: \ + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; \ + } \ + } \ + __C infiniStatus_t infiniopGet##OP_NAME_UPPER##WorkspaceSize( \ + infiniop##OP_NAME_UPPER##Descriptor_t desc, \ + size_t *size) { \ + switch (desc->device_type) { \ + UNARY_OP_IMPL_GET_WORKSPACE_CASES(OP_NAME) \ + default: \ + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; \ + } \ + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; \ + } \ + __C infiniStatus_t infiniop##OP_NAME_UPPER( \ + infiniop##OP_NAME_UPPER##Descriptor_t desc, \ + void *workspace, \ + size_t workspace_size, \ + void *y, \ + const void *x, \ + void *stream) { \ + switch (desc->device_type) { \ + UNARY_OP_IMPL_CALCULATE_CASES(OP_NAME, y, x) \ + default: \ + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; \ + } \ + } \ + __C infiniStatus_t infiniopDestroy##OP_NAME_UPPER##Descriptor( \ + infiniop##OP_NAME_UPPER##Descriptor_t desc) { \ + switch (desc->device_type) { \ + UNARY_OP_IMPL_DESTROY_CASES(OP_NAME) \ + default: \ + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; \ + } \ + } + +#endif // __INFINIOP_OPERATOR_IMPL_H__ diff --git a/src/infiniop/ops/abs/cpu/abs_cpu.cc b/src/infiniop/ops/abs/cpu/abs_cpu.cc index 7d6e81d04..d4b541ba7 100644 --- a/src/infiniop/ops/abs/cpu/abs_cpu.cc +++ b/src/infiniop/ops/abs/cpu/abs_cpu.cc @@ -1,48 +1,8 @@ #include "abs_cpu.h" +#include "../../../elementwise/cpu/elementwise_cpu_impl.h" namespace op::abs::cpu { -Descriptor::~Descriptor() = default; +ELEMENTWISE_CPU_IMPL_UNARY(abs) -infiniStatus_t Descriptor::create( - infiniopHandle_t handle_, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t out_desc, - std::vector input_desc_vec) { - - auto handle = reinterpret_cast(handle_); - auto dtype = out_desc->dtype(); - - const auto &x_desc = input_desc_vec.at(0); - const auto &y_shape = out_desc->shape(); - const auto &x_shape = x_desc->shape(); - - CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); - - CHECK_SAME_SHAPE(y_shape, x_shape); - - // create CPU elementwise descriptor - CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); - - return INFINI_STATUS_SUCCESS; -} - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *output, - std::vector inputs, - void *stream) const { - - switch (_dtype) { - case INFINI_DTYPE_F16: - return _device_info->calculate(_info, output, inputs, stream); - case INFINI_DTYPE_F32: - return _device_info->calculate(_info, output, inputs, stream); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - - return INFINI_STATUS_SUCCESS; -} } // namespace op::abs::cpu diff --git a/src/infiniop/ops/abs/cpu/abs_cpu.h b/src/infiniop/ops/abs/cpu/abs_cpu.h index 5b9773298..cba8274e6 100644 --- a/src/infiniop/ops/abs/cpu/abs_cpu.h +++ b/src/infiniop/ops/abs/cpu/abs_cpu.h @@ -1,26 +1,9 @@ #ifndef __ABS_CPU_H__ #define __ABS_CPU_H__ -#include - #include "../../../elementwise/cpu/elementwise_cpu.h" +#include "../../../elementwise/unary.h" -ELEMENTWISE_DESCRIPTOR(abs, cpu) - -namespace op::abs::cpu { -typedef struct AbsOp { -public: - static constexpr size_t num_inputs = 1; - - template - T operator()(const T &x) const { - if constexpr (std::is_floating_point_v) { - return std::fabs(x); - } else { - return std::abs(x); - } - } -} AbsOp; -} // namespace op::abs::cpu +UNARY_ELEMENTWISE_DESCRIPTOR(abs, cpu, op::elementwise::unary::UnaryMode::Abs) #endif // __ABS_CPU_H__ diff --git a/src/infiniop/ops/abs/cuda/kernel.cuh b/src/infiniop/ops/abs/cuda/kernel.cuh index d7ff2db12..406aa423f 100644 --- a/src/infiniop/ops/abs/cuda/kernel.cuh +++ b/src/infiniop/ops/abs/cuda/kernel.cuh @@ -1,26 +1,10 @@ #ifndef __ABS_CUDA_H__ #define __ABS_CUDA_H__ -#include -#include +#include "../../../elementwise/unary.h" namespace op::abs::cuda { -typedef struct AbsOp { -public: - static constexpr size_t num_inputs = 1; - template - __device__ __forceinline__ T operator()(const T &x) const { - if constexpr (std::is_same_v) { - return __habs2(x); - } else if constexpr (std::is_same_v) { - return __habs(x); - } else if constexpr (std::is_floating_point_v) { - return std::fabs(x); - } else { - return std::abs(x); - } - } -} AbsOp; +using Op = op::elementwise::unary::cuda::UnaryOp; } // namespace op::abs::cuda #endif // __ABS_CUDA_H__ diff --git a/src/infiniop/ops/abs/nvidia/abs_nvidia.cu b/src/infiniop/ops/abs/nvidia/abs_nvidia.cu index 485f0406a..b9687226a 100644 --- a/src/infiniop/ops/abs/nvidia/abs_nvidia.cu +++ b/src/infiniop/ops/abs/nvidia/abs_nvidia.cu @@ -1,54 +1,10 @@ -#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" +#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh" #include "../cuda/kernel.cuh" #include "abs_nvidia.cuh" namespace op::abs::nvidia { -Descriptor::~Descriptor() = default; +ELEMENTWISE_NVIDIA_IMPL_UNARY(abs) -infiniStatus_t Descriptor::create( - infiniopHandle_t handle_, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t out_desc, - std::vector input_desc_vec) { - - auto handle = reinterpret_cast(handle_); - auto dtype = out_desc->dtype(); - - const auto &x_desc = input_desc_vec.at(0); - const auto &y_shape = out_desc->shape(); - const auto &x_shape = x_desc->shape(); - - CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); - - CHECK_SAME_SHAPE(y_shape, x_shape); - - // create CUDA elementwise descriptor - CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) - - return INFINI_STATUS_SUCCESS; -} - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *output, - std::vector inputs, - void *stream) const { - if (workspace_size < _workspace_size) { - return INFINI_STATUS_INSUFFICIENT_WORKSPACE; - } - - switch (_dtype) { - case INFINI_DTYPE_F16: - return _device_info->calculate<256, cuda::AbsOp, half>(_info, workspace, output, inputs, stream); - case INFINI_DTYPE_F32: - return _device_info->calculate<256, cuda::AbsOp, float>(_info, workspace, output, inputs, stream); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - - return INFINI_STATUS_SUCCESS; -} } // namespace op::abs::nvidia diff --git a/src/infiniop/ops/abs/operator.cc b/src/infiniop/ops/abs/operator.cc index b6820079d..051b8711a 100644 --- a/src/infiniop/ops/abs/operator.cc +++ b/src/infiniop/ops/abs/operator.cc @@ -1,5 +1,4 @@ -#include "../../operator.h" -#include "../../handle.h" +#include "../../operator_impl.h" #include "infiniop/ops/abs.h" #ifdef ENABLE_CPU_API @@ -9,131 +8,4 @@ #include "nvidia/abs_nvidia.cuh" #endif -__C infiniStatus_t infiniopCreateAbsDescriptor( - infiniopHandle_t handle, - infiniopAbsDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t y_desc, - infiniopTensorDescriptor_t x_desc) { - -#define CREATE(CASE, NAMESPACE) \ - case CASE: \ - return op::abs::NAMESPACE::Descriptor::create( \ - handle, \ - reinterpret_cast(desc_ptr), \ - y_desc, \ - {x_desc}) - - switch (handle->device) { - -#ifdef ENABLE_CPU_API - CREATE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - CREATE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - CREATE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - CREATE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef CREATE -} - -__C infiniStatus_t infiniopGetAbsWorkspaceSize(infiniopAbsDescriptor_t desc, size_t *size) { - -#define GET(CASE, NAMESPACE) \ - case CASE: \ - *size = reinterpret_cast(desc)->workspaceSize(); \ - return INFINI_STATUS_SUCCESS; - - switch (desc->device_type) { -#ifdef ENABLE_CPU_API - GET(INFINI_DEVICE_CPU, cpu) -#endif -#ifdef ENABLE_NVIDIA_API - GET(INFINI_DEVICE_NVIDIA, nvidia) -#endif -#ifdef ENABLE_ILUVATAR_API - GET(INFINI_DEVICE_ILUVATAR, nvidia) -#endif -#ifdef ENABLE_QY_API - GET(INFINI_DEVICE_QY, nvidia) -#endif - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } -#undef GET - - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; -} - -__C infiniStatus_t infiniopAbs( - infiniopAbsDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *y, - const void *x, - void *stream) { - -#define CALCULATE(CASE, NAMESPACE) \ - case CASE: \ - return reinterpret_cast(desc) \ - ->calculate(workspace, workspace_size, y, {x}, stream) - - switch (desc->device_type) { - -#ifdef ENABLE_CPU_API - CALCULATE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - CALCULATE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef CALCULATE -} - -__C infiniStatus_t -infiniopDestroyAbsDescriptor(infiniopAbsDescriptor_t desc) { - -#define DELETE(CASE, NAMESPACE) \ - case CASE: \ - delete reinterpret_cast(desc); \ - return INFINI_STATUS_SUCCESS - - switch (desc->device_type) { - -#ifdef ENABLE_CPU_API - DELETE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - DELETE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - DELETE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - DELETE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef DELETE -} +UNARY_OP_IMPL(abs, Abs) diff --git a/src/infiniop/ops/acos/cpu/acos_cpu.cc b/src/infiniop/ops/acos/cpu/acos_cpu.cc index 1accb6752..9be4ca1fe 100644 --- a/src/infiniop/ops/acos/cpu/acos_cpu.cc +++ b/src/infiniop/ops/acos/cpu/acos_cpu.cc @@ -1,48 +1,8 @@ #include "acos_cpu.h" +#include "../../../elementwise/cpu/elementwise_cpu_impl.h" namespace op::acos::cpu { -Descriptor::~Descriptor() = default; +ELEMENTWISE_CPU_IMPL_UNARY(acos) -infiniStatus_t Descriptor::create( - infiniopHandle_t handle_, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t out_desc, - std::vector input_desc_vec) { - - auto handle = reinterpret_cast(handle_); - auto dtype = out_desc->dtype(); - - const auto &x_desc = input_desc_vec.at(0); - const auto &y_shape = out_desc->shape(); - const auto &x_shape = x_desc->shape(); - - CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); - - CHECK_SAME_SHAPE(y_shape, x_shape); - - // create CPU elementwise descriptor - CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); - - return INFINI_STATUS_SUCCESS; -} - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *output, - std::vector inputs, - void *stream) const { - - switch (_dtype) { - case INFINI_DTYPE_F16: - return _device_info->calculate(_info, output, inputs, stream); - case INFINI_DTYPE_F32: - return _device_info->calculate(_info, output, inputs, stream); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - - return INFINI_STATUS_SUCCESS; -} } // namespace op::acos::cpu diff --git a/src/infiniop/ops/acos/cpu/acos_cpu.h b/src/infiniop/ops/acos/cpu/acos_cpu.h index 14e74b75c..50900e217 100644 --- a/src/infiniop/ops/acos/cpu/acos_cpu.h +++ b/src/infiniop/ops/acos/cpu/acos_cpu.h @@ -1,22 +1,9 @@ #ifndef __ACOS_CPU_H__ #define __ACOS_CPU_H__ -#include - #include "../../../elementwise/cpu/elementwise_cpu.h" +#include "../../../elementwise/unary.h" -ELEMENTWISE_DESCRIPTOR(acos, cpu) - -namespace op::acos::cpu { -typedef struct AcosOp { -public: - static constexpr size_t num_inputs = 1; - - template - T operator()(const T &x) const { - return std::acos(x); - } -} AcosOp; -} // namespace op::acos::cpu +UNARY_ELEMENTWISE_DESCRIPTOR(acos, cpu, op::elementwise::unary::UnaryMode::Acos) #endif // __ACOS_CPU_H__ diff --git a/src/infiniop/ops/acos/cuda/kernel.cuh b/src/infiniop/ops/acos/cuda/kernel.cuh index c3281c7e3..b62bf1e88 100644 --- a/src/infiniop/ops/acos/cuda/kernel.cuh +++ b/src/infiniop/ops/acos/cuda/kernel.cuh @@ -1,32 +1,10 @@ #ifndef __ACOS_CUDA_H__ #define __ACOS_CUDA_H__ -#include -#include +#include "../../../elementwise/unary.h" namespace op::acos::cuda { -typedef struct AcosOp { -public: - static constexpr size_t num_inputs = 1; - template - __device__ __forceinline__ T operator()(const T &x) const { - if constexpr (std::is_same_v) { - return __floats2half2_rn(acosf(__half2float(__low2half(x))), acosf(__half2float(__high2half(x)))); - } else if constexpr (std::is_same_v) { - return __float2half(acosf(__half2float(x))); - } else if constexpr (std::is_same_v) { - float x0 = __bfloat162float(__low2bfloat16(x)); - float x1 = __bfloat162float(__high2bfloat16(x)); - return __floats2bfloat162_rn(acosf(x0), acosf(x1)); - } else if constexpr (std::is_same_v) { - return __float2bfloat16_rn(acosf(__bfloat162float(x))); - } else if constexpr (std::is_same_v) { - return acosf(x); - } else { - return std::acos(x); - } - } -} AcosOp; +using Op = op::elementwise::unary::cuda::UnaryOp; } // namespace op::acos::cuda #endif // __ACOS_CUDA_H__ diff --git a/src/infiniop/ops/acos/nvidia/acos_nvidia.cu b/src/infiniop/ops/acos/nvidia/acos_nvidia.cu index 8480219bc..e7cf1feea 100644 --- a/src/infiniop/ops/acos/nvidia/acos_nvidia.cu +++ b/src/infiniop/ops/acos/nvidia/acos_nvidia.cu @@ -1,54 +1,10 @@ -#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" +#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh" #include "../cuda/kernel.cuh" #include "acos_nvidia.cuh" namespace op::acos::nvidia { -Descriptor::~Descriptor() = default; +ELEMENTWISE_NVIDIA_IMPL_UNARY(acos) -infiniStatus_t Descriptor::create( - infiniopHandle_t handle_, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t out_desc, - std::vector input_desc_vec) { - - auto handle = reinterpret_cast(handle_); - auto dtype = out_desc->dtype(); - - const auto &x_desc = input_desc_vec.at(0); - const auto &y_shape = out_desc->shape(); - const auto &x_shape = x_desc->shape(); - - CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); - - CHECK_SAME_SHAPE(y_shape, x_shape); - - // create CUDA elementwise descriptor - CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) - - return INFINI_STATUS_SUCCESS; -} - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *output, - std::vector inputs, - void *stream) const { - if (workspace_size < _workspace_size) { - return INFINI_STATUS_INSUFFICIENT_WORKSPACE; - } - - switch (_dtype) { - case INFINI_DTYPE_F16: - return _device_info->calculate<256, cuda::AcosOp, half>(_info, workspace, output, inputs, stream); - case INFINI_DTYPE_F32: - return _device_info->calculate<256, cuda::AcosOp, float>(_info, workspace, output, inputs, stream); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - - return INFINI_STATUS_SUCCESS; -} } // namespace op::acos::nvidia diff --git a/src/infiniop/ops/acos/operator.cc b/src/infiniop/ops/acos/operator.cc index e775a005a..15872b754 100644 --- a/src/infiniop/ops/acos/operator.cc +++ b/src/infiniop/ops/acos/operator.cc @@ -1,5 +1,4 @@ -#include "../../operator.h" -#include "../../handle.h" +#include "../../operator_impl.h" #include "infiniop/ops/acos.h" #ifdef ENABLE_CPU_API @@ -9,131 +8,4 @@ #include "nvidia/acos_nvidia.cuh" #endif -__C infiniStatus_t infiniopCreateAcosDescriptor( - infiniopHandle_t handle, - infiniopAcosDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t y_desc, - infiniopTensorDescriptor_t x_desc) { - -#define CREATE(CASE, NAMESPACE) \ - case CASE: \ - return op::acos::NAMESPACE::Descriptor::create( \ - handle, \ - reinterpret_cast(desc_ptr), \ - y_desc, \ - {x_desc}) - - switch (handle->device) { - -#ifdef ENABLE_CPU_API - CREATE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - CREATE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - CREATE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - CREATE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef CREATE -} - -__C infiniStatus_t infiniopGetAcosWorkspaceSize(infiniopAcosDescriptor_t desc, size_t *size) { - -#define GET(CASE, NAMESPACE) \ - case CASE: \ - *size = reinterpret_cast(desc)->workspaceSize(); \ - return INFINI_STATUS_SUCCESS; - - switch (desc->device_type) { -#ifdef ENABLE_CPU_API - GET(INFINI_DEVICE_CPU, cpu) -#endif -#ifdef ENABLE_NVIDIA_API - GET(INFINI_DEVICE_NVIDIA, nvidia) -#endif -#ifdef ENABLE_ILUVATAR_API - GET(INFINI_DEVICE_ILUVATAR, nvidia) -#endif -#ifdef ENABLE_QY_API - GET(INFINI_DEVICE_QY, nvidia) -#endif - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } -#undef GET - - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; -} - -__C infiniStatus_t infiniopAcos( - infiniopAcosDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *y, - const void *x, - void *stream) { - -#define CALCULATE(CASE, NAMESPACE) \ - case CASE: \ - return reinterpret_cast(desc) \ - ->calculate(workspace, workspace_size, y, {x}, stream) - - switch (desc->device_type) { - -#ifdef ENABLE_CPU_API - CALCULATE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - CALCULATE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef CALCULATE -} - -__C infiniStatus_t -infiniopDestroyAcosDescriptor(infiniopAcosDescriptor_t desc) { - -#define DELETE(CASE, NAMESPACE) \ - case CASE: \ - delete reinterpret_cast(desc); \ - return INFINI_STATUS_SUCCESS - - switch (desc->device_type) { - -#ifdef ENABLE_CPU_API - DELETE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - DELETE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - DELETE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - DELETE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef DELETE -} +UNARY_OP_IMPL(acos, Acos) diff --git a/src/infiniop/ops/acosh/cpu/acosh_cpu.cc b/src/infiniop/ops/acosh/cpu/acosh_cpu.cc index 005463679..0cb424c00 100644 --- a/src/infiniop/ops/acosh/cpu/acosh_cpu.cc +++ b/src/infiniop/ops/acosh/cpu/acosh_cpu.cc @@ -1,48 +1,8 @@ #include "acosh_cpu.h" +#include "../../../elementwise/cpu/elementwise_cpu_impl.h" namespace op::acosh::cpu { -Descriptor::~Descriptor() = default; +ELEMENTWISE_CPU_IMPL_UNARY(acosh) -infiniStatus_t Descriptor::create( - infiniopHandle_t handle_, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t out_desc, - std::vector input_desc_vec) { - - auto handle = reinterpret_cast(handle_); - auto dtype = out_desc->dtype(); - - const auto &x_desc = input_desc_vec.at(0); - const auto &y_shape = out_desc->shape(); - const auto &x_shape = x_desc->shape(); - - CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); - - CHECK_SAME_SHAPE(y_shape, x_shape); - - // create CPU elementwise descriptor - CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); - - return INFINI_STATUS_SUCCESS; -} - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *output, - std::vector inputs, - void *stream) const { - - switch (_dtype) { - case INFINI_DTYPE_F16: - return _device_info->calculate(_info, output, inputs, stream); - case INFINI_DTYPE_F32: - return _device_info->calculate(_info, output, inputs, stream); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - - return INFINI_STATUS_SUCCESS; -} } // namespace op::acosh::cpu diff --git a/src/infiniop/ops/acosh/cpu/acosh_cpu.h b/src/infiniop/ops/acosh/cpu/acosh_cpu.h index b4b710ed5..bb05baf14 100644 --- a/src/infiniop/ops/acosh/cpu/acosh_cpu.h +++ b/src/infiniop/ops/acosh/cpu/acosh_cpu.h @@ -1,22 +1,9 @@ #ifndef __ACOSH_CPU_H__ #define __ACOSH_CPU_H__ -#include - #include "../../../elementwise/cpu/elementwise_cpu.h" +#include "../../../elementwise/unary.h" -ELEMENTWISE_DESCRIPTOR(acosh, cpu) - -namespace op::acosh::cpu { -typedef struct AcoshOp { -public: - static constexpr size_t num_inputs = 1; - - template - T operator()(const T &x) const { - return std::acosh(x); - } -} AcoshOp; -} // namespace op::acosh::cpu +UNARY_ELEMENTWISE_DESCRIPTOR(acosh, cpu, op::elementwise::unary::UnaryMode::Acosh) #endif // __ACOSH_CPU_H__ diff --git a/src/infiniop/ops/acosh/cuda/kernel.cuh b/src/infiniop/ops/acosh/cuda/kernel.cuh index fe444b1b4..9fbb54636 100644 --- a/src/infiniop/ops/acosh/cuda/kernel.cuh +++ b/src/infiniop/ops/acosh/cuda/kernel.cuh @@ -1,32 +1,10 @@ #ifndef __ACOSH_CUDA_H__ #define __ACOSH_CUDA_H__ -#include -#include +#include "../../../elementwise/unary.h" namespace op::acosh::cuda { -typedef struct AcoshOp { -public: - static constexpr size_t num_inputs = 1; - template - __device__ __forceinline__ T operator()(const T &x) const { - if constexpr (std::is_same_v) { - return __floats2half2_rn(acoshf(__half2float(__low2half(x))), acoshf(__half2float(__high2half(x)))); - } else if constexpr (std::is_same_v) { - return __float2half(acoshf(__half2float(x))); - } else if constexpr (std::is_same_v) { - float x0 = __bfloat162float(__low2bfloat16(x)); - float x1 = __bfloat162float(__high2bfloat16(x)); - return __floats2bfloat162_rn(acoshf(x0), acoshf(x1)); - } else if constexpr (std::is_same_v) { - return __float2bfloat16_rn(acoshf(__bfloat162float(x))); - } else if constexpr (std::is_same_v) { - return acoshf(x); - } else { - return std::acosh(x); - } - } -} AcoshOp; +using Op = op::elementwise::unary::cuda::UnaryOp; } // namespace op::acosh::cuda #endif // __ACOSH_CUDA_H__ diff --git a/src/infiniop/ops/acosh/nvidia/acosh_nvidia.cu b/src/infiniop/ops/acosh/nvidia/acosh_nvidia.cu index fc06590a7..5d065bdbc 100644 --- a/src/infiniop/ops/acosh/nvidia/acosh_nvidia.cu +++ b/src/infiniop/ops/acosh/nvidia/acosh_nvidia.cu @@ -1,54 +1,10 @@ -#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" +#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh" #include "../cuda/kernel.cuh" #include "acosh_nvidia.cuh" namespace op::acosh::nvidia { -Descriptor::~Descriptor() = default; +ELEMENTWISE_NVIDIA_IMPL_UNARY(acosh) -infiniStatus_t Descriptor::create( - infiniopHandle_t handle_, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t out_desc, - std::vector input_desc_vec) { - - auto handle = reinterpret_cast(handle_); - auto dtype = out_desc->dtype(); - - const auto &x_desc = input_desc_vec.at(0); - const auto &y_shape = out_desc->shape(); - const auto &x_shape = x_desc->shape(); - - CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); - - CHECK_SAME_SHAPE(y_shape, x_shape); - - // create CUDA elementwise descriptor - CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) - - return INFINI_STATUS_SUCCESS; -} - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *output, - std::vector inputs, - void *stream) const { - if (workspace_size < _workspace_size) { - return INFINI_STATUS_INSUFFICIENT_WORKSPACE; - } - - switch (_dtype) { - case INFINI_DTYPE_F16: - return _device_info->calculate<256, cuda::AcoshOp, half>(_info, workspace, output, inputs, stream); - case INFINI_DTYPE_F32: - return _device_info->calculate<256, cuda::AcoshOp, float>(_info, workspace, output, inputs, stream); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - - return INFINI_STATUS_SUCCESS; -} } // namespace op::acosh::nvidia diff --git a/src/infiniop/ops/acosh/operator.cc b/src/infiniop/ops/acosh/operator.cc index 9bba3389a..c1939a54c 100644 --- a/src/infiniop/ops/acosh/operator.cc +++ b/src/infiniop/ops/acosh/operator.cc @@ -1,5 +1,4 @@ -#include "../../operator.h" -#include "../../handle.h" +#include "../../operator_impl.h" #include "infiniop/ops/acosh.h" #ifdef ENABLE_CPU_API @@ -9,131 +8,4 @@ #include "nvidia/acosh_nvidia.cuh" #endif -__C infiniStatus_t infiniopCreateAcoshDescriptor( - infiniopHandle_t handle, - infiniopAcoshDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t y_desc, - infiniopTensorDescriptor_t x_desc) { - -#define CREATE(CASE, NAMESPACE) \ - case CASE: \ - return op::acosh::NAMESPACE::Descriptor::create( \ - handle, \ - reinterpret_cast(desc_ptr), \ - y_desc, \ - {x_desc}) - - switch (handle->device) { - -#ifdef ENABLE_CPU_API - CREATE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - CREATE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - CREATE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - CREATE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef CREATE -} - -__C infiniStatus_t infiniopGetAcoshWorkspaceSize(infiniopAcoshDescriptor_t desc, size_t *size) { - -#define GET(CASE, NAMESPACE) \ - case CASE: \ - *size = reinterpret_cast(desc)->workspaceSize(); \ - return INFINI_STATUS_SUCCESS; - - switch (desc->device_type) { -#ifdef ENABLE_CPU_API - GET(INFINI_DEVICE_CPU, cpu) -#endif -#ifdef ENABLE_NVIDIA_API - GET(INFINI_DEVICE_NVIDIA, nvidia) -#endif -#ifdef ENABLE_ILUVATAR_API - GET(INFINI_DEVICE_ILUVATAR, nvidia) -#endif -#ifdef ENABLE_QY_API - GET(INFINI_DEVICE_QY, nvidia) -#endif - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } -#undef GET - - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; -} - -__C infiniStatus_t infiniopAcosh( - infiniopAcoshDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *y, - const void *x, - void *stream) { - -#define CALCULATE(CASE, NAMESPACE) \ - case CASE: \ - return reinterpret_cast(desc) \ - ->calculate(workspace, workspace_size, y, {x}, stream) - - switch (desc->device_type) { - -#ifdef ENABLE_CPU_API - CALCULATE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - CALCULATE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef CALCULATE -} - -__C infiniStatus_t -infiniopDestroyAcoshDescriptor(infiniopAcoshDescriptor_t desc) { - -#define DELETE(CASE, NAMESPACE) \ - case CASE: \ - delete reinterpret_cast(desc); \ - return INFINI_STATUS_SUCCESS - - switch (desc->device_type) { - -#ifdef ENABLE_CPU_API - DELETE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - DELETE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - DELETE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - DELETE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef DELETE -} +UNARY_OP_IMPL(acosh, Acosh) diff --git a/src/infiniop/ops/asin/cpu/asin_cpu.cc b/src/infiniop/ops/asin/cpu/asin_cpu.cc index e149044f1..de42639ff 100644 --- a/src/infiniop/ops/asin/cpu/asin_cpu.cc +++ b/src/infiniop/ops/asin/cpu/asin_cpu.cc @@ -1,48 +1,8 @@ #include "asin_cpu.h" +#include "../../../elementwise/cpu/elementwise_cpu_impl.h" namespace op::asin::cpu { -Descriptor::~Descriptor() = default; +ELEMENTWISE_CPU_IMPL_UNARY(asin) -infiniStatus_t Descriptor::create( - infiniopHandle_t handle_, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t out_desc, - std::vector input_desc_vec) { - - auto handle = reinterpret_cast(handle_); - auto dtype = out_desc->dtype(); - - const auto &x_desc = input_desc_vec.at(0); - const auto &y_shape = out_desc->shape(); - const auto &x_shape = x_desc->shape(); - - CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); - - CHECK_SAME_SHAPE(y_shape, x_shape); - - // create CPU elementwise descriptor - CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); - - return INFINI_STATUS_SUCCESS; -} - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *output, - std::vector inputs, - void *stream) const { - - switch (_dtype) { - case INFINI_DTYPE_F16: - return _device_info->calculate(_info, output, inputs, stream); - case INFINI_DTYPE_F32: - return _device_info->calculate(_info, output, inputs, stream); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - - return INFINI_STATUS_SUCCESS; -} } // namespace op::asin::cpu diff --git a/src/infiniop/ops/asin/cpu/asin_cpu.h b/src/infiniop/ops/asin/cpu/asin_cpu.h index 22bcba337..8c6da5e20 100644 --- a/src/infiniop/ops/asin/cpu/asin_cpu.h +++ b/src/infiniop/ops/asin/cpu/asin_cpu.h @@ -1,22 +1,9 @@ #ifndef __ASIN_CPU_H__ #define __ASIN_CPU_H__ -#include - #include "../../../elementwise/cpu/elementwise_cpu.h" +#include "../../../elementwise/unary.h" -ELEMENTWISE_DESCRIPTOR(asin, cpu) - -namespace op::asin::cpu { -typedef struct AsinOp { -public: - static constexpr size_t num_inputs = 1; - - template - T operator()(const T &x) const { - return std::asin(x); - } -} AsinOp; -} // namespace op::asin::cpu +UNARY_ELEMENTWISE_DESCRIPTOR(asin, cpu, op::elementwise::unary::UnaryMode::Asin) #endif // __ASIN_CPU_H__ diff --git a/src/infiniop/ops/asin/cuda/kernel.cuh b/src/infiniop/ops/asin/cuda/kernel.cuh index 3e8d11a07..a7063f015 100644 --- a/src/infiniop/ops/asin/cuda/kernel.cuh +++ b/src/infiniop/ops/asin/cuda/kernel.cuh @@ -1,32 +1,10 @@ #ifndef __ASIN_CUDA_H__ #define __ASIN_CUDA_H__ -#include -#include +#include "../../../elementwise/unary.h" namespace op::asin::cuda { -typedef struct AsinOp { -public: - static constexpr size_t num_inputs = 1; - template - __device__ __forceinline__ T operator()(const T &x) const { - if constexpr (std::is_same_v) { - return __floats2half2_rn(asinf(__half2float(__low2half(x))), asinf(__half2float(__high2half(x)))); - } else if constexpr (std::is_same_v) { - return __float2half(asinf(__half2float(x))); - } else if constexpr (std::is_same_v) { - float x0 = __bfloat162float(__low2bfloat16(x)); - float x1 = __bfloat162float(__high2bfloat16(x)); - return __floats2bfloat162_rn(asinf(x0), asinf(x1)); - } else if constexpr (std::is_same_v) { - return __float2bfloat16_rn(asinf(__bfloat162float(x))); - } else if constexpr (std::is_same_v) { - return asinf(x); - } else { - return std::asin(x); - } - } -} AsinOp; +using Op = op::elementwise::unary::cuda::UnaryOp; } // namespace op::asin::cuda #endif // __ASIN_CUDA_H__ diff --git a/src/infiniop/ops/asin/nvidia/asin_nvidia.cu b/src/infiniop/ops/asin/nvidia/asin_nvidia.cu index 714d2b1b3..262755d50 100644 --- a/src/infiniop/ops/asin/nvidia/asin_nvidia.cu +++ b/src/infiniop/ops/asin/nvidia/asin_nvidia.cu @@ -1,54 +1,10 @@ -#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" +#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh" #include "../cuda/kernel.cuh" #include "asin_nvidia.cuh" namespace op::asin::nvidia { -Descriptor::~Descriptor() = default; +ELEMENTWISE_NVIDIA_IMPL_UNARY(asin) -infiniStatus_t Descriptor::create( - infiniopHandle_t handle_, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t out_desc, - std::vector input_desc_vec) { - - auto handle = reinterpret_cast(handle_); - auto dtype = out_desc->dtype(); - - const auto &x_desc = input_desc_vec.at(0); - const auto &y_shape = out_desc->shape(); - const auto &x_shape = x_desc->shape(); - - CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); - - CHECK_SAME_SHAPE(y_shape, x_shape); - - // create CUDA elementwise descriptor - CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) - - return INFINI_STATUS_SUCCESS; -} - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *output, - std::vector inputs, - void *stream) const { - if (workspace_size < _workspace_size) { - return INFINI_STATUS_INSUFFICIENT_WORKSPACE; - } - - switch (_dtype) { - case INFINI_DTYPE_F16: - return _device_info->calculate<256, cuda::AsinOp, half>(_info, workspace, output, inputs, stream); - case INFINI_DTYPE_F32: - return _device_info->calculate<256, cuda::AsinOp, float>(_info, workspace, output, inputs, stream); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - - return INFINI_STATUS_SUCCESS; -} } // namespace op::asin::nvidia diff --git a/src/infiniop/ops/asin/operator.cc b/src/infiniop/ops/asin/operator.cc index c4973e9f5..edb8fa867 100644 --- a/src/infiniop/ops/asin/operator.cc +++ b/src/infiniop/ops/asin/operator.cc @@ -1,5 +1,4 @@ -#include "../../operator.h" -#include "../../handle.h" +#include "../../operator_impl.h" #include "infiniop/ops/asin.h" #ifdef ENABLE_CPU_API @@ -9,131 +8,4 @@ #include "nvidia/asin_nvidia.cuh" #endif -__C infiniStatus_t infiniopCreateAsinDescriptor( - infiniopHandle_t handle, - infiniopAsinDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t y_desc, - infiniopTensorDescriptor_t x_desc) { - -#define CREATE(CASE, NAMESPACE) \ - case CASE: \ - return op::asin::NAMESPACE::Descriptor::create( \ - handle, \ - reinterpret_cast(desc_ptr), \ - y_desc, \ - {x_desc}) - - switch (handle->device) { - -#ifdef ENABLE_CPU_API - CREATE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - CREATE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - CREATE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - CREATE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef CREATE -} - -__C infiniStatus_t infiniopGetAsinWorkspaceSize(infiniopAsinDescriptor_t desc, size_t *size) { - -#define GET(CASE, NAMESPACE) \ - case CASE: \ - *size = reinterpret_cast(desc)->workspaceSize(); \ - return INFINI_STATUS_SUCCESS; - - switch (desc->device_type) { -#ifdef ENABLE_CPU_API - GET(INFINI_DEVICE_CPU, cpu) -#endif -#ifdef ENABLE_NVIDIA_API - GET(INFINI_DEVICE_NVIDIA, nvidia) -#endif -#ifdef ENABLE_ILUVATAR_API - GET(INFINI_DEVICE_ILUVATAR, nvidia) -#endif -#ifdef ENABLE_QY_API - GET(INFINI_DEVICE_QY, nvidia) -#endif - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } -#undef GET - - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; -} - -__C infiniStatus_t infiniopAsin( - infiniopAsinDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *y, - const void *x, - void *stream) { - -#define CALCULATE(CASE, NAMESPACE) \ - case CASE: \ - return reinterpret_cast(desc) \ - ->calculate(workspace, workspace_size, y, {x}, stream) - - switch (desc->device_type) { - -#ifdef ENABLE_CPU_API - CALCULATE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - CALCULATE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef CALCULATE -} - -__C infiniStatus_t -infiniopDestroyAsinDescriptor(infiniopAsinDescriptor_t desc) { - -#define DELETE(CASE, NAMESPACE) \ - case CASE: \ - delete reinterpret_cast(desc); \ - return INFINI_STATUS_SUCCESS - - switch (desc->device_type) { - -#ifdef ENABLE_CPU_API - DELETE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - DELETE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - DELETE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - DELETE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef DELETE -} +UNARY_OP_IMPL(asin, Asin) diff --git a/src/infiniop/ops/asinh/cpu/asinh_cpu.cc b/src/infiniop/ops/asinh/cpu/asinh_cpu.cc index e0d5b749a..8b18ab6f8 100644 --- a/src/infiniop/ops/asinh/cpu/asinh_cpu.cc +++ b/src/infiniop/ops/asinh/cpu/asinh_cpu.cc @@ -1,48 +1,8 @@ #include "asinh_cpu.h" +#include "../../../elementwise/cpu/elementwise_cpu_impl.h" namespace op::asinh::cpu { -Descriptor::~Descriptor() = default; +ELEMENTWISE_CPU_IMPL_UNARY(asinh) -infiniStatus_t Descriptor::create( - infiniopHandle_t handle_, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t out_desc, - std::vector input_desc_vec) { - - auto handle = reinterpret_cast(handle_); - auto dtype = out_desc->dtype(); - - const auto &x_desc = input_desc_vec.at(0); - const auto &y_shape = out_desc->shape(); - const auto &x_shape = x_desc->shape(); - - CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); - - CHECK_SAME_SHAPE(y_shape, x_shape); - - // create CPU elementwise descriptor - CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); - - return INFINI_STATUS_SUCCESS; -} - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *output, - std::vector inputs, - void *stream) const { - - switch (_dtype) { - case INFINI_DTYPE_F16: - return _device_info->calculate(_info, output, inputs, stream); - case INFINI_DTYPE_F32: - return _device_info->calculate(_info, output, inputs, stream); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - - return INFINI_STATUS_SUCCESS; -} } // namespace op::asinh::cpu diff --git a/src/infiniop/ops/asinh/cpu/asinh_cpu.h b/src/infiniop/ops/asinh/cpu/asinh_cpu.h index 0a999b63b..4c3603752 100644 --- a/src/infiniop/ops/asinh/cpu/asinh_cpu.h +++ b/src/infiniop/ops/asinh/cpu/asinh_cpu.h @@ -1,22 +1,9 @@ #ifndef __ASINH_CPU_H__ #define __ASINH_CPU_H__ -#include - #include "../../../elementwise/cpu/elementwise_cpu.h" +#include "../../../elementwise/unary.h" -ELEMENTWISE_DESCRIPTOR(asinh, cpu) - -namespace op::asinh::cpu { -typedef struct AsinhOp { -public: - static constexpr size_t num_inputs = 1; - - template - T operator()(const T &x) const { - return std::asinh(x); - } -} AsinhOp; -} // namespace op::asinh::cpu +UNARY_ELEMENTWISE_DESCRIPTOR(asinh, cpu, op::elementwise::unary::UnaryMode::Asinh) #endif // __ASINH_CPU_H__ diff --git a/src/infiniop/ops/asinh/cuda/kernel.cuh b/src/infiniop/ops/asinh/cuda/kernel.cuh index 7cb018c8a..866ea147a 100644 --- a/src/infiniop/ops/asinh/cuda/kernel.cuh +++ b/src/infiniop/ops/asinh/cuda/kernel.cuh @@ -1,32 +1,10 @@ #ifndef __ASINH_CUDA_H__ #define __ASINH_CUDA_H__ -#include -#include +#include "../../../elementwise/unary.h" namespace op::asinh::cuda { -typedef struct AsinhOp { -public: - static constexpr size_t num_inputs = 1; - template - __device__ __forceinline__ T operator()(const T &x) const { - if constexpr (std::is_same_v) { - return __floats2half2_rn(asinhf(__half2float(__low2half(x))), asinhf(__half2float(__high2half(x)))); - } else if constexpr (std::is_same_v) { - return __float2half(asinhf(__half2float(x))); - } else if constexpr (std::is_same_v) { - float x0 = __bfloat162float(__low2bfloat16(x)); - float x1 = __bfloat162float(__high2bfloat16(x)); - return __floats2bfloat162_rn(asinhf(x0), asinhf(x1)); - } else if constexpr (std::is_same_v) { - return __float2bfloat16_rn(asinhf(__bfloat162float(x))); - } else if constexpr (std::is_same_v) { - return asinhf(x); - } else { - return std::asinh(x); - } - } -} AsinhOp; +using Op = op::elementwise::unary::cuda::UnaryOp; } // namespace op::asinh::cuda #endif // __ASINH_CUDA_H__ diff --git a/src/infiniop/ops/asinh/nvidia/asinh_nvidia.cu b/src/infiniop/ops/asinh/nvidia/asinh_nvidia.cu index 203008b81..37c44baf0 100644 --- a/src/infiniop/ops/asinh/nvidia/asinh_nvidia.cu +++ b/src/infiniop/ops/asinh/nvidia/asinh_nvidia.cu @@ -1,54 +1,10 @@ -#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" +#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh" #include "../cuda/kernel.cuh" #include "asinh_nvidia.cuh" namespace op::asinh::nvidia { -Descriptor::~Descriptor() = default; +ELEMENTWISE_NVIDIA_IMPL_UNARY(asinh) -infiniStatus_t Descriptor::create( - infiniopHandle_t handle_, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t out_desc, - std::vector input_desc_vec) { - - auto handle = reinterpret_cast(handle_); - auto dtype = out_desc->dtype(); - - const auto &x_desc = input_desc_vec.at(0); - const auto &y_shape = out_desc->shape(); - const auto &x_shape = x_desc->shape(); - - CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); - - CHECK_SAME_SHAPE(y_shape, x_shape); - - // create CUDA elementwise descriptor - CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) - - return INFINI_STATUS_SUCCESS; -} - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *output, - std::vector inputs, - void *stream) const { - if (workspace_size < _workspace_size) { - return INFINI_STATUS_INSUFFICIENT_WORKSPACE; - } - - switch (_dtype) { - case INFINI_DTYPE_F16: - return _device_info->calculate<256, cuda::AsinhOp, half>(_info, workspace, output, inputs, stream); - case INFINI_DTYPE_F32: - return _device_info->calculate<256, cuda::AsinhOp, float>(_info, workspace, output, inputs, stream); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - - return INFINI_STATUS_SUCCESS; -} } // namespace op::asinh::nvidia diff --git a/src/infiniop/ops/asinh/operator.cc b/src/infiniop/ops/asinh/operator.cc index d9ff5beda..7b519ec05 100644 --- a/src/infiniop/ops/asinh/operator.cc +++ b/src/infiniop/ops/asinh/operator.cc @@ -1,5 +1,4 @@ -#include "../../operator.h" -#include "../../handle.h" +#include "../../operator_impl.h" #include "infiniop/ops/asinh.h" #ifdef ENABLE_CPU_API @@ -9,131 +8,4 @@ #include "nvidia/asinh_nvidia.cuh" #endif -__C infiniStatus_t infiniopCreateAsinhDescriptor( - infiniopHandle_t handle, - infiniopAsinhDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t y_desc, - infiniopTensorDescriptor_t x_desc) { - -#define CREATE(CASE, NAMESPACE) \ - case CASE: \ - return op::asinh::NAMESPACE::Descriptor::create( \ - handle, \ - reinterpret_cast(desc_ptr), \ - y_desc, \ - {x_desc}) - - switch (handle->device) { - -#ifdef ENABLE_CPU_API - CREATE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - CREATE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - CREATE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - CREATE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef CREATE -} - -__C infiniStatus_t infiniopGetAsinhWorkspaceSize(infiniopAsinhDescriptor_t desc, size_t *size) { - -#define GET(CASE, NAMESPACE) \ - case CASE: \ - *size = reinterpret_cast(desc)->workspaceSize(); \ - return INFINI_STATUS_SUCCESS; - - switch (desc->device_type) { -#ifdef ENABLE_CPU_API - GET(INFINI_DEVICE_CPU, cpu) -#endif -#ifdef ENABLE_NVIDIA_API - GET(INFINI_DEVICE_NVIDIA, nvidia) -#endif -#ifdef ENABLE_ILUVATAR_API - GET(INFINI_DEVICE_ILUVATAR, nvidia) -#endif -#ifdef ENABLE_QY_API - GET(INFINI_DEVICE_QY, nvidia) -#endif - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } -#undef GET - - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; -} - -__C infiniStatus_t infiniopAsinh( - infiniopAsinhDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *y, - const void *x, - void *stream) { - -#define CALCULATE(CASE, NAMESPACE) \ - case CASE: \ - return reinterpret_cast(desc) \ - ->calculate(workspace, workspace_size, y, {x}, stream) - - switch (desc->device_type) { - -#ifdef ENABLE_CPU_API - CALCULATE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - CALCULATE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef CALCULATE -} - -__C infiniStatus_t -infiniopDestroyAsinhDescriptor(infiniopAsinhDescriptor_t desc) { - -#define DELETE(CASE, NAMESPACE) \ - case CASE: \ - delete reinterpret_cast(desc); \ - return INFINI_STATUS_SUCCESS - - switch (desc->device_type) { - -#ifdef ENABLE_CPU_API - DELETE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - DELETE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - DELETE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - DELETE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef DELETE -} +UNARY_OP_IMPL(asinh, Asinh) diff --git a/src/infiniop/ops/atan/cpu/atan_cpu.cc b/src/infiniop/ops/atan/cpu/atan_cpu.cc index a8c613d1e..075c7fd4e 100644 --- a/src/infiniop/ops/atan/cpu/atan_cpu.cc +++ b/src/infiniop/ops/atan/cpu/atan_cpu.cc @@ -1,48 +1,8 @@ #include "atan_cpu.h" +#include "../../../elementwise/cpu/elementwise_cpu_impl.h" namespace op::atan::cpu { -Descriptor::~Descriptor() = default; +ELEMENTWISE_CPU_IMPL_UNARY(atan) -infiniStatus_t Descriptor::create( - infiniopHandle_t handle_, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t out_desc, - std::vector input_desc_vec) { - - auto handle = reinterpret_cast(handle_); - auto dtype = out_desc->dtype(); - - const auto &x_desc = input_desc_vec.at(0); - const auto &y_shape = out_desc->shape(); - const auto &x_shape = x_desc->shape(); - - CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); - - CHECK_SAME_SHAPE(y_shape, x_shape); - - // create CPU elementwise descriptor - CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); - - return INFINI_STATUS_SUCCESS; -} - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *output, - std::vector inputs, - void *stream) const { - - switch (_dtype) { - case INFINI_DTYPE_F16: - return _device_info->calculate(_info, output, inputs, stream); - case INFINI_DTYPE_F32: - return _device_info->calculate(_info, output, inputs, stream); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - - return INFINI_STATUS_SUCCESS; -} } // namespace op::atan::cpu diff --git a/src/infiniop/ops/atan/cpu/atan_cpu.h b/src/infiniop/ops/atan/cpu/atan_cpu.h index ac2a1bc0c..6b333cfb1 100644 --- a/src/infiniop/ops/atan/cpu/atan_cpu.h +++ b/src/infiniop/ops/atan/cpu/atan_cpu.h @@ -1,22 +1,9 @@ #ifndef __ATAN_CPU_H__ #define __ATAN_CPU_H__ -#include - #include "../../../elementwise/cpu/elementwise_cpu.h" +#include "../../../elementwise/unary.h" -ELEMENTWISE_DESCRIPTOR(atan, cpu) - -namespace op::atan::cpu { -typedef struct AtanOp { -public: - static constexpr size_t num_inputs = 1; - - template - T operator()(const T &x) const { - return std::atan(x); - } -} AtanOp; -} // namespace op::atan::cpu +UNARY_ELEMENTWISE_DESCRIPTOR(atan, cpu, op::elementwise::unary::UnaryMode::Atan) #endif // __ATAN_CPU_H__ diff --git a/src/infiniop/ops/atan/cuda/kernel.cuh b/src/infiniop/ops/atan/cuda/kernel.cuh index 0c7745196..ce553c1c1 100644 --- a/src/infiniop/ops/atan/cuda/kernel.cuh +++ b/src/infiniop/ops/atan/cuda/kernel.cuh @@ -1,32 +1,10 @@ #ifndef __ATAN_CUDA_H__ #define __ATAN_CUDA_H__ -#include -#include +#include "../../../elementwise/unary.h" namespace op::atan::cuda { -typedef struct AtanOp { -public: - static constexpr size_t num_inputs = 1; - template - __device__ __forceinline__ T operator()(const T &x) const { - if constexpr (std::is_same_v) { - return __floats2half2_rn(atanf(__half2float(__low2half(x))), atanf(__half2float(__high2half(x)))); - } else if constexpr (std::is_same_v) { - return __float2half(atanf(__half2float(x))); - } else if constexpr (std::is_same_v) { - float x0 = __bfloat162float(__low2bfloat16(x)); - float x1 = __bfloat162float(__high2bfloat16(x)); - return __floats2bfloat162_rn(atanf(x0), atanf(x1)); - } else if constexpr (std::is_same_v) { - return __float2bfloat16_rn(atanf(__bfloat162float(x))); - } else if constexpr (std::is_same_v) { - return atanf(x); - } else { - return std::atan(x); - } - } -} AtanOp; +using Op = op::elementwise::unary::cuda::UnaryOp; } // namespace op::atan::cuda #endif // __ATAN_CUDA_H__ diff --git a/src/infiniop/ops/atan/nvidia/atan_nvidia.cu b/src/infiniop/ops/atan/nvidia/atan_nvidia.cu index 2c6cf53d4..a05d65b79 100644 --- a/src/infiniop/ops/atan/nvidia/atan_nvidia.cu +++ b/src/infiniop/ops/atan/nvidia/atan_nvidia.cu @@ -1,54 +1,10 @@ -#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" +#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh" #include "../cuda/kernel.cuh" #include "atan_nvidia.cuh" namespace op::atan::nvidia { -Descriptor::~Descriptor() = default; +ELEMENTWISE_NVIDIA_IMPL_UNARY(atan) -infiniStatus_t Descriptor::create( - infiniopHandle_t handle_, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t out_desc, - std::vector input_desc_vec) { - - auto handle = reinterpret_cast(handle_); - auto dtype = out_desc->dtype(); - - const auto &x_desc = input_desc_vec.at(0); - const auto &y_shape = out_desc->shape(); - const auto &x_shape = x_desc->shape(); - - CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); - - CHECK_SAME_SHAPE(y_shape, x_shape); - - // create CUDA elementwise descriptor - CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) - - return INFINI_STATUS_SUCCESS; -} - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *output, - std::vector inputs, - void *stream) const { - if (workspace_size < _workspace_size) { - return INFINI_STATUS_INSUFFICIENT_WORKSPACE; - } - - switch (_dtype) { - case INFINI_DTYPE_F16: - return _device_info->calculate<256, cuda::AtanOp, half>(_info, workspace, output, inputs, stream); - case INFINI_DTYPE_F32: - return _device_info->calculate<256, cuda::AtanOp, float>(_info, workspace, output, inputs, stream); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - - return INFINI_STATUS_SUCCESS; -} } // namespace op::atan::nvidia diff --git a/src/infiniop/ops/atan/operator.cc b/src/infiniop/ops/atan/operator.cc index c56e101d2..9025489c3 100644 --- a/src/infiniop/ops/atan/operator.cc +++ b/src/infiniop/ops/atan/operator.cc @@ -1,5 +1,4 @@ -#include "../../operator.h" -#include "../../handle.h" +#include "../../operator_impl.h" #include "infiniop/ops/atan.h" #ifdef ENABLE_CPU_API @@ -9,131 +8,4 @@ #include "nvidia/atan_nvidia.cuh" #endif -__C infiniStatus_t infiniopCreateAtanDescriptor( - infiniopHandle_t handle, - infiniopAtanDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t y_desc, - infiniopTensorDescriptor_t x_desc) { - -#define CREATE(CASE, NAMESPACE) \ - case CASE: \ - return op::atan::NAMESPACE::Descriptor::create( \ - handle, \ - reinterpret_cast(desc_ptr), \ - y_desc, \ - {x_desc}) - - switch (handle->device) { - -#ifdef ENABLE_CPU_API - CREATE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - CREATE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - CREATE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - CREATE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef CREATE -} - -__C infiniStatus_t infiniopGetAtanWorkspaceSize(infiniopAtanDescriptor_t desc, size_t *size) { - -#define GET(CASE, NAMESPACE) \ - case CASE: \ - *size = reinterpret_cast(desc)->workspaceSize(); \ - return INFINI_STATUS_SUCCESS; - - switch (desc->device_type) { -#ifdef ENABLE_CPU_API - GET(INFINI_DEVICE_CPU, cpu) -#endif -#ifdef ENABLE_NVIDIA_API - GET(INFINI_DEVICE_NVIDIA, nvidia) -#endif -#ifdef ENABLE_ILUVATAR_API - GET(INFINI_DEVICE_ILUVATAR, nvidia) -#endif -#ifdef ENABLE_QY_API - GET(INFINI_DEVICE_QY, nvidia) -#endif - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } -#undef GET - - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; -} - -__C infiniStatus_t infiniopAtan( - infiniopAtanDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *y, - const void *x, - void *stream) { - -#define CALCULATE(CASE, NAMESPACE) \ - case CASE: \ - return reinterpret_cast(desc) \ - ->calculate(workspace, workspace_size, y, {x}, stream) - - switch (desc->device_type) { - -#ifdef ENABLE_CPU_API - CALCULATE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - CALCULATE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef CALCULATE -} - -__C infiniStatus_t -infiniopDestroyAtanDescriptor(infiniopAtanDescriptor_t desc) { - -#define DELETE(CASE, NAMESPACE) \ - case CASE: \ - delete reinterpret_cast(desc); \ - return INFINI_STATUS_SUCCESS - - switch (desc->device_type) { - -#ifdef ENABLE_CPU_API - DELETE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - DELETE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - DELETE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - DELETE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef DELETE -} +UNARY_OP_IMPL(atan, Atan) diff --git a/src/infiniop/ops/atanh/cpu/atanh_cpu.cc b/src/infiniop/ops/atanh/cpu/atanh_cpu.cc index 66ef4b1df..d19c978e4 100644 --- a/src/infiniop/ops/atanh/cpu/atanh_cpu.cc +++ b/src/infiniop/ops/atanh/cpu/atanh_cpu.cc @@ -1,48 +1,8 @@ #include "atanh_cpu.h" +#include "../../../elementwise/cpu/elementwise_cpu_impl.h" namespace op::atanh::cpu { -Descriptor::~Descriptor() = default; +ELEMENTWISE_CPU_IMPL_UNARY(atanh) -infiniStatus_t Descriptor::create( - infiniopHandle_t handle_, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t out_desc, - std::vector input_desc_vec) { - - auto handle = reinterpret_cast(handle_); - auto dtype = out_desc->dtype(); - - const auto &x_desc = input_desc_vec.at(0); - const auto &y_shape = out_desc->shape(); - const auto &x_shape = x_desc->shape(); - - CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); - - CHECK_SAME_SHAPE(y_shape, x_shape); - - // create CPU elementwise descriptor - CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); - - return INFINI_STATUS_SUCCESS; -} - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *output, - std::vector inputs, - void *stream) const { - - switch (_dtype) { - case INFINI_DTYPE_F16: - return _device_info->calculate(_info, output, inputs, stream); - case INFINI_DTYPE_F32: - return _device_info->calculate(_info, output, inputs, stream); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - - return INFINI_STATUS_SUCCESS; -} } // namespace op::atanh::cpu diff --git a/src/infiniop/ops/atanh/cpu/atanh_cpu.h b/src/infiniop/ops/atanh/cpu/atanh_cpu.h index 8c2b04755..1a37453f0 100644 --- a/src/infiniop/ops/atanh/cpu/atanh_cpu.h +++ b/src/infiniop/ops/atanh/cpu/atanh_cpu.h @@ -1,22 +1,9 @@ #ifndef __ATANH_CPU_H__ #define __ATANH_CPU_H__ -#include - #include "../../../elementwise/cpu/elementwise_cpu.h" +#include "../../../elementwise/unary.h" -ELEMENTWISE_DESCRIPTOR(atanh, cpu) - -namespace op::atanh::cpu { -typedef struct AtanhOp { -public: - static constexpr size_t num_inputs = 1; - - template - T operator()(const T &x) const { - return std::atanh(x); - } -} AtanhOp; -} // namespace op::atanh::cpu +UNARY_ELEMENTWISE_DESCRIPTOR(atanh, cpu, op::elementwise::unary::UnaryMode::Atanh) #endif // __ATANH_CPU_H__ diff --git a/src/infiniop/ops/atanh/cuda/kernel.cuh b/src/infiniop/ops/atanh/cuda/kernel.cuh index 5337d8243..de0866ba5 100644 --- a/src/infiniop/ops/atanh/cuda/kernel.cuh +++ b/src/infiniop/ops/atanh/cuda/kernel.cuh @@ -1,32 +1,10 @@ #ifndef __ATANH_CUDA_H__ #define __ATANH_CUDA_H__ -#include -#include +#include "../../../elementwise/unary.h" namespace op::atanh::cuda { -typedef struct AtanhOp { -public: - static constexpr size_t num_inputs = 1; - template - __device__ __forceinline__ T operator()(const T &x) const { - if constexpr (std::is_same_v) { - return __floats2half2_rn(atanhf(__half2float(__low2half(x))), atanhf(__half2float(__high2half(x)))); - } else if constexpr (std::is_same_v) { - return __float2half(atanhf(__half2float(x))); - } else if constexpr (std::is_same_v) { - float x0 = __bfloat162float(__low2bfloat16(x)); - float x1 = __bfloat162float(__high2bfloat16(x)); - return __floats2bfloat162_rn(atanhf(x0), atanhf(x1)); - } else if constexpr (std::is_same_v) { - return __float2bfloat16_rn(atanhf(__bfloat162float(x))); - } else if constexpr (std::is_same_v) { - return atanhf(x); - } else { - return std::atanh(x); - } - } -} AtanhOp; +using Op = op::elementwise::unary::cuda::UnaryOp; } // namespace op::atanh::cuda #endif // __ATANH_CUDA_H__ diff --git a/src/infiniop/ops/atanh/nvidia/atanh_nvidia.cu b/src/infiniop/ops/atanh/nvidia/atanh_nvidia.cu index cb5a1ff03..55b435920 100644 --- a/src/infiniop/ops/atanh/nvidia/atanh_nvidia.cu +++ b/src/infiniop/ops/atanh/nvidia/atanh_nvidia.cu @@ -1,54 +1,10 @@ -#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" +#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh" #include "../cuda/kernel.cuh" #include "atanh_nvidia.cuh" namespace op::atanh::nvidia { -Descriptor::~Descriptor() = default; +ELEMENTWISE_NVIDIA_IMPL_UNARY(atanh) -infiniStatus_t Descriptor::create( - infiniopHandle_t handle_, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t out_desc, - std::vector input_desc_vec) { - - auto handle = reinterpret_cast(handle_); - auto dtype = out_desc->dtype(); - - const auto &x_desc = input_desc_vec.at(0); - const auto &y_shape = out_desc->shape(); - const auto &x_shape = x_desc->shape(); - - CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); - - CHECK_SAME_SHAPE(y_shape, x_shape); - - // create CUDA elementwise descriptor - CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) - - return INFINI_STATUS_SUCCESS; -} - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *output, - std::vector inputs, - void *stream) const { - if (workspace_size < _workspace_size) { - return INFINI_STATUS_INSUFFICIENT_WORKSPACE; - } - - switch (_dtype) { - case INFINI_DTYPE_F16: - return _device_info->calculate<256, cuda::AtanhOp, half>(_info, workspace, output, inputs, stream); - case INFINI_DTYPE_F32: - return _device_info->calculate<256, cuda::AtanhOp, float>(_info, workspace, output, inputs, stream); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - - return INFINI_STATUS_SUCCESS; -} } // namespace op::atanh::nvidia diff --git a/src/infiniop/ops/atanh/operator.cc b/src/infiniop/ops/atanh/operator.cc index a73adcb23..cc9d6131e 100644 --- a/src/infiniop/ops/atanh/operator.cc +++ b/src/infiniop/ops/atanh/operator.cc @@ -1,5 +1,4 @@ -#include "../../operator.h" -#include "../../handle.h" +#include "../../operator_impl.h" #include "infiniop/ops/atanh.h" #ifdef ENABLE_CPU_API @@ -9,131 +8,4 @@ #include "nvidia/atanh_nvidia.cuh" #endif -__C infiniStatus_t infiniopCreateAtanhDescriptor( - infiniopHandle_t handle, - infiniopAtanhDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t y_desc, - infiniopTensorDescriptor_t x_desc) { - -#define CREATE(CASE, NAMESPACE) \ - case CASE: \ - return op::atanh::NAMESPACE::Descriptor::create( \ - handle, \ - reinterpret_cast(desc_ptr), \ - y_desc, \ - {x_desc}) - - switch (handle->device) { - -#ifdef ENABLE_CPU_API - CREATE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - CREATE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - CREATE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - CREATE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef CREATE -} - -__C infiniStatus_t infiniopGetAtanhWorkspaceSize(infiniopAtanhDescriptor_t desc, size_t *size) { - -#define GET(CASE, NAMESPACE) \ - case CASE: \ - *size = reinterpret_cast(desc)->workspaceSize(); \ - return INFINI_STATUS_SUCCESS; - - switch (desc->device_type) { -#ifdef ENABLE_CPU_API - GET(INFINI_DEVICE_CPU, cpu) -#endif -#ifdef ENABLE_NVIDIA_API - GET(INFINI_DEVICE_NVIDIA, nvidia) -#endif -#ifdef ENABLE_ILUVATAR_API - GET(INFINI_DEVICE_ILUVATAR, nvidia) -#endif -#ifdef ENABLE_QY_API - GET(INFINI_DEVICE_QY, nvidia) -#endif - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } -#undef GET - - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; -} - -__C infiniStatus_t infiniopAtanh( - infiniopAtanhDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *y, - const void *x, - void *stream) { - -#define CALCULATE(CASE, NAMESPACE) \ - case CASE: \ - return reinterpret_cast(desc) \ - ->calculate(workspace, workspace_size, y, {x}, stream) - - switch (desc->device_type) { - -#ifdef ENABLE_CPU_API - CALCULATE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - CALCULATE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef CALCULATE -} - -__C infiniStatus_t -infiniopDestroyAtanhDescriptor(infiniopAtanhDescriptor_t desc) { - -#define DELETE(CASE, NAMESPACE) \ - case CASE: \ - delete reinterpret_cast(desc); \ - return INFINI_STATUS_SUCCESS - - switch (desc->device_type) { - -#ifdef ENABLE_CPU_API - DELETE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - DELETE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - DELETE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - DELETE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef DELETE -} +UNARY_OP_IMPL(atanh, Atanh) diff --git a/src/infiniop/ops/ceil/cpu/ceil_cpu.cc b/src/infiniop/ops/ceil/cpu/ceil_cpu.cc index 17b3ec888..81ca2fe7a 100644 --- a/src/infiniop/ops/ceil/cpu/ceil_cpu.cc +++ b/src/infiniop/ops/ceil/cpu/ceil_cpu.cc @@ -1,48 +1,8 @@ #include "ceil_cpu.h" +#include "../../../elementwise/cpu/elementwise_cpu_impl.h" namespace op::ceil::cpu { -Descriptor::~Descriptor() = default; +ELEMENTWISE_CPU_IMPL_UNARY(ceil) -infiniStatus_t Descriptor::create( - infiniopHandle_t handle_, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t out_desc, - std::vector input_desc_vec) { - - auto handle = reinterpret_cast(handle_); - auto dtype = out_desc->dtype(); - - const auto &x_desc = input_desc_vec.at(0); - const auto &y_shape = out_desc->shape(); - const auto &x_shape = x_desc->shape(); - - CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); - - CHECK_SAME_SHAPE(y_shape, x_shape); - - // create CPU elementwise descriptor - CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); - - return INFINI_STATUS_SUCCESS; -} - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *output, - std::vector inputs, - void *stream) const { - - switch (_dtype) { - case INFINI_DTYPE_F16: - return _device_info->calculate(_info, output, inputs, stream); - case INFINI_DTYPE_F32: - return _device_info->calculate(_info, output, inputs, stream); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - - return INFINI_STATUS_SUCCESS; -} } // namespace op::ceil::cpu diff --git a/src/infiniop/ops/ceil/cpu/ceil_cpu.h b/src/infiniop/ops/ceil/cpu/ceil_cpu.h index c3ca8e441..423c784cc 100644 --- a/src/infiniop/ops/ceil/cpu/ceil_cpu.h +++ b/src/infiniop/ops/ceil/cpu/ceil_cpu.h @@ -1,26 +1,9 @@ #ifndef __CEIL_CPU_H__ #define __CEIL_CPU_H__ -#include - #include "../../../elementwise/cpu/elementwise_cpu.h" +#include "../../../elementwise/unary.h" -ELEMENTWISE_DESCRIPTOR(ceil, cpu) - -namespace op::ceil::cpu { -typedef struct CeilOp { -public: - static constexpr size_t num_inputs = 1; - - template - T operator()(const T &x) const { - if constexpr (std::is_integral_v) { - return x; - } else { - return std::ceil(x); - } - } -} CeilOp; -} // namespace op::ceil::cpu +UNARY_ELEMENTWISE_DESCRIPTOR(ceil, cpu, op::elementwise::unary::UnaryMode::Ceil) #endif // __CEIL_CPU_H__ diff --git a/src/infiniop/ops/ceil/cuda/kernel.cuh b/src/infiniop/ops/ceil/cuda/kernel.cuh index a2d2e7fb5..1d30a42eb 100644 --- a/src/infiniop/ops/ceil/cuda/kernel.cuh +++ b/src/infiniop/ops/ceil/cuda/kernel.cuh @@ -1,34 +1,10 @@ #ifndef __CEIL_CUDA_H__ #define __CEIL_CUDA_H__ -#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" -#include +#include "../../../elementwise/unary.h" namespace op::ceil::cuda { -typedef struct CeilOp { -public: - static constexpr size_t num_inputs = 1; - template - __device__ __forceinline__ T operator()(const T &x) const { - if constexpr (std::is_same_v) { - return h2ceil(x); - } else if constexpr (std::is_same_v) { - return hceil(x); - } else if constexpr (std::is_same_v) { - float x0 = __bfloat162float(__low2bfloat16(x)); - float x1 = __bfloat162float(__high2bfloat16(x)); - return __floats2bfloat162_rn(ceilf(x0), ceilf(x1)); - } else if constexpr (std::is_same_v) { - return __float2bfloat16_rn(ceilf(__bfloat162float(x))); - } else if constexpr (std::is_same_v) { - return ceilf(x); - } else if constexpr (std::is_integral_v) { - return x; - } else { - return std::ceil(x); - } - } -} CeilOp; +using Op = op::elementwise::unary::cuda::UnaryOp; } // namespace op::ceil::cuda #endif // __CEIL_CUDA_H__ diff --git a/src/infiniop/ops/ceil/nvidia/ceil_nvidia.cu b/src/infiniop/ops/ceil/nvidia/ceil_nvidia.cu index c7ad2ee5b..88ee35be8 100644 --- a/src/infiniop/ops/ceil/nvidia/ceil_nvidia.cu +++ b/src/infiniop/ops/ceil/nvidia/ceil_nvidia.cu @@ -1,54 +1,10 @@ -#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" +#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh" #include "../cuda/kernel.cuh" #include "ceil_nvidia.cuh" namespace op::ceil::nvidia { -Descriptor::~Descriptor() = default; +ELEMENTWISE_NVIDIA_IMPL_UNARY(ceil) -infiniStatus_t Descriptor::create( - infiniopHandle_t handle_, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t out_desc, - std::vector input_desc_vec) { - - auto handle = reinterpret_cast(handle_); - auto dtype = out_desc->dtype(); - - const auto &x_desc = input_desc_vec.at(0); - const auto &y_shape = out_desc->shape(); - const auto &x_shape = x_desc->shape(); - - CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); - - CHECK_SAME_SHAPE(y_shape, x_shape); - - // create CUDA elementwise descriptor - CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) - - return INFINI_STATUS_SUCCESS; -} - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *output, - std::vector inputs, - void *stream) const { - if (workspace_size < _workspace_size) { - return INFINI_STATUS_INSUFFICIENT_WORKSPACE; - } - - switch (_dtype) { - case INFINI_DTYPE_F16: - return _device_info->calculate<256, cuda::CeilOp, half>(_info, workspace, output, inputs, stream); - case INFINI_DTYPE_F32: - return _device_info->calculate<256, cuda::CeilOp, float>(_info, workspace, output, inputs, stream); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - - return INFINI_STATUS_SUCCESS; -} } // namespace op::ceil::nvidia diff --git a/src/infiniop/ops/ceil/operator.cc b/src/infiniop/ops/ceil/operator.cc index 4e5ee7800..dbe591043 100644 --- a/src/infiniop/ops/ceil/operator.cc +++ b/src/infiniop/ops/ceil/operator.cc @@ -1,5 +1,4 @@ -#include "../../operator.h" -#include "../../handle.h" +#include "../../operator_impl.h" #include "infiniop/ops/ceil.h" #ifdef ENABLE_CPU_API @@ -9,131 +8,4 @@ #include "nvidia/ceil_nvidia.cuh" #endif -__C infiniStatus_t infiniopCreateCeilDescriptor( - infiniopHandle_t handle, - infiniopCeilDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t y_desc, - infiniopTensorDescriptor_t x_desc) { - -#define CREATE(CASE, NAMESPACE) \ - case CASE: \ - return op::ceil::NAMESPACE::Descriptor::create( \ - handle, \ - reinterpret_cast(desc_ptr), \ - y_desc, \ - {x_desc}) - - switch (handle->device) { - -#ifdef ENABLE_CPU_API - CREATE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - CREATE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - CREATE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - CREATE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef CREATE -} - -__C infiniStatus_t infiniopGetCeilWorkspaceSize(infiniopCeilDescriptor_t desc, size_t *size) { - -#define GET(CASE, NAMESPACE) \ - case CASE: \ - *size = reinterpret_cast(desc)->workspaceSize(); \ - return INFINI_STATUS_SUCCESS; - - switch (desc->device_type) { -#ifdef ENABLE_CPU_API - GET(INFINI_DEVICE_CPU, cpu) -#endif -#ifdef ENABLE_NVIDIA_API - GET(INFINI_DEVICE_NVIDIA, nvidia) -#endif -#ifdef ENABLE_ILUVATAR_API - GET(INFINI_DEVICE_ILUVATAR, nvidia) -#endif -#ifdef ENABLE_QY_API - GET(INFINI_DEVICE_QY, nvidia) -#endif - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } -#undef GET - - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; -} - -__C infiniStatus_t infiniopCeil( - infiniopCeilDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *y, - const void *x, - void *stream) { - -#define CALCULATE(CASE, NAMESPACE) \ - case CASE: \ - return reinterpret_cast(desc) \ - ->calculate(workspace, workspace_size, y, {x}, stream) - - switch (desc->device_type) { - -#ifdef ENABLE_CPU_API - CALCULATE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - CALCULATE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef CALCULATE -} - -__C infiniStatus_t -infiniopDestroyCeilDescriptor(infiniopCeilDescriptor_t desc) { - -#define DELETE(CASE, NAMESPACE) \ - case CASE: \ - delete reinterpret_cast(desc); \ - return INFINI_STATUS_SUCCESS - - switch (desc->device_type) { - -#ifdef ENABLE_CPU_API - DELETE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - DELETE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - DELETE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - DELETE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef DELETE -} +UNARY_OP_IMPL(ceil, Ceil) diff --git a/src/infiniop/ops/cos/cpu/cos_cpu.cc b/src/infiniop/ops/cos/cpu/cos_cpu.cc index 9dc68d327..19ef002cf 100644 --- a/src/infiniop/ops/cos/cpu/cos_cpu.cc +++ b/src/infiniop/ops/cos/cpu/cos_cpu.cc @@ -1,48 +1,8 @@ #include "cos_cpu.h" +#include "../../../elementwise/cpu/elementwise_cpu_impl.h" namespace op::cos::cpu { -Descriptor::~Descriptor() = default; +ELEMENTWISE_CPU_IMPL_UNARY(cos) -infiniStatus_t Descriptor::create( - infiniopHandle_t handle_, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t out_desc, - std::vector input_desc_vec) { - - auto handle = reinterpret_cast(handle_); - auto dtype = out_desc->dtype(); - - const auto &x_desc = input_desc_vec.at(0); - const auto &y_shape = out_desc->shape(); - const auto &x_shape = x_desc->shape(); - - CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); - - CHECK_SAME_SHAPE(y_shape, x_shape); - - // create CPU elementwise descriptor - CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); - - return INFINI_STATUS_SUCCESS; -} - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *output, - std::vector inputs, - void *stream) const { - - switch (_dtype) { - case INFINI_DTYPE_F16: - return _device_info->calculate(_info, output, inputs, stream); - case INFINI_DTYPE_F32: - return _device_info->calculate(_info, output, inputs, stream); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - - return INFINI_STATUS_SUCCESS; -} } // namespace op::cos::cpu diff --git a/src/infiniop/ops/cos/cpu/cos_cpu.h b/src/infiniop/ops/cos/cpu/cos_cpu.h index 9b4236fc2..d62aa91b8 100644 --- a/src/infiniop/ops/cos/cpu/cos_cpu.h +++ b/src/infiniop/ops/cos/cpu/cos_cpu.h @@ -1,22 +1,9 @@ #ifndef __COS_CPU_H__ #define __COS_CPU_H__ -#include - #include "../../../elementwise/cpu/elementwise_cpu.h" +#include "../../../elementwise/unary.h" -ELEMENTWISE_DESCRIPTOR(cos, cpu) - -namespace op::cos::cpu { -typedef struct CosOp { -public: - static constexpr size_t num_inputs = 1; - - template - T operator()(const T &x) const { - return std::cos(x); - } -} CosOp; -} // namespace op::cos::cpu +UNARY_ELEMENTWISE_DESCRIPTOR(cos, cpu, op::elementwise::unary::UnaryMode::Cos) #endif // __COS_CPU_H__ diff --git a/src/infiniop/ops/cos/cuda/kernel.cuh b/src/infiniop/ops/cos/cuda/kernel.cuh index b0dabb340..57fe4f50e 100644 --- a/src/infiniop/ops/cos/cuda/kernel.cuh +++ b/src/infiniop/ops/cos/cuda/kernel.cuh @@ -1,32 +1,10 @@ #ifndef __COS_CUDA_H__ #define __COS_CUDA_H__ -#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" -#include +#include "../../../elementwise/unary.h" namespace op::cos::cuda { -typedef struct CosOp { -public: - static constexpr size_t num_inputs = 1; - template - __device__ __forceinline__ T operator()(const T &x) const { - if constexpr (std::is_same_v) { - return h2cos(x); - } else if constexpr (std::is_same_v) { - return hcos(x); - } else if constexpr (std::is_same_v) { - float x0 = __bfloat162float(__low2bfloat16(x)); - float x1 = __bfloat162float(__high2bfloat16(x)); - return __floats2bfloat162_rn(cosf(x0), cosf(x1)); - } else if constexpr (std::is_same_v) { - return __float2bfloat16_rn(cosf(__bfloat162float(x))); - } else if constexpr (std::is_same_v) { - return __cosf(x); - } else { - return std::cos(x); - } - } -} CosOp; +using Op = op::elementwise::unary::cuda::UnaryOp; } // namespace op::cos::cuda #endif // __COS_CUDA_H__ diff --git a/src/infiniop/ops/cos/nvidia/cos_nvidia.cu b/src/infiniop/ops/cos/nvidia/cos_nvidia.cu index 044c59ca0..5da3c02e8 100644 --- a/src/infiniop/ops/cos/nvidia/cos_nvidia.cu +++ b/src/infiniop/ops/cos/nvidia/cos_nvidia.cu @@ -1,54 +1,10 @@ -#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" +#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh" #include "../cuda/kernel.cuh" #include "cos_nvidia.cuh" namespace op::cos::nvidia { -Descriptor::~Descriptor() = default; +ELEMENTWISE_NVIDIA_IMPL_UNARY(cos) -infiniStatus_t Descriptor::create( - infiniopHandle_t handle_, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t out_desc, - std::vector input_desc_vec) { - - auto handle = reinterpret_cast(handle_); - auto dtype = out_desc->dtype(); - - const auto &x_desc = input_desc_vec.at(0); - const auto &y_shape = out_desc->shape(); - const auto &x_shape = x_desc->shape(); - - CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); - - CHECK_SAME_SHAPE(y_shape, x_shape); - - // create CUDA elementwise descriptor - CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) - - return INFINI_STATUS_SUCCESS; -} - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *output, - std::vector inputs, - void *stream) const { - if (workspace_size < _workspace_size) { - return INFINI_STATUS_INSUFFICIENT_WORKSPACE; - } - - switch (_dtype) { - case INFINI_DTYPE_F16: - return _device_info->calculate<256, cuda::CosOp, half>(_info, workspace, output, inputs, stream); - case INFINI_DTYPE_F32: - return _device_info->calculate<256, cuda::CosOp, float>(_info, workspace, output, inputs, stream); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - - return INFINI_STATUS_SUCCESS; -} } // namespace op::cos::nvidia diff --git a/src/infiniop/ops/cos/operator.cc b/src/infiniop/ops/cos/operator.cc index 5c464ad60..1531c6caa 100644 --- a/src/infiniop/ops/cos/operator.cc +++ b/src/infiniop/ops/cos/operator.cc @@ -1,5 +1,4 @@ -#include "../../operator.h" -#include "../../handle.h" +#include "../../operator_impl.h" #include "infiniop/ops/cos.h" #ifdef ENABLE_CPU_API @@ -9,131 +8,4 @@ #include "nvidia/cos_nvidia.cuh" #endif -__C infiniStatus_t infiniopCreateCosDescriptor( - infiniopHandle_t handle, - infiniopCosDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t y_desc, - infiniopTensorDescriptor_t x_desc) { - -#define CREATE(CASE, NAMESPACE) \ - case CASE: \ - return op::cos::NAMESPACE::Descriptor::create( \ - handle, \ - reinterpret_cast(desc_ptr), \ - y_desc, \ - {x_desc}) - - switch (handle->device) { - -#ifdef ENABLE_CPU_API - CREATE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - CREATE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - CREATE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - CREATE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef CREATE -} - -__C infiniStatus_t infiniopGetCosWorkspaceSize(infiniopCosDescriptor_t desc, size_t *size) { - -#define GET(CASE, NAMESPACE) \ - case CASE: \ - *size = reinterpret_cast(desc)->workspaceSize(); \ - return INFINI_STATUS_SUCCESS; - - switch (desc->device_type) { -#ifdef ENABLE_CPU_API - GET(INFINI_DEVICE_CPU, cpu) -#endif -#ifdef ENABLE_NVIDIA_API - GET(INFINI_DEVICE_NVIDIA, nvidia) -#endif -#ifdef ENABLE_ILUVATAR_API - GET(INFINI_DEVICE_ILUVATAR, nvidia) -#endif -#ifdef ENABLE_QY_API - GET(INFINI_DEVICE_QY, nvidia) -#endif - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } -#undef GET - - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; -} - -__C infiniStatus_t infiniopCos( - infiniopCosDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *y, - const void *x, - void *stream) { - -#define CALCULATE(CASE, NAMESPACE) \ - case CASE: \ - return reinterpret_cast(desc) \ - ->calculate(workspace, workspace_size, y, {x}, stream) - - switch (desc->device_type) { - -#ifdef ENABLE_CPU_API - CALCULATE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - CALCULATE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef CALCULATE -} - -__C infiniStatus_t -infiniopDestroyCosDescriptor(infiniopCosDescriptor_t desc) { - -#define DELETE(CASE, NAMESPACE) \ - case CASE: \ - delete reinterpret_cast(desc); \ - return INFINI_STATUS_SUCCESS - - switch (desc->device_type) { - -#ifdef ENABLE_CPU_API - DELETE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - DELETE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - DELETE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - DELETE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef DELETE -} +UNARY_OP_IMPL(cos, Cos) diff --git a/src/infiniop/ops/cosh/cpu/cosh_cpu.cc b/src/infiniop/ops/cosh/cpu/cosh_cpu.cc index 9ed8e33da..e7b2a6dad 100644 --- a/src/infiniop/ops/cosh/cpu/cosh_cpu.cc +++ b/src/infiniop/ops/cosh/cpu/cosh_cpu.cc @@ -1,48 +1,8 @@ #include "cosh_cpu.h" +#include "../../../elementwise/cpu/elementwise_cpu_impl.h" namespace op::cosh::cpu { -Descriptor::~Descriptor() = default; +ELEMENTWISE_CPU_IMPL_UNARY(cosh) -infiniStatus_t Descriptor::create( - infiniopHandle_t handle_, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t out_desc, - std::vector input_desc_vec) { - - auto handle = reinterpret_cast(handle_); - auto dtype = out_desc->dtype(); - - const auto &x_desc = input_desc_vec.at(0); - const auto &y_shape = out_desc->shape(); - const auto &x_shape = x_desc->shape(); - - CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); - - CHECK_SAME_SHAPE(y_shape, x_shape); - - // create CPU elementwise descriptor - CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); - - return INFINI_STATUS_SUCCESS; -} - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *output, - std::vector inputs, - void *stream) const { - - switch (_dtype) { - case INFINI_DTYPE_F16: - return _device_info->calculate(_info, output, inputs, stream); - case INFINI_DTYPE_F32: - return _device_info->calculate(_info, output, inputs, stream); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - - return INFINI_STATUS_SUCCESS; -} } // namespace op::cosh::cpu diff --git a/src/infiniop/ops/cosh/cpu/cosh_cpu.h b/src/infiniop/ops/cosh/cpu/cosh_cpu.h index aea359ef2..c789d38ea 100644 --- a/src/infiniop/ops/cosh/cpu/cosh_cpu.h +++ b/src/infiniop/ops/cosh/cpu/cosh_cpu.h @@ -1,22 +1,9 @@ #ifndef __COSH_CPU_H__ #define __COSH_CPU_H__ -#include - #include "../../../elementwise/cpu/elementwise_cpu.h" +#include "../../../elementwise/unary.h" -ELEMENTWISE_DESCRIPTOR(cosh, cpu) - -namespace op::cosh::cpu { -typedef struct CoshOp { -public: - static constexpr size_t num_inputs = 1; - - template - T operator()(const T &x) const { - return std::cosh(x); - } -} CoshOp; -} // namespace op::cosh::cpu +UNARY_ELEMENTWISE_DESCRIPTOR(cosh, cpu, op::elementwise::unary::UnaryMode::Cosh) #endif // __COSH_CPU_H__ diff --git a/src/infiniop/ops/cosh/cuda/kernel.cuh b/src/infiniop/ops/cosh/cuda/kernel.cuh index ce6806433..934bfe12d 100644 --- a/src/infiniop/ops/cosh/cuda/kernel.cuh +++ b/src/infiniop/ops/cosh/cuda/kernel.cuh @@ -1,32 +1,10 @@ #ifndef __COSH_CUDA_H__ #define __COSH_CUDA_H__ -#include -#include +#include "../../../elementwise/unary.h" namespace op::cosh::cuda { -typedef struct CoshOp { -public: - static constexpr size_t num_inputs = 1; - template - __device__ __forceinline__ T operator()(const T &x) const { - if constexpr (std::is_same_v) { - return __floats2half2_rn(coshf(__half2float(__low2half(x))), coshf(__half2float(__high2half(x)))); - } else if constexpr (std::is_same_v) { - return __float2half(coshf(__half2float(x))); - } else if constexpr (std::is_same_v) { - float x0 = __bfloat162float(__low2bfloat16(x)); - float x1 = __bfloat162float(__high2bfloat16(x)); - return __floats2bfloat162_rn(coshf(x0), coshf(x1)); - } else if constexpr (std::is_same_v) { - return __float2bfloat16_rn(coshf(__bfloat162float(x))); - } else if constexpr (std::is_same_v) { - return coshf(x); - } else { - return std::cosh(x); - } - } -} CoshOp; +using Op = op::elementwise::unary::cuda::UnaryOp; } // namespace op::cosh::cuda #endif // __COSH_CUDA_H__ diff --git a/src/infiniop/ops/cosh/nvidia/cosh_nvidia.cu b/src/infiniop/ops/cosh/nvidia/cosh_nvidia.cu index a5e1442ce..038b0373e 100644 --- a/src/infiniop/ops/cosh/nvidia/cosh_nvidia.cu +++ b/src/infiniop/ops/cosh/nvidia/cosh_nvidia.cu @@ -1,54 +1,10 @@ -#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" +#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh" #include "../cuda/kernel.cuh" #include "cosh_nvidia.cuh" namespace op::cosh::nvidia { -Descriptor::~Descriptor() = default; +ELEMENTWISE_NVIDIA_IMPL_UNARY(cosh) -infiniStatus_t Descriptor::create( - infiniopHandle_t handle_, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t out_desc, - std::vector input_desc_vec) { - - auto handle = reinterpret_cast(handle_); - auto dtype = out_desc->dtype(); - - const auto &x_desc = input_desc_vec.at(0); - const auto &y_shape = out_desc->shape(); - const auto &x_shape = x_desc->shape(); - - CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); - - CHECK_SAME_SHAPE(y_shape, x_shape); - - // create CUDA elementwise descriptor - CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) - - return INFINI_STATUS_SUCCESS; -} - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *output, - std::vector inputs, - void *stream) const { - if (workspace_size < _workspace_size) { - return INFINI_STATUS_INSUFFICIENT_WORKSPACE; - } - - switch (_dtype) { - case INFINI_DTYPE_F16: - return _device_info->calculate<256, cuda::CoshOp, half>(_info, workspace, output, inputs, stream); - case INFINI_DTYPE_F32: - return _device_info->calculate<256, cuda::CoshOp, float>(_info, workspace, output, inputs, stream); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - - return INFINI_STATUS_SUCCESS; -} } // namespace op::cosh::nvidia diff --git a/src/infiniop/ops/cosh/operator.cc b/src/infiniop/ops/cosh/operator.cc index 75aac0c91..9b18b47ee 100644 --- a/src/infiniop/ops/cosh/operator.cc +++ b/src/infiniop/ops/cosh/operator.cc @@ -1,5 +1,4 @@ -#include "../../operator.h" -#include "../../handle.h" +#include "../../operator_impl.h" #include "infiniop/ops/cosh.h" #ifdef ENABLE_CPU_API @@ -9,131 +8,4 @@ #include "nvidia/cosh_nvidia.cuh" #endif -__C infiniStatus_t infiniopCreateCoshDescriptor( - infiniopHandle_t handle, - infiniopCoshDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t y_desc, - infiniopTensorDescriptor_t x_desc) { - -#define CREATE(CASE, NAMESPACE) \ - case CASE: \ - return op::cosh::NAMESPACE::Descriptor::create( \ - handle, \ - reinterpret_cast(desc_ptr), \ - y_desc, \ - {x_desc}) - - switch (handle->device) { - -#ifdef ENABLE_CPU_API - CREATE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - CREATE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - CREATE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - CREATE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef CREATE -} - -__C infiniStatus_t infiniopGetCoshWorkspaceSize(infiniopCoshDescriptor_t desc, size_t *size) { - -#define GET(CASE, NAMESPACE) \ - case CASE: \ - *size = reinterpret_cast(desc)->workspaceSize(); \ - return INFINI_STATUS_SUCCESS; - - switch (desc->device_type) { -#ifdef ENABLE_CPU_API - GET(INFINI_DEVICE_CPU, cpu) -#endif -#ifdef ENABLE_NVIDIA_API - GET(INFINI_DEVICE_NVIDIA, nvidia) -#endif -#ifdef ENABLE_ILUVATAR_API - GET(INFINI_DEVICE_ILUVATAR, nvidia) -#endif -#ifdef ENABLE_QY_API - GET(INFINI_DEVICE_QY, nvidia) -#endif - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } -#undef GET - - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; -} - -__C infiniStatus_t infiniopCosh( - infiniopCoshDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *y, - const void *x, - void *stream) { - -#define CALCULATE(CASE, NAMESPACE) \ - case CASE: \ - return reinterpret_cast(desc) \ - ->calculate(workspace, workspace_size, y, {x}, stream) - - switch (desc->device_type) { - -#ifdef ENABLE_CPU_API - CALCULATE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - CALCULATE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef CALCULATE -} - -__C infiniStatus_t -infiniopDestroyCoshDescriptor(infiniopCoshDescriptor_t desc) { - -#define DELETE(CASE, NAMESPACE) \ - case CASE: \ - delete reinterpret_cast(desc); \ - return INFINI_STATUS_SUCCESS - - switch (desc->device_type) { - -#ifdef ENABLE_CPU_API - DELETE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - DELETE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - DELETE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - DELETE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef DELETE -} +UNARY_OP_IMPL(cosh, Cosh) diff --git a/src/infiniop/ops/div/cpu/div_cpu.cc b/src/infiniop/ops/div/cpu/div_cpu.cc index 19e222031..6d150070c 100644 --- a/src/infiniop/ops/div/cpu/div_cpu.cc +++ b/src/infiniop/ops/div/cpu/div_cpu.cc @@ -1,50 +1,8 @@ #include "div_cpu.h" +#include "../../../elementwise/cpu/elementwise_cpu_impl.h" namespace op::div::cpu { -Descriptor::~Descriptor() = default; +ELEMENTWISE_CPU_IMPL_BINARY(div) -infiniStatus_t Descriptor::create( - infiniopHandle_t handle_, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t out_desc, - std::vector input_desc_vec) { - - auto handle = reinterpret_cast(handle_); - auto dtype = out_desc->dtype(); - - const auto &a_desc = input_desc_vec.at(0); - const auto &b_desc = input_desc_vec.at(1); - const auto &c_shape = out_desc->shape(); - const auto &a_shape = a_desc->shape(); - const auto &b_shape = b_desc->shape(); - - CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); - - CHECK_SAME_SHAPE(c_shape, a_shape, b_shape); - - // create CPU elementwise descriptor - CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); - - return INFINI_STATUS_SUCCESS; -} - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *output, - std::vector inputs, - void *stream) const { - - switch (_dtype) { - case INFINI_DTYPE_F16: - return _device_info->calculate(_info, output, inputs, stream); - case INFINI_DTYPE_F32: - return _device_info->calculate(_info, output, inputs, stream); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - - return INFINI_STATUS_SUCCESS; -} } // namespace op::div::cpu diff --git a/src/infiniop/ops/div/cpu/div_cpu.h b/src/infiniop/ops/div/cpu/div_cpu.h index 0373b766f..ad76e7ef1 100644 --- a/src/infiniop/ops/div/cpu/div_cpu.h +++ b/src/infiniop/ops/div/cpu/div_cpu.h @@ -1,19 +1,9 @@ #ifndef __DIV_CPU_H__ #define __DIV_CPU_H__ +#include "../../../elementwise/binary.h" #include "../../../elementwise/cpu/elementwise_cpu.h" -ELEMENTWISE_DESCRIPTOR(div, cpu) - -namespace op::div::cpu { -typedef struct DivOp { -public: - static constexpr size_t num_inputs = 2; - template - T operator()(const T &a, const T &b) const { - return a / b; - } -} DivOp; -} // namespace op::div::cpu +BINARY_ELEMENTWISE_DESCRIPTOR(div, cpu, op::elementwise::binary::BinaryMode::Divide) #endif // __DIV_CPU_H__ diff --git a/src/infiniop/ops/div/cuda/kernel.cuh b/src/infiniop/ops/div/cuda/kernel.cuh index a67993da5..f1ab13152 100644 --- a/src/infiniop/ops/div/cuda/kernel.cuh +++ b/src/infiniop/ops/div/cuda/kernel.cuh @@ -1,23 +1,10 @@ #ifndef __DIV_CUDA_H__ #define __DIV_CUDA_H__ +#include "../../../elementwise/binary.h" + namespace op::div::cuda { -typedef struct DivOp { -public: - static constexpr size_t num_inputs = 2; - template - __device__ __forceinline__ T operator()(const T &a, const T &b) const { - if constexpr (std::is_same_v) { - return __h2div(a, b); - } else if constexpr (std::is_same_v || std::is_same_v) { - return a / b; - } else if constexpr (std::is_same_v) { - return __fdividef(a, b); - } else { - return a / b; - } - } -} DivOp; +using Op = op::elementwise::binary::cuda::BinaryOp; } // namespace op::div::cuda #endif // __DIV_CUDA_H__ diff --git a/src/infiniop/ops/div/nvidia/div_nvidia.cu b/src/infiniop/ops/div/nvidia/div_nvidia.cu index 1abffe816..8aaba09b4 100644 --- a/src/infiniop/ops/div/nvidia/div_nvidia.cu +++ b/src/infiniop/ops/div/nvidia/div_nvidia.cu @@ -1,57 +1,10 @@ -#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" +#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh" #include "../cuda/kernel.cuh" #include "div_nvidia.cuh" namespace op::div::nvidia { -Descriptor::~Descriptor() = default; +ELEMENTWISE_NVIDIA_IMPL_BINARY(div) -infiniStatus_t Descriptor::create( - infiniopHandle_t handle_, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t out_desc, - std::vector input_desc_vec) { - - auto handle = reinterpret_cast(handle_); - auto dtype = out_desc->dtype(); - - const auto &a_desc = input_desc_vec.at(0); - const auto &b_desc = input_desc_vec.at(1); - const auto &c_shape = out_desc->shape(); - const auto &a_shape = a_desc->shape(); - const auto &b_shape = b_desc->shape(); - - CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); - - CHECK_SAME_SHAPE(c_shape, a_shape, b_shape); - - // create CUDA elementwise descriptor - CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) - - return INFINI_STATUS_SUCCESS; -} - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *output, - std::vector inputs, - void *stream) const { - - if (workspace_size < _workspace_size) { - return INFINI_STATUS_INSUFFICIENT_WORKSPACE; - } - - switch (_dtype) { - case INFINI_DTYPE_F16: - return _device_info->calculate<256, cuda::DivOp, half>(_info, workspace, output, inputs, stream); - case INFINI_DTYPE_F32: - return _device_info->calculate<256, cuda::DivOp, float>(_info, workspace, output, inputs, stream); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - - return INFINI_STATUS_SUCCESS; -} } // namespace op::div::nvidia diff --git a/src/infiniop/ops/div/operator.cc b/src/infiniop/ops/div/operator.cc index 84021a1af..af9d1929a 100644 --- a/src/infiniop/ops/div/operator.cc +++ b/src/infiniop/ops/div/operator.cc @@ -1,5 +1,4 @@ -#include "../../operator.h" -#include "../../handle.h" +#include "../../operator_impl.h" #include "infiniop/ops/div.h" #ifdef ENABLE_CPU_API @@ -8,195 +7,5 @@ #if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) #include "nvidia/div_nvidia.cuh" #endif -#ifdef ENABLE_METAX_API -#include "metax/div_metax.h" -#endif -#ifdef ENABLE_KUNLUN_API -#include "kunlun/div_kunlun.h" -#endif -#ifdef ENABLE_CAMBRICON_API -#include "bang/div_bang.h" -#endif -#ifdef ENABLE_MOORE_API -#include "moore/div_moore.h" -#endif - -__C infiniStatus_t infiniopCreateDivDescriptor( - infiniopHandle_t handle, - infiniopDivDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t c_desc, - infiniopTensorDescriptor_t a_desc, - infiniopTensorDescriptor_t b_desc) { - -#define CREATE(CASE, NAMESPACE) \ - case CASE: \ - return op::div::NAMESPACE::Descriptor::create( \ - handle, \ - reinterpret_cast(desc_ptr), \ - c_desc, \ - {a_desc, \ - b_desc}) - - switch (handle->device) { - -#ifdef ENABLE_CPU_API - CREATE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - CREATE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - CREATE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - CREATE(INFINI_DEVICE_QY, nvidia); -#endif -#ifdef ENABLE_METAX_API - CREATE(INFINI_DEVICE_METAX, metax); -#endif -#ifdef ENABLE_KUNLUN_API - CREATE(INFINI_DEVICE_KUNLUN, kunlun); -#endif -#ifdef ENABLE_CAMBRICON_API - CREATE(INFINI_DEVICE_CAMBRICON, bang); -#endif -#ifdef ENABLE_MOORE_API - CREATE(INFINI_DEVICE_MOORE, moore); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef CREATE -} - -__C infiniStatus_t infiniopGetDivWorkspaceSize(infiniopDivDescriptor_t desc, size_t *size) { - -#define GET(CASE, NAMESPACE) \ - case CASE: \ - *size = reinterpret_cast(desc)->workspaceSize(); \ - return INFINI_STATUS_SUCCESS - - switch (desc->device_type) { -#ifdef ENABLE_CPU_API - GET(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - GET(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - GET(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - GET(INFINI_DEVICE_QY, nvidia); -#endif -#ifdef ENABLE_METAX_API - GET(INFINI_DEVICE_METAX, metax); -#endif -#ifdef ENABLE_KUNLUN_API - GET(INFINI_DEVICE_KUNLUN, kunlun); -#endif -#ifdef ENABLE_CAMBRICON_API - GET(INFINI_DEVICE_CAMBRICON, bang); -#endif -#ifdef ENABLE_MOORE_API - GET(INFINI_DEVICE_MOORE, moore); -#endif - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } -#undef GET - - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; -} - -__C infiniStatus_t infiniopDiv( - infiniopDivDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *c, - const void *a, - const void *b, - void *stream) { - -#define CALCULATE(CASE, NAMESPACE) \ - case CASE: \ - return reinterpret_cast(desc) \ - ->calculate(workspace, workspace_size, c, {a, b}, stream) - - switch (desc->device_type) { - -#ifdef ENABLE_CPU_API - CALCULATE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - CALCULATE(INFINI_DEVICE_QY, nvidia); -#endif -#ifdef ENABLE_METAX_API - CALCULATE(INFINI_DEVICE_METAX, metax); -#endif -#ifdef ENABLE_KUNLUN_API - CALCULATE(INFINI_DEVICE_KUNLUN, kunlun); -#endif -#ifdef ENABLE_CAMBRICON_API - CALCULATE(INFINI_DEVICE_CAMBRICON, bang); -#endif -#ifdef ENABLE_MOORE_API - CALCULATE(INFINI_DEVICE_MOORE, moore); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef CALCULATE -} - -__C infiniStatus_t -infiniopDestroyDivDescriptor(infiniopDivDescriptor_t desc) { - -#define DELETE(CASE, NAMESPACE) \ - case CASE: \ - delete reinterpret_cast(desc); \ - return INFINI_STATUS_SUCCESS - - switch (desc->device_type) { - -#ifdef ENABLE_CPU_API - DELETE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - DELETE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - DELETE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - DELETE(INFINI_DEVICE_QY, nvidia); -#endif -#ifdef ENABLE_METAX_API - DELETE(INFINI_DEVICE_METAX, metax); -#endif -#ifdef ENABLE_KUNLUN_API - DELETE(INFINI_DEVICE_KUNLUN, kunlun); -#endif -#ifdef ENABLE_CAMBRICON_API - DELETE(INFINI_DEVICE_CAMBRICON, bang); -#endif -#ifdef ENABLE_MOORE_API - DELETE(INFINI_DEVICE_MOORE, moore); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } -#undef DELETE -} +BINARY_OP_IMPL(div, Div) diff --git a/src/infiniop/ops/erf/cpu/erf_cpu.cc b/src/infiniop/ops/erf/cpu/erf_cpu.cc index 00b1897d1..d9119c697 100644 --- a/src/infiniop/ops/erf/cpu/erf_cpu.cc +++ b/src/infiniop/ops/erf/cpu/erf_cpu.cc @@ -1,48 +1,8 @@ #include "erf_cpu.h" +#include "../../../elementwise/cpu/elementwise_cpu_impl.h" namespace op::erf::cpu { -Descriptor::~Descriptor() = default; +ELEMENTWISE_CPU_IMPL_UNARY(erf) -infiniStatus_t Descriptor::create( - infiniopHandle_t handle_, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t out_desc, - std::vector input_desc_vec) { - - auto handle = reinterpret_cast(handle_); - auto dtype = out_desc->dtype(); - - const auto &x_desc = input_desc_vec.at(0); - const auto &y_shape = out_desc->shape(); - const auto &x_shape = x_desc->shape(); - - CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); - - CHECK_SAME_SHAPE(y_shape, x_shape); - - // create CPU elementwise descriptor - CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); - - return INFINI_STATUS_SUCCESS; -} - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *output, - std::vector inputs, - void *stream) const { - - switch (_dtype) { - case INFINI_DTYPE_F16: - return _device_info->calculate(_info, output, inputs, stream); - case INFINI_DTYPE_F32: - return _device_info->calculate(_info, output, inputs, stream); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - - return INFINI_STATUS_SUCCESS; -} } // namespace op::erf::cpu diff --git a/src/infiniop/ops/erf/cpu/erf_cpu.h b/src/infiniop/ops/erf/cpu/erf_cpu.h index c26f519cf..f50cd157d 100644 --- a/src/infiniop/ops/erf/cpu/erf_cpu.h +++ b/src/infiniop/ops/erf/cpu/erf_cpu.h @@ -1,22 +1,9 @@ #ifndef __ERF_CPU_H__ #define __ERF_CPU_H__ -#include - #include "../../../elementwise/cpu/elementwise_cpu.h" +#include "../../../elementwise/unary.h" -ELEMENTWISE_DESCRIPTOR(erf, cpu) - -namespace op::erf::cpu { -typedef struct ErfOp { -public: - static constexpr size_t num_inputs = 1; - - template - T operator()(const T &x) const { - return std::erf(x); - } -} ErfOp; -} // namespace op::erf::cpu +UNARY_ELEMENTWISE_DESCRIPTOR(erf, cpu, op::elementwise::unary::UnaryMode::Erf) #endif // __ERF_CPU_H__ diff --git a/src/infiniop/ops/erf/cuda/kernel.cuh b/src/infiniop/ops/erf/cuda/kernel.cuh index 820c10b19..978890cff 100644 --- a/src/infiniop/ops/erf/cuda/kernel.cuh +++ b/src/infiniop/ops/erf/cuda/kernel.cuh @@ -1,32 +1,10 @@ #ifndef __ERF_CUDA_H__ #define __ERF_CUDA_H__ -#include -#include +#include "../../../elementwise/unary.h" namespace op::erf::cuda { -typedef struct ErfOp { -public: - static constexpr size_t num_inputs = 1; - template - __device__ __forceinline__ T operator()(const T &x) const { - if constexpr (std::is_same_v) { - return __floats2half2_rn(erff(__half2float(__low2half(x))), erff(__half2float(__high2half(x)))); - } else if constexpr (std::is_same_v) { - return __float2half(erff(__half2float(x))); - } else if constexpr (std::is_same_v) { - float x0 = __bfloat162float(__low2bfloat16(x)); - float x1 = __bfloat162float(__high2bfloat16(x)); - return __floats2bfloat162_rn(erff(x0), erff(x1)); - } else if constexpr (std::is_same_v) { - return __float2bfloat16_rn(erff(__bfloat162float(x))); - } else if constexpr (std::is_same_v) { - return erff(x); - } else { - return std::erf(x); - } - } -} ErfOp; +using Op = op::elementwise::unary::cuda::UnaryOp; } // namespace op::erf::cuda #endif // __ERF_CUDA_H__ diff --git a/src/infiniop/ops/erf/nvidia/erf_nvidia.cu b/src/infiniop/ops/erf/nvidia/erf_nvidia.cu index 9080593de..0d743b538 100644 --- a/src/infiniop/ops/erf/nvidia/erf_nvidia.cu +++ b/src/infiniop/ops/erf/nvidia/erf_nvidia.cu @@ -1,54 +1,10 @@ -#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" +#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh" #include "../cuda/kernel.cuh" #include "erf_nvidia.cuh" namespace op::erf::nvidia { -Descriptor::~Descriptor() = default; +ELEMENTWISE_NVIDIA_IMPL_UNARY(erf) -infiniStatus_t Descriptor::create( - infiniopHandle_t handle_, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t out_desc, - std::vector input_desc_vec) { - - auto handle = reinterpret_cast(handle_); - auto dtype = out_desc->dtype(); - - const auto &x_desc = input_desc_vec.at(0); - const auto &y_shape = out_desc->shape(); - const auto &x_shape = x_desc->shape(); - - CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); - - CHECK_SAME_SHAPE(y_shape, x_shape); - - // create CUDA elementwise descriptor - CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) - - return INFINI_STATUS_SUCCESS; -} - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *output, - std::vector inputs, - void *stream) const { - if (workspace_size < _workspace_size) { - return INFINI_STATUS_INSUFFICIENT_WORKSPACE; - } - - switch (_dtype) { - case INFINI_DTYPE_F16: - return _device_info->calculate<256, cuda::ErfOp, half>(_info, workspace, output, inputs, stream); - case INFINI_DTYPE_F32: - return _device_info->calculate<256, cuda::ErfOp, float>(_info, workspace, output, inputs, stream); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - - return INFINI_STATUS_SUCCESS; -} } // namespace op::erf::nvidia diff --git a/src/infiniop/ops/erf/operator.cc b/src/infiniop/ops/erf/operator.cc index 1491cfa9a..9304cf525 100644 --- a/src/infiniop/ops/erf/operator.cc +++ b/src/infiniop/ops/erf/operator.cc @@ -1,5 +1,4 @@ -#include "../../operator.h" -#include "../../handle.h" +#include "../../operator_impl.h" #include "infiniop/ops/erf.h" #ifdef ENABLE_CPU_API @@ -9,131 +8,4 @@ #include "nvidia/erf_nvidia.cuh" #endif -__C infiniStatus_t infiniopCreateErfDescriptor( - infiniopHandle_t handle, - infiniopErfDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t y_desc, - infiniopTensorDescriptor_t x_desc) { - -#define CREATE(CASE, NAMESPACE) \ - case CASE: \ - return op::erf::NAMESPACE::Descriptor::create( \ - handle, \ - reinterpret_cast(desc_ptr), \ - y_desc, \ - {x_desc}) - - switch (handle->device) { - -#ifdef ENABLE_CPU_API - CREATE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - CREATE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - CREATE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - CREATE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef CREATE -} - -__C infiniStatus_t infiniopGetErfWorkspaceSize(infiniopErfDescriptor_t desc, size_t *size) { - -#define GET(CASE, NAMESPACE) \ - case CASE: \ - *size = reinterpret_cast(desc)->workspaceSize(); \ - return INFINI_STATUS_SUCCESS; - - switch (desc->device_type) { -#ifdef ENABLE_CPU_API - GET(INFINI_DEVICE_CPU, cpu) -#endif -#ifdef ENABLE_NVIDIA_API - GET(INFINI_DEVICE_NVIDIA, nvidia) -#endif -#ifdef ENABLE_ILUVATAR_API - GET(INFINI_DEVICE_ILUVATAR, nvidia) -#endif -#ifdef ENABLE_QY_API - GET(INFINI_DEVICE_QY, nvidia) -#endif - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } -#undef GET - - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; -} - -__C infiniStatus_t infiniopErf( - infiniopErfDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *y, - const void *x, - void *stream) { - -#define CALCULATE(CASE, NAMESPACE) \ - case CASE: \ - return reinterpret_cast(desc) \ - ->calculate(workspace, workspace_size, y, {x}, stream) - - switch (desc->device_type) { - -#ifdef ENABLE_CPU_API - CALCULATE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - CALCULATE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef CALCULATE -} - -__C infiniStatus_t -infiniopDestroyErfDescriptor(infiniopErfDescriptor_t desc) { - -#define DELETE(CASE, NAMESPACE) \ - case CASE: \ - delete reinterpret_cast(desc); \ - return INFINI_STATUS_SUCCESS - - switch (desc->device_type) { - -#ifdef ENABLE_CPU_API - DELETE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - DELETE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - DELETE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - DELETE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef DELETE -} +UNARY_OP_IMPL(erf, Erf) diff --git a/src/infiniop/ops/floor/cpu/floor_cpu.cc b/src/infiniop/ops/floor/cpu/floor_cpu.cc index e809a02e2..cc717ac11 100644 --- a/src/infiniop/ops/floor/cpu/floor_cpu.cc +++ b/src/infiniop/ops/floor/cpu/floor_cpu.cc @@ -1,48 +1,8 @@ #include "floor_cpu.h" +#include "../../../elementwise/cpu/elementwise_cpu_impl.h" namespace op::floor::cpu { -Descriptor::~Descriptor() = default; +ELEMENTWISE_CPU_IMPL_UNARY(floor) -infiniStatus_t Descriptor::create( - infiniopHandle_t handle_, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t out_desc, - std::vector input_desc_vec) { - - auto handle = reinterpret_cast(handle_); - auto dtype = out_desc->dtype(); - - const auto &x_desc = input_desc_vec.at(0); - const auto &y_shape = out_desc->shape(); - const auto &x_shape = x_desc->shape(); - - CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); - - CHECK_SAME_SHAPE(y_shape, x_shape); - - // create CPU elementwise descriptor - CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); - - return INFINI_STATUS_SUCCESS; -} - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *output, - std::vector inputs, - void *stream) const { - - switch (_dtype) { - case INFINI_DTYPE_F16: - return _device_info->calculate(_info, output, inputs, stream); - case INFINI_DTYPE_F32: - return _device_info->calculate(_info, output, inputs, stream); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - - return INFINI_STATUS_SUCCESS; -} } // namespace op::floor::cpu diff --git a/src/infiniop/ops/floor/cpu/floor_cpu.h b/src/infiniop/ops/floor/cpu/floor_cpu.h index 91508a384..a246309e8 100644 --- a/src/infiniop/ops/floor/cpu/floor_cpu.h +++ b/src/infiniop/ops/floor/cpu/floor_cpu.h @@ -1,26 +1,9 @@ #ifndef __FLOOR_CPU_H__ #define __FLOOR_CPU_H__ -#include - #include "../../../elementwise/cpu/elementwise_cpu.h" +#include "../../../elementwise/unary.h" -ELEMENTWISE_DESCRIPTOR(floor, cpu) - -namespace op::floor::cpu { -typedef struct FloorOp { -public: - static constexpr size_t num_inputs = 1; - - template - T operator()(const T &x) const { - if constexpr (std::is_integral_v) { - return x; - } else { - return std::floor(x); - } - } -} FloorOp; -} // namespace op::floor::cpu +UNARY_ELEMENTWISE_DESCRIPTOR(floor, cpu, op::elementwise::unary::UnaryMode::Floor) #endif // __FLOOR_CPU_H__ diff --git a/src/infiniop/ops/floor/cuda/kernel.cuh b/src/infiniop/ops/floor/cuda/kernel.cuh index c89ce34f4..23a7a44e9 100644 --- a/src/infiniop/ops/floor/cuda/kernel.cuh +++ b/src/infiniop/ops/floor/cuda/kernel.cuh @@ -1,34 +1,10 @@ #ifndef __FLOOR_CUDA_H__ #define __FLOOR_CUDA_H__ -#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" -#include +#include "../../../elementwise/unary.h" namespace op::floor::cuda { -typedef struct FloorOp { -public: - static constexpr size_t num_inputs = 1; - template - __device__ __forceinline__ T operator()(const T &x) const { - if constexpr (std::is_same_v) { - return h2floor(x); - } else if constexpr (std::is_same_v) { - return hfloor(x); - } else if constexpr (std::is_same_v) { - float x0 = __bfloat162float(__low2bfloat16(x)); - float x1 = __bfloat162float(__high2bfloat16(x)); - return __floats2bfloat162_rn(floorf(x0), floorf(x1)); - } else if constexpr (std::is_same_v) { - return __float2bfloat16_rn(floorf(__bfloat162float(x))); - } else if constexpr (std::is_same_v) { - return floorf(x); - } else if constexpr (std::is_integral_v) { - return x; - } else { - return std::floor(x); - } - } -} FloorOp; +using Op = op::elementwise::unary::cuda::UnaryOp; } // namespace op::floor::cuda #endif // __FLOOR_CUDA_H__ diff --git a/src/infiniop/ops/floor/nvidia/floor_nvidia.cu b/src/infiniop/ops/floor/nvidia/floor_nvidia.cu index 08305048a..cec304a1c 100644 --- a/src/infiniop/ops/floor/nvidia/floor_nvidia.cu +++ b/src/infiniop/ops/floor/nvidia/floor_nvidia.cu @@ -1,54 +1,10 @@ -#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" +#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh" #include "../cuda/kernel.cuh" #include "floor_nvidia.cuh" namespace op::floor::nvidia { -Descriptor::~Descriptor() = default; +ELEMENTWISE_NVIDIA_IMPL_UNARY(floor) -infiniStatus_t Descriptor::create( - infiniopHandle_t handle_, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t out_desc, - std::vector input_desc_vec) { - - auto handle = reinterpret_cast(handle_); - auto dtype = out_desc->dtype(); - - const auto &x_desc = input_desc_vec.at(0); - const auto &y_shape = out_desc->shape(); - const auto &x_shape = x_desc->shape(); - - CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); - - CHECK_SAME_SHAPE(y_shape, x_shape); - - // create CUDA elementwise descriptor - CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) - - return INFINI_STATUS_SUCCESS; -} - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *output, - std::vector inputs, - void *stream) const { - if (workspace_size < _workspace_size) { - return INFINI_STATUS_INSUFFICIENT_WORKSPACE; - } - - switch (_dtype) { - case INFINI_DTYPE_F16: - return _device_info->calculate<256, cuda::FloorOp, half>(_info, workspace, output, inputs, stream); - case INFINI_DTYPE_F32: - return _device_info->calculate<256, cuda::FloorOp, float>(_info, workspace, output, inputs, stream); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - - return INFINI_STATUS_SUCCESS; -} } // namespace op::floor::nvidia diff --git a/src/infiniop/ops/floor/operator.cc b/src/infiniop/ops/floor/operator.cc index 4e4ed2b5a..64e4a586b 100644 --- a/src/infiniop/ops/floor/operator.cc +++ b/src/infiniop/ops/floor/operator.cc @@ -1,5 +1,4 @@ -#include "../../operator.h" -#include "../../handle.h" +#include "../../operator_impl.h" #include "infiniop/ops/floor.h" #ifdef ENABLE_CPU_API @@ -9,131 +8,4 @@ #include "nvidia/floor_nvidia.cuh" #endif -__C infiniStatus_t infiniopCreateFloorDescriptor( - infiniopHandle_t handle, - infiniopFloorDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t y_desc, - infiniopTensorDescriptor_t x_desc) { - -#define CREATE(CASE, NAMESPACE) \ - case CASE: \ - return op::floor::NAMESPACE::Descriptor::create( \ - handle, \ - reinterpret_cast(desc_ptr), \ - y_desc, \ - {x_desc}) - - switch (handle->device) { - -#ifdef ENABLE_CPU_API - CREATE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - CREATE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - CREATE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - CREATE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef CREATE -} - -__C infiniStatus_t infiniopGetFloorWorkspaceSize(infiniopFloorDescriptor_t desc, size_t *size) { - -#define GET(CASE, NAMESPACE) \ - case CASE: \ - *size = reinterpret_cast(desc)->workspaceSize(); \ - return INFINI_STATUS_SUCCESS; - - switch (desc->device_type) { -#ifdef ENABLE_CPU_API - GET(INFINI_DEVICE_CPU, cpu) -#endif -#ifdef ENABLE_NVIDIA_API - GET(INFINI_DEVICE_NVIDIA, nvidia) -#endif -#ifdef ENABLE_ILUVATAR_API - GET(INFINI_DEVICE_ILUVATAR, nvidia) -#endif -#ifdef ENABLE_QY_API - GET(INFINI_DEVICE_QY, nvidia) -#endif - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } -#undef GET - - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; -} - -__C infiniStatus_t infiniopFloor( - infiniopFloorDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *y, - const void *x, - void *stream) { - -#define CALCULATE(CASE, NAMESPACE) \ - case CASE: \ - return reinterpret_cast(desc) \ - ->calculate(workspace, workspace_size, y, {x}, stream) - - switch (desc->device_type) { - -#ifdef ENABLE_CPU_API - CALCULATE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - CALCULATE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef CALCULATE -} - -__C infiniStatus_t -infiniopDestroyFloorDescriptor(infiniopFloorDescriptor_t desc) { - -#define DELETE(CASE, NAMESPACE) \ - case CASE: \ - delete reinterpret_cast(desc); \ - return INFINI_STATUS_SUCCESS - - switch (desc->device_type) { - -#ifdef ENABLE_CPU_API - DELETE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - DELETE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - DELETE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - DELETE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef DELETE -} +UNARY_OP_IMPL(floor, Floor) diff --git a/src/infiniop/ops/log/cpu/log_cpu.cc b/src/infiniop/ops/log/cpu/log_cpu.cc index e7314c319..734ad1617 100644 --- a/src/infiniop/ops/log/cpu/log_cpu.cc +++ b/src/infiniop/ops/log/cpu/log_cpu.cc @@ -1,48 +1,8 @@ #include "log_cpu.h" +#include "../../../elementwise/cpu/elementwise_cpu_impl.h" namespace op::log::cpu { -Descriptor::~Descriptor() = default; +ELEMENTWISE_CPU_IMPL_UNARY(log) -infiniStatus_t Descriptor::create( - infiniopHandle_t handle_, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t out_desc, - std::vector input_desc_vec) { - - auto handle = reinterpret_cast(handle_); - auto dtype = out_desc->dtype(); - - const auto &x_desc = input_desc_vec.at(0); - const auto &y_shape = out_desc->shape(); - const auto &x_shape = x_desc->shape(); - - CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); - - CHECK_SAME_SHAPE(y_shape, x_shape); - - // create CPU elementwise descriptor - CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); - - return INFINI_STATUS_SUCCESS; -} - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *output, - std::vector inputs, - void *stream) const { - - switch (_dtype) { - case INFINI_DTYPE_F16: - return _device_info->calculate(_info, output, inputs, stream); - case INFINI_DTYPE_F32: - return _device_info->calculate(_info, output, inputs, stream); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - - return INFINI_STATUS_SUCCESS; -} } // namespace op::log::cpu diff --git a/src/infiniop/ops/log/cpu/log_cpu.h b/src/infiniop/ops/log/cpu/log_cpu.h index 535e681d3..b13d01442 100644 --- a/src/infiniop/ops/log/cpu/log_cpu.h +++ b/src/infiniop/ops/log/cpu/log_cpu.h @@ -1,22 +1,9 @@ #ifndef __LOG_CPU_H__ #define __LOG_CPU_H__ -#include - #include "../../../elementwise/cpu/elementwise_cpu.h" +#include "../../../elementwise/unary.h" -ELEMENTWISE_DESCRIPTOR(log, cpu) - -namespace op::log::cpu { -typedef struct LogOp { -public: - static constexpr size_t num_inputs = 1; - - template - T operator()(const T &x) const { - return std::log(x); - } -} LogOp; -} // namespace op::log::cpu +UNARY_ELEMENTWISE_DESCRIPTOR(log, cpu, op::elementwise::unary::UnaryMode::Log) #endif // __LOG_CPU_H__ diff --git a/src/infiniop/ops/log/cuda/kernel.cuh b/src/infiniop/ops/log/cuda/kernel.cuh index b1e46873c..80980ada1 100644 --- a/src/infiniop/ops/log/cuda/kernel.cuh +++ b/src/infiniop/ops/log/cuda/kernel.cuh @@ -1,32 +1,10 @@ #ifndef __LOG_CUDA_H__ #define __LOG_CUDA_H__ -#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" -#include +#include "../../../elementwise/unary.h" namespace op::log::cuda { -typedef struct LogOp { -public: - static constexpr size_t num_inputs = 1; - template - __device__ __forceinline__ T operator()(const T &x) const { - if constexpr (std::is_same_v) { - return h2log(x); - } else if constexpr (std::is_same_v) { - return __float2half(__logf(__half2float(x))); - } else if constexpr (std::is_same_v) { - float x0 = __bfloat162float(__low2bfloat16(x)); - float x1 = __bfloat162float(__high2bfloat16(x)); - return __floats2bfloat162_rn(logf(x0), logf(x1)); - } else if constexpr (std::is_same_v) { - return __float2bfloat16_rn(logf(__bfloat162float(x))); - } else if constexpr (std::is_same_v) { - return __logf(x); - } else { - return std::log(x); - } - } -} LogOp; +using Op = op::elementwise::unary::cuda::UnaryOp; } // namespace op::log::cuda #endif // __LOG_CUDA_H__ diff --git a/src/infiniop/ops/log/nvidia/log_nvidia.cu b/src/infiniop/ops/log/nvidia/log_nvidia.cu index 9e7bcafc4..87aaa0388 100644 --- a/src/infiniop/ops/log/nvidia/log_nvidia.cu +++ b/src/infiniop/ops/log/nvidia/log_nvidia.cu @@ -1,54 +1,10 @@ -#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" +#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh" #include "../cuda/kernel.cuh" #include "log_nvidia.cuh" namespace op::log::nvidia { -Descriptor::~Descriptor() = default; +ELEMENTWISE_NVIDIA_IMPL_UNARY(log) -infiniStatus_t Descriptor::create( - infiniopHandle_t handle_, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t out_desc, - std::vector input_desc_vec) { - - auto handle = reinterpret_cast(handle_); - auto dtype = out_desc->dtype(); - - const auto &x_desc = input_desc_vec.at(0); - const auto &y_shape = out_desc->shape(); - const auto &x_shape = x_desc->shape(); - - CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); - - CHECK_SAME_SHAPE(y_shape, x_shape); - - // create CUDA elementwise descriptor - CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) - - return INFINI_STATUS_SUCCESS; -} - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *output, - std::vector inputs, - void *stream) const { - if (workspace_size < _workspace_size) { - return INFINI_STATUS_INSUFFICIENT_WORKSPACE; - } - - switch (_dtype) { - case INFINI_DTYPE_F16: - return _device_info->calculate<256, cuda::LogOp, half>(_info, workspace, output, inputs, stream); - case INFINI_DTYPE_F32: - return _device_info->calculate<256, cuda::LogOp, float>(_info, workspace, output, inputs, stream); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - - return INFINI_STATUS_SUCCESS; -} } // namespace op::log::nvidia diff --git a/src/infiniop/ops/log/operator.cc b/src/infiniop/ops/log/operator.cc index 8f2add408..9614a0861 100644 --- a/src/infiniop/ops/log/operator.cc +++ b/src/infiniop/ops/log/operator.cc @@ -1,5 +1,4 @@ -#include "../../operator.h" -#include "../../handle.h" +#include "../../operator_impl.h" #include "infiniop/ops/log.h" #ifdef ENABLE_CPU_API @@ -9,131 +8,4 @@ #include "nvidia/log_nvidia.cuh" #endif -__C infiniStatus_t infiniopCreateLogDescriptor( - infiniopHandle_t handle, - infiniopLogDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t y_desc, - infiniopTensorDescriptor_t x_desc) { - -#define CREATE(CASE, NAMESPACE) \ - case CASE: \ - return op::log::NAMESPACE::Descriptor::create( \ - handle, \ - reinterpret_cast(desc_ptr), \ - y_desc, \ - {x_desc}) - - switch (handle->device) { - -#ifdef ENABLE_CPU_API - CREATE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - CREATE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - CREATE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - CREATE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef CREATE -} - -__C infiniStatus_t infiniopGetLogWorkspaceSize(infiniopLogDescriptor_t desc, size_t *size) { - -#define GET(CASE, NAMESPACE) \ - case CASE: \ - *size = reinterpret_cast(desc)->workspaceSize(); \ - return INFINI_STATUS_SUCCESS; - - switch (desc->device_type) { -#ifdef ENABLE_CPU_API - GET(INFINI_DEVICE_CPU, cpu) -#endif -#ifdef ENABLE_NVIDIA_API - GET(INFINI_DEVICE_NVIDIA, nvidia) -#endif -#ifdef ENABLE_ILUVATAR_API - GET(INFINI_DEVICE_ILUVATAR, nvidia) -#endif -#ifdef ENABLE_QY_API - GET(INFINI_DEVICE_QY, nvidia) -#endif - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } -#undef GET - - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; -} - -__C infiniStatus_t infiniopLog( - infiniopLogDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *y, - const void *x, - void *stream) { - -#define CALCULATE(CASE, NAMESPACE) \ - case CASE: \ - return reinterpret_cast(desc) \ - ->calculate(workspace, workspace_size, y, {x}, stream) - - switch (desc->device_type) { - -#ifdef ENABLE_CPU_API - CALCULATE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - CALCULATE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef CALCULATE -} - -__C infiniStatus_t -infiniopDestroyLogDescriptor(infiniopLogDescriptor_t desc) { - -#define DELETE(CASE, NAMESPACE) \ - case CASE: \ - delete reinterpret_cast(desc); \ - return INFINI_STATUS_SUCCESS - - switch (desc->device_type) { - -#ifdef ENABLE_CPU_API - DELETE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - DELETE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - DELETE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - DELETE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef DELETE -} +UNARY_OP_IMPL(log, Log) diff --git a/src/infiniop/ops/max/cpu/max_cpu.cc b/src/infiniop/ops/max/cpu/max_cpu.cc index 1b30fa4e4..98e8a52a2 100644 --- a/src/infiniop/ops/max/cpu/max_cpu.cc +++ b/src/infiniop/ops/max/cpu/max_cpu.cc @@ -1,50 +1,8 @@ #include "max_cpu.h" +#include "../../../elementwise/cpu/elementwise_cpu_impl.h" namespace op::max::cpu { -Descriptor::~Descriptor() = default; +ELEMENTWISE_CPU_IMPL_BINARY(max) -infiniStatus_t Descriptor::create( - infiniopHandle_t handle_, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t out_desc, - std::vector input_desc_vec) { - - auto handle = reinterpret_cast(handle_); - auto dtype = out_desc->dtype(); - - const auto &a_desc = input_desc_vec.at(0); - const auto &b_desc = input_desc_vec.at(1); - const auto &c_shape = out_desc->shape(); - const auto &a_shape = a_desc->shape(); - const auto &b_shape = b_desc->shape(); - - CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); - - CHECK_SAME_SHAPE(c_shape, a_shape, b_shape); - - // create CPU elementwise descriptor - CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); - - return INFINI_STATUS_SUCCESS; -} - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *output, - std::vector inputs, - void *stream) const { - - switch (_dtype) { - case INFINI_DTYPE_F16: - return _device_info->calculate(_info, output, inputs, stream); - case INFINI_DTYPE_F32: - return _device_info->calculate(_info, output, inputs, stream); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - - return INFINI_STATUS_SUCCESS; -} } // namespace op::max::cpu diff --git a/src/infiniop/ops/max/cpu/max_cpu.h b/src/infiniop/ops/max/cpu/max_cpu.h index 4d085ed39..2219994d5 100644 --- a/src/infiniop/ops/max/cpu/max_cpu.h +++ b/src/infiniop/ops/max/cpu/max_cpu.h @@ -1,20 +1,9 @@ #ifndef __MAX_CPU_H__ #define __MAX_CPU_H__ +#include "../../../elementwise/binary.h" #include "../../../elementwise/cpu/elementwise_cpu.h" -#include -ELEMENTWISE_DESCRIPTOR(max, cpu) - -namespace op::max::cpu { -typedef struct MaxOp { -public: - static constexpr size_t num_inputs = 2; - template - T operator()(const T &a, const T &b) const { - return std::max(a, b); - } -} MaxOp; -} // namespace op::max::cpu +BINARY_ELEMENTWISE_DESCRIPTOR(max, cpu, op::elementwise::binary::BinaryMode::Max) #endif // __MAX_CPU_H__ diff --git a/src/infiniop/ops/max/cuda/kernel.cuh b/src/infiniop/ops/max/cuda/kernel.cuh index bf3977a31..68f634559 100644 --- a/src/infiniop/ops/max/cuda/kernel.cuh +++ b/src/infiniop/ops/max/cuda/kernel.cuh @@ -1,23 +1,10 @@ #ifndef __MAX_CUDA_H__ #define __MAX_CUDA_H__ +#include "../../../elementwise/binary.h" + namespace op::max::cuda { -typedef struct MaxOp { -public: - static constexpr size_t num_inputs = 2; - template - __device__ __forceinline__ T operator()(const T &a, const T &b) const { - if constexpr (std::is_same_v) { - return __hmax2(a, b); - } else if constexpr (std::is_same_v || std::is_same_v) { - return a > b ? a : b; - } else if constexpr (std::is_same_v) { - return fmaxf(a, b); - } else { - return a > b ? a : b; - } - } -} MaxOp; +using Op = op::elementwise::binary::cuda::BinaryOp; } // namespace op::max::cuda #endif // __MAX_CUDA_H__ diff --git a/src/infiniop/ops/max/nvidia/max_nvidia.cu b/src/infiniop/ops/max/nvidia/max_nvidia.cu index 5e9fb13f4..ba4620f3b 100644 --- a/src/infiniop/ops/max/nvidia/max_nvidia.cu +++ b/src/infiniop/ops/max/nvidia/max_nvidia.cu @@ -1,57 +1,10 @@ -#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" +#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh" #include "../cuda/kernel.cuh" #include "max_nvidia.cuh" namespace op::max::nvidia { -Descriptor::~Descriptor() = default; +ELEMENTWISE_NVIDIA_IMPL_BINARY(max) -infiniStatus_t Descriptor::create( - infiniopHandle_t handle_, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t out_desc, - std::vector input_desc_vec) { - - auto handle = reinterpret_cast(handle_); - auto dtype = out_desc->dtype(); - - const auto &a_desc = input_desc_vec.at(0); - const auto &b_desc = input_desc_vec.at(1); - const auto &c_shape = out_desc->shape(); - const auto &a_shape = a_desc->shape(); - const auto &b_shape = b_desc->shape(); - - CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); - - CHECK_SAME_SHAPE(c_shape, a_shape, b_shape); - - // create CUDA elementwise descriptor - CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) - - return INFINI_STATUS_SUCCESS; -} - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *output, - std::vector inputs, - void *stream) const { - - if (workspace_size < _workspace_size) { - return INFINI_STATUS_INSUFFICIENT_WORKSPACE; - } - - switch (_dtype) { - case INFINI_DTYPE_F16: - return _device_info->calculate<256, cuda::MaxOp, half>(_info, workspace, output, inputs, stream); - case INFINI_DTYPE_F32: - return _device_info->calculate<256, cuda::MaxOp, float>(_info, workspace, output, inputs, stream); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - - return INFINI_STATUS_SUCCESS; -} } // namespace op::max::nvidia diff --git a/src/infiniop/ops/max/operator.cc b/src/infiniop/ops/max/operator.cc index e04368533..3e5299f52 100644 --- a/src/infiniop/ops/max/operator.cc +++ b/src/infiniop/ops/max/operator.cc @@ -1,5 +1,4 @@ -#include "../../operator.h" -#include "../../handle.h" +#include "../../operator_impl.h" #include "infiniop/ops/max.h" #ifdef ENABLE_CPU_API @@ -8,195 +7,5 @@ #if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) #include "nvidia/max_nvidia.cuh" #endif -#ifdef ENABLE_METAX_API -#include "metax/max_metax.h" -#endif -#ifdef ENABLE_KUNLUN_API -#include "kunlun/max_kunlun.h" -#endif -#ifdef ENABLE_CAMBRICON_API -#include "bang/max_bang.h" -#endif -#ifdef ENABLE_MOORE_API -#include "moore/max_moore.h" -#endif - -__C infiniStatus_t infiniopCreateMaxDescriptor( - infiniopHandle_t handle, - infiniopMaxDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t c_desc, - infiniopTensorDescriptor_t a_desc, - infiniopTensorDescriptor_t b_desc) { - -#define CREATE(CASE, NAMESPACE) \ - case CASE: \ - return op::max::NAMESPACE::Descriptor::create( \ - handle, \ - reinterpret_cast(desc_ptr), \ - c_desc, \ - {a_desc, \ - b_desc}) - - switch (handle->device) { - -#ifdef ENABLE_CPU_API - CREATE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - CREATE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - CREATE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - CREATE(INFINI_DEVICE_QY, nvidia); -#endif -#ifdef ENABLE_METAX_API - CREATE(INFINI_DEVICE_METAX, metax); -#endif -#ifdef ENABLE_KUNLUN_API - CREATE(INFINI_DEVICE_KUNLUN, kunlun); -#endif -#ifdef ENABLE_CAMBRICON_API - CREATE(INFINI_DEVICE_CAMBRICON, bang); -#endif -#ifdef ENABLE_MOORE_API - CREATE(INFINI_DEVICE_MOORE, moore); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef CREATE -} - -__C infiniStatus_t infiniopGetMaxWorkspaceSize(infiniopMaxDescriptor_t desc, size_t *size) { - -#define GET(CASE, NAMESPACE) \ - case CASE: \ - *size = reinterpret_cast(desc)->workspaceSize(); \ - return INFINI_STATUS_SUCCESS - - switch (desc->device_type) { -#ifdef ENABLE_CPU_API - GET(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - GET(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - GET(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - GET(INFINI_DEVICE_QY, nvidia); -#endif -#ifdef ENABLE_METAX_API - GET(INFINI_DEVICE_METAX, metax); -#endif -#ifdef ENABLE_KUNLUN_API - GET(INFINI_DEVICE_KUNLUN, kunlun); -#endif -#ifdef ENABLE_CAMBRICON_API - GET(INFINI_DEVICE_CAMBRICON, bang); -#endif -#ifdef ENABLE_MOORE_API - GET(INFINI_DEVICE_MOORE, moore); -#endif - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } -#undef GET - - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; -} - -__C infiniStatus_t infiniopMax( - infiniopMaxDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *c, - const void *a, - const void *b, - void *stream) { - -#define CALCULATE(CASE, NAMESPACE) \ - case CASE: \ - return reinterpret_cast(desc) \ - ->calculate(workspace, workspace_size, c, {a, b}, stream) - - switch (desc->device_type) { - -#ifdef ENABLE_CPU_API - CALCULATE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - CALCULATE(INFINI_DEVICE_QY, nvidia); -#endif -#ifdef ENABLE_METAX_API - CALCULATE(INFINI_DEVICE_METAX, metax); -#endif -#ifdef ENABLE_KUNLUN_API - CALCULATE(INFINI_DEVICE_KUNLUN, kunlun); -#endif -#ifdef ENABLE_CAMBRICON_API - CALCULATE(INFINI_DEVICE_CAMBRICON, bang); -#endif -#ifdef ENABLE_MOORE_API - CALCULATE(INFINI_DEVICE_MOORE, moore); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef CALCULATE -} - -__C infiniStatus_t -infiniopDestroyMaxDescriptor(infiniopMaxDescriptor_t desc) { - -#define DELETE(CASE, NAMESPACE) \ - case CASE: \ - delete reinterpret_cast(desc); \ - return INFINI_STATUS_SUCCESS - - switch (desc->device_type) { - -#ifdef ENABLE_CPU_API - DELETE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - DELETE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - DELETE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - DELETE(INFINI_DEVICE_QY, nvidia); -#endif -#ifdef ENABLE_METAX_API - DELETE(INFINI_DEVICE_METAX, metax); -#endif -#ifdef ENABLE_KUNLUN_API - DELETE(INFINI_DEVICE_KUNLUN, kunlun); -#endif -#ifdef ENABLE_CAMBRICON_API - DELETE(INFINI_DEVICE_CAMBRICON, bang); -#endif -#ifdef ENABLE_MOORE_API - DELETE(INFINI_DEVICE_MOORE, moore); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } -#undef DELETE -} +BINARY_OP_IMPL(max, Max) diff --git a/src/infiniop/ops/min/cpu/min_cpu.cc b/src/infiniop/ops/min/cpu/min_cpu.cc index dc30ee57f..1bac9ea61 100644 --- a/src/infiniop/ops/min/cpu/min_cpu.cc +++ b/src/infiniop/ops/min/cpu/min_cpu.cc @@ -1,50 +1,8 @@ #include "min_cpu.h" +#include "../../../elementwise/cpu/elementwise_cpu_impl.h" namespace op::min::cpu { -Descriptor::~Descriptor() = default; +ELEMENTWISE_CPU_IMPL_BINARY(min) -infiniStatus_t Descriptor::create( - infiniopHandle_t handle_, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t out_desc, - std::vector input_desc_vec) { - - auto handle = reinterpret_cast(handle_); - auto dtype = out_desc->dtype(); - - const auto &a_desc = input_desc_vec.at(0); - const auto &b_desc = input_desc_vec.at(1); - const auto &c_shape = out_desc->shape(); - const auto &a_shape = a_desc->shape(); - const auto &b_shape = b_desc->shape(); - - CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); - - CHECK_SAME_SHAPE(c_shape, a_shape, b_shape); - - // create CPU elementwise descriptor - CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); - - return INFINI_STATUS_SUCCESS; -} - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *output, - std::vector inputs, - void *stream) const { - - switch (_dtype) { - case INFINI_DTYPE_F16: - return _device_info->calculate(_info, output, inputs, stream); - case INFINI_DTYPE_F32: - return _device_info->calculate(_info, output, inputs, stream); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - - return INFINI_STATUS_SUCCESS; -} } // namespace op::min::cpu diff --git a/src/infiniop/ops/min/cpu/min_cpu.h b/src/infiniop/ops/min/cpu/min_cpu.h index 1c84d4fca..74042db50 100644 --- a/src/infiniop/ops/min/cpu/min_cpu.h +++ b/src/infiniop/ops/min/cpu/min_cpu.h @@ -1,20 +1,9 @@ #ifndef __MIN_CPU_H__ #define __MIN_CPU_H__ +#include "../../../elementwise/binary.h" #include "../../../elementwise/cpu/elementwise_cpu.h" -#include -ELEMENTWISE_DESCRIPTOR(min, cpu) - -namespace op::min::cpu { -typedef struct MinOp { -public: - static constexpr size_t num_inputs = 2; - template - T operator()(const T &a, const T &b) const { - return std::min(a, b); - } -} MinOp; -} // namespace op::min::cpu +BINARY_ELEMENTWISE_DESCRIPTOR(min, cpu, op::elementwise::binary::BinaryMode::Min) #endif // __MIN_CPU_H__ diff --git a/src/infiniop/ops/min/cuda/kernel.cuh b/src/infiniop/ops/min/cuda/kernel.cuh index aac14a0e8..75c6ab6b9 100644 --- a/src/infiniop/ops/min/cuda/kernel.cuh +++ b/src/infiniop/ops/min/cuda/kernel.cuh @@ -1,23 +1,10 @@ #ifndef __MIN_CUDA_H__ #define __MIN_CUDA_H__ +#include "../../../elementwise/binary.h" + namespace op::min::cuda { -typedef struct MinOp { -public: - static constexpr size_t num_inputs = 2; - template - __device__ __forceinline__ T operator()(const T &a, const T &b) const { - if constexpr (std::is_same_v) { - return __hmin2(a, b); - } else if constexpr (std::is_same_v || std::is_same_v) { - return a < b ? a : b; - } else if constexpr (std::is_same_v) { - return fminf(a, b); - } else { - return a < b ? a : b; - } - } -} MinOp; +using Op = op::elementwise::binary::cuda::BinaryOp; } // namespace op::min::cuda #endif // __MIN_CUDA_H__ diff --git a/src/infiniop/ops/min/nvidia/min_nvidia.cu b/src/infiniop/ops/min/nvidia/min_nvidia.cu index 419655e29..0708cbcaf 100644 --- a/src/infiniop/ops/min/nvidia/min_nvidia.cu +++ b/src/infiniop/ops/min/nvidia/min_nvidia.cu @@ -1,57 +1,10 @@ -#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" +#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh" #include "../cuda/kernel.cuh" #include "min_nvidia.cuh" namespace op::min::nvidia { -Descriptor::~Descriptor() = default; +ELEMENTWISE_NVIDIA_IMPL_BINARY(min) -infiniStatus_t Descriptor::create( - infiniopHandle_t handle_, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t out_desc, - std::vector input_desc_vec) { - - auto handle = reinterpret_cast(handle_); - auto dtype = out_desc->dtype(); - - const auto &a_desc = input_desc_vec.at(0); - const auto &b_desc = input_desc_vec.at(1); - const auto &c_shape = out_desc->shape(); - const auto &a_shape = a_desc->shape(); - const auto &b_shape = b_desc->shape(); - - CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); - - CHECK_SAME_SHAPE(c_shape, a_shape, b_shape); - - // create CUDA elementwise descriptor - CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) - - return INFINI_STATUS_SUCCESS; -} - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *output, - std::vector inputs, - void *stream) const { - - if (workspace_size < _workspace_size) { - return INFINI_STATUS_INSUFFICIENT_WORKSPACE; - } - - switch (_dtype) { - case INFINI_DTYPE_F16: - return _device_info->calculate<256, cuda::MinOp, half>(_info, workspace, output, inputs, stream); - case INFINI_DTYPE_F32: - return _device_info->calculate<256, cuda::MinOp, float>(_info, workspace, output, inputs, stream); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - - return INFINI_STATUS_SUCCESS; -} } // namespace op::min::nvidia diff --git a/src/infiniop/ops/min/operator.cc b/src/infiniop/ops/min/operator.cc index 8479feab4..6f67ecf87 100644 --- a/src/infiniop/ops/min/operator.cc +++ b/src/infiniop/ops/min/operator.cc @@ -1,5 +1,4 @@ -#include "../../operator.h" -#include "../../handle.h" +#include "../../operator_impl.h" #include "infiniop/ops/min.h" #ifdef ENABLE_CPU_API @@ -8,195 +7,5 @@ #if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) #include "nvidia/min_nvidia.cuh" #endif -#ifdef ENABLE_METAX_API -#include "metax/min_metax.h" -#endif -#ifdef ENABLE_KUNLUN_API -#include "kunlun/min_kunlun.h" -#endif -#ifdef ENABLE_CAMBRICON_API -#include "bang/min_bang.h" -#endif -#ifdef ENABLE_MOORE_API -#include "moore/min_moore.h" -#endif - -__C infiniStatus_t infiniopCreateMinDescriptor( - infiniopHandle_t handle, - infiniopMinDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t c_desc, - infiniopTensorDescriptor_t a_desc, - infiniopTensorDescriptor_t b_desc) { - -#define CREATE(CASE, NAMESPACE) \ - case CASE: \ - return op::min::NAMESPACE::Descriptor::create( \ - handle, \ - reinterpret_cast(desc_ptr), \ - c_desc, \ - {a_desc, \ - b_desc}) - - switch (handle->device) { - -#ifdef ENABLE_CPU_API - CREATE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - CREATE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - CREATE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - CREATE(INFINI_DEVICE_QY, nvidia); -#endif -#ifdef ENABLE_METAX_API - CREATE(INFINI_DEVICE_METAX, metax); -#endif -#ifdef ENABLE_KUNLUN_API - CREATE(INFINI_DEVICE_KUNLUN, kunlun); -#endif -#ifdef ENABLE_CAMBRICON_API - CREATE(INFINI_DEVICE_CAMBRICON, bang); -#endif -#ifdef ENABLE_MOORE_API - CREATE(INFINI_DEVICE_MOORE, moore); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef CREATE -} - -__C infiniStatus_t infiniopGetMinWorkspaceSize(infiniopMinDescriptor_t desc, size_t *size) { - -#define GET(CASE, NAMESPACE) \ - case CASE: \ - *size = reinterpret_cast(desc)->workspaceSize(); \ - return INFINI_STATUS_SUCCESS - - switch (desc->device_type) { -#ifdef ENABLE_CPU_API - GET(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - GET(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - GET(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - GET(INFINI_DEVICE_QY, nvidia); -#endif -#ifdef ENABLE_METAX_API - GET(INFINI_DEVICE_METAX, metax); -#endif -#ifdef ENABLE_KUNLUN_API - GET(INFINI_DEVICE_KUNLUN, kunlun); -#endif -#ifdef ENABLE_CAMBRICON_API - GET(INFINI_DEVICE_CAMBRICON, bang); -#endif -#ifdef ENABLE_MOORE_API - GET(INFINI_DEVICE_MOORE, moore); -#endif - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } -#undef GET - - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; -} - -__C infiniStatus_t infiniopMin( - infiniopMinDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *c, - const void *a, - const void *b, - void *stream) { - -#define CALCULATE(CASE, NAMESPACE) \ - case CASE: \ - return reinterpret_cast(desc) \ - ->calculate(workspace, workspace_size, c, {a, b}, stream) - - switch (desc->device_type) { - -#ifdef ENABLE_CPU_API - CALCULATE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - CALCULATE(INFINI_DEVICE_QY, nvidia); -#endif -#ifdef ENABLE_METAX_API - CALCULATE(INFINI_DEVICE_METAX, metax); -#endif -#ifdef ENABLE_KUNLUN_API - CALCULATE(INFINI_DEVICE_KUNLUN, kunlun); -#endif -#ifdef ENABLE_CAMBRICON_API - CALCULATE(INFINI_DEVICE_CAMBRICON, bang); -#endif -#ifdef ENABLE_MOORE_API - CALCULATE(INFINI_DEVICE_MOORE, moore); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef CALCULATE -} - -__C infiniStatus_t -infiniopDestroyMinDescriptor(infiniopMinDescriptor_t desc) { - -#define DELETE(CASE, NAMESPACE) \ - case CASE: \ - delete reinterpret_cast(desc); \ - return INFINI_STATUS_SUCCESS - - switch (desc->device_type) { - -#ifdef ENABLE_CPU_API - DELETE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - DELETE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - DELETE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - DELETE(INFINI_DEVICE_QY, nvidia); -#endif -#ifdef ENABLE_METAX_API - DELETE(INFINI_DEVICE_METAX, metax); -#endif -#ifdef ENABLE_KUNLUN_API - DELETE(INFINI_DEVICE_KUNLUN, kunlun); -#endif -#ifdef ENABLE_CAMBRICON_API - DELETE(INFINI_DEVICE_CAMBRICON, bang); -#endif -#ifdef ENABLE_MOORE_API - DELETE(INFINI_DEVICE_MOORE, moore); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } -#undef DELETE -} +BINARY_OP_IMPL(min, Min) diff --git a/src/infiniop/ops/mod/cpu/mod_cpu.cc b/src/infiniop/ops/mod/cpu/mod_cpu.cc index 907d05166..609c2e76e 100644 --- a/src/infiniop/ops/mod/cpu/mod_cpu.cc +++ b/src/infiniop/ops/mod/cpu/mod_cpu.cc @@ -1,49 +1,8 @@ #include "mod_cpu.h" +#include "../../../elementwise/cpu/elementwise_cpu_impl.h" namespace op::mod::cpu { -Descriptor::~Descriptor() = default; +ELEMENTWISE_CPU_IMPL_BINARY(mod) -infiniStatus_t Descriptor::create( - infiniopHandle_t handle_, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t out_desc, - std::vector input_desc_vec) { - - auto handle = reinterpret_cast(handle_); - auto dtype = out_desc->dtype(); - - const auto &a_desc = input_desc_vec.at(0); - const auto &b_desc = input_desc_vec.at(1); - const auto &out_shape = out_desc->shape(); - const auto &a_shape = a_desc->shape(); - const auto &b_shape = b_desc->shape(); - - CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); - - CHECK_SAME_SHAPE(out_shape, a_shape, b_shape); - - // create CPU elementwise descriptor - CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); - - return INFINI_STATUS_SUCCESS; -} - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *output, - std::vector inputs, - void *stream) const { - - switch (_dtype) { - case INFINI_DTYPE_F16: - return _device_info->calculate(_info, output, inputs, stream); - case INFINI_DTYPE_F32: - return _device_info->calculate(_info, output, inputs, stream); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - return INFINI_STATUS_SUCCESS; -} } // namespace op::mod::cpu diff --git a/src/infiniop/ops/mod/cpu/mod_cpu.h b/src/infiniop/ops/mod/cpu/mod_cpu.h index 9e78adca6..72ea7dede 100644 --- a/src/infiniop/ops/mod/cpu/mod_cpu.h +++ b/src/infiniop/ops/mod/cpu/mod_cpu.h @@ -1,23 +1,9 @@ #ifndef __MOD_CPU_H__ #define __MOD_CPU_H__ +#include "../../../elementwise/binary.h" #include "../../../elementwise/cpu/elementwise_cpu.h" -ELEMENTWISE_DESCRIPTOR(mod, cpu) - -namespace op::mod::cpu { -typedef struct ModOp { -public: - static constexpr size_t num_inputs = 2; - template - T operator()(const T &a, const T &b) const { - if constexpr (std::is_floating_point_v) { - return std::fmod(a, b); - } else { - return a % b; - } - } -} ModOp; -} // namespace op::mod::cpu +BINARY_ELEMENTWISE_DESCRIPTOR(mod, cpu, op::elementwise::binary::BinaryMode::Mod) #endif // __MOD_CPU_H__ diff --git a/src/infiniop/ops/mod/cuda/kernel.cuh b/src/infiniop/ops/mod/cuda/kernel.cuh index 0dcb54136..164784081 100644 --- a/src/infiniop/ops/mod/cuda/kernel.cuh +++ b/src/infiniop/ops/mod/cuda/kernel.cuh @@ -1,30 +1,10 @@ #ifndef __MOD_CUDA_H__ #define __MOD_CUDA_H__ -#include -#include +#include "../../../elementwise/binary.h" namespace op::mod::cuda { -typedef struct ModOp { -public: - static constexpr size_t num_inputs = 2; - template - __device__ __forceinline__ T operator()(const T &a, const T &b) const { - if constexpr (std::is_same_v) { - float2 a_f2 = __half22float2(a); - float2 b_f2 = __half22float2(b); - return __float22half2_rn(make_float2(std::fmod(a_f2.x, b_f2.x), std::fmod(a_f2.y, b_f2.y))); - } else if constexpr (std::is_same_v) { - float a_ = __half2float(a); - float b_ = __half2float(b); - return __float2half(std::fmod(a_, b_)); - } else if constexpr (std::is_floating_point_v) { - return std::fmod(a, b); - } else { - return a % b; - } - } -} ModOp; +using Op = op::elementwise::binary::cuda::BinaryOp; } // namespace op::mod::cuda #endif // __MOD_CUDA_H__ diff --git a/src/infiniop/ops/mod/nvidia/mod_nvidia.cu b/src/infiniop/ops/mod/nvidia/mod_nvidia.cu index 64326d441..68b78ee70 100644 --- a/src/infiniop/ops/mod/nvidia/mod_nvidia.cu +++ b/src/infiniop/ops/mod/nvidia/mod_nvidia.cu @@ -1,57 +1,10 @@ -#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" +#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh" #include "../cuda/kernel.cuh" #include "mod_nvidia.cuh" namespace op::mod::nvidia { -Descriptor::~Descriptor() = default; +ELEMENTWISE_NVIDIA_IMPL_BINARY(mod) -infiniStatus_t Descriptor::create( - infiniopHandle_t handle_, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t out_desc, - std::vector input_desc_vec) { - - auto handle = reinterpret_cast(handle_); - auto dtype = out_desc->dtype(); - - const auto &a_desc = input_desc_vec.at(0); - const auto &b_desc = input_desc_vec.at(1); - const auto &c_shape = out_desc->shape(); - const auto &a_shape = a_desc->shape(); - const auto &b_shape = b_desc->shape(); - - CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); - - CHECK_SAME_SHAPE(c_shape, a_shape, b_shape); - - // create CUDA elementwise descriptor - CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) - - return INFINI_STATUS_SUCCESS; -} - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *output, - std::vector inputs, - void *stream) const { - - if (workspace_size < _workspace_size) { - return INFINI_STATUS_INSUFFICIENT_WORKSPACE; - } - - switch (_dtype) { - case INFINI_DTYPE_F16: - return _device_info->calculate<256, cuda::ModOp, half>(_info, workspace, output, inputs, stream); - case INFINI_DTYPE_F32: - return _device_info->calculate<256, cuda::ModOp, float>(_info, workspace, output, inputs, stream); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - - return INFINI_STATUS_SUCCESS; -} } // namespace op::mod::nvidia diff --git a/src/infiniop/ops/mod/operator.cc b/src/infiniop/ops/mod/operator.cc index 85810e794..aef892ce1 100644 --- a/src/infiniop/ops/mod/operator.cc +++ b/src/infiniop/ops/mod/operator.cc @@ -1,5 +1,4 @@ -#include "../../operator.h" -#include "../../handle.h" +#include "../../operator_impl.h" #include "infiniop/ops/mod.h" #ifdef ENABLE_CPU_API @@ -9,134 +8,4 @@ #include "nvidia/mod_nvidia.cuh" #endif -__C infiniStatus_t infiniopCreateModDescriptor( - infiniopHandle_t handle, - infiniopModDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t c_desc, - infiniopTensorDescriptor_t a_desc, - infiniopTensorDescriptor_t b_desc) { - -#define CREATE(CASE, NAMESPACE) \ - case CASE: \ - return op::mod::NAMESPACE::Descriptor::create( \ - handle, \ - reinterpret_cast(desc_ptr), \ - c_desc, \ - {a_desc, \ - b_desc}) - - switch (handle->device) { - -#ifdef ENABLE_CPU_API - CREATE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - CREATE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - CREATE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - CREATE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef CREATE -} - -__C infiniStatus_t infiniopGetModWorkspaceSize(infiniopModDescriptor_t desc, size_t *size) { - -#define GET(CASE, NAMESPACE) \ - case CASE: \ - *size = reinterpret_cast(desc)->workspaceSize(); \ - return INFINI_STATUS_SUCCESS - - switch (desc->device_type) { -#ifdef ENABLE_CPU_API - GET(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - GET(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - GET(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - GET(INFINI_DEVICE_QY, nvidia); -#endif - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } -#undef GET - - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; -} - -__C infiniStatus_t infiniopMod( - infiniopModDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *c, - const void *a, - const void *b, - void *stream) { - -#define CALCULATE(CASE, NAMESPACE) \ - case CASE: \ - return reinterpret_cast(desc) \ - ->calculate(workspace, workspace_size, c, {a, b}, stream) - - switch (desc->device_type) { - -#ifdef ENABLE_CPU_API - CALCULATE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - CALCULATE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef CALCULATE -} - -__C infiniStatus_t -infiniopDestroyModDescriptor(infiniopModDescriptor_t desc) { - -#define DELETE(CASE, NAMESPACE) \ - case CASE: \ - delete reinterpret_cast(desc); \ - return INFINI_STATUS_SUCCESS - - switch (desc->device_type) { - -#ifdef ENABLE_CPU_API - DELETE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - DELETE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - DELETE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - DELETE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef DELETE -} +BINARY_OP_IMPL(mod, Mod) diff --git a/src/infiniop/ops/neg/cpu/neg_cpu.cc b/src/infiniop/ops/neg/cpu/neg_cpu.cc index 5da2ae4c3..47f4d2b2e 100644 --- a/src/infiniop/ops/neg/cpu/neg_cpu.cc +++ b/src/infiniop/ops/neg/cpu/neg_cpu.cc @@ -1,48 +1,8 @@ #include "neg_cpu.h" +#include "../../../elementwise/cpu/elementwise_cpu_impl.h" namespace op::neg::cpu { -Descriptor::~Descriptor() = default; +ELEMENTWISE_CPU_IMPL_UNARY(neg) -infiniStatus_t Descriptor::create( - infiniopHandle_t handle_, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t out_desc, - std::vector input_desc_vec) { - - auto handle = reinterpret_cast(handle_); - auto dtype = out_desc->dtype(); - - const auto &x_desc = input_desc_vec.at(0); - const auto &y_shape = out_desc->shape(); - const auto &x_shape = x_desc->shape(); - - CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); - - CHECK_SAME_SHAPE(y_shape, x_shape); - - // create CPU elementwise descriptor - CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); - - return INFINI_STATUS_SUCCESS; -} - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *output, - std::vector inputs, - void *stream) const { - - switch (_dtype) { - case INFINI_DTYPE_F16: - return _device_info->calculate(_info, output, inputs, stream); - case INFINI_DTYPE_F32: - return _device_info->calculate(_info, output, inputs, stream); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - - return INFINI_STATUS_SUCCESS; -} } // namespace op::neg::cpu diff --git a/src/infiniop/ops/neg/cpu/neg_cpu.h b/src/infiniop/ops/neg/cpu/neg_cpu.h index ea45989b3..f6778a6d3 100644 --- a/src/infiniop/ops/neg/cpu/neg_cpu.h +++ b/src/infiniop/ops/neg/cpu/neg_cpu.h @@ -2,19 +2,8 @@ #define __NEG_CPU_H__ #include "../../../elementwise/cpu/elementwise_cpu.h" +#include "../../../elementwise/unary.h" -ELEMENTWISE_DESCRIPTOR(neg, cpu) - -namespace op::neg::cpu { -typedef struct NegOp { -public: - static constexpr size_t num_inputs = 1; - - template - T operator()(const T &x) const { - return -x; - } -} NegOp; -} // namespace op::neg::cpu +UNARY_ELEMENTWISE_DESCRIPTOR(neg, cpu, op::elementwise::unary::UnaryMode::Neg) #endif // __NEG_CPU_H__ diff --git a/src/infiniop/ops/neg/cuda/kernel.cuh b/src/infiniop/ops/neg/cuda/kernel.cuh index 57904b3df..f5cf5a449 100644 --- a/src/infiniop/ops/neg/cuda/kernel.cuh +++ b/src/infiniop/ops/neg/cuda/kernel.cuh @@ -1,23 +1,10 @@ #ifndef __NEG_CUDA_H__ #define __NEG_CUDA_H__ -#include +#include "../../../elementwise/unary.h" namespace op::neg::cuda { -typedef struct NegOp { -public: - static constexpr size_t num_inputs = 1; - template - __device__ __forceinline__ T operator()(const T &x) const { - if constexpr (std::is_same_v) { - return __hneg2(x); - } else if constexpr (std::is_same_v) { - return __hneg(x); - } else { - return -x; - } - } -} NegOp; +using Op = op::elementwise::unary::cuda::UnaryOp; } // namespace op::neg::cuda #endif // __NEG_CUDA_H__ diff --git a/src/infiniop/ops/neg/nvidia/neg_nvidia.cu b/src/infiniop/ops/neg/nvidia/neg_nvidia.cu index d18b8bf25..f568585f0 100644 --- a/src/infiniop/ops/neg/nvidia/neg_nvidia.cu +++ b/src/infiniop/ops/neg/nvidia/neg_nvidia.cu @@ -1,54 +1,10 @@ -#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" +#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh" #include "../cuda/kernel.cuh" #include "neg_nvidia.cuh" namespace op::neg::nvidia { -Descriptor::~Descriptor() = default; +ELEMENTWISE_NVIDIA_IMPL_UNARY(neg) -infiniStatus_t Descriptor::create( - infiniopHandle_t handle_, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t out_desc, - std::vector input_desc_vec) { - - auto handle = reinterpret_cast(handle_); - auto dtype = out_desc->dtype(); - - const auto &x_desc = input_desc_vec.at(0); - const auto &y_shape = out_desc->shape(); - const auto &x_shape = x_desc->shape(); - - CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); - - CHECK_SAME_SHAPE(y_shape, x_shape); - - // create CUDA elementwise descriptor - CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) - - return INFINI_STATUS_SUCCESS; -} - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *output, - std::vector inputs, - void *stream) const { - if (workspace_size < _workspace_size) { - return INFINI_STATUS_INSUFFICIENT_WORKSPACE; - } - - switch (_dtype) { - case INFINI_DTYPE_F16: - return _device_info->calculate<256, cuda::NegOp, half>(_info, workspace, output, inputs, stream); - case INFINI_DTYPE_F32: - return _device_info->calculate<256, cuda::NegOp, float>(_info, workspace, output, inputs, stream); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - - return INFINI_STATUS_SUCCESS; -} } // namespace op::neg::nvidia diff --git a/src/infiniop/ops/neg/operator.cc b/src/infiniop/ops/neg/operator.cc index d4134df3e..c3945f4bb 100644 --- a/src/infiniop/ops/neg/operator.cc +++ b/src/infiniop/ops/neg/operator.cc @@ -1,5 +1,4 @@ -#include "../../operator.h" -#include "../../handle.h" +#include "../../operator_impl.h" #include "infiniop/ops/neg.h" #ifdef ENABLE_CPU_API @@ -9,131 +8,4 @@ #include "nvidia/neg_nvidia.cuh" #endif -__C infiniStatus_t infiniopCreateNegDescriptor( - infiniopHandle_t handle, - infiniopNegDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t y_desc, - infiniopTensorDescriptor_t x_desc) { - -#define CREATE(CASE, NAMESPACE) \ - case CASE: \ - return op::neg::NAMESPACE::Descriptor::create( \ - handle, \ - reinterpret_cast(desc_ptr), \ - y_desc, \ - {x_desc}) - - switch (handle->device) { - -#ifdef ENABLE_CPU_API - CREATE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - CREATE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - CREATE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - CREATE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef CREATE -} - -__C infiniStatus_t infiniopGetNegWorkspaceSize(infiniopNegDescriptor_t desc, size_t *size) { - -#define GET(CASE, NAMESPACE) \ - case CASE: \ - *size = reinterpret_cast(desc)->workspaceSize(); \ - return INFINI_STATUS_SUCCESS; - - switch (desc->device_type) { -#ifdef ENABLE_CPU_API - GET(INFINI_DEVICE_CPU, cpu) -#endif -#ifdef ENABLE_NVIDIA_API - GET(INFINI_DEVICE_NVIDIA, nvidia) -#endif -#ifdef ENABLE_ILUVATAR_API - GET(INFINI_DEVICE_ILUVATAR, nvidia) -#endif -#ifdef ENABLE_QY_API - GET(INFINI_DEVICE_QY, nvidia) -#endif - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } -#undef GET - - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; -} - -__C infiniStatus_t infiniopNeg( - infiniopNegDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *y, - const void *x, - void *stream) { - -#define CALCULATE(CASE, NAMESPACE) \ - case CASE: \ - return reinterpret_cast(desc) \ - ->calculate(workspace, workspace_size, y, {x}, stream) - - switch (desc->device_type) { - -#ifdef ENABLE_CPU_API - CALCULATE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - CALCULATE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef CALCULATE -} - -__C infiniStatus_t -infiniopDestroyNegDescriptor(infiniopNegDescriptor_t desc) { - -#define DELETE(CASE, NAMESPACE) \ - case CASE: \ - delete reinterpret_cast(desc); \ - return INFINI_STATUS_SUCCESS - - switch (desc->device_type) { - -#ifdef ENABLE_CPU_API - DELETE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - DELETE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - DELETE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - DELETE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef DELETE -} +UNARY_OP_IMPL(neg, Neg) diff --git a/src/infiniop/ops/pow/cpu/pow_cpu.cc b/src/infiniop/ops/pow/cpu/pow_cpu.cc index 0c6fda0f7..1134d8aae 100644 --- a/src/infiniop/ops/pow/cpu/pow_cpu.cc +++ b/src/infiniop/ops/pow/cpu/pow_cpu.cc @@ -1,49 +1,8 @@ #include "pow_cpu.h" +#include "../../../elementwise/cpu/elementwise_cpu_impl.h" namespace op::pow::cpu { -Descriptor::~Descriptor() = default; +ELEMENTWISE_CPU_IMPL_BINARY(pow) -infiniStatus_t Descriptor::create( - infiniopHandle_t handle_, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t out_desc, - std::vector input_desc_vec) { - - auto handle = reinterpret_cast(handle_); - auto dtype = out_desc->dtype(); - - const auto &a_desc = input_desc_vec.at(0); - const auto &b_desc = input_desc_vec.at(1); - const auto &out_shape = out_desc->shape(); - const auto &a_shape = a_desc->shape(); - const auto &b_shape = b_desc->shape(); - - CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); - - CHECK_SAME_SHAPE(out_shape, a_shape, b_shape); - - // create CPU elementwise descriptor - CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); - - return INFINI_STATUS_SUCCESS; -} - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *output, - std::vector inputs, - void *stream) const { - - switch (_dtype) { - case INFINI_DTYPE_F16: - return _device_info->calculate(_info, output, inputs, stream); - case INFINI_DTYPE_F32: - return _device_info->calculate(_info, output, inputs, stream); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - return INFINI_STATUS_SUCCESS; -} } // namespace op::pow::cpu diff --git a/src/infiniop/ops/pow/cpu/pow_cpu.h b/src/infiniop/ops/pow/cpu/pow_cpu.h index 21d9bb897..9c8e8a368 100644 --- a/src/infiniop/ops/pow/cpu/pow_cpu.h +++ b/src/infiniop/ops/pow/cpu/pow_cpu.h @@ -1,19 +1,9 @@ #ifndef __POW_CPU_H__ #define __POW_CPU_H__ +#include "../../../elementwise/binary.h" #include "../../../elementwise/cpu/elementwise_cpu.h" -ELEMENTWISE_DESCRIPTOR(pow, cpu) - -namespace op::pow::cpu { -typedef struct PowOp { -public: - static constexpr size_t num_inputs = 2; - template - T operator()(const T &a, const T &b) const { - return std::pow(a, b); - } -} PowOp; -} // namespace op::pow::cpu +BINARY_ELEMENTWISE_DESCRIPTOR(pow, cpu, op::elementwise::binary::BinaryMode::Pow) #endif // __POW_CPU_H__ diff --git a/src/infiniop/ops/pow/cuda/kernel.cuh b/src/infiniop/ops/pow/cuda/kernel.cuh index 3786e7a52..0637240e8 100644 --- a/src/infiniop/ops/pow/cuda/kernel.cuh +++ b/src/infiniop/ops/pow/cuda/kernel.cuh @@ -1,40 +1,10 @@ #ifndef __POW_CUDA_H__ #define __POW_CUDA_H__ -#include -#include -#include +#include "../../../elementwise/binary.h" namespace op::pow::cuda { -typedef struct PowOp { - static constexpr size_t num_inputs = 2; - template - __device__ __forceinline__ T operator()(const T &a, const T &b) const { - if constexpr (std::is_same_v) { - float2 a_f2 = __half22float2(a); - float2 b_f2 = __half22float2(b); - return __float22half2_rn(make_float2(__powf(a_f2.x, b_f2.x), __powf(a_f2.y, b_f2.y))); - } else if constexpr (std::is_same_v) { - float a_ = __half2float(a); - float b_ = __half2float(b); - float ans_f = __powf(a_, b_); - return __float2half(isnan(ans_f) ? std::pow(a_, b_) : ans_f); - } else if constexpr (std::is_same_v) { - float2 a_f2 = __bfloat1622float2(a); - float2 b_f2 = __bfloat1622float2(b); - return __floats2bfloat162_rn(__powf(a_f2.x, b_f2.x), __powf(a_f2.y, b_f2.y)); - } else if constexpr (std::is_same_v) { - float a_ = __bfloat162float(a); - float b_ = __bfloat162float(b); - return __float2bfloat16_rn(__powf(a_, b_)); - } else if constexpr (std::is_same_v) { - return __powf(a, b); - } else { - return std::pow(a, b); - } - } -} PowOp; - +using Op = op::elementwise::binary::cuda::BinaryOp; } // namespace op::pow::cuda #endif // __POW_CUDA_H__ diff --git a/src/infiniop/ops/pow/nvidia/pow_nvidia.cu b/src/infiniop/ops/pow/nvidia/pow_nvidia.cu index 3cfd0cd2f..63a3d40a3 100644 --- a/src/infiniop/ops/pow/nvidia/pow_nvidia.cu +++ b/src/infiniop/ops/pow/nvidia/pow_nvidia.cu @@ -1,57 +1,10 @@ -#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" +#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh" #include "../cuda/kernel.cuh" #include "pow_nvidia.cuh" namespace op::pow::nvidia { -Descriptor::~Descriptor() = default; +ELEMENTWISE_NVIDIA_IMPL_BINARY(pow) -infiniStatus_t Descriptor::create( - infiniopHandle_t handle_, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t out_desc, - std::vector input_desc_vec) { - - auto handle = reinterpret_cast(handle_); - auto dtype = out_desc->dtype(); - - const auto &a_desc = input_desc_vec.at(0); - const auto &b_desc = input_desc_vec.at(1); - const auto &c_shape = out_desc->shape(); - const auto &a_shape = a_desc->shape(); - const auto &b_shape = b_desc->shape(); - - CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); - - CHECK_SAME_SHAPE(c_shape, a_shape, b_shape); - - // create CUDA elementwise descriptor - CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) - - return INFINI_STATUS_SUCCESS; -} - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *output, - std::vector inputs, - void *stream) const { - - if (workspace_size < _workspace_size) { - return INFINI_STATUS_INSUFFICIENT_WORKSPACE; - } - - switch (_dtype) { - case INFINI_DTYPE_F16: - return _device_info->calculate<256, cuda::PowOp, half>(_info, workspace, output, inputs, stream); - case INFINI_DTYPE_F32: - return _device_info->calculate<256, cuda::PowOp, float>(_info, workspace, output, inputs, stream); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - - return INFINI_STATUS_SUCCESS; -} } // namespace op::pow::nvidia diff --git a/src/infiniop/ops/pow/operator.cc b/src/infiniop/ops/pow/operator.cc index e90639f67..b1ddbc9c1 100644 --- a/src/infiniop/ops/pow/operator.cc +++ b/src/infiniop/ops/pow/operator.cc @@ -1,5 +1,4 @@ -#include "../../operator.h" -#include "../../handle.h" +#include "../../operator_impl.h" #include "infiniop/ops/pow.h" #ifdef ENABLE_CPU_API @@ -9,134 +8,4 @@ #include "nvidia/pow_nvidia.cuh" #endif -__C infiniStatus_t infiniopCreatePowDescriptor( - infiniopHandle_t handle, - infiniopPowDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t c_desc, - infiniopTensorDescriptor_t a_desc, - infiniopTensorDescriptor_t b_desc) { - -#define CREATE(CASE, NAMESPACE) \ - case CASE: \ - return op::pow::NAMESPACE::Descriptor::create( \ - handle, \ - reinterpret_cast(desc_ptr), \ - c_desc, \ - {a_desc, \ - b_desc}) - - switch (handle->device) { - -#ifdef ENABLE_CPU_API - CREATE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - CREATE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - CREATE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - CREATE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef CREATE -} - -__C infiniStatus_t infiniopGetPowWorkspaceSize(infiniopPowDescriptor_t desc, size_t *size) { - -#define GET(CASE, NAMESPACE) \ - case CASE: \ - *size = reinterpret_cast(desc)->workspaceSize(); \ - return INFINI_STATUS_SUCCESS - - switch (desc->device_type) { -#ifdef ENABLE_CPU_API - GET(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - GET(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - GET(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - GET(INFINI_DEVICE_QY, nvidia); -#endif - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } -#undef GET - - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; -} - -__C infiniStatus_t infiniopPow( - infiniopPowDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *c, - const void *a, - const void *b, - void *stream) { - -#define CALCULATE(CASE, NAMESPACE) \ - case CASE: \ - return reinterpret_cast(desc) \ - ->calculate(workspace, workspace_size, c, {a, b}, stream) - - switch (desc->device_type) { - -#ifdef ENABLE_CPU_API - CALCULATE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - CALCULATE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef CALCULATE -} - -__C infiniStatus_t -infiniopDestroyPowDescriptor(infiniopPowDescriptor_t desc) { - -#define DELETE(CASE, NAMESPACE) \ - case CASE: \ - delete reinterpret_cast(desc); \ - return INFINI_STATUS_SUCCESS; - - switch (desc->device_type) { - -#ifdef ENABLE_CPU_API - DELETE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - DELETE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - DELETE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - DELETE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef DELETE -} +BINARY_OP_IMPL(pow, Pow) diff --git a/src/infiniop/ops/reciprocal/cpu/reciprocal_cpu.cc b/src/infiniop/ops/reciprocal/cpu/reciprocal_cpu.cc index 52874c8b3..0b66eca64 100644 --- a/src/infiniop/ops/reciprocal/cpu/reciprocal_cpu.cc +++ b/src/infiniop/ops/reciprocal/cpu/reciprocal_cpu.cc @@ -1,48 +1,8 @@ #include "reciprocal_cpu.h" +#include "../../../elementwise/cpu/elementwise_cpu_impl.h" namespace op::reciprocal::cpu { -Descriptor::~Descriptor() = default; +ELEMENTWISE_CPU_IMPL_UNARY(reciprocal) -infiniStatus_t Descriptor::create( - infiniopHandle_t handle_, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t out_desc, - std::vector input_desc_vec) { - - auto handle = reinterpret_cast(handle_); - auto dtype = out_desc->dtype(); - - const auto &x_desc = input_desc_vec.at(0); - const auto &y_shape = out_desc->shape(); - const auto &x_shape = x_desc->shape(); - - CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); - - CHECK_SAME_SHAPE(y_shape, x_shape); - - // create CPU elementwise descriptor - CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); - - return INFINI_STATUS_SUCCESS; -} - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *output, - std::vector inputs, - void *stream) const { - - switch (_dtype) { - case INFINI_DTYPE_F16: - return _device_info->calculate(_info, output, inputs, stream); - case INFINI_DTYPE_F32: - return _device_info->calculate(_info, output, inputs, stream); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - - return INFINI_STATUS_SUCCESS; -} } // namespace op::reciprocal::cpu diff --git a/src/infiniop/ops/reciprocal/cpu/reciprocal_cpu.h b/src/infiniop/ops/reciprocal/cpu/reciprocal_cpu.h index 0a0f223f0..9af583ab7 100644 --- a/src/infiniop/ops/reciprocal/cpu/reciprocal_cpu.h +++ b/src/infiniop/ops/reciprocal/cpu/reciprocal_cpu.h @@ -2,19 +2,8 @@ #define __RECIPROCAL_CPU_H__ #include "../../../elementwise/cpu/elementwise_cpu.h" +#include "../../../elementwise/unary.h" -ELEMENTWISE_DESCRIPTOR(reciprocal, cpu) - -namespace op::reciprocal::cpu { -typedef struct ReciprocalOp { -public: - static constexpr size_t num_inputs = 1; - - template - T operator()(const T &x) const { - return T(1) / x; - } -} ReciprocalOp; -} // namespace op::reciprocal::cpu +UNARY_ELEMENTWISE_DESCRIPTOR(reciprocal, cpu, op::elementwise::unary::UnaryMode::Reciprocal) #endif // __RECIPROCAL_CPU_H__ diff --git a/src/infiniop/ops/reciprocal/cuda/kernel.cuh b/src/infiniop/ops/reciprocal/cuda/kernel.cuh index 94c71de90..8c29a8e9e 100644 --- a/src/infiniop/ops/reciprocal/cuda/kernel.cuh +++ b/src/infiniop/ops/reciprocal/cuda/kernel.cuh @@ -1,32 +1,10 @@ #ifndef __RECIPROCAL_CUDA_H__ #define __RECIPROCAL_CUDA_H__ -#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" -#include +#include "../../../elementwise/unary.h" namespace op::reciprocal::cuda { -typedef struct ReciprocalOp { -public: - static constexpr size_t num_inputs = 1; - template - __device__ __forceinline__ T operator()(const T &x) const { - if constexpr (std::is_same_v) { - return h2rcp(x); - } else if constexpr (std::is_same_v) { - return hrcp(x); - } else if constexpr (std::is_same_v) { - float x0 = __bfloat162float(__low2bfloat16(x)); - float x1 = __bfloat162float(__high2bfloat16(x)); - return __floats2bfloat162_rn(__frcp_rn(x0), __frcp_rn(x1)); - } else if constexpr (std::is_same_v) { - return __float2bfloat16_rn(__frcp_rn(__bfloat162float(x))); - } else if constexpr (std::is_same_v) { - return __frcp_rn(x); - } else { - return T(1) / x; - } - } -} ReciprocalOp; +using Op = op::elementwise::unary::cuda::UnaryOp; } // namespace op::reciprocal::cuda #endif // __RECIPROCAL_CUDA_H__ diff --git a/src/infiniop/ops/reciprocal/nvidia/reciprocal_nvidia.cu b/src/infiniop/ops/reciprocal/nvidia/reciprocal_nvidia.cu index 45b74e25e..39a41b583 100644 --- a/src/infiniop/ops/reciprocal/nvidia/reciprocal_nvidia.cu +++ b/src/infiniop/ops/reciprocal/nvidia/reciprocal_nvidia.cu @@ -1,54 +1,10 @@ -#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" +#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh" #include "../cuda/kernel.cuh" #include "reciprocal_nvidia.cuh" namespace op::reciprocal::nvidia { -Descriptor::~Descriptor() = default; +ELEMENTWISE_NVIDIA_IMPL_UNARY(reciprocal) -infiniStatus_t Descriptor::create( - infiniopHandle_t handle_, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t out_desc, - std::vector input_desc_vec) { - - auto handle = reinterpret_cast(handle_); - auto dtype = out_desc->dtype(); - - const auto &x_desc = input_desc_vec.at(0); - const auto &y_shape = out_desc->shape(); - const auto &x_shape = x_desc->shape(); - - CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); - - CHECK_SAME_SHAPE(y_shape, x_shape); - - // create CUDA elementwise descriptor - CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) - - return INFINI_STATUS_SUCCESS; -} - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *output, - std::vector inputs, - void *stream) const { - if (workspace_size < _workspace_size) { - return INFINI_STATUS_INSUFFICIENT_WORKSPACE; - } - - switch (_dtype) { - case INFINI_DTYPE_F16: - return _device_info->calculate<256, cuda::ReciprocalOp, half>(_info, workspace, output, inputs, stream); - case INFINI_DTYPE_F32: - return _device_info->calculate<256, cuda::ReciprocalOp, float>(_info, workspace, output, inputs, stream); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - - return INFINI_STATUS_SUCCESS; -} } // namespace op::reciprocal::nvidia diff --git a/src/infiniop/ops/reciprocal/operator.cc b/src/infiniop/ops/reciprocal/operator.cc index 033286024..966bd72d8 100644 --- a/src/infiniop/ops/reciprocal/operator.cc +++ b/src/infiniop/ops/reciprocal/operator.cc @@ -1,5 +1,4 @@ -#include "../../operator.h" -#include "../../handle.h" +#include "../../operator_impl.h" #include "infiniop/ops/reciprocal.h" #ifdef ENABLE_CPU_API @@ -9,131 +8,4 @@ #include "nvidia/reciprocal_nvidia.cuh" #endif -__C infiniStatus_t infiniopCreateReciprocalDescriptor( - infiniopHandle_t handle, - infiniopReciprocalDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t y_desc, - infiniopTensorDescriptor_t x_desc) { - -#define CREATE(CASE, NAMESPACE) \ - case CASE: \ - return op::reciprocal::NAMESPACE::Descriptor::create( \ - handle, \ - reinterpret_cast(desc_ptr), \ - y_desc, \ - {x_desc}) - - switch (handle->device) { - -#ifdef ENABLE_CPU_API - CREATE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - CREATE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - CREATE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - CREATE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef CREATE -} - -__C infiniStatus_t infiniopGetReciprocalWorkspaceSize(infiniopReciprocalDescriptor_t desc, size_t *size) { - -#define GET(CASE, NAMESPACE) \ - case CASE: \ - *size = reinterpret_cast(desc)->workspaceSize(); \ - return INFINI_STATUS_SUCCESS; - - switch (desc->device_type) { -#ifdef ENABLE_CPU_API - GET(INFINI_DEVICE_CPU, cpu) -#endif -#ifdef ENABLE_NVIDIA_API - GET(INFINI_DEVICE_NVIDIA, nvidia) -#endif -#ifdef ENABLE_ILUVATAR_API - GET(INFINI_DEVICE_ILUVATAR, nvidia) -#endif -#ifdef ENABLE_QY_API - GET(INFINI_DEVICE_QY, nvidia) -#endif - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } -#undef GET - - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; -} - -__C infiniStatus_t infiniopReciprocal( - infiniopReciprocalDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *y, - const void *x, - void *stream) { - -#define CALCULATE(CASE, NAMESPACE) \ - case CASE: \ - return reinterpret_cast(desc) \ - ->calculate(workspace, workspace_size, y, {x}, stream) - - switch (desc->device_type) { - -#ifdef ENABLE_CPU_API - CALCULATE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - CALCULATE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef CALCULATE -} - -__C infiniStatus_t -infiniopDestroyReciprocalDescriptor(infiniopReciprocalDescriptor_t desc) { - -#define DELETE(CASE, NAMESPACE) \ - case CASE: \ - delete reinterpret_cast(desc); \ - return INFINI_STATUS_SUCCESS - - switch (desc->device_type) { - -#ifdef ENABLE_CPU_API - DELETE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - DELETE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - DELETE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - DELETE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef DELETE -} +UNARY_OP_IMPL(reciprocal, Reciprocal) diff --git a/src/infiniop/ops/round/cpu/round_cpu.cc b/src/infiniop/ops/round/cpu/round_cpu.cc index 0b0cea7b7..20ae304bd 100644 --- a/src/infiniop/ops/round/cpu/round_cpu.cc +++ b/src/infiniop/ops/round/cpu/round_cpu.cc @@ -1,48 +1,8 @@ #include "round_cpu.h" +#include "../../../elementwise/cpu/elementwise_cpu_impl.h" namespace op::round::cpu { -Descriptor::~Descriptor() = default; +ELEMENTWISE_CPU_IMPL_UNARY(round) -infiniStatus_t Descriptor::create( - infiniopHandle_t handle_, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t out_desc, - std::vector input_desc_vec) { - - auto handle = reinterpret_cast(handle_); - auto dtype = out_desc->dtype(); - - const auto &x_desc = input_desc_vec.at(0); - const auto &y_shape = out_desc->shape(); - const auto &x_shape = x_desc->shape(); - - CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); - - CHECK_SAME_SHAPE(y_shape, x_shape); - - // create CPU elementwise descriptor - CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); - - return INFINI_STATUS_SUCCESS; -} - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *output, - std::vector inputs, - void *stream) const { - - switch (_dtype) { - case INFINI_DTYPE_F16: - return _device_info->calculate(_info, output, inputs, stream); - case INFINI_DTYPE_F32: - return _device_info->calculate(_info, output, inputs, stream); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - - return INFINI_STATUS_SUCCESS; -} } // namespace op::round::cpu diff --git a/src/infiniop/ops/round/cpu/round_cpu.h b/src/infiniop/ops/round/cpu/round_cpu.h index eccd6df0f..1a755dbf8 100644 --- a/src/infiniop/ops/round/cpu/round_cpu.h +++ b/src/infiniop/ops/round/cpu/round_cpu.h @@ -2,24 +2,8 @@ #define __ROUND_CPU_H__ #include "../../../elementwise/cpu/elementwise_cpu.h" -#include +#include "../../../elementwise/unary.h" -ELEMENTWISE_DESCRIPTOR(round, cpu) - -namespace op::round::cpu { -typedef struct RoundOp { -public: - static constexpr size_t num_inputs = 1; - - template - T operator()(const T &x) const { - if constexpr (std::is_integral_v) { - return x; - } else { - return std::nearbyint(x); - } - } -} RoundOp; -} // namespace op::round::cpu +UNARY_ELEMENTWISE_DESCRIPTOR(round, cpu, op::elementwise::unary::UnaryMode::Round) #endif // __ROUND_CPU_H__ diff --git a/src/infiniop/ops/round/cuda/kernel.cuh b/src/infiniop/ops/round/cuda/kernel.cuh index c52a10716..f4de9c772 100644 --- a/src/infiniop/ops/round/cuda/kernel.cuh +++ b/src/infiniop/ops/round/cuda/kernel.cuh @@ -1,34 +1,10 @@ #ifndef __ROUND_CUDA_H__ #define __ROUND_CUDA_H__ -#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" -#include +#include "../../../elementwise/unary.h" namespace op::round::cuda { -typedef struct RoundOp { -public: - static constexpr size_t num_inputs = 1; - template - __device__ __forceinline__ T operator()(const T &x) const { - if constexpr (std::is_same_v) { - return h2rint(x); - } else if constexpr (std::is_same_v) { - return hrint(x); - } else if constexpr (std::is_same_v) { - float x0 = __bfloat162float(__low2bfloat16(x)); - float x1 = __bfloat162float(__high2bfloat16(x)); - return __floats2bfloat162_rn(rintf(x0), rintf(x1)); - } else if constexpr (std::is_same_v) { - return __float2bfloat16_rn(rintf(__bfloat162float(x))); - } else if constexpr (std::is_same_v) { - return rintf(x); - } else if constexpr (std::is_integral_v) { - return x; - } else { - return std::nearbyint(x); - } - } -} RoundOp; +using Op = op::elementwise::unary::cuda::UnaryOp; } // namespace op::round::cuda #endif // __ROUND_CUDA_H__ diff --git a/src/infiniop/ops/round/nvidia/round_nvidia.cu b/src/infiniop/ops/round/nvidia/round_nvidia.cu index c1fabc885..dc84388a3 100644 --- a/src/infiniop/ops/round/nvidia/round_nvidia.cu +++ b/src/infiniop/ops/round/nvidia/round_nvidia.cu @@ -1,54 +1,10 @@ -#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" +#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh" #include "../cuda/kernel.cuh" #include "round_nvidia.cuh" namespace op::round::nvidia { -Descriptor::~Descriptor() = default; +ELEMENTWISE_NVIDIA_IMPL_UNARY(round) -infiniStatus_t Descriptor::create( - infiniopHandle_t handle_, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t out_desc, - std::vector input_desc_vec) { - - auto handle = reinterpret_cast(handle_); - auto dtype = out_desc->dtype(); - - const auto &x_desc = input_desc_vec.at(0); - const auto &y_shape = out_desc->shape(); - const auto &x_shape = x_desc->shape(); - - CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); - - CHECK_SAME_SHAPE(y_shape, x_shape); - - // create CUDA elementwise descriptor - CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) - - return INFINI_STATUS_SUCCESS; -} - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *output, - std::vector inputs, - void *stream) const { - if (workspace_size < _workspace_size) { - return INFINI_STATUS_INSUFFICIENT_WORKSPACE; - } - - switch (_dtype) { - case INFINI_DTYPE_F16: - return _device_info->calculate<256, cuda::RoundOp, half>(_info, workspace, output, inputs, stream); - case INFINI_DTYPE_F32: - return _device_info->calculate<256, cuda::RoundOp, float>(_info, workspace, output, inputs, stream); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - - return INFINI_STATUS_SUCCESS; -} } // namespace op::round::nvidia diff --git a/src/infiniop/ops/round/operator.cc b/src/infiniop/ops/round/operator.cc index 9468803c8..a20fbcb17 100644 --- a/src/infiniop/ops/round/operator.cc +++ b/src/infiniop/ops/round/operator.cc @@ -1,5 +1,4 @@ -#include "../../operator.h" -#include "../../handle.h" +#include "../../operator_impl.h" #include "infiniop/ops/round.h" #ifdef ENABLE_CPU_API @@ -9,131 +8,4 @@ #include "nvidia/round_nvidia.cuh" #endif -__C infiniStatus_t infiniopCreateRoundDescriptor( - infiniopHandle_t handle, - infiniopRoundDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t y_desc, - infiniopTensorDescriptor_t x_desc) { - -#define CREATE(CASE, NAMESPACE) \ - case CASE: \ - return op::round::NAMESPACE::Descriptor::create( \ - handle, \ - reinterpret_cast(desc_ptr), \ - y_desc, \ - {x_desc}) - - switch (handle->device) { - -#ifdef ENABLE_CPU_API - CREATE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - CREATE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - CREATE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - CREATE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef CREATE -} - -__C infiniStatus_t infiniopGetRoundWorkspaceSize(infiniopRoundDescriptor_t desc, size_t *size) { - -#define GET(CASE, NAMESPACE) \ - case CASE: \ - *size = reinterpret_cast(desc)->workspaceSize(); \ - return INFINI_STATUS_SUCCESS; - - switch (desc->device_type) { -#ifdef ENABLE_CPU_API - GET(INFINI_DEVICE_CPU, cpu) -#endif -#ifdef ENABLE_NVIDIA_API - GET(INFINI_DEVICE_NVIDIA, nvidia) -#endif -#ifdef ENABLE_ILUVATAR_API - GET(INFINI_DEVICE_ILUVATAR, nvidia) -#endif -#ifdef ENABLE_QY_API - GET(INFINI_DEVICE_QY, nvidia) -#endif - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } -#undef GET - - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; -} - -__C infiniStatus_t infiniopRound( - infiniopRoundDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *y, - const void *x, - void *stream) { - -#define CALCULATE(CASE, NAMESPACE) \ - case CASE: \ - return reinterpret_cast(desc) \ - ->calculate(workspace, workspace_size, y, {x}, stream) - - switch (desc->device_type) { - -#ifdef ENABLE_CPU_API - CALCULATE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - CALCULATE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef CALCULATE -} - -__C infiniStatus_t -infiniopDestroyRoundDescriptor(infiniopRoundDescriptor_t desc) { - -#define DELETE(CASE, NAMESPACE) \ - case CASE: \ - delete reinterpret_cast(desc); \ - return INFINI_STATUS_SUCCESS - - switch (desc->device_type) { - -#ifdef ENABLE_CPU_API - DELETE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - DELETE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - DELETE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - DELETE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef DELETE -} +UNARY_OP_IMPL(round, Round) diff --git a/src/infiniop/ops/sign/cpu/sign_cpu.cc b/src/infiniop/ops/sign/cpu/sign_cpu.cc index 1f3430e73..c65868d09 100644 --- a/src/infiniop/ops/sign/cpu/sign_cpu.cc +++ b/src/infiniop/ops/sign/cpu/sign_cpu.cc @@ -1,48 +1,8 @@ #include "sign_cpu.h" +#include "../../../elementwise/cpu/elementwise_cpu_impl.h" namespace op::sign::cpu { -Descriptor::~Descriptor() = default; +ELEMENTWISE_CPU_IMPL_UNARY(sign) -infiniStatus_t Descriptor::create( - infiniopHandle_t handle_, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t out_desc, - std::vector input_desc_vec) { - - auto handle = reinterpret_cast(handle_); - auto dtype = out_desc->dtype(); - - const auto &x_desc = input_desc_vec.at(0); - const auto &y_shape = out_desc->shape(); - const auto &x_shape = x_desc->shape(); - - CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); - - CHECK_SAME_SHAPE(y_shape, x_shape); - - // create CPU elementwise descriptor - CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); - - return INFINI_STATUS_SUCCESS; -} - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *output, - std::vector inputs, - void *stream) const { - - switch (_dtype) { - case INFINI_DTYPE_F16: - return _device_info->calculate(_info, output, inputs, stream); - case INFINI_DTYPE_F32: - return _device_info->calculate(_info, output, inputs, stream); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - - return INFINI_STATUS_SUCCESS; -} } // namespace op::sign::cpu diff --git a/src/infiniop/ops/sign/cpu/sign_cpu.h b/src/infiniop/ops/sign/cpu/sign_cpu.h index 505194c85..7ddeec543 100644 --- a/src/infiniop/ops/sign/cpu/sign_cpu.h +++ b/src/infiniop/ops/sign/cpu/sign_cpu.h @@ -2,19 +2,8 @@ #define __SIGN_CPU_H__ #include "../../../elementwise/cpu/elementwise_cpu.h" +#include "../../../elementwise/unary.h" -ELEMENTWISE_DESCRIPTOR(sign, cpu) - -namespace op::sign::cpu { -typedef struct SignOp { -public: - static constexpr size_t num_inputs = 1; - - template - T operator()(const T &x) const { - return x > T(0) ? T(1) : (x == T(0) ? T(0) : T(-1)); - } -} SignOp; -} // namespace op::sign::cpu +UNARY_ELEMENTWISE_DESCRIPTOR(sign, cpu, op::elementwise::unary::UnaryMode::Sign) #endif // __SIGN_CPU_H__ diff --git a/src/infiniop/ops/sign/cuda/kernel.cuh b/src/infiniop/ops/sign/cuda/kernel.cuh index 3737282b0..a1216fb82 100644 --- a/src/infiniop/ops/sign/cuda/kernel.cuh +++ b/src/infiniop/ops/sign/cuda/kernel.cuh @@ -1,25 +1,10 @@ #ifndef __SIGN_CUDA_H__ #define __SIGN_CUDA_H__ -#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" -#include +#include "../../../elementwise/unary.h" namespace op::sign::cuda { -typedef struct SignOp { -public: - static constexpr size_t num_inputs = 1; - template - __device__ __forceinline__ T operator()(const T &x) const { - if constexpr (std::is_same_v) { - const auto lt_mask = __hlt2(x, __floats2half2_rn(0.0f, 0.0f)); - return __hadd2(__hneg2(lt_mask), __hsub2(__floats2half2_rn(1.0f, 1.0f), lt_mask)); - } else if constexpr (std::is_same_v) { - return x > half(0) ? half(1) : (x == half(0) ? half(0) : half(-1)); - } else { - return x > T(0) ? T(1) : (x == T(0) ? T(0) : T(-1)); - } - } -} SignOp; +using Op = op::elementwise::unary::cuda::UnaryOp; } // namespace op::sign::cuda #endif // __SIGN_CUDA_H__ diff --git a/src/infiniop/ops/sign/nvidia/sign_nvidia.cu b/src/infiniop/ops/sign/nvidia/sign_nvidia.cu index 6a3152e41..2a11f9e23 100644 --- a/src/infiniop/ops/sign/nvidia/sign_nvidia.cu +++ b/src/infiniop/ops/sign/nvidia/sign_nvidia.cu @@ -1,54 +1,10 @@ -#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" +#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh" #include "../cuda/kernel.cuh" #include "sign_nvidia.cuh" namespace op::sign::nvidia { -Descriptor::~Descriptor() = default; +ELEMENTWISE_NVIDIA_IMPL_UNARY(sign) -infiniStatus_t Descriptor::create( - infiniopHandle_t handle_, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t out_desc, - std::vector input_desc_vec) { - - auto handle = reinterpret_cast(handle_); - auto dtype = out_desc->dtype(); - - const auto &x_desc = input_desc_vec.at(0); - const auto &y_shape = out_desc->shape(); - const auto &x_shape = x_desc->shape(); - - CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); - - CHECK_SAME_SHAPE(y_shape, x_shape); - - // create CUDA elementwise descriptor - CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) - - return INFINI_STATUS_SUCCESS; -} - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *output, - std::vector inputs, - void *stream) const { - if (workspace_size < _workspace_size) { - return INFINI_STATUS_INSUFFICIENT_WORKSPACE; - } - - switch (_dtype) { - case INFINI_DTYPE_F16: - return _device_info->calculate<256, cuda::SignOp, half>(_info, workspace, output, inputs, stream); - case INFINI_DTYPE_F32: - return _device_info->calculate<256, cuda::SignOp, float>(_info, workspace, output, inputs, stream); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - - return INFINI_STATUS_SUCCESS; -} } // namespace op::sign::nvidia diff --git a/src/infiniop/ops/sign/operator.cc b/src/infiniop/ops/sign/operator.cc index 8f658a9b3..1a4599d5d 100644 --- a/src/infiniop/ops/sign/operator.cc +++ b/src/infiniop/ops/sign/operator.cc @@ -1,5 +1,4 @@ -#include "../../operator.h" -#include "../../handle.h" +#include "../../operator_impl.h" #include "infiniop/ops/sign.h" #ifdef ENABLE_CPU_API @@ -9,131 +8,4 @@ #include "nvidia/sign_nvidia.cuh" #endif -__C infiniStatus_t infiniopCreateSignDescriptor( - infiniopHandle_t handle, - infiniopSignDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t y_desc, - infiniopTensorDescriptor_t x_desc) { - -#define CREATE(CASE, NAMESPACE) \ - case CASE: \ - return op::sign::NAMESPACE::Descriptor::create( \ - handle, \ - reinterpret_cast(desc_ptr), \ - y_desc, \ - {x_desc}) - - switch (handle->device) { - -#ifdef ENABLE_CPU_API - CREATE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - CREATE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - CREATE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - CREATE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef CREATE -} - -__C infiniStatus_t infiniopGetSignWorkspaceSize(infiniopSignDescriptor_t desc, size_t *size) { - -#define GET(CASE, NAMESPACE) \ - case CASE: \ - *size = reinterpret_cast(desc)->workspaceSize(); \ - return INFINI_STATUS_SUCCESS; - - switch (desc->device_type) { -#ifdef ENABLE_CPU_API - GET(INFINI_DEVICE_CPU, cpu) -#endif -#ifdef ENABLE_NVIDIA_API - GET(INFINI_DEVICE_NVIDIA, nvidia) -#endif -#ifdef ENABLE_ILUVATAR_API - GET(INFINI_DEVICE_ILUVATAR, nvidia) -#endif -#ifdef ENABLE_QY_API - GET(INFINI_DEVICE_QY, nvidia) -#endif - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } -#undef GET - - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; -} - -__C infiniStatus_t infiniopSign( - infiniopSignDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *y, - const void *x, - void *stream) { - -#define CALCULATE(CASE, NAMESPACE) \ - case CASE: \ - return reinterpret_cast(desc) \ - ->calculate(workspace, workspace_size, y, {x}, stream) - - switch (desc->device_type) { - -#ifdef ENABLE_CPU_API - CALCULATE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - CALCULATE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef CALCULATE -} - -__C infiniStatus_t -infiniopDestroySignDescriptor(infiniopSignDescriptor_t desc) { - -#define DELETE(CASE, NAMESPACE) \ - case CASE: \ - delete reinterpret_cast(desc); \ - return INFINI_STATUS_SUCCESS - - switch (desc->device_type) { - -#ifdef ENABLE_CPU_API - DELETE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - DELETE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - DELETE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - DELETE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef DELETE -} +UNARY_OP_IMPL(sign, Sign) diff --git a/src/infiniop/ops/sinh/cpu/sinh_cpu.cc b/src/infiniop/ops/sinh/cpu/sinh_cpu.cc index 40685847d..897439905 100644 --- a/src/infiniop/ops/sinh/cpu/sinh_cpu.cc +++ b/src/infiniop/ops/sinh/cpu/sinh_cpu.cc @@ -1,48 +1,8 @@ #include "sinh_cpu.h" +#include "../../../elementwise/cpu/elementwise_cpu_impl.h" namespace op::sinh::cpu { -Descriptor::~Descriptor() = default; +ELEMENTWISE_CPU_IMPL_UNARY(sinh) -infiniStatus_t Descriptor::create( - infiniopHandle_t handle_, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t out_desc, - std::vector input_desc_vec) { - - auto handle = reinterpret_cast(handle_); - auto dtype = out_desc->dtype(); - - const auto &x_desc = input_desc_vec.at(0); - const auto &y_shape = out_desc->shape(); - const auto &x_shape = x_desc->shape(); - - CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); - - CHECK_SAME_SHAPE(y_shape, x_shape); - - // create CPU elementwise descriptor - CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); - - return INFINI_STATUS_SUCCESS; -} - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *output, - std::vector inputs, - void *stream) const { - - switch (_dtype) { - case INFINI_DTYPE_F16: - return _device_info->calculate(_info, output, inputs, stream); - case INFINI_DTYPE_F32: - return _device_info->calculate(_info, output, inputs, stream); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - - return INFINI_STATUS_SUCCESS; -} } // namespace op::sinh::cpu diff --git a/src/infiniop/ops/sinh/cpu/sinh_cpu.h b/src/infiniop/ops/sinh/cpu/sinh_cpu.h index dbc8f3c7e..573027ee3 100644 --- a/src/infiniop/ops/sinh/cpu/sinh_cpu.h +++ b/src/infiniop/ops/sinh/cpu/sinh_cpu.h @@ -1,22 +1,9 @@ #ifndef __SINH_CPU_H__ #define __SINH_CPU_H__ -#include - #include "../../../elementwise/cpu/elementwise_cpu.h" +#include "../../../elementwise/unary.h" -ELEMENTWISE_DESCRIPTOR(sinh, cpu) - -namespace op::sinh::cpu { -typedef struct SinhOp { -public: - static constexpr size_t num_inputs = 1; - - template - T operator()(const T &x) const { - return std::sinh(x); - } -} SinhOp; -} // namespace op::sinh::cpu +UNARY_ELEMENTWISE_DESCRIPTOR(sinh, cpu, op::elementwise::unary::UnaryMode::Sinh) #endif // __SINH_CPU_H__ diff --git a/src/infiniop/ops/sinh/cuda/kernel.cuh b/src/infiniop/ops/sinh/cuda/kernel.cuh index c09150666..d5bb7491f 100644 --- a/src/infiniop/ops/sinh/cuda/kernel.cuh +++ b/src/infiniop/ops/sinh/cuda/kernel.cuh @@ -1,32 +1,10 @@ #ifndef __SINH_CUDA_H__ #define __SINH_CUDA_H__ -#include -#include +#include "../../../elementwise/unary.h" namespace op::sinh::cuda { -typedef struct SinhOp { -public: - static constexpr size_t num_inputs = 1; - template - __device__ __forceinline__ T operator()(const T &x) const { - if constexpr (std::is_same_v) { - return __floats2half2_rn(sinhf(__half2float(__low2half(x))), sinhf(__half2float(__high2half(x)))); - } else if constexpr (std::is_same_v) { - return __float2half(sinhf(__half2float(x))); - } else if constexpr (std::is_same_v) { - float x0 = __bfloat162float(__low2bfloat16(x)); - float x1 = __bfloat162float(__high2bfloat16(x)); - return __floats2bfloat162_rn(sinhf(x0), sinhf(x1)); - } else if constexpr (std::is_same_v) { - return __float2bfloat16_rn(sinhf(__bfloat162float(x))); - } else if constexpr (std::is_same_v) { - return sinhf(x); - } else { - return std::sinh(x); - } - } -} SinhOp; +using Op = op::elementwise::unary::cuda::UnaryOp; } // namespace op::sinh::cuda #endif // __SINH_CUDA_H__ diff --git a/src/infiniop/ops/sinh/nvidia/sinh_nvidia.cu b/src/infiniop/ops/sinh/nvidia/sinh_nvidia.cu index d4c3fd165..3abfc2973 100644 --- a/src/infiniop/ops/sinh/nvidia/sinh_nvidia.cu +++ b/src/infiniop/ops/sinh/nvidia/sinh_nvidia.cu @@ -1,54 +1,10 @@ -#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" +#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh" #include "../cuda/kernel.cuh" #include "sinh_nvidia.cuh" namespace op::sinh::nvidia { -Descriptor::~Descriptor() = default; +ELEMENTWISE_NVIDIA_IMPL_UNARY(sinh) -infiniStatus_t Descriptor::create( - infiniopHandle_t handle_, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t out_desc, - std::vector input_desc_vec) { - - auto handle = reinterpret_cast(handle_); - auto dtype = out_desc->dtype(); - - const auto &x_desc = input_desc_vec.at(0); - const auto &y_shape = out_desc->shape(); - const auto &x_shape = x_desc->shape(); - - CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); - - CHECK_SAME_SHAPE(y_shape, x_shape); - - // create CUDA elementwise descriptor - CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) - - return INFINI_STATUS_SUCCESS; -} - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *output, - std::vector inputs, - void *stream) const { - if (workspace_size < _workspace_size) { - return INFINI_STATUS_INSUFFICIENT_WORKSPACE; - } - - switch (_dtype) { - case INFINI_DTYPE_F16: - return _device_info->calculate<256, cuda::SinhOp, half>(_info, workspace, output, inputs, stream); - case INFINI_DTYPE_F32: - return _device_info->calculate<256, cuda::SinhOp, float>(_info, workspace, output, inputs, stream); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - - return INFINI_STATUS_SUCCESS; -} } // namespace op::sinh::nvidia diff --git a/src/infiniop/ops/sinh/operator.cc b/src/infiniop/ops/sinh/operator.cc index 1636ce2c8..41940d235 100644 --- a/src/infiniop/ops/sinh/operator.cc +++ b/src/infiniop/ops/sinh/operator.cc @@ -1,5 +1,4 @@ -#include "../../operator.h" -#include "../../handle.h" +#include "../../operator_impl.h" #include "infiniop/ops/sinh.h" #ifdef ENABLE_CPU_API @@ -9,131 +8,4 @@ #include "nvidia/sinh_nvidia.cuh" #endif -__C infiniStatus_t infiniopCreateSinhDescriptor( - infiniopHandle_t handle, - infiniopSinhDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t y_desc, - infiniopTensorDescriptor_t x_desc) { - -#define CREATE(CASE, NAMESPACE) \ - case CASE: \ - return op::sinh::NAMESPACE::Descriptor::create( \ - handle, \ - reinterpret_cast(desc_ptr), \ - y_desc, \ - {x_desc}) - - switch (handle->device) { - -#ifdef ENABLE_CPU_API - CREATE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - CREATE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - CREATE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - CREATE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef CREATE -} - -__C infiniStatus_t infiniopGetSinhWorkspaceSize(infiniopSinhDescriptor_t desc, size_t *size) { - -#define GET(CASE, NAMESPACE) \ - case CASE: \ - *size = reinterpret_cast(desc)->workspaceSize(); \ - return INFINI_STATUS_SUCCESS; - - switch (desc->device_type) { -#ifdef ENABLE_CPU_API - GET(INFINI_DEVICE_CPU, cpu) -#endif -#ifdef ENABLE_NVIDIA_API - GET(INFINI_DEVICE_NVIDIA, nvidia) -#endif -#ifdef ENABLE_ILUVATAR_API - GET(INFINI_DEVICE_ILUVATAR, nvidia) -#endif -#ifdef ENABLE_QY_API - GET(INFINI_DEVICE_QY, nvidia) -#endif - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } -#undef GET - - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; -} - -__C infiniStatus_t infiniopSinh( - infiniopSinhDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *y, - const void *x, - void *stream) { - -#define CALCULATE(CASE, NAMESPACE) \ - case CASE: \ - return reinterpret_cast(desc) \ - ->calculate(workspace, workspace_size, y, {x}, stream) - - switch (desc->device_type) { - -#ifdef ENABLE_CPU_API - CALCULATE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - CALCULATE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef CALCULATE -} - -__C infiniStatus_t -infiniopDestroySinhDescriptor(infiniopSinhDescriptor_t desc) { - -#define DELETE(CASE, NAMESPACE) \ - case CASE: \ - delete reinterpret_cast(desc); \ - return INFINI_STATUS_SUCCESS - - switch (desc->device_type) { - -#ifdef ENABLE_CPU_API - DELETE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - DELETE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - DELETE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - DELETE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef DELETE -} +UNARY_OP_IMPL(sinh, Sinh) diff --git a/src/infiniop/ops/sqrt/cpu/sqrt_cpu.cc b/src/infiniop/ops/sqrt/cpu/sqrt_cpu.cc index 99e723126..eb9ac4d66 100644 --- a/src/infiniop/ops/sqrt/cpu/sqrt_cpu.cc +++ b/src/infiniop/ops/sqrt/cpu/sqrt_cpu.cc @@ -1,48 +1,8 @@ #include "sqrt_cpu.h" +#include "../../../elementwise/cpu/elementwise_cpu_impl.h" namespace op::sqrt::cpu { -Descriptor::~Descriptor() = default; +ELEMENTWISE_CPU_IMPL_UNARY(sqrt) -infiniStatus_t Descriptor::create( - infiniopHandle_t handle_, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t out_desc, - std::vector input_desc_vec) { - - auto handle = reinterpret_cast(handle_); - auto dtype = out_desc->dtype(); - - const auto &x_desc = input_desc_vec.at(0); - const auto &y_shape = out_desc->shape(); - const auto &x_shape = x_desc->shape(); - - CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); - - CHECK_SAME_SHAPE(y_shape, x_shape); - - // create CPU elementwise descriptor - CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); - - return INFINI_STATUS_SUCCESS; -} - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *output, - std::vector inputs, - void *stream) const { - - switch (_dtype) { - case INFINI_DTYPE_F16: - return _device_info->calculate(_info, output, inputs, stream); - case INFINI_DTYPE_F32: - return _device_info->calculate(_info, output, inputs, stream); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - - return INFINI_STATUS_SUCCESS; -} } // namespace op::sqrt::cpu diff --git a/src/infiniop/ops/sqrt/cpu/sqrt_cpu.h b/src/infiniop/ops/sqrt/cpu/sqrt_cpu.h index 3d026cf63..ed6217e1f 100644 --- a/src/infiniop/ops/sqrt/cpu/sqrt_cpu.h +++ b/src/infiniop/ops/sqrt/cpu/sqrt_cpu.h @@ -1,22 +1,9 @@ #ifndef __SQRT_CPU_H__ #define __SQRT_CPU_H__ -#include - #include "../../../elementwise/cpu/elementwise_cpu.h" +#include "../../../elementwise/unary.h" -ELEMENTWISE_DESCRIPTOR(sqrt, cpu) - -namespace op::sqrt::cpu { -typedef struct SqrtOp { -public: - static constexpr size_t num_inputs = 1; - - template - T operator()(const T &x) const { - return std::sqrt(x); - } -} SqrtOp; -} // namespace op::sqrt::cpu +UNARY_ELEMENTWISE_DESCRIPTOR(sqrt, cpu, op::elementwise::unary::UnaryMode::Sqrt) #endif // __SQRT_CPU_H__ diff --git a/src/infiniop/ops/sqrt/cuda/kernel.cuh b/src/infiniop/ops/sqrt/cuda/kernel.cuh index c82cd7dd5..40ab9708f 100644 --- a/src/infiniop/ops/sqrt/cuda/kernel.cuh +++ b/src/infiniop/ops/sqrt/cuda/kernel.cuh @@ -1,32 +1,10 @@ #ifndef __SQRT_CUDA_H__ #define __SQRT_CUDA_H__ -#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" -#include +#include "../../../elementwise/unary.h" namespace op::sqrt::cuda { -typedef struct SqrtOp { -public: - static constexpr size_t num_inputs = 1; - template - __device__ __forceinline__ T operator()(const T &x) const { - if constexpr (std::is_same_v) { - return h2sqrt(x); - } else if constexpr (std::is_same_v) { - return hsqrt(x); - } else if constexpr (std::is_same_v) { - float x0 = __bfloat162float(__low2bfloat16(x)); - float x1 = __bfloat162float(__high2bfloat16(x)); - return __floats2bfloat162_rn(__fsqrt_rn(x0), __fsqrt_rn(x1)); - } else if constexpr (std::is_same_v) { - return __float2bfloat16_rn(__fsqrt_rn(__bfloat162float(x))); - } else if constexpr (std::is_same_v) { - return __fsqrt_rn(x); - } else { - return std::sqrt(x); - } - } -} SqrtOp; +using Op = op::elementwise::unary::cuda::UnaryOp; } // namespace op::sqrt::cuda #endif // __SQRT_CUDA_H__ diff --git a/src/infiniop/ops/sqrt/nvidia/sqrt_nvidia.cu b/src/infiniop/ops/sqrt/nvidia/sqrt_nvidia.cu index 519d06e89..4d6c70d72 100644 --- a/src/infiniop/ops/sqrt/nvidia/sqrt_nvidia.cu +++ b/src/infiniop/ops/sqrt/nvidia/sqrt_nvidia.cu @@ -1,54 +1,10 @@ -#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" +#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh" #include "../cuda/kernel.cuh" #include "sqrt_nvidia.cuh" namespace op::sqrt::nvidia { -Descriptor::~Descriptor() = default; +ELEMENTWISE_NVIDIA_IMPL_UNARY(sqrt) -infiniStatus_t Descriptor::create( - infiniopHandle_t handle_, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t out_desc, - std::vector input_desc_vec) { - - auto handle = reinterpret_cast(handle_); - auto dtype = out_desc->dtype(); - - const auto &x_desc = input_desc_vec.at(0); - const auto &y_shape = out_desc->shape(); - const auto &x_shape = x_desc->shape(); - - CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); - - CHECK_SAME_SHAPE(y_shape, x_shape); - - // create CUDA elementwise descriptor - CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) - - return INFINI_STATUS_SUCCESS; -} - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *output, - std::vector inputs, - void *stream) const { - if (workspace_size < _workspace_size) { - return INFINI_STATUS_INSUFFICIENT_WORKSPACE; - } - - switch (_dtype) { - case INFINI_DTYPE_F16: - return _device_info->calculate<256, cuda::SqrtOp, half>(_info, workspace, output, inputs, stream); - case INFINI_DTYPE_F32: - return _device_info->calculate<256, cuda::SqrtOp, float>(_info, workspace, output, inputs, stream); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - - return INFINI_STATUS_SUCCESS; -} } // namespace op::sqrt::nvidia diff --git a/src/infiniop/ops/sqrt/operator.cc b/src/infiniop/ops/sqrt/operator.cc index b11c8a4b5..fe999f58f 100644 --- a/src/infiniop/ops/sqrt/operator.cc +++ b/src/infiniop/ops/sqrt/operator.cc @@ -1,5 +1,4 @@ -#include "../../operator.h" -#include "../../handle.h" +#include "../../operator_impl.h" #include "infiniop/ops/sqrt.h" #ifdef ENABLE_CPU_API @@ -9,131 +8,4 @@ #include "nvidia/sqrt_nvidia.cuh" #endif -__C infiniStatus_t infiniopCreateSqrtDescriptor( - infiniopHandle_t handle, - infiniopSqrtDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t y_desc, - infiniopTensorDescriptor_t x_desc) { - -#define CREATE(CASE, NAMESPACE) \ - case CASE: \ - return op::sqrt::NAMESPACE::Descriptor::create( \ - handle, \ - reinterpret_cast(desc_ptr), \ - y_desc, \ - {x_desc}) - - switch (handle->device) { - -#ifdef ENABLE_CPU_API - CREATE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - CREATE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - CREATE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - CREATE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef CREATE -} - -__C infiniStatus_t infiniopGetSqrtWorkspaceSize(infiniopSqrtDescriptor_t desc, size_t *size) { - -#define GET(CASE, NAMESPACE) \ - case CASE: \ - *size = reinterpret_cast(desc)->workspaceSize(); \ - return INFINI_STATUS_SUCCESS; - - switch (desc->device_type) { -#ifdef ENABLE_CPU_API - GET(INFINI_DEVICE_CPU, cpu) -#endif -#ifdef ENABLE_NVIDIA_API - GET(INFINI_DEVICE_NVIDIA, nvidia) -#endif -#ifdef ENABLE_ILUVATAR_API - GET(INFINI_DEVICE_ILUVATAR, nvidia) -#endif -#ifdef ENABLE_QY_API - GET(INFINI_DEVICE_QY, nvidia) -#endif - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } -#undef GET - - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; -} - -__C infiniStatus_t infiniopSqrt( - infiniopSqrtDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *y, - const void *x, - void *stream) { - -#define CALCULATE(CASE, NAMESPACE) \ - case CASE: \ - return reinterpret_cast(desc) \ - ->calculate(workspace, workspace_size, y, {x}, stream) - - switch (desc->device_type) { - -#ifdef ENABLE_CPU_API - CALCULATE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - CALCULATE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef CALCULATE -} - -__C infiniStatus_t -infiniopDestroySqrtDescriptor(infiniopSqrtDescriptor_t desc) { - -#define DELETE(CASE, NAMESPACE) \ - case CASE: \ - delete reinterpret_cast(desc); \ - return INFINI_STATUS_SUCCESS - - switch (desc->device_type) { - -#ifdef ENABLE_CPU_API - DELETE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - DELETE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - DELETE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - DELETE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef DELETE -} +UNARY_OP_IMPL(sqrt, Sqrt) diff --git a/src/infiniop/ops/tan/cpu/tan_cpu.cc b/src/infiniop/ops/tan/cpu/tan_cpu.cc index 2947dfc5e..5166cf64f 100644 --- a/src/infiniop/ops/tan/cpu/tan_cpu.cc +++ b/src/infiniop/ops/tan/cpu/tan_cpu.cc @@ -1,48 +1,8 @@ #include "tan_cpu.h" +#include "../../../elementwise/cpu/elementwise_cpu_impl.h" namespace op::tan::cpu { -Descriptor::~Descriptor() = default; +ELEMENTWISE_CPU_IMPL_UNARY(tan) -infiniStatus_t Descriptor::create( - infiniopHandle_t handle_, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t out_desc, - std::vector input_desc_vec) { - - auto handle = reinterpret_cast(handle_); - auto dtype = out_desc->dtype(); - - const auto &x_desc = input_desc_vec.at(0); - const auto &y_shape = out_desc->shape(); - const auto &x_shape = x_desc->shape(); - - CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); - - CHECK_SAME_SHAPE(y_shape, x_shape); - - // create CPU elementwise descriptor - CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); - - return INFINI_STATUS_SUCCESS; -} - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *output, - std::vector inputs, - void *stream) const { - - switch (_dtype) { - case INFINI_DTYPE_F16: - return _device_info->calculate(_info, output, inputs, stream); - case INFINI_DTYPE_F32: - return _device_info->calculate(_info, output, inputs, stream); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - - return INFINI_STATUS_SUCCESS; -} } // namespace op::tan::cpu diff --git a/src/infiniop/ops/tan/cpu/tan_cpu.h b/src/infiniop/ops/tan/cpu/tan_cpu.h index c3a22456c..6c697c311 100644 --- a/src/infiniop/ops/tan/cpu/tan_cpu.h +++ b/src/infiniop/ops/tan/cpu/tan_cpu.h @@ -1,22 +1,9 @@ #ifndef __TAN_CPU_H__ #define __TAN_CPU_H__ -#include - #include "../../../elementwise/cpu/elementwise_cpu.h" +#include "../../../elementwise/unary.h" -ELEMENTWISE_DESCRIPTOR(tan, cpu) - -namespace op::tan::cpu { -typedef struct TanOp { -public: - static constexpr size_t num_inputs = 1; - - template - T operator()(const T &x) const { - return std::tan(x); - } -} TanOp; -} // namespace op::tan::cpu +UNARY_ELEMENTWISE_DESCRIPTOR(tan, cpu, op::elementwise::unary::UnaryMode::Tan) #endif // __TAN_CPU_H__ diff --git a/src/infiniop/ops/tan/cuda/kernel.cuh b/src/infiniop/ops/tan/cuda/kernel.cuh index bbd8facaa..c3cf45350 100644 --- a/src/infiniop/ops/tan/cuda/kernel.cuh +++ b/src/infiniop/ops/tan/cuda/kernel.cuh @@ -1,55 +1,10 @@ #ifndef __TAN_CUDA_H__ #define __TAN_CUDA_H__ -#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" -#include -#include - -#define TAN_THRESHOLD 15000 +#include "../../../elementwise/unary.h" namespace op::tan::cuda { -typedef struct TanOp { -public: - static constexpr size_t num_inputs = 1; - template - __device__ __forceinline__ T operator()(const T &x) const { - if constexpr (std::is_same_v) { - return h2sin(x) / h2cos(x); - } else if constexpr (std::is_same_v) { - float tan_f = __tanf(__half2float(x)); - if (std::fabs(tan_f) > TAN_THRESHOLD) { - return __float2half(tanf(__half2float(x))); - } - return __float2half(tan_f); - } else if constexpr (std::is_same_v) { - float x0 = __bfloat162float(__low2bfloat16(x)); - float x1 = __bfloat162float(__high2bfloat16(x)); - float tan_f0 = __tanf(x0); - float tan_f1 = __tanf(x1); - if (std::fabs(tan_f0) > TAN_THRESHOLD) { - tan_f0 = tanf(x0); - } - if (std::fabs(tan_f1) > TAN_THRESHOLD) { - tan_f1 = tanf(x1); - } - return __floats2bfloat162_rn(tan_f0, tan_f1); - } else if constexpr (std::is_same_v) { - float tan_f = __tanf(__bfloat162float(x)); - if (std::fabs(tan_f) > TAN_THRESHOLD) { - return __float2bfloat16_rn(tanf(__bfloat162float(x))); - } - return __float2bfloat16_rn(tan_f); - } else if constexpr (std::is_same_v) { - float tan_f = __tanf(x); - if (std::fabs(tan_f) > TAN_THRESHOLD) { - return tanf(x); - } - return tan_f; - } else { - return std::tan(x); - } - } -} TanOp; +using Op = op::elementwise::unary::cuda::UnaryOp; } // namespace op::tan::cuda #endif // __TAN_CUDA_H__ diff --git a/src/infiniop/ops/tan/nvidia/tan_nvidia.cu b/src/infiniop/ops/tan/nvidia/tan_nvidia.cu index b4c24e2fe..5f56dcb6f 100644 --- a/src/infiniop/ops/tan/nvidia/tan_nvidia.cu +++ b/src/infiniop/ops/tan/nvidia/tan_nvidia.cu @@ -1,54 +1,10 @@ -#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" +#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh" #include "../cuda/kernel.cuh" #include "tan_nvidia.cuh" namespace op::tan::nvidia { -Descriptor::~Descriptor() = default; +ELEMENTWISE_NVIDIA_IMPL_UNARY(tan) -infiniStatus_t Descriptor::create( - infiniopHandle_t handle_, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t out_desc, - std::vector input_desc_vec) { - - auto handle = reinterpret_cast(handle_); - auto dtype = out_desc->dtype(); - - const auto &x_desc = input_desc_vec.at(0); - const auto &y_shape = out_desc->shape(); - const auto &x_shape = x_desc->shape(); - - CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); - - CHECK_SAME_SHAPE(y_shape, x_shape); - - // create CUDA elementwise descriptor - CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) - - return INFINI_STATUS_SUCCESS; -} - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *output, - std::vector inputs, - void *stream) const { - if (workspace_size < _workspace_size) { - return INFINI_STATUS_INSUFFICIENT_WORKSPACE; - } - - switch (_dtype) { - case INFINI_DTYPE_F16: - return _device_info->calculate<256, cuda::TanOp, half>(_info, workspace, output, inputs, stream); - case INFINI_DTYPE_F32: - return _device_info->calculate<256, cuda::TanOp, float>(_info, workspace, output, inputs, stream); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - - return INFINI_STATUS_SUCCESS; -} } // namespace op::tan::nvidia diff --git a/src/infiniop/ops/tan/operator.cc b/src/infiniop/ops/tan/operator.cc index 48ae8d48e..ae506dcd8 100644 --- a/src/infiniop/ops/tan/operator.cc +++ b/src/infiniop/ops/tan/operator.cc @@ -1,5 +1,4 @@ -#include "../../operator.h" -#include "../../handle.h" +#include "../../operator_impl.h" #include "infiniop/ops/tan.h" #ifdef ENABLE_CPU_API @@ -9,131 +8,4 @@ #include "nvidia/tan_nvidia.cuh" #endif -__C infiniStatus_t infiniopCreateTanDescriptor( - infiniopHandle_t handle, - infiniopTanDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t y_desc, - infiniopTensorDescriptor_t x_desc) { - -#define CREATE(CASE, NAMESPACE) \ - case CASE: \ - return op::tan::NAMESPACE::Descriptor::create( \ - handle, \ - reinterpret_cast(desc_ptr), \ - y_desc, \ - {x_desc}) - - switch (handle->device) { - -#ifdef ENABLE_CPU_API - CREATE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - CREATE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - CREATE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - CREATE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef CREATE -} - -__C infiniStatus_t infiniopGetTanWorkspaceSize(infiniopTanDescriptor_t desc, size_t *size) { - -#define GET(CASE, NAMESPACE) \ - case CASE: \ - *size = reinterpret_cast(desc)->workspaceSize(); \ - return INFINI_STATUS_SUCCESS; - - switch (desc->device_type) { -#ifdef ENABLE_CPU_API - GET(INFINI_DEVICE_CPU, cpu) -#endif -#ifdef ENABLE_NVIDIA_API - GET(INFINI_DEVICE_NVIDIA, nvidia) -#endif -#ifdef ENABLE_ILUVATAR_API - GET(INFINI_DEVICE_ILUVATAR, nvidia) -#endif -#ifdef ENABLE_QY_API - GET(INFINI_DEVICE_QY, nvidia) -#endif - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } -#undef GET - - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; -} - -__C infiniStatus_t infiniopTan( - infiniopTanDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *y, - const void *x, - void *stream) { - -#define CALCULATE(CASE, NAMESPACE) \ - case CASE: \ - return reinterpret_cast(desc) \ - ->calculate(workspace, workspace_size, y, {x}, stream) - - switch (desc->device_type) { - -#ifdef ENABLE_CPU_API - CALCULATE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - CALCULATE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef CALCULATE -} - -__C infiniStatus_t -infiniopDestroyTanDescriptor(infiniopTanDescriptor_t desc) { - -#define DELETE(CASE, NAMESPACE) \ - case CASE: \ - delete reinterpret_cast(desc); \ - return INFINI_STATUS_SUCCESS - - switch (desc->device_type) { - -#ifdef ENABLE_CPU_API - DELETE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - DELETE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - DELETE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - DELETE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef DELETE -} +UNARY_OP_IMPL(tan, Tan) diff --git a/src/infiniop/ops/tanh/cuda/kernel.cuh b/src/infiniop/ops/tanh/cuda/kernel.cuh index e336a4995..d987ac7c5 100644 --- a/src/infiniop/ops/tanh/cuda/kernel.cuh +++ b/src/infiniop/ops/tanh/cuda/kernel.cuh @@ -1,44 +1,10 @@ #ifndef __TANH_CUDA_H__ #define __TANH_CUDA_H__ -#include +#include "../../../elementwise/unary.h" namespace op::tanh::cuda { -typedef struct TanhOp { - static constexpr size_t num_inputs = 1; - - __device__ __forceinline__ float tanh_f32_func(float x) const { - return tanhf(x); - } - template - __device__ __forceinline__ T operator()(const T &input) const { - if constexpr (std::is_same_v) { - float2 vf = __half22float2(input); - float2 vr = make_float2(tanh_f32_func(vf.x), tanh_f32_func(vf.y)); - return __float22half2_rn(vr); - } else if constexpr (std::is_same_v) { - float xf = __half2float(input); - float yf = tanh_f32_func(xf); - return __float2half_rn(yf); - } else if constexpr (std::is_same_v) { - float f0 = __bfloat162float(__low2bfloat16(input)); - float f1 = __bfloat162float(__high2bfloat16(input)); - float r0 = tanh_f32_func(f0); - float r1 = tanh_f32_func(f1); - return __floats2bfloat162_rn(r0, r1); - } else if constexpr (std::is_same_v) { - float xf = __bfloat162float(input); - float rf = tanh_f32_func(xf); - return __float2bfloat16_rn(rf); - } else if constexpr (std::is_same_v) { - return tanh_f32_func(input); - } else if constexpr (std::is_same_v) { - return std::tanh(input); - } else { - return std::tanh(input); - } - } -} TanhOp; +using Op = op::elementwise::unary::cuda::UnaryOp; } // namespace op::tanh::cuda #endif // __TANH_CUDA_H__ diff --git a/src/infiniop/ops/tanh/nvidia/tanh_nvidia.cu b/src/infiniop/ops/tanh/nvidia/tanh_nvidia.cu index a2c36551c..62f02da67 100644 --- a/src/infiniop/ops/tanh/nvidia/tanh_nvidia.cu +++ b/src/infiniop/ops/tanh/nvidia/tanh_nvidia.cu @@ -1,59 +1,10 @@ -#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" +#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh" #include "../cuda/kernel.cuh" #include "tanh_nvidia.cuh" namespace op::tanh::nvidia { -Descriptor::~Descriptor() = default; +ELEMENTWISE_NVIDIA_IMPL_UNARY(tanh) -infiniStatus_t Descriptor::create( - infiniopHandle_t handle_, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t out_desc, - std::vector input_desc_vec) { - - auto handle = reinterpret_cast(handle_); - auto dtype = out_desc->dtype(); - - const auto &input_desc = input_desc_vec.at(0); - const auto &output_shape = out_desc->shape(); - const auto &input_shape = input_desc->shape(); - - CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); - - CHECK_SAME_SHAPE(output_shape, input_shape); - - // create CUDA elementwise descriptor - CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) - - return INFINI_STATUS_SUCCESS; -} - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *output, - std::vector inputs, - void *stream) const { - - if (workspace_size < _workspace_size) { - return INFINI_STATUS_INSUFFICIENT_WORKSPACE; - } - - switch (_dtype) { - case INFINI_DTYPE_F16: - return _device_info->calculate<256, cuda::TanhOp, half>(_info, workspace, output, inputs, stream); - case INFINI_DTYPE_BF16: - return _device_info->calculate<256, cuda::TanhOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); - case INFINI_DTYPE_F32: - return _device_info->calculate<256, cuda::TanhOp, float>(_info, workspace, output, inputs, stream); - case INFINI_DTYPE_F64: - return _device_info->calculate<256, cuda::TanhOp, double>(_info, workspace, output, inputs, stream); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - - return INFINI_STATUS_SUCCESS; -} } // namespace op::tanh::nvidia From 2ed76b9f46a0e5ff5436b361cebc94e495f4d98a Mon Sep 17 00:00:00 2001 From: gongchensu Date: Fri, 16 Jan 2026 06:34:19 +0000 Subject: [PATCH 4/7] Issue/887 - Refactor: consolidate binary/unary operator headers and tests into unified files --- include/infiniop.h | 27 +- include/infiniop/ops/abs.h | 8 - include/infiniop/ops/acos.h | 8 - include/infiniop/ops/acosh.h | 8 - include/infiniop/ops/asin.h | 8 - include/infiniop/ops/asinh.h | 8 - include/infiniop/ops/atan.h | 8 - include/infiniop/ops/atanh.h | 8 - include/infiniop/ops/binary_ops_api.h | 23 + include/infiniop/ops/ceil.h | 8 - include/infiniop/ops/cos.h | 8 - include/infiniop/ops/cosh.h | 8 - include/infiniop/ops/div.h | 8 - include/infiniop/ops/erf.h | 8 - include/infiniop/ops/floor.h | 8 - include/infiniop/ops/log.h | 8 - include/infiniop/ops/max.h | 8 - include/infiniop/ops/min.h | 8 - include/infiniop/ops/mod.h | 8 - include/infiniop/ops/neg.h | 8 - include/infiniop/ops/pow.h | 8 - include/infiniop/ops/reciprocal.h | 8 - include/infiniop/ops/round.h | 8 - include/infiniop/ops/sign.h | 8 - include/infiniop/ops/sinh.h | 8 - include/infiniop/ops/sqrt.h | 8 - include/infiniop/ops/tan.h | 8 - include/infiniop/ops/unary_ops_api.h | 39 ++ scripts/test_binary_unary.py | 143 ----- src/infiniop/ops/abs/operator.cc | 2 +- src/infiniop/ops/acos/operator.cc | 2 +- src/infiniop/ops/acosh/operator.cc | 2 +- src/infiniop/ops/asin/operator.cc | 2 +- src/infiniop/ops/asinh/operator.cc | 2 +- src/infiniop/ops/atan/operator.cc | 2 +- src/infiniop/ops/atanh/operator.cc | 2 +- src/infiniop/ops/ceil/operator.cc | 2 +- src/infiniop/ops/cos/operator.cc | 2 +- src/infiniop/ops/cosh/operator.cc | 2 +- src/infiniop/ops/div/operator.cc | 2 +- src/infiniop/ops/erf/operator.cc | 2 +- src/infiniop/ops/floor/operator.cc | 2 +- src/infiniop/ops/log/operator.cc | 2 +- src/infiniop/ops/max/operator.cc | 2 +- src/infiniop/ops/min/operator.cc | 2 +- src/infiniop/ops/mod/operator.cc | 2 +- src/infiniop/ops/neg/operator.cc | 2 +- src/infiniop/ops/pow/operator.cc | 2 +- src/infiniop/ops/reciprocal/operator.cc | 2 +- src/infiniop/ops/round/operator.cc | 2 +- src/infiniop/ops/sign/operator.cc | 2 +- src/infiniop/ops/sinh/operator.cc | 2 +- src/infiniop/ops/sqrt/operator.cc | 2 +- src/infiniop/ops/tan/operator.cc | 2 +- test/infiniop/abs.py | 164 ------ test/infiniop/acos.py | 165 ------ test/infiniop/acosh.py | 165 ------ test/infiniop/asin.py | 165 ------ test/infiniop/asinh.py | 165 ------ test/infiniop/atan.py | 164 ------ test/infiniop/atanh.py | 165 ------ test/infiniop/ceil.py | 165 ------ test/infiniop/cos.py | 166 ------ test/infiniop/cosh.py | 165 ------ test/infiniop/div.py | 192 ------ test/infiniop/erf.py | 165 ------ test/infiniop/floor.py | 165 ------ test/infiniop/libinfiniop/binary_test_base.py | 273 +++++++++ test/infiniop/libinfiniop/unary_test_base.py | 242 ++++++++ test/infiniop/log.py | 166 ------ test/infiniop/max.py | 189 ------ test/infiniop/min.py | 189 ------ test/infiniop/mod.py | 190 ------ test/infiniop/neg.py | 165 ------ test/infiniop/pow.py | 190 ------ test/infiniop/reciprocal.py | 168 ------ test/infiniop/round.py | 165 ------ test/infiniop/sign.py | 166 ------ test/infiniop/sinh.py | 166 ------ test/infiniop/sqrt.py | 166 ------ test/infiniop/tan.py | 167 ------ test/infiniop/test_all_binary_ops.py | 251 ++++++++ test/infiniop/test_all_unary_ops.py | 548 ++++++++++++++++++ 83 files changed, 1405 insertions(+), 4649 deletions(-) delete mode 100644 include/infiniop/ops/abs.h delete mode 100644 include/infiniop/ops/acos.h delete mode 100644 include/infiniop/ops/acosh.h delete mode 100644 include/infiniop/ops/asin.h delete mode 100644 include/infiniop/ops/asinh.h delete mode 100644 include/infiniop/ops/atan.h delete mode 100644 include/infiniop/ops/atanh.h create mode 100644 include/infiniop/ops/binary_ops_api.h delete mode 100644 include/infiniop/ops/ceil.h delete mode 100644 include/infiniop/ops/cos.h delete mode 100644 include/infiniop/ops/cosh.h delete mode 100644 include/infiniop/ops/div.h delete mode 100644 include/infiniop/ops/erf.h delete mode 100644 include/infiniop/ops/floor.h delete mode 100644 include/infiniop/ops/log.h delete mode 100644 include/infiniop/ops/max.h delete mode 100644 include/infiniop/ops/min.h delete mode 100644 include/infiniop/ops/mod.h delete mode 100644 include/infiniop/ops/neg.h delete mode 100644 include/infiniop/ops/pow.h delete mode 100644 include/infiniop/ops/reciprocal.h delete mode 100644 include/infiniop/ops/round.h delete mode 100644 include/infiniop/ops/sign.h delete mode 100644 include/infiniop/ops/sinh.h delete mode 100644 include/infiniop/ops/sqrt.h delete mode 100644 include/infiniop/ops/tan.h create mode 100644 include/infiniop/ops/unary_ops_api.h delete mode 100755 scripts/test_binary_unary.py delete mode 100644 test/infiniop/abs.py delete mode 100644 test/infiniop/acos.py delete mode 100644 test/infiniop/acosh.py delete mode 100644 test/infiniop/asin.py delete mode 100644 test/infiniop/asinh.py delete mode 100644 test/infiniop/atan.py delete mode 100644 test/infiniop/atanh.py delete mode 100644 test/infiniop/ceil.py delete mode 100644 test/infiniop/cos.py delete mode 100644 test/infiniop/cosh.py delete mode 100644 test/infiniop/div.py delete mode 100644 test/infiniop/erf.py delete mode 100644 test/infiniop/floor.py create mode 100644 test/infiniop/libinfiniop/binary_test_base.py create mode 100644 test/infiniop/libinfiniop/unary_test_base.py delete mode 100644 test/infiniop/log.py delete mode 100644 test/infiniop/max.py delete mode 100644 test/infiniop/min.py delete mode 100644 test/infiniop/mod.py delete mode 100644 test/infiniop/neg.py delete mode 100644 test/infiniop/pow.py delete mode 100644 test/infiniop/reciprocal.py delete mode 100644 test/infiniop/round.py delete mode 100644 test/infiniop/sign.py delete mode 100644 test/infiniop/sinh.py delete mode 100644 test/infiniop/sqrt.py delete mode 100644 test/infiniop/tan.py create mode 100644 test/infiniop/test_all_binary_ops.py create mode 100644 test/infiniop/test_all_unary_ops.py diff --git a/include/infiniop.h b/include/infiniop.h index 4778fce90..e87839bc2 100644 --- a/include/infiniop.h +++ b/include/infiniop.h @@ -2,47 +2,29 @@ #define __INFINIOP_API_H__ #include "infiniop/handle.h" -#include "infiniop/ops/abs.h" -#include "infiniop/ops/acos.h" -#include "infiniop/ops/acosh.h" +// Unified headers for elementwise operators +#include "infiniop/ops/unary_ops_api.h" +#include "infiniop/ops/binary_ops_api.h" +// Other operators #include "infiniop/ops/add.h" #include "infiniop/ops/add_rms_norm.h" -#include "infiniop/ops/asin.h" -#include "infiniop/ops/asinh.h" -#include "infiniop/ops/atan.h" -#include "infiniop/ops/atanh.h" #include "infiniop/ops/attention.h" -#include "infiniop/ops/ceil.h" -#include "infiniop/ops/cos.h" -#include "infiniop/ops/cosh.h" -#include "infiniop/ops/erf.h" -#include "infiniop/ops/floor.h" #include "infiniop/ops/causal_softmax.h" #include "infiniop/ops/clip.h" #include "infiniop/ops/conv.h" #include "infiniop/ops/dequantize_awq.h" -#include "infiniop/ops/div.h" #include "infiniop/ops/gelu.h" #include "infiniop/ops/gemm.h" #include "infiniop/ops/layer_norm.h" -#include "infiniop/ops/log.h" #include "infiniop/ops/logsoftmax.h" #include "infiniop/ops/lp_norm.h" -#include "infiniop/ops/max.h" -#include "infiniop/ops/min.h" #include "infiniop/ops/mul.h" -#include "infiniop/ops/neg.h" #include "infiniop/ops/ones.h" #include "infiniop/ops/paged_attention.h" #include "infiniop/ops/paged_attention_prefill.h" #include "infiniop/ops/paged_caching.h" #include "infiniop/ops/random_sample.h" -#include "infiniop/ops/reciprocal.h" #include "infiniop/ops/rearrange.h" -#include "infiniop/ops/round.h" -#include "infiniop/ops/sign.h" -#include "infiniop/ops/sinh.h" -#include "infiniop/ops/sqrt.h" #include "infiniop/ops/relu.h" #include "infiniop/ops/rms_norm.h" #include "infiniop/ops/rope.h" @@ -52,7 +34,6 @@ #include "infiniop/ops/softplus.h" #include "infiniop/ops/sub.h" #include "infiniop/ops/swiglu.h" -#include "infiniop/ops/tan.h" #include "infiniop/ops/tanh.h" #include "infiniop/ops/topkrouter.h" #include "infiniop/ops/topksoftmax.h" diff --git a/include/infiniop/ops/abs.h b/include/infiniop/ops/abs.h deleted file mode 100644 index 1d1f1cbd1..000000000 --- a/include/infiniop/ops/abs.h +++ /dev/null @@ -1,8 +0,0 @@ -#ifndef __INFINIOP_ABS_API_H__ -#define __INFINIOP_ABS_API_H__ - -#include "unary_op_api.h" - -UNARY_OP_API_DECLARE(abs, Abs) - -#endif diff --git a/include/infiniop/ops/acos.h b/include/infiniop/ops/acos.h deleted file mode 100644 index c2f4de837..000000000 --- a/include/infiniop/ops/acos.h +++ /dev/null @@ -1,8 +0,0 @@ -#ifndef __INFINIOP_ACOS_API_H__ -#define __INFINIOP_ACOS_API_H__ - -#include "unary_op_api.h" - -UNARY_OP_API_DECLARE(acos, Acos) - -#endif diff --git a/include/infiniop/ops/acosh.h b/include/infiniop/ops/acosh.h deleted file mode 100644 index e8630b7d5..000000000 --- a/include/infiniop/ops/acosh.h +++ /dev/null @@ -1,8 +0,0 @@ -#ifndef __INFINIOP_ACOSH_API_H__ -#define __INFINIOP_ACOSH_API_H__ - -#include "unary_op_api.h" - -UNARY_OP_API_DECLARE(acosh, Acosh) - -#endif diff --git a/include/infiniop/ops/asin.h b/include/infiniop/ops/asin.h deleted file mode 100644 index 1a8bdd7b8..000000000 --- a/include/infiniop/ops/asin.h +++ /dev/null @@ -1,8 +0,0 @@ -#ifndef __INFINIOP_ASIN_API_H__ -#define __INFINIOP_ASIN_API_H__ - -#include "unary_op_api.h" - -UNARY_OP_API_DECLARE(asin, Asin) - -#endif diff --git a/include/infiniop/ops/asinh.h b/include/infiniop/ops/asinh.h deleted file mode 100644 index 2a3aebf5a..000000000 --- a/include/infiniop/ops/asinh.h +++ /dev/null @@ -1,8 +0,0 @@ -#ifndef __INFINIOP_ASINH_API_H__ -#define __INFINIOP_ASINH_API_H__ - -#include "unary_op_api.h" - -UNARY_OP_API_DECLARE(asinh, Asinh) - -#endif diff --git a/include/infiniop/ops/atan.h b/include/infiniop/ops/atan.h deleted file mode 100644 index 18eed316f..000000000 --- a/include/infiniop/ops/atan.h +++ /dev/null @@ -1,8 +0,0 @@ -#ifndef __INFINIOP_ATAN_API_H__ -#define __INFINIOP_ATAN_API_H__ - -#include "unary_op_api.h" - -UNARY_OP_API_DECLARE(atan, Atan) - -#endif diff --git a/include/infiniop/ops/atanh.h b/include/infiniop/ops/atanh.h deleted file mode 100644 index e7db5b53c..000000000 --- a/include/infiniop/ops/atanh.h +++ /dev/null @@ -1,8 +0,0 @@ -#ifndef __INFINIOP_ATANH_API_H__ -#define __INFINIOP_ATANH_API_H__ - -#include "unary_op_api.h" - -UNARY_OP_API_DECLARE(atanh, Atanh) - -#endif diff --git a/include/infiniop/ops/binary_ops_api.h b/include/infiniop/ops/binary_ops_api.h new file mode 100644 index 000000000..24d7715c9 --- /dev/null +++ b/include/infiniop/ops/binary_ops_api.h @@ -0,0 +1,23 @@ +#ifndef __INFINIOP_BINARY_OPS_API_H__ +#define __INFINIOP_BINARY_OPS_API_H__ + +#include "binary_op_api.h" + +/** + * @brief Unified API declarations for all binary operators. + * + * This header contains API declarations for all binary operators in a single file, + * eliminating the need for individual header files for each operator. + * + * All binary operator APIs are declared here: + * - div, pow, mod, max, min + */ + +// Declare all binary operator APIs +BINARY_OP_API_DECLARE(div, Div) +BINARY_OP_API_DECLARE(pow, Pow) +BINARY_OP_API_DECLARE(mod, Mod) +BINARY_OP_API_DECLARE(max, Max) +BINARY_OP_API_DECLARE(min, Min) + +#endif // __INFINIOP_BINARY_OPS_API_H__ diff --git a/include/infiniop/ops/ceil.h b/include/infiniop/ops/ceil.h deleted file mode 100644 index 8fca73b2e..000000000 --- a/include/infiniop/ops/ceil.h +++ /dev/null @@ -1,8 +0,0 @@ -#ifndef __INFINIOP_CEIL_API_H__ -#define __INFINIOP_CEIL_API_H__ - -#include "unary_op_api.h" - -UNARY_OP_API_DECLARE(ceil, Ceil) - -#endif diff --git a/include/infiniop/ops/cos.h b/include/infiniop/ops/cos.h deleted file mode 100644 index ed33b0a0e..000000000 --- a/include/infiniop/ops/cos.h +++ /dev/null @@ -1,8 +0,0 @@ -#ifndef __INFINIOP_COS_API_H__ -#define __INFINIOP_COS_API_H__ - -#include "unary_op_api.h" - -UNARY_OP_API_DECLARE(cos, Cos) - -#endif diff --git a/include/infiniop/ops/cosh.h b/include/infiniop/ops/cosh.h deleted file mode 100644 index b607b8fd1..000000000 --- a/include/infiniop/ops/cosh.h +++ /dev/null @@ -1,8 +0,0 @@ -#ifndef __INFINIOP_COSH_API_H__ -#define __INFINIOP_COSH_API_H__ - -#include "unary_op_api.h" - -UNARY_OP_API_DECLARE(cosh, Cosh) - -#endif diff --git a/include/infiniop/ops/div.h b/include/infiniop/ops/div.h deleted file mode 100644 index 6f146bf4c..000000000 --- a/include/infiniop/ops/div.h +++ /dev/null @@ -1,8 +0,0 @@ -#ifndef __INFINIOP_DIV_API_H__ -#define __INFINIOP_DIV_API_H__ - -#include "binary_op_api.h" - -BINARY_OP_API_DECLARE(div, Div) - -#endif diff --git a/include/infiniop/ops/erf.h b/include/infiniop/ops/erf.h deleted file mode 100644 index 0dcc149da..000000000 --- a/include/infiniop/ops/erf.h +++ /dev/null @@ -1,8 +0,0 @@ -#ifndef __INFINIOP_ERF_API_H__ -#define __INFINIOP_ERF_API_H__ - -#include "unary_op_api.h" - -UNARY_OP_API_DECLARE(erf, Erf) - -#endif diff --git a/include/infiniop/ops/floor.h b/include/infiniop/ops/floor.h deleted file mode 100644 index 02efc6761..000000000 --- a/include/infiniop/ops/floor.h +++ /dev/null @@ -1,8 +0,0 @@ -#ifndef __INFINIOP_FLOOR_API_H__ -#define __INFINIOP_FLOOR_API_H__ - -#include "unary_op_api.h" - -UNARY_OP_API_DECLARE(floor, Floor) - -#endif diff --git a/include/infiniop/ops/log.h b/include/infiniop/ops/log.h deleted file mode 100644 index 3892ccb6e..000000000 --- a/include/infiniop/ops/log.h +++ /dev/null @@ -1,8 +0,0 @@ -#ifndef __INFINIOP_LOG_API_H__ -#define __INFINIOP_LOG_API_H__ - -#include "unary_op_api.h" - -UNARY_OP_API_DECLARE(log, Log) - -#endif diff --git a/include/infiniop/ops/max.h b/include/infiniop/ops/max.h deleted file mode 100644 index 4b91e5c83..000000000 --- a/include/infiniop/ops/max.h +++ /dev/null @@ -1,8 +0,0 @@ -#ifndef __INFINIOP_MAX_API_H__ -#define __INFINIOP_MAX_API_H__ - -#include "binary_op_api.h" - -BINARY_OP_API_DECLARE(max, Max) - -#endif diff --git a/include/infiniop/ops/min.h b/include/infiniop/ops/min.h deleted file mode 100644 index 1496806df..000000000 --- a/include/infiniop/ops/min.h +++ /dev/null @@ -1,8 +0,0 @@ -#ifndef __INFINIOP_MIN_API_H__ -#define __INFINIOP_MIN_API_H__ - -#include "binary_op_api.h" - -BINARY_OP_API_DECLARE(min, Min) - -#endif diff --git a/include/infiniop/ops/mod.h b/include/infiniop/ops/mod.h deleted file mode 100644 index e4fcd571e..000000000 --- a/include/infiniop/ops/mod.h +++ /dev/null @@ -1,8 +0,0 @@ -#ifndef __INFINIOP_MOD_API_H__ -#define __INFINIOP_MOD_API_H__ - -#include "binary_op_api.h" - -BINARY_OP_API_DECLARE(mod, Mod) - -#endif diff --git a/include/infiniop/ops/neg.h b/include/infiniop/ops/neg.h deleted file mode 100644 index 0d18bbd5c..000000000 --- a/include/infiniop/ops/neg.h +++ /dev/null @@ -1,8 +0,0 @@ -#ifndef __INFINIOP_NEG_API_H__ -#define __INFINIOP_NEG_API_H__ - -#include "unary_op_api.h" - -UNARY_OP_API_DECLARE(neg, Neg) - -#endif diff --git a/include/infiniop/ops/pow.h b/include/infiniop/ops/pow.h deleted file mode 100644 index f4e263a58..000000000 --- a/include/infiniop/ops/pow.h +++ /dev/null @@ -1,8 +0,0 @@ -#ifndef __INFINIOP_POW_API_H__ -#define __INFINIOP_POW_API_H__ - -#include "binary_op_api.h" - -BINARY_OP_API_DECLARE(pow, Pow) - -#endif diff --git a/include/infiniop/ops/reciprocal.h b/include/infiniop/ops/reciprocal.h deleted file mode 100644 index 7d5626176..000000000 --- a/include/infiniop/ops/reciprocal.h +++ /dev/null @@ -1,8 +0,0 @@ -#ifndef __INFINIOP_RECIPROCAL_API_H__ -#define __INFINIOP_RECIPROCAL_API_H__ - -#include "unary_op_api.h" - -UNARY_OP_API_DECLARE(reciprocal, Reciprocal) - -#endif diff --git a/include/infiniop/ops/round.h b/include/infiniop/ops/round.h deleted file mode 100644 index 1bf4377ff..000000000 --- a/include/infiniop/ops/round.h +++ /dev/null @@ -1,8 +0,0 @@ -#ifndef __INFINIOP_ROUND_API_H__ -#define __INFINIOP_ROUND_API_H__ - -#include "unary_op_api.h" - -UNARY_OP_API_DECLARE(round, Round) - -#endif diff --git a/include/infiniop/ops/sign.h b/include/infiniop/ops/sign.h deleted file mode 100644 index ef7854de8..000000000 --- a/include/infiniop/ops/sign.h +++ /dev/null @@ -1,8 +0,0 @@ -#ifndef __INFINIOP_SIGN_API_H__ -#define __INFINIOP_SIGN_API_H__ - -#include "unary_op_api.h" - -UNARY_OP_API_DECLARE(sign, Sign) - -#endif diff --git a/include/infiniop/ops/sinh.h b/include/infiniop/ops/sinh.h deleted file mode 100644 index ea8511a2b..000000000 --- a/include/infiniop/ops/sinh.h +++ /dev/null @@ -1,8 +0,0 @@ -#ifndef __INFINIOP_SINH_API_H__ -#define __INFINIOP_SINH_API_H__ - -#include "unary_op_api.h" - -UNARY_OP_API_DECLARE(sinh, Sinh) - -#endif diff --git a/include/infiniop/ops/sqrt.h b/include/infiniop/ops/sqrt.h deleted file mode 100644 index 6df6fe89c..000000000 --- a/include/infiniop/ops/sqrt.h +++ /dev/null @@ -1,8 +0,0 @@ -#ifndef __INFINIOP_SQRT_API_H__ -#define __INFINIOP_SQRT_API_H__ - -#include "unary_op_api.h" - -UNARY_OP_API_DECLARE(sqrt, Sqrt) - -#endif diff --git a/include/infiniop/ops/tan.h b/include/infiniop/ops/tan.h deleted file mode 100644 index d4a2f0bf2..000000000 --- a/include/infiniop/ops/tan.h +++ /dev/null @@ -1,8 +0,0 @@ -#ifndef __INFINIOP_TAN_API_H__ -#define __INFINIOP_TAN_API_H__ - -#include "unary_op_api.h" - -UNARY_OP_API_DECLARE(tan, Tan) - -#endif diff --git a/include/infiniop/ops/unary_ops_api.h b/include/infiniop/ops/unary_ops_api.h new file mode 100644 index 000000000..95b0773b6 --- /dev/null +++ b/include/infiniop/ops/unary_ops_api.h @@ -0,0 +1,39 @@ +#ifndef __INFINIOP_UNARY_OPS_API_H__ +#define __INFINIOP_UNARY_OPS_API_H__ + +#include "unary_op_api.h" + +/** + * @brief Unified API declarations for all unary operators. + * + * This header contains API declarations for all unary operators in a single file, + * eliminating the need for individual header files for each operator. + * + * All unary operator APIs are declared here: + * - abs, log, sqrt, reciprocal, neg, round, sinh, sign, tan + * - acosh, asinh, cos, atanh, asin, floor, cosh, erf, atan, acos, ceil + */ + +// Declare all unary operator APIs +UNARY_OP_API_DECLARE(abs, Abs) +UNARY_OP_API_DECLARE(log, Log) +UNARY_OP_API_DECLARE(sqrt, Sqrt) +UNARY_OP_API_DECLARE(reciprocal, Reciprocal) +UNARY_OP_API_DECLARE(neg, Neg) +UNARY_OP_API_DECLARE(round, Round) +UNARY_OP_API_DECLARE(sinh, Sinh) +UNARY_OP_API_DECLARE(sign, Sign) +UNARY_OP_API_DECLARE(tan, Tan) +UNARY_OP_API_DECLARE(acosh, Acosh) +UNARY_OP_API_DECLARE(asinh, Asinh) +UNARY_OP_API_DECLARE(cos, Cos) +UNARY_OP_API_DECLARE(atanh, Atanh) +UNARY_OP_API_DECLARE(asin, Asin) +UNARY_OP_API_DECLARE(floor, Floor) +UNARY_OP_API_DECLARE(cosh, Cosh) +UNARY_OP_API_DECLARE(erf, Erf) +UNARY_OP_API_DECLARE(atan, Atan) +UNARY_OP_API_DECLARE(acos, Acos) +UNARY_OP_API_DECLARE(ceil, Ceil) + +#endif // __INFINIOP_UNARY_OPS_API_H__ diff --git a/scripts/test_binary_unary.py b/scripts/test_binary_unary.py deleted file mode 100755 index 8dbbfbf53..000000000 --- a/scripts/test_binary_unary.py +++ /dev/null @@ -1,143 +0,0 @@ -import os -import subprocess -from set_env import set_env -import sys - -PROJECT_DIR = os.path.abspath( - os.path.join(os.path.dirname(__file__), "..", "test", "infiniop") -) -os.chdir(PROJECT_DIR) - - -def run_tests(args): - failed = [] - - # Binary operators (重构过的) - binary_tests = [ - "div.py", - "pow.py", - "mod.py", - "min.py", - "max.py", - ] - - # Unary operators (重构过的) - unary_tests = [ - "abs.py", - "log.py", - "cos.py", - "sqrt.py", - "neg.py", - "sign.py", - "reciprocal.py", - "round.py", - "floor.py", - "ceil.py", - "erf.py", - "cosh.py", - "sinh.py", - "tan.py", - "acos.py", - "acosh.py", - "asin.py", - "asinh.py", - "atan.py", - "atanh.py", - ] - - all_tests = binary_tests + unary_tests - - print("\033[94m" + "=" * 60 + "\033[0m") - print("\033[94mTesting Binary and Unary Operators (Refactored)\033[0m") - print("\033[94m" + "=" * 60 + "\033[0m") - print(f"\033[94mTotal tests: {len(all_tests)}\033[0m") - print(f"\033[94m - Binary operators: {len(binary_tests)}\033[0m") - print(f"\033[94m - Unary operators: {len(unary_tests)}\033[0m") - print() - - for test in all_tests: - if not os.path.exists(test): - print(f"\033[93m[SKIP] {test} - test file not found\033[0m") - continue - - print(f"\033[96m[RUN] {test}\033[0m", end=" ... ", flush=True) - result = subprocess.run( - f"python3 {test} {args}", - text=True, - encoding="utf-8", - shell=True, - capture_output=True - ) - - if result.returncode != 0: - print(f"\033[91m[FAIL]\033[0m") - print(f"\033[91mError output:\033[0m") - print(result.stderr) - failed.append(test) - else: - print(f"\033[92m[PASS]\033[0m") - - return failed - - -if __name__ == "__main__": - import argparse - - parser = argparse.ArgumentParser( - description="Test refactored binary and unary operators", - formatter_class=argparse.RawDescriptionHelpFormatter, - epilog=""" -Examples: - # Test on CPU only (default) - python3 scripts/test_binary_unary.py --cpu - - # Test on NVIDIA GPU only - python3 scripts/test_binary_unary.py --nvidia - - # Test on both CPU and NVIDIA - python3 scripts/test_binary_unary.py --cpu --nvidia - - # Test with debug mode - python3 scripts/test_binary_unary.py --cpu --debug - - # Test with profiling - python3 scripts/test_binary_unary.py --nvidia --profile - """ - ) - - # Device selection arguments (same as test files) - parser.add_argument("--cpu", action="store_true", help="Run CPU tests") - parser.add_argument("--nvidia", action="store_true", help="Run NVIDIA GPU tests") - parser.add_argument("--debug", action="store_true", help="Enable debug mode") - parser.add_argument("--profile", action="store_true", help="Enable profiling") - - args, unknown = parser.parse_known_args() - - # Build command line arguments to pass to test files - test_args = [] - if args.cpu: - test_args.append("--cpu") - if args.nvidia: - test_args.append("--nvidia") - if args.debug: - test_args.append("--debug") - if args.profile: - test_args.append("--profile") - - # Add any unknown arguments (for compatibility) - test_args.extend(unknown) - - set_env() - failed = run_tests(" ".join(test_args)) - - print() - print("\033[94m" + "=" * 60 + "\033[0m") - if len(failed) == 0: - print("\033[92m✓ All tests passed!\033[0m") - else: - print(f"\033[91m✗ {len(failed)} test(s) failed:\033[0m") - for test in failed: - print(f"\033[91m - {test}\033[0m") - print("\033[94m" + "=" * 60 + "\033[0m") - - exit(len(failed)) diff --git a/src/infiniop/ops/abs/operator.cc b/src/infiniop/ops/abs/operator.cc index 051b8711a..8439236eb 100644 --- a/src/infiniop/ops/abs/operator.cc +++ b/src/infiniop/ops/abs/operator.cc @@ -1,5 +1,5 @@ #include "../../operator_impl.h" -#include "infiniop/ops/abs.h" +#include "infiniop/ops/unary_ops_api.h" #ifdef ENABLE_CPU_API #include "cpu/abs_cpu.h" diff --git a/src/infiniop/ops/acos/operator.cc b/src/infiniop/ops/acos/operator.cc index 15872b754..3fd50fb51 100644 --- a/src/infiniop/ops/acos/operator.cc +++ b/src/infiniop/ops/acos/operator.cc @@ -1,5 +1,5 @@ #include "../../operator_impl.h" -#include "infiniop/ops/acos.h" +#include "infiniop/ops/unary_ops_api.h" #ifdef ENABLE_CPU_API #include "cpu/acos_cpu.h" diff --git a/src/infiniop/ops/acosh/operator.cc b/src/infiniop/ops/acosh/operator.cc index c1939a54c..0fb30c0f6 100644 --- a/src/infiniop/ops/acosh/operator.cc +++ b/src/infiniop/ops/acosh/operator.cc @@ -1,5 +1,5 @@ #include "../../operator_impl.h" -#include "infiniop/ops/acosh.h" +#include "infiniop/ops/unary_ops_api.h" #ifdef ENABLE_CPU_API #include "cpu/acosh_cpu.h" diff --git a/src/infiniop/ops/asin/operator.cc b/src/infiniop/ops/asin/operator.cc index edb8fa867..8ed07d55d 100644 --- a/src/infiniop/ops/asin/operator.cc +++ b/src/infiniop/ops/asin/operator.cc @@ -1,5 +1,5 @@ #include "../../operator_impl.h" -#include "infiniop/ops/asin.h" +#include "infiniop/ops/unary_ops_api.h" #ifdef ENABLE_CPU_API #include "cpu/asin_cpu.h" diff --git a/src/infiniop/ops/asinh/operator.cc b/src/infiniop/ops/asinh/operator.cc index 7b519ec05..020f83dc4 100644 --- a/src/infiniop/ops/asinh/operator.cc +++ b/src/infiniop/ops/asinh/operator.cc @@ -1,5 +1,5 @@ #include "../../operator_impl.h" -#include "infiniop/ops/asinh.h" +#include "infiniop/ops/unary_ops_api.h" #ifdef ENABLE_CPU_API #include "cpu/asinh_cpu.h" diff --git a/src/infiniop/ops/atan/operator.cc b/src/infiniop/ops/atan/operator.cc index 9025489c3..2ee3ad449 100644 --- a/src/infiniop/ops/atan/operator.cc +++ b/src/infiniop/ops/atan/operator.cc @@ -1,5 +1,5 @@ #include "../../operator_impl.h" -#include "infiniop/ops/atan.h" +#include "infiniop/ops/unary_ops_api.h" #ifdef ENABLE_CPU_API #include "cpu/atan_cpu.h" diff --git a/src/infiniop/ops/atanh/operator.cc b/src/infiniop/ops/atanh/operator.cc index cc9d6131e..fb991051c 100644 --- a/src/infiniop/ops/atanh/operator.cc +++ b/src/infiniop/ops/atanh/operator.cc @@ -1,5 +1,5 @@ #include "../../operator_impl.h" -#include "infiniop/ops/atanh.h" +#include "infiniop/ops/unary_ops_api.h" #ifdef ENABLE_CPU_API #include "cpu/atanh_cpu.h" diff --git a/src/infiniop/ops/ceil/operator.cc b/src/infiniop/ops/ceil/operator.cc index dbe591043..26252ec16 100644 --- a/src/infiniop/ops/ceil/operator.cc +++ b/src/infiniop/ops/ceil/operator.cc @@ -1,5 +1,5 @@ #include "../../operator_impl.h" -#include "infiniop/ops/ceil.h" +#include "infiniop/ops/unary_ops_api.h" #ifdef ENABLE_CPU_API #include "cpu/ceil_cpu.h" diff --git a/src/infiniop/ops/cos/operator.cc b/src/infiniop/ops/cos/operator.cc index 1531c6caa..e3d9237a9 100644 --- a/src/infiniop/ops/cos/operator.cc +++ b/src/infiniop/ops/cos/operator.cc @@ -1,5 +1,5 @@ #include "../../operator_impl.h" -#include "infiniop/ops/cos.h" +#include "infiniop/ops/unary_ops_api.h" #ifdef ENABLE_CPU_API #include "cpu/cos_cpu.h" diff --git a/src/infiniop/ops/cosh/operator.cc b/src/infiniop/ops/cosh/operator.cc index 9b18b47ee..c1a6159c1 100644 --- a/src/infiniop/ops/cosh/operator.cc +++ b/src/infiniop/ops/cosh/operator.cc @@ -1,5 +1,5 @@ #include "../../operator_impl.h" -#include "infiniop/ops/cosh.h" +#include "infiniop/ops/unary_ops_api.h" #ifdef ENABLE_CPU_API #include "cpu/cosh_cpu.h" diff --git a/src/infiniop/ops/div/operator.cc b/src/infiniop/ops/div/operator.cc index af9d1929a..4ed2374af 100644 --- a/src/infiniop/ops/div/operator.cc +++ b/src/infiniop/ops/div/operator.cc @@ -1,5 +1,5 @@ #include "../../operator_impl.h" -#include "infiniop/ops/div.h" +#include "infiniop/ops/binary_ops_api.h" #ifdef ENABLE_CPU_API #include "cpu/div_cpu.h" diff --git a/src/infiniop/ops/erf/operator.cc b/src/infiniop/ops/erf/operator.cc index 9304cf525..eeee864ee 100644 --- a/src/infiniop/ops/erf/operator.cc +++ b/src/infiniop/ops/erf/operator.cc @@ -1,5 +1,5 @@ #include "../../operator_impl.h" -#include "infiniop/ops/erf.h" +#include "infiniop/ops/unary_ops_api.h" #ifdef ENABLE_CPU_API #include "cpu/erf_cpu.h" diff --git a/src/infiniop/ops/floor/operator.cc b/src/infiniop/ops/floor/operator.cc index 64e4a586b..bfb4a2466 100644 --- a/src/infiniop/ops/floor/operator.cc +++ b/src/infiniop/ops/floor/operator.cc @@ -1,5 +1,5 @@ #include "../../operator_impl.h" -#include "infiniop/ops/floor.h" +#include "infiniop/ops/unary_ops_api.h" #ifdef ENABLE_CPU_API #include "cpu/floor_cpu.h" diff --git a/src/infiniop/ops/log/operator.cc b/src/infiniop/ops/log/operator.cc index 9614a0861..b4814ff72 100644 --- a/src/infiniop/ops/log/operator.cc +++ b/src/infiniop/ops/log/operator.cc @@ -1,5 +1,5 @@ #include "../../operator_impl.h" -#include "infiniop/ops/log.h" +#include "infiniop/ops/unary_ops_api.h" #ifdef ENABLE_CPU_API #include "cpu/log_cpu.h" diff --git a/src/infiniop/ops/max/operator.cc b/src/infiniop/ops/max/operator.cc index 3e5299f52..03b6d4eeb 100644 --- a/src/infiniop/ops/max/operator.cc +++ b/src/infiniop/ops/max/operator.cc @@ -1,5 +1,5 @@ #include "../../operator_impl.h" -#include "infiniop/ops/max.h" +#include "infiniop/ops/binary_ops_api.h" #ifdef ENABLE_CPU_API #include "cpu/max_cpu.h" diff --git a/src/infiniop/ops/min/operator.cc b/src/infiniop/ops/min/operator.cc index 6f67ecf87..1597bb5d3 100644 --- a/src/infiniop/ops/min/operator.cc +++ b/src/infiniop/ops/min/operator.cc @@ -1,5 +1,5 @@ #include "../../operator_impl.h" -#include "infiniop/ops/min.h" +#include "infiniop/ops/binary_ops_api.h" #ifdef ENABLE_CPU_API #include "cpu/min_cpu.h" diff --git a/src/infiniop/ops/mod/operator.cc b/src/infiniop/ops/mod/operator.cc index aef892ce1..9f635d6e6 100644 --- a/src/infiniop/ops/mod/operator.cc +++ b/src/infiniop/ops/mod/operator.cc @@ -1,5 +1,5 @@ #include "../../operator_impl.h" -#include "infiniop/ops/mod.h" +#include "infiniop/ops/binary_ops_api.h" #ifdef ENABLE_CPU_API #include "cpu/mod_cpu.h" diff --git a/src/infiniop/ops/neg/operator.cc b/src/infiniop/ops/neg/operator.cc index c3945f4bb..e8c99dcdf 100644 --- a/src/infiniop/ops/neg/operator.cc +++ b/src/infiniop/ops/neg/operator.cc @@ -1,5 +1,5 @@ #include "../../operator_impl.h" -#include "infiniop/ops/neg.h" +#include "infiniop/ops/unary_ops_api.h" #ifdef ENABLE_CPU_API #include "cpu/neg_cpu.h" diff --git a/src/infiniop/ops/pow/operator.cc b/src/infiniop/ops/pow/operator.cc index b1ddbc9c1..7a24d7a20 100644 --- a/src/infiniop/ops/pow/operator.cc +++ b/src/infiniop/ops/pow/operator.cc @@ -1,5 +1,5 @@ #include "../../operator_impl.h" -#include "infiniop/ops/pow.h" +#include "infiniop/ops/binary_ops_api.h" #ifdef ENABLE_CPU_API #include "cpu/pow_cpu.h" diff --git a/src/infiniop/ops/reciprocal/operator.cc b/src/infiniop/ops/reciprocal/operator.cc index 966bd72d8..4c55fdf20 100644 --- a/src/infiniop/ops/reciprocal/operator.cc +++ b/src/infiniop/ops/reciprocal/operator.cc @@ -1,5 +1,5 @@ #include "../../operator_impl.h" -#include "infiniop/ops/reciprocal.h" +#include "infiniop/ops/unary_ops_api.h" #ifdef ENABLE_CPU_API #include "cpu/reciprocal_cpu.h" diff --git a/src/infiniop/ops/round/operator.cc b/src/infiniop/ops/round/operator.cc index a20fbcb17..5a1e0fcc5 100644 --- a/src/infiniop/ops/round/operator.cc +++ b/src/infiniop/ops/round/operator.cc @@ -1,5 +1,5 @@ #include "../../operator_impl.h" -#include "infiniop/ops/round.h" +#include "infiniop/ops/unary_ops_api.h" #ifdef ENABLE_CPU_API #include "cpu/round_cpu.h" diff --git a/src/infiniop/ops/sign/operator.cc b/src/infiniop/ops/sign/operator.cc index 1a4599d5d..18850ec1f 100644 --- a/src/infiniop/ops/sign/operator.cc +++ b/src/infiniop/ops/sign/operator.cc @@ -1,5 +1,5 @@ #include "../../operator_impl.h" -#include "infiniop/ops/sign.h" +#include "infiniop/ops/unary_ops_api.h" #ifdef ENABLE_CPU_API #include "cpu/sign_cpu.h" diff --git a/src/infiniop/ops/sinh/operator.cc b/src/infiniop/ops/sinh/operator.cc index 41940d235..263d20347 100644 --- a/src/infiniop/ops/sinh/operator.cc +++ b/src/infiniop/ops/sinh/operator.cc @@ -1,5 +1,5 @@ #include "../../operator_impl.h" -#include "infiniop/ops/sinh.h" +#include "infiniop/ops/unary_ops_api.h" #ifdef ENABLE_CPU_API #include "cpu/sinh_cpu.h" diff --git a/src/infiniop/ops/sqrt/operator.cc b/src/infiniop/ops/sqrt/operator.cc index fe999f58f..5962860ca 100644 --- a/src/infiniop/ops/sqrt/operator.cc +++ b/src/infiniop/ops/sqrt/operator.cc @@ -1,5 +1,5 @@ #include "../../operator_impl.h" -#include "infiniop/ops/sqrt.h" +#include "infiniop/ops/unary_ops_api.h" #ifdef ENABLE_CPU_API #include "cpu/sqrt_cpu.h" diff --git a/src/infiniop/ops/tan/operator.cc b/src/infiniop/ops/tan/operator.cc index ae506dcd8..75dd8277e 100644 --- a/src/infiniop/ops/tan/operator.cc +++ b/src/infiniop/ops/tan/operator.cc @@ -1,5 +1,5 @@ #include "../../operator_impl.h" -#include "infiniop/ops/tan.h" +#include "infiniop/ops/unary_ops_api.h" #ifdef ENABLE_CPU_API #include "cpu/tan_cpu.h" diff --git a/test/infiniop/abs.py b/test/infiniop/abs.py deleted file mode 100644 index df8748a97..000000000 --- a/test/infiniop/abs.py +++ /dev/null @@ -1,164 +0,0 @@ -import ctypes -from ctypes import c_uint64 -from enum import Enum, auto - -import torch -from libinfiniop import ( - LIBINFINIOP, - InfiniDeviceNames, - InfiniDtype, - InfiniDtypeNames, - TestTensor, - TestWorkspace, - check_error, - debug, - get_args, - get_test_devices, - get_tolerance, - infiniopOperatorDescriptor_t, - profile_operation, - test_operator, -) - -# ============================================================================== -# Configuration (Internal Use Only) -# ============================================================================== -# These are not meant to be imported from other modules -_TEST_CASES_ = [ - # tensor_shape, inplace - ((1, 3),), - ((3, 3),), - ((32, 20, 512),), - ((33, 333, 333),), - ((32, 256, 112, 112),), - ((3, 3, 13, 9, 17),), -] - - -class Inplace(Enum): - OUT_OF_PLACE = auto() - INPLACE_X = auto() - - -# Inplace options applied for each test case in _TEST_CASES_ -_INPLACE = [ - Inplace.OUT_OF_PLACE, - Inplace.INPLACE_X, -] - -# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_ -_TEST_CASES = [ - test_case + (inplace_item,) - for test_case in _TEST_CASES_ - for inplace_item in _INPLACE -] - -# Data types used for testing (matching old operators library: only F16 and F32) -_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32] - -# Tolerance map for different data types -_TOLERANCE_MAP = { - InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, - InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7}, -} - -DEBUG = False -PROFILE = False -NUM_PRERUN = 10 -NUM_ITERATIONS = 1000 - - -def abs_op(x): - return torch.abs(x).to(x.dtype) - - -def test( - handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=InfiniDtype.F16, sync=None -): - # Generate test tensors with values in range [-1, 1) for abs operation - x_torch_tensor = torch.rand(shape) * 2 - 1 - - x = TestTensor( - shape, - x_torch_tensor.stride(), - dtype, - device, - mode="manual", - set_tensor=x_torch_tensor, - ) - - if inplace == Inplace.INPLACE_X: - y = x - else: - y = TestTensor(shape, None, dtype, device) - - if y.is_broadcast(): - return - - print( - f"Testing Abs on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}" - ) - - ans = abs_op(x.torch_tensor()) - - if sync is not None: - sync() - - descriptor = infiniopOperatorDescriptor_t() - check_error( - LIBINFINIOP.infiniopCreateAbsDescriptor( - handle, ctypes.byref(descriptor), y.descriptor, x.descriptor - ) - ) - - # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel - for tensor in [x, y]: - tensor.destroy_desc() - - workspace_size = c_uint64(0) - check_error( - LIBINFINIOP.infiniopGetAbsWorkspaceSize( - descriptor, ctypes.byref(workspace_size) - ) - ) - workspace = TestWorkspace(workspace_size.value, y.device) - - def lib_abs(): - check_error( - LIBINFINIOP.infiniopAbs( - descriptor, workspace.data(), workspace_size.value, y.data(), x.data(), None - ) - ) - - lib_abs() - if sync is not None: - sync() - - atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) - if DEBUG: - debug(y.actual_tensor(), ans, atol=atol, rtol=rtol) - assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol) - - # Profiling workflow - if PROFILE: - # fmt: off - profile_operation("PyTorch", lambda: abs_op(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) - profile_operation(" lib", lambda: lib_abs(), device, NUM_PRERUN, NUM_ITERATIONS) - # fmt: on - - check_error(LIBINFINIOP.infiniopDestroyAbsDescriptor(descriptor)) - - -if __name__ == "__main__": - args = get_args() - - # Configure testing options - DEBUG = args.debug - PROFILE = args.profile - NUM_PRERUN = args.num_prerun - NUM_ITERATIONS = args.num_iterations - - for device in get_test_devices(args): - test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) - - print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/acos.py b/test/infiniop/acos.py deleted file mode 100644 index d39e966c4..000000000 --- a/test/infiniop/acos.py +++ /dev/null @@ -1,165 +0,0 @@ -import ctypes -from ctypes import c_uint64 -from enum import Enum, auto - -import torch -from libinfiniop import ( - LIBINFINIOP, - InfiniDeviceNames, - InfiniDtype, - InfiniDtypeNames, - TestTensor, - TestWorkspace, - check_error, - debug, - get_args, - get_test_devices, - get_tolerance, - infiniopOperatorDescriptor_t, - profile_operation, - test_operator, -) - -# ============================================================================== -# Configuration (Internal Use Only) -# ============================================================================== -# These are not meant to be imported from other modules -_TEST_CASES_ = [ - # tensor_shape, inplace - ((1, 3),), - ((3, 3),), - ((32, 20, 512),), - ((33, 333, 333),), - ((32, 256, 112, 112),), - ((3, 3, 13, 9, 17),), -] - - -class Inplace(Enum): - OUT_OF_PLACE = auto() - INPLACE_X = auto() - - -# Inplace options applied for each test case in _TEST_CASES_ -_INPLACE = [ - Inplace.OUT_OF_PLACE, - Inplace.INPLACE_X, -] - -# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_ -_TEST_CASES = [ - test_case + (inplace_item,) - for test_case in _TEST_CASES_ - for inplace_item in _INPLACE -] - -# Data types used for testing (matching old operators library: only F16 and F32) -_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32] - -# Tolerance map for different data types -_TOLERANCE_MAP = { - InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, - InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7}, -} - -DEBUG = False -PROFILE = False -NUM_PRERUN = 10 -NUM_ITERATIONS = 1000 - - -def acos_op(x): - return torch.acos(x).to(x.dtype) - - -def test( - handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=InfiniDtype.F16, sync=None -): - # Generate test tensors with values in range [-1, 1) for acos operation - # acos domain is [-1, 1], so we use range [-1, 1) - x_torch_tensor = torch.rand(shape) * 2 - 1 - - x = TestTensor( - shape, - x_torch_tensor.stride(), - dtype, - device, - mode="manual", - set_tensor=x_torch_tensor, - ) - - if inplace == Inplace.INPLACE_X: - y = x - else: - y = TestTensor(shape, None, dtype, device) - - if y.is_broadcast(): - return - - print( - f"Testing Acos on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}" - ) - - ans = acos_op(x.torch_tensor()) - - if sync is not None: - sync() - - descriptor = infiniopOperatorDescriptor_t() - check_error( - LIBINFINIOP.infiniopCreateAcosDescriptor( - handle, ctypes.byref(descriptor), y.descriptor, x.descriptor - ) - ) - - # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel - for tensor in [x, y]: - tensor.destroy_desc() - - workspace_size = c_uint64(0) - check_error( - LIBINFINIOP.infiniopGetAcosWorkspaceSize( - descriptor, ctypes.byref(workspace_size) - ) - ) - workspace = TestWorkspace(workspace_size.value, y.device) - - def lib_acos(): - check_error( - LIBINFINIOP.infiniopAcos( - descriptor, workspace.data(), workspace_size.value, y.data(), x.data(), None - ) - ) - - lib_acos() - if sync is not None: - sync() - - atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) - if DEBUG: - debug(y.actual_tensor(), ans, atol=atol, rtol=rtol) - assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol, equal_nan=True) - - # Profiling workflow - if PROFILE: - # fmt: off - profile_operation("PyTorch", lambda: acos_op(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) - profile_operation(" lib", lambda: lib_acos(), device, NUM_PRERUN, NUM_ITERATIONS) - # fmt: on - - check_error(LIBINFINIOP.infiniopDestroyAcosDescriptor(descriptor)) - - -if __name__ == "__main__": - args = get_args() - - # Configure testing options - DEBUG = args.debug - PROFILE = args.profile - NUM_PRERUN = args.num_prerun - NUM_ITERATIONS = args.num_iterations - - for device in get_test_devices(args): - test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) - - print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/acosh.py b/test/infiniop/acosh.py deleted file mode 100644 index c6777998b..000000000 --- a/test/infiniop/acosh.py +++ /dev/null @@ -1,165 +0,0 @@ -import ctypes -from ctypes import c_uint64 -from enum import Enum, auto - -import torch -from libinfiniop import ( - LIBINFINIOP, - InfiniDeviceNames, - InfiniDtype, - InfiniDtypeNames, - TestTensor, - TestWorkspace, - check_error, - debug, - get_args, - get_test_devices, - get_tolerance, - infiniopOperatorDescriptor_t, - profile_operation, - test_operator, -) - -# ============================================================================== -# Configuration (Internal Use Only) -# ============================================================================== -# These are not meant to be imported from other modules -_TEST_CASES_ = [ - # tensor_shape, inplace - ((1, 3),), - ((3, 3),), - ((32, 20, 512),), - ((33, 333, 333),), - ((32, 256, 112, 112),), - ((3, 3, 13, 9, 17),), -] - - -class Inplace(Enum): - OUT_OF_PLACE = auto() - INPLACE_X = auto() - - -# Inplace options applied for each test case in _TEST_CASES_ -_INPLACE = [ - Inplace.OUT_OF_PLACE, - Inplace.INPLACE_X, -] - -# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_ -_TEST_CASES = [ - test_case + (inplace_item,) - for test_case in _TEST_CASES_ - for inplace_item in _INPLACE -] - -# Data types used for testing (matching old operators library: only F16 and F32) -_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32] - -# Tolerance map for different data types -_TOLERANCE_MAP = { - InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, - InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7}, -} - -DEBUG = False -PROFILE = False -NUM_PRERUN = 10 -NUM_ITERATIONS = 1000 - - -def acosh_op(x): - return torch.acosh(x).to(x.dtype) - - -def test( - handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=InfiniDtype.F16, sync=None -): - # Generate test tensors with values in range [1, 101) for acosh operation - # acosh domain is [1, +∞), so we use range [1, 101) - x_torch_tensor = torch.rand(shape) * 100 + 1 - - x = TestTensor( - shape, - x_torch_tensor.stride(), - dtype, - device, - mode="manual", - set_tensor=x_torch_tensor, - ) - - if inplace == Inplace.INPLACE_X: - y = x - else: - y = TestTensor(shape, None, dtype, device) - - if y.is_broadcast(): - return - - print( - f"Testing Acosh on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}" - ) - - ans = acosh_op(x.torch_tensor()) - - if sync is not None: - sync() - - descriptor = infiniopOperatorDescriptor_t() - check_error( - LIBINFINIOP.infiniopCreateAcoshDescriptor( - handle, ctypes.byref(descriptor), y.descriptor, x.descriptor - ) - ) - - # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel - for tensor in [x, y]: - tensor.destroy_desc() - - workspace_size = c_uint64(0) - check_error( - LIBINFINIOP.infiniopGetAcoshWorkspaceSize( - descriptor, ctypes.byref(workspace_size) - ) - ) - workspace = TestWorkspace(workspace_size.value, y.device) - - def lib_acosh(): - check_error( - LIBINFINIOP.infiniopAcosh( - descriptor, workspace.data(), workspace_size.value, y.data(), x.data(), None - ) - ) - - lib_acosh() - if sync is not None: - sync() - - atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) - if DEBUG: - debug(y.actual_tensor(), ans, atol=atol, rtol=rtol) - assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol, equal_nan=True) - - # Profiling workflow - if PROFILE: - # fmt: off - profile_operation("PyTorch", lambda: acosh_op(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) - profile_operation(" lib", lambda: lib_acosh(), device, NUM_PRERUN, NUM_ITERATIONS) - # fmt: on - - check_error(LIBINFINIOP.infiniopDestroyAcoshDescriptor(descriptor)) - - -if __name__ == "__main__": - args = get_args() - - # Configure testing options - DEBUG = args.debug - PROFILE = args.profile - NUM_PRERUN = args.num_prerun - NUM_ITERATIONS = args.num_iterations - - for device in get_test_devices(args): - test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) - - print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/asin.py b/test/infiniop/asin.py deleted file mode 100644 index 18cf0ec8e..000000000 --- a/test/infiniop/asin.py +++ /dev/null @@ -1,165 +0,0 @@ -import ctypes -from ctypes import c_uint64 -from enum import Enum, auto - -import torch -from libinfiniop import ( - LIBINFINIOP, - InfiniDeviceNames, - InfiniDtype, - InfiniDtypeNames, - TestTensor, - TestWorkspace, - check_error, - debug, - get_args, - get_test_devices, - get_tolerance, - infiniopOperatorDescriptor_t, - profile_operation, - test_operator, -) - -# ============================================================================== -# Configuration (Internal Use Only) -# ============================================================================== -# These are not meant to be imported from other modules -_TEST_CASES_ = [ - # tensor_shape, inplace - ((1, 3),), - ((3, 3),), - ((32, 20, 512),), - ((33, 333, 333),), - ((32, 256, 112, 112),), - ((3, 3, 13, 9, 17),), -] - - -class Inplace(Enum): - OUT_OF_PLACE = auto() - INPLACE_X = auto() - - -# Inplace options applied for each test case in _TEST_CASES_ -_INPLACE = [ - Inplace.OUT_OF_PLACE, - Inplace.INPLACE_X, -] - -# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_ -_TEST_CASES = [ - test_case + (inplace_item,) - for test_case in _TEST_CASES_ - for inplace_item in _INPLACE -] - -# Data types used for testing (matching old operators library: only F16 and F32) -_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32] - -# Tolerance map for different data types -_TOLERANCE_MAP = { - InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, - InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7}, -} - -DEBUG = False -PROFILE = False -NUM_PRERUN = 10 -NUM_ITERATIONS = 1000 - - -def asin_op(x): - return torch.asin(x).to(x.dtype) - - -def test( - handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=InfiniDtype.F16, sync=None -): - # Generate test tensors with values in range [-1, 1) for asin operation - # asin domain is [-1, 1], so we use range [-1, 1) - x_torch_tensor = torch.rand(shape) * 2 - 1 - - x = TestTensor( - shape, - x_torch_tensor.stride(), - dtype, - device, - mode="manual", - set_tensor=x_torch_tensor, - ) - - if inplace == Inplace.INPLACE_X: - y = x - else: - y = TestTensor(shape, None, dtype, device) - - if y.is_broadcast(): - return - - print( - f"Testing Asin on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}" - ) - - ans = asin_op(x.torch_tensor()) - - if sync is not None: - sync() - - descriptor = infiniopOperatorDescriptor_t() - check_error( - LIBINFINIOP.infiniopCreateAsinDescriptor( - handle, ctypes.byref(descriptor), y.descriptor, x.descriptor - ) - ) - - # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel - for tensor in [x, y]: - tensor.destroy_desc() - - workspace_size = c_uint64(0) - check_error( - LIBINFINIOP.infiniopGetAsinWorkspaceSize( - descriptor, ctypes.byref(workspace_size) - ) - ) - workspace = TestWorkspace(workspace_size.value, y.device) - - def lib_asin(): - check_error( - LIBINFINIOP.infiniopAsin( - descriptor, workspace.data(), workspace_size.value, y.data(), x.data(), None - ) - ) - - lib_asin() - if sync is not None: - sync() - - atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) - if DEBUG: - debug(y.actual_tensor(), ans, atol=atol, rtol=rtol) - assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol, equal_nan=True) - - # Profiling workflow - if PROFILE: - # fmt: off - profile_operation("PyTorch", lambda: asin_op(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) - profile_operation(" lib", lambda: lib_asin(), device, NUM_PRERUN, NUM_ITERATIONS) - # fmt: on - - check_error(LIBINFINIOP.infiniopDestroyAsinDescriptor(descriptor)) - - -if __name__ == "__main__": - args = get_args() - - # Configure testing options - DEBUG = args.debug - PROFILE = args.profile - NUM_PRERUN = args.num_prerun - NUM_ITERATIONS = args.num_iterations - - for device in get_test_devices(args): - test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) - - print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/asinh.py b/test/infiniop/asinh.py deleted file mode 100644 index d051d486e..000000000 --- a/test/infiniop/asinh.py +++ /dev/null @@ -1,165 +0,0 @@ -import ctypes -from ctypes import c_uint64 -from enum import Enum, auto - -import torch -from libinfiniop import ( - LIBINFINIOP, - InfiniDeviceNames, - InfiniDtype, - InfiniDtypeNames, - TestTensor, - TestWorkspace, - check_error, - debug, - get_args, - get_test_devices, - get_tolerance, - infiniopOperatorDescriptor_t, - profile_operation, - test_operator, -) - -# ============================================================================== -# Configuration (Internal Use Only) -# ============================================================================== -# These are not meant to be imported from other modules -_TEST_CASES_ = [ - # tensor_shape, inplace - ((1, 3),), - ((3, 3),), - ((32, 20, 512),), - ((33, 333, 333),), - ((32, 256, 112, 112),), - ((3, 3, 13, 9, 17),), -] - - -class Inplace(Enum): - OUT_OF_PLACE = auto() - INPLACE_X = auto() - - -# Inplace options applied for each test case in _TEST_CASES_ -_INPLACE = [ - Inplace.OUT_OF_PLACE, - Inplace.INPLACE_X, -] - -# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_ -_TEST_CASES = [ - test_case + (inplace_item,) - for test_case in _TEST_CASES_ - for inplace_item in _INPLACE -] - -# Data types used for testing (matching old operators library: only F16 and F32) -_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32] - -# Tolerance map for different data types -_TOLERANCE_MAP = { - InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, - InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7}, -} - -DEBUG = False -PROFILE = False -NUM_PRERUN = 10 -NUM_ITERATIONS = 1000 - - -def asinh_op(x): - return torch.asinh(x).to(x.dtype) - - -def test( - handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=InfiniDtype.F16, sync=None -): - # Generate test tensors with values in range [0, 100) for asinh operation - # asinh domain is (-∞, +∞), so we use range [0, 100) - x_torch_tensor = torch.rand(shape) * 100 - - x = TestTensor( - shape, - x_torch_tensor.stride(), - dtype, - device, - mode="manual", - set_tensor=x_torch_tensor, - ) - - if inplace == Inplace.INPLACE_X: - y = x - else: - y = TestTensor(shape, None, dtype, device) - - if y.is_broadcast(): - return - - print( - f"Testing Asinh on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}" - ) - - ans = asinh_op(x.torch_tensor()) - - if sync is not None: - sync() - - descriptor = infiniopOperatorDescriptor_t() - check_error( - LIBINFINIOP.infiniopCreateAsinhDescriptor( - handle, ctypes.byref(descriptor), y.descriptor, x.descriptor - ) - ) - - # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel - for tensor in [x, y]: - tensor.destroy_desc() - - workspace_size = c_uint64(0) - check_error( - LIBINFINIOP.infiniopGetAsinhWorkspaceSize( - descriptor, ctypes.byref(workspace_size) - ) - ) - workspace = TestWorkspace(workspace_size.value, y.device) - - def lib_asinh(): - check_error( - LIBINFINIOP.infiniopAsinh( - descriptor, workspace.data(), workspace_size.value, y.data(), x.data(), None - ) - ) - - lib_asinh() - if sync is not None: - sync() - - atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) - if DEBUG: - debug(y.actual_tensor(), ans, atol=atol, rtol=rtol) - assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol, equal_nan=True) - - # Profiling workflow - if PROFILE: - # fmt: off - profile_operation("PyTorch", lambda: asinh_op(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) - profile_operation(" lib", lambda: lib_asinh(), device, NUM_PRERUN, NUM_ITERATIONS) - # fmt: on - - check_error(LIBINFINIOP.infiniopDestroyAsinhDescriptor(descriptor)) - - -if __name__ == "__main__": - args = get_args() - - # Configure testing options - DEBUG = args.debug - PROFILE = args.profile - NUM_PRERUN = args.num_prerun - NUM_ITERATIONS = args.num_iterations - - for device in get_test_devices(args): - test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) - - print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/atan.py b/test/infiniop/atan.py deleted file mode 100644 index 01fceff5b..000000000 --- a/test/infiniop/atan.py +++ /dev/null @@ -1,164 +0,0 @@ -import ctypes -from ctypes import c_uint64 -from enum import Enum, auto - -import torch -from libinfiniop import ( - LIBINFINIOP, - InfiniDeviceNames, - InfiniDtype, - InfiniDtypeNames, - TestTensor, - TestWorkspace, - check_error, - debug, - get_args, - get_test_devices, - get_tolerance, - infiniopOperatorDescriptor_t, - profile_operation, - test_operator, -) - -# ============================================================================== -# Configuration (Internal Use Only) -# ============================================================================== -# These are not meant to be imported from other modules -_TEST_CASES_ = [ - # tensor_shape, inplace - ((1, 3),), - ((3, 3, 13, 9, 17),), - ((32, 20, 512),), - ((33, 333, 333),), - ((32, 256, 112, 112),), -] - - -class Inplace(Enum): - OUT_OF_PLACE = auto() - INPLACE_X = auto() - - -# Inplace options applied for each test case in _TEST_CASES_ -_INPLACE = [ - Inplace.OUT_OF_PLACE, - Inplace.INPLACE_X, -] - -# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_ -_TEST_CASES = [ - test_case + (inplace_item,) - for test_case in _TEST_CASES_ - for inplace_item in _INPLACE -] - -# Data types used for testing (matching old operators library: only F16 and F32) -_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32] - -# Tolerance map for different data types -_TOLERANCE_MAP = { - InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, - InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7}, -} - -DEBUG = False -PROFILE = False -NUM_PRERUN = 10 -NUM_ITERATIONS = 1000 - - -def atan_op(x): - return torch.atan(x).to(x.dtype) - - -def test( - handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=InfiniDtype.F16, sync=None -): - # Generate test tensors with values in range [-200, -100) for atan operation - # atan domain is (-∞, +∞), so we use range [-200, -100) - x_torch_tensor = torch.rand(shape) * 100 - 200 - - x = TestTensor( - shape, - x_torch_tensor.stride(), - dtype, - device, - mode="manual", - set_tensor=x_torch_tensor, - ) - - if inplace == Inplace.INPLACE_X: - y = x - else: - y = TestTensor(shape, None, dtype, device) - - if y.is_broadcast(): - return - - print( - f"Testing Atan on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}" - ) - - ans = atan_op(x.torch_tensor()) - - if sync is not None: - sync() - - descriptor = infiniopOperatorDescriptor_t() - check_error( - LIBINFINIOP.infiniopCreateAtanDescriptor( - handle, ctypes.byref(descriptor), y.descriptor, x.descriptor - ) - ) - - # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel - for tensor in [x, y]: - tensor.destroy_desc() - - workspace_size = c_uint64(0) - check_error( - LIBINFINIOP.infiniopGetAtanWorkspaceSize( - descriptor, ctypes.byref(workspace_size) - ) - ) - workspace = TestWorkspace(workspace_size.value, y.device) - - def lib_atan(): - check_error( - LIBINFINIOP.infiniopAtan( - descriptor, workspace.data(), workspace_size.value, y.data(), x.data(), None - ) - ) - - lib_atan() - if sync is not None: - sync() - - atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) - if DEBUG: - debug(y.actual_tensor(), ans, atol=atol, rtol=rtol) - assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol, equal_nan=True) - - # Profiling workflow - if PROFILE: - # fmt: off - profile_operation("PyTorch", lambda: atan_op(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) - profile_operation(" lib", lambda: lib_atan(), device, NUM_PRERUN, NUM_ITERATIONS) - # fmt: on - - check_error(LIBINFINIOP.infiniopDestroyAtanDescriptor(descriptor)) - - -if __name__ == "__main__": - args = get_args() - - # Configure testing options - DEBUG = args.debug - PROFILE = args.profile - NUM_PRERUN = args.num_prerun - NUM_ITERATIONS = args.num_iterations - - for device in get_test_devices(args): - test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) - - print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/atanh.py b/test/infiniop/atanh.py deleted file mode 100644 index 74073a6f2..000000000 --- a/test/infiniop/atanh.py +++ /dev/null @@ -1,165 +0,0 @@ -import ctypes -from ctypes import c_uint64 -from enum import Enum, auto - -import torch -from libinfiniop import ( - LIBINFINIOP, - InfiniDeviceNames, - InfiniDtype, - InfiniDtypeNames, - TestTensor, - TestWorkspace, - check_error, - debug, - get_args, - get_test_devices, - get_tolerance, - infiniopOperatorDescriptor_t, - profile_operation, - test_operator, -) - -# ============================================================================== -# Configuration (Internal Use Only) -# ============================================================================== -# These are not meant to be imported from other modules -_TEST_CASES_ = [ - # tensor_shape, inplace - ((1, 3),), - ((3, 3),), - ((32, 20, 512),), - ((33, 333, 333),), - ((32, 256, 112, 112),), - ((3, 3, 13, 9, 17),), -] - - -class Inplace(Enum): - OUT_OF_PLACE = auto() - INPLACE_X = auto() - - -# Inplace options applied for each test case in _TEST_CASES_ -_INPLACE = [ - Inplace.OUT_OF_PLACE, - Inplace.INPLACE_X, -] - -# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_ -_TEST_CASES = [ - test_case + (inplace_item,) - for test_case in _TEST_CASES_ - for inplace_item in _INPLACE -] - -# Data types used for testing (matching old operators library: only F16 and F32) -_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32] - -# Tolerance map for different data types -_TOLERANCE_MAP = { - InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, - InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7}, -} - -DEBUG = False -PROFILE = False -NUM_PRERUN = 10 -NUM_ITERATIONS = 1000 - - -def atanh_op(x): - return torch.atanh(x).to(x.dtype) - - -def test( - handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=InfiniDtype.F16, sync=None -): - # Generate test tensors with values in range [-1, 1) for atanh operation - # atanh domain is (-1, 1), so we use range [-1, 1) - x_torch_tensor = torch.rand(shape) * 2 - 1 - - x = TestTensor( - shape, - x_torch_tensor.stride(), - dtype, - device, - mode="manual", - set_tensor=x_torch_tensor, - ) - - if inplace == Inplace.INPLACE_X: - y = x - else: - y = TestTensor(shape, None, dtype, device) - - if y.is_broadcast(): - return - - print( - f"Testing Atanh on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}" - ) - - ans = atanh_op(x.torch_tensor()) - - if sync is not None: - sync() - - descriptor = infiniopOperatorDescriptor_t() - check_error( - LIBINFINIOP.infiniopCreateAtanhDescriptor( - handle, ctypes.byref(descriptor), y.descriptor, x.descriptor - ) - ) - - # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel - for tensor in [x, y]: - tensor.destroy_desc() - - workspace_size = c_uint64(0) - check_error( - LIBINFINIOP.infiniopGetAtanhWorkspaceSize( - descriptor, ctypes.byref(workspace_size) - ) - ) - workspace = TestWorkspace(workspace_size.value, y.device) - - def lib_atanh(): - check_error( - LIBINFINIOP.infiniopAtanh( - descriptor, workspace.data(), workspace_size.value, y.data(), x.data(), None - ) - ) - - lib_atanh() - if sync is not None: - sync() - - atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) - if DEBUG: - debug(y.actual_tensor(), ans, atol=atol, rtol=rtol) - assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol, equal_nan=True) - - # Profiling workflow - if PROFILE: - # fmt: off - profile_operation("PyTorch", lambda: atanh_op(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) - profile_operation(" lib", lambda: lib_atanh(), device, NUM_PRERUN, NUM_ITERATIONS) - # fmt: on - - check_error(LIBINFINIOP.infiniopDestroyAtanhDescriptor(descriptor)) - - -if __name__ == "__main__": - args = get_args() - - # Configure testing options - DEBUG = args.debug - PROFILE = args.profile - NUM_PRERUN = args.num_prerun - NUM_ITERATIONS = args.num_iterations - - for device in get_test_devices(args): - test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) - - print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/ceil.py b/test/infiniop/ceil.py deleted file mode 100644 index afc1993c1..000000000 --- a/test/infiniop/ceil.py +++ /dev/null @@ -1,165 +0,0 @@ -import ctypes -from ctypes import c_uint64 -from enum import Enum, auto - -import torch -from libinfiniop import ( - LIBINFINIOP, - InfiniDeviceNames, - InfiniDtype, - InfiniDtypeNames, - TestTensor, - TestWorkspace, - check_error, - debug, - get_args, - get_test_devices, - get_tolerance, - infiniopOperatorDescriptor_t, - profile_operation, - test_operator, -) - -# ============================================================================== -# Configuration (Internal Use Only) -# ============================================================================== -# These are not meant to be imported from other modules -_TEST_CASES_ = [ - # tensor_shape, inplace - ((1, 3),), - ((3, 3),), - ((32, 20, 512),), - ((33, 333, 333),), - ((32, 256, 112, 112),), - ((3, 3, 13, 9, 17),), -] - - -class Inplace(Enum): - OUT_OF_PLACE = auto() - INPLACE_X = auto() - - -# Inplace options applied for each test case in _TEST_CASES_ -_INPLACE = [ - Inplace.OUT_OF_PLACE, - Inplace.INPLACE_X, -] - -# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_ -_TEST_CASES = [ - test_case + (inplace_item,) - for test_case in _TEST_CASES_ - for inplace_item in _INPLACE -] - -# Data types used for testing (matching old operators library: only F16 and F32) -_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32] - -# Tolerance map for different data types -_TOLERANCE_MAP = { - InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, - InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7}, -} - -DEBUG = False -PROFILE = False -NUM_PRERUN = 10 -NUM_ITERATIONS = 1000 - - -def ceil_op(x): - return torch.ceil(x).to(x.dtype) - - -def test( - handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=InfiniDtype.F16, sync=None -): - # Generate test tensors with values in range [-20, -10) for ceil operation - # ceil domain is (-∞, +∞), so we use range [-20, -10) - x_torch_tensor = torch.rand(shape) * 10 - 20 - - x = TestTensor( - shape, - x_torch_tensor.stride(), - dtype, - device, - mode="manual", - set_tensor=x_torch_tensor, - ) - - if inplace == Inplace.INPLACE_X: - y = x - else: - y = TestTensor(shape, None, dtype, device) - - if y.is_broadcast(): - return - - print( - f"Testing Ceil on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}" - ) - - ans = ceil_op(x.torch_tensor()) - - if sync is not None: - sync() - - descriptor = infiniopOperatorDescriptor_t() - check_error( - LIBINFINIOP.infiniopCreateCeilDescriptor( - handle, ctypes.byref(descriptor), y.descriptor, x.descriptor - ) - ) - - # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel - for tensor in [x, y]: - tensor.destroy_desc() - - workspace_size = c_uint64(0) - check_error( - LIBINFINIOP.infiniopGetCeilWorkspaceSize( - descriptor, ctypes.byref(workspace_size) - ) - ) - workspace = TestWorkspace(workspace_size.value, y.device) - - def lib_ceil(): - check_error( - LIBINFINIOP.infiniopCeil( - descriptor, workspace.data(), workspace_size.value, y.data(), x.data(), None - ) - ) - - lib_ceil() - if sync is not None: - sync() - - atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) - if DEBUG: - debug(y.actual_tensor(), ans, atol=atol, rtol=rtol) - assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol, equal_nan=True) - - # Profiling workflow - if PROFILE: - # fmt: off - profile_operation("PyTorch", lambda: ceil_op(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) - profile_operation(" lib", lambda: lib_ceil(), device, NUM_PRERUN, NUM_ITERATIONS) - # fmt: on - - check_error(LIBINFINIOP.infiniopDestroyCeilDescriptor(descriptor)) - - -if __name__ == "__main__": - args = get_args() - - # Configure testing options - DEBUG = args.debug - PROFILE = args.profile - NUM_PRERUN = args.num_prerun - NUM_ITERATIONS = args.num_iterations - - for device in get_test_devices(args): - test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) - - print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/cos.py b/test/infiniop/cos.py deleted file mode 100644 index 972f17b7b..000000000 --- a/test/infiniop/cos.py +++ /dev/null @@ -1,166 +0,0 @@ -import ctypes -from ctypes import c_uint64 -from enum import Enum, auto - -import torch -from libinfiniop import ( - LIBINFINIOP, - InfiniDeviceNames, - InfiniDtype, - InfiniDtypeNames, - TestTensor, - TestWorkspace, - check_error, - debug, - get_args, - get_test_devices, - get_tolerance, - infiniopOperatorDescriptor_t, - profile_operation, - test_operator, -) - -# ============================================================================== -# Configuration (Internal Use Only) -# ============================================================================== -# These are not meant to be imported from other modules -_TEST_CASES_ = [ - # tensor_shape, inplace - ((1, 3),), - ((3, 3),), - ((32, 20, 512),), - ((33, 333, 333),), - ((32, 256, 112, 112),), - ((3, 3, 13, 9, 17),), -] - - -class Inplace(Enum): - OUT_OF_PLACE = auto() - INPLACE_X = auto() - - -# Inplace options applied for each test case in _TEST_CASES_ -_INPLACE = [ - Inplace.OUT_OF_PLACE, - Inplace.INPLACE_X, -] - -# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_ -_TEST_CASES = [ - test_case + (inplace_item,) - for test_case in _TEST_CASES_ - for inplace_item in _INPLACE -] - -# Data types used for testing (matching old operators library: only F16 and F32) -_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32] - -# Tolerance map for different data types -# Copied from old operators library: atol=1e-4, rtol=1e-2 -_TOLERANCE_MAP = { - InfiniDtype.F16: {"atol": 1e-4, "rtol": 1e-2}, - InfiniDtype.F32: {"atol": 1e-4, "rtol": 1e-2}, -} - -DEBUG = False -PROFILE = False -NUM_PRERUN = 10 -NUM_ITERATIONS = 1000 - - -def cos_op(x): - return torch.cos(x).to(x.dtype) - - -def test( - handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=InfiniDtype.F16, sync=None -): - # Generate test tensors with values in range [-200, -100) for cos operation - # cos domain is (-∞, +∞), so we use range [-200, -100) - x_torch_tensor = torch.rand(shape) * 100 - 200 - - x = TestTensor( - shape, - x_torch_tensor.stride(), - dtype, - device, - mode="manual", - set_tensor=x_torch_tensor, - ) - - if inplace == Inplace.INPLACE_X: - y = x - else: - y = TestTensor(shape, None, dtype, device) - - if y.is_broadcast(): - return - - print( - f"Testing Cos on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}" - ) - - ans = cos_op(x.torch_tensor()) - - if sync is not None: - sync() - - descriptor = infiniopOperatorDescriptor_t() - check_error( - LIBINFINIOP.infiniopCreateCosDescriptor( - handle, ctypes.byref(descriptor), y.descriptor, x.descriptor - ) - ) - - # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel - for tensor in [x, y]: - tensor.destroy_desc() - - workspace_size = c_uint64(0) - check_error( - LIBINFINIOP.infiniopGetCosWorkspaceSize( - descriptor, ctypes.byref(workspace_size) - ) - ) - workspace = TestWorkspace(workspace_size.value, y.device) - - def lib_cos(): - check_error( - LIBINFINIOP.infiniopCos( - descriptor, workspace.data(), workspace_size.value, y.data(), x.data(), None - ) - ) - - lib_cos() - if sync is not None: - sync() - - atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) - if DEBUG: - debug(y.actual_tensor(), ans, atol=atol, rtol=rtol) - assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol, equal_nan=True) - - # Profiling workflow - if PROFILE: - # fmt: off - profile_operation("PyTorch", lambda: cos_op(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) - profile_operation(" lib", lambda: lib_cos(), device, NUM_PRERUN, NUM_ITERATIONS) - # fmt: on - - check_error(LIBINFINIOP.infiniopDestroyCosDescriptor(descriptor)) - - -if __name__ == "__main__": - args = get_args() - - # Configure testing options - DEBUG = args.debug - PROFILE = args.profile - NUM_PRERUN = args.num_prerun - NUM_ITERATIONS = args.num_iterations - - for device in get_test_devices(args): - test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) - - print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/cosh.py b/test/infiniop/cosh.py deleted file mode 100644 index ee7994531..000000000 --- a/test/infiniop/cosh.py +++ /dev/null @@ -1,165 +0,0 @@ -import ctypes -from ctypes import c_uint64 -from enum import Enum, auto - -import torch -from libinfiniop import ( - LIBINFINIOP, - InfiniDeviceNames, - InfiniDtype, - InfiniDtypeNames, - TestTensor, - TestWorkspace, - check_error, - debug, - get_args, - get_test_devices, - get_tolerance, - infiniopOperatorDescriptor_t, - profile_operation, - test_operator, -) - -# ============================================================================== -# Configuration (Internal Use Only) -# ============================================================================== -# These are not meant to be imported from other modules -_TEST_CASES_ = [ - # tensor_shape, inplace - ((1, 3),), - ((3, 3),), - ((32, 20, 512),), - ((33, 333, 333),), - ((32, 256, 112, 112),), - ((3, 3, 13, 9, 17),), -] - - -class Inplace(Enum): - OUT_OF_PLACE = auto() - INPLACE_X = auto() - - -# Inplace options applied for each test case in _TEST_CASES_ -_INPLACE = [ - Inplace.OUT_OF_PLACE, - Inplace.INPLACE_X, -] - -# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_ -_TEST_CASES = [ - test_case + (inplace_item,) - for test_case in _TEST_CASES_ - for inplace_item in _INPLACE -] - -# Data types used for testing (matching old operators library: only F16 and F32) -_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32] - -# Tolerance map for different data types -_TOLERANCE_MAP = { - InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, - InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7}, -} - -DEBUG = False -PROFILE = False -NUM_PRERUN = 10 -NUM_ITERATIONS = 1000 - - -def cosh_op(x): - return torch.cosh(x).to(x.dtype) - - -def test( - handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=InfiniDtype.F16, sync=None -): - # Generate test tensors with values in range [-200, -100) for cosh operation - # cosh domain is (-∞, +∞), so we use range [-200, -100) - x_torch_tensor = torch.rand(shape) * 100 - 200 - - x = TestTensor( - shape, - x_torch_tensor.stride(), - dtype, - device, - mode="manual", - set_tensor=x_torch_tensor, - ) - - if inplace == Inplace.INPLACE_X: - y = x - else: - y = TestTensor(shape, None, dtype, device) - - if y.is_broadcast(): - return - - print( - f"Testing Cosh on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}" - ) - - ans = cosh_op(x.torch_tensor()) - - if sync is not None: - sync() - - descriptor = infiniopOperatorDescriptor_t() - check_error( - LIBINFINIOP.infiniopCreateCoshDescriptor( - handle, ctypes.byref(descriptor), y.descriptor, x.descriptor - ) - ) - - # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel - for tensor in [x, y]: - tensor.destroy_desc() - - workspace_size = c_uint64(0) - check_error( - LIBINFINIOP.infiniopGetCoshWorkspaceSize( - descriptor, ctypes.byref(workspace_size) - ) - ) - workspace = TestWorkspace(workspace_size.value, y.device) - - def lib_cosh(): - check_error( - LIBINFINIOP.infiniopCosh( - descriptor, workspace.data(), workspace_size.value, y.data(), x.data(), None - ) - ) - - lib_cosh() - if sync is not None: - sync() - - atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) - if DEBUG: - debug(y.actual_tensor(), ans, atol=atol, rtol=rtol) - assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol, equal_nan=True) - - # Profiling workflow - if PROFILE: - # fmt: off - profile_operation("PyTorch", lambda: cosh_op(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) - profile_operation(" lib", lambda: lib_cosh(), device, NUM_PRERUN, NUM_ITERATIONS) - # fmt: on - - check_error(LIBINFINIOP.infiniopDestroyCoshDescriptor(descriptor)) - - -if __name__ == "__main__": - args = get_args() - - # Configure testing options - DEBUG = args.debug - PROFILE = args.profile - NUM_PRERUN = args.num_prerun - NUM_ITERATIONS = args.num_iterations - - for device in get_test_devices(args): - test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) - - print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/div.py b/test/infiniop/div.py deleted file mode 100644 index 17b22b2e5..000000000 --- a/test/infiniop/div.py +++ /dev/null @@ -1,192 +0,0 @@ -import torch -import ctypes -from ctypes import c_uint64 -from libinfiniop import ( - LIBINFINIOP, - TestTensor, - get_test_devices, - check_error, - test_operator, - get_args, - debug, - get_tolerance, - profile_operation, - TestWorkspace, - InfiniDtype, - InfiniDtypeNames, - InfiniDeviceNames, - infiniopOperatorDescriptor_t, -) -from enum import Enum, auto - -# ============================================================================== -# Configuration (Internal Use Only) -# ============================================================================== -# These are not meant to be imported from other modules -_TEST_CASES_ = [ - # shape, a_stride, b_stride, c_stride - ((13, 4), None, None, None), - ((13, 4), (10, 1), (10, 1), (10, 1)), - ((13, 4), (0, 1), None, None), - ((13, 4, 4), None, None, None), - ((13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1)), - ((13, 4, 4), (4, 0, 1), (0, 4, 1), None), - ((16, 5632), None, None, None), - ((16, 5632), (13312, 1), (13312, 1), (13312, 1)), - ((13, 16, 2), (128, 4, 1), (0, 2, 1), (64, 4, 1)), - ((13, 16, 2), (128, 4, 1), (2, 0, 1), (64, 4, 1)), - ((4, 4, 5632), None, None, None), - ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1)), -] - - -class Inplace(Enum): - OUT_OF_PLACE = auto() - INPLACE_A = auto() - INPLACE_B = auto() - - -# Inplace options applied for each test case in _TEST_CASES_ -_INPLACE = [ - Inplace.OUT_OF_PLACE, - Inplace.INPLACE_A, - Inplace.INPLACE_B, -] - -# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_ -_TEST_CASES = [ - test_case + (inplace_item,) - for test_case in _TEST_CASES_ - for inplace_item in _INPLACE -] - -# Data types used for testing (matching old operators library: only F16 and F32) -_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32] - -# Tolerance map for different data types -# Note: F32 tolerance is relaxed compared to theoretical precision due to: -# - Old operators library uses vectorized operations (pack_size=4) with vecN -# - InfiniCore uses elementwise operations, which can cause 1 ULP differences -# - This is acceptable as it's within floating-point precision limits -_TOLERANCE_MAP = { - InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, - InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6}, # Relaxed from 1e-7 to accommodate vectorization differences -} - -DEBUG = False -PROFILE = False -NUM_PRERUN = 10 -NUM_ITERATIONS = 1000 - - -def div(c, a, b): - # Only support F16 and F32 (matching old operators library) - torch.div(a, b, out=c) - - -def test( - handle, - device, - shape, - a_stride=None, - b_stride=None, - c_stride=None, - inplace=Inplace.OUT_OF_PLACE, - dtype=InfiniDtype.F16, - sync=None, -): - a = TestTensor(shape, a_stride, dtype, device) - # For division, ensure b doesn't contain zeros to avoid division by zero - # Similar to old test: b = torch.rand(...) * 2, which gives range [0, 2) - # Use scale=2 to ensure values are in [0, 2) range, then add small bias to avoid zero - b = TestTensor(shape, b_stride, dtype, device, scale=2, bias=0.1) - - if inplace == Inplace.INPLACE_A: - if c_stride is not None and c_stride != a_stride: - return - c = a - elif inplace == Inplace.INPLACE_B: - if c_stride is not None and c_stride != b_stride: - return - c = b - else: - c = TestTensor(shape, c_stride, dtype, device) - - if c.is_broadcast(): - return - - print( - f"Testing Div on {InfiniDeviceNames[device]} with shape:{shape} a_stride:{a_stride} b_stride:{b_stride} c_stride:{c_stride} " - f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}" - ) - div(c.torch_tensor(), a.torch_tensor(), b.torch_tensor()) - - if sync is not None: - sync() - - descriptor = infiniopOperatorDescriptor_t() - check_error( - LIBINFINIOP.infiniopCreateDivDescriptor( - handle, - ctypes.byref(descriptor), - c.descriptor, - a.descriptor, - b.descriptor, - ) - ) - - # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel - for tensor in [a, b, c]: - tensor.destroy_desc() - - workspace_size = c_uint64(0) - check_error( - LIBINFINIOP.infiniopGetDivWorkspaceSize( - descriptor, ctypes.byref(workspace_size) - ) - ) - workspace = TestWorkspace(workspace_size.value, device) - - def lib_div(): - check_error( - LIBINFINIOP.infiniopDiv( - descriptor, - workspace.data(), - workspace_size.value, - c.data(), - a.data(), - b.data(), - None, - ) - ) - - lib_div() - if sync is not None: - sync() - - atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) - if DEBUG: - debug(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol) - assert torch.allclose(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol, equal_nan=True) - - # Profiling workflow - if PROFILE: - # fmt: off - profile_operation("PyTorch", lambda: div(c.torch_tensor(), a.torch_tensor(), b.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) - profile_operation(" lib", lambda: lib_div(), device, NUM_PRERUN, NUM_ITERATIONS) - # fmt: on - check_error(LIBINFINIOP.infiniopDestroyDivDescriptor(descriptor)) - - -if __name__ == "__main__": - args = get_args() - # Configure testing options - DEBUG = args.debug - PROFILE = args.profile - NUM_PRERUN = args.num_prerun - NUM_ITERATIONS = args.num_iterations - - for device in get_test_devices(args): - test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) - - print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/erf.py b/test/infiniop/erf.py deleted file mode 100644 index f5f9c4cd9..000000000 --- a/test/infiniop/erf.py +++ /dev/null @@ -1,165 +0,0 @@ -import ctypes -from ctypes import c_uint64 -from enum import Enum, auto - -import torch -from libinfiniop import ( - LIBINFINIOP, - InfiniDeviceNames, - InfiniDtype, - InfiniDtypeNames, - TestTensor, - TestWorkspace, - check_error, - debug, - get_args, - get_test_devices, - get_tolerance, - infiniopOperatorDescriptor_t, - profile_operation, - test_operator, -) - -# ============================================================================== -# Configuration (Internal Use Only) -# ============================================================================== -# These are not meant to be imported from other modules -_TEST_CASES_ = [ - # tensor_shape, inplace - ((1, 3),), - ((3, 3),), - ((32, 20, 512),), - ((33, 333, 333),), - ((32, 256, 112, 112),), - ((3, 3, 13, 9, 17),), -] - - -class Inplace(Enum): - OUT_OF_PLACE = auto() - INPLACE_X = auto() - - -# Inplace options applied for each test case in _TEST_CASES_ -_INPLACE = [ - Inplace.OUT_OF_PLACE, - Inplace.INPLACE_X, -] - -# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_ -_TEST_CASES = [ - test_case + (inplace_item,) - for test_case in _TEST_CASES_ - for inplace_item in _INPLACE -] - -# Data types used for testing (matching old operators library: only F16 and F32) -_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32] - -# Tolerance map for different data types -_TOLERANCE_MAP = { - InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, - InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7}, -} - -DEBUG = False -PROFILE = False -NUM_PRERUN = 10 -NUM_ITERATIONS = 1000 - - -def erf_op(x): - return torch.erf(x).to(x.dtype) - - -def test( - handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=InfiniDtype.F16, sync=None -): - # Generate test tensors with values in range [-3, 3) for erf operation - # erf domain is (-∞, +∞), so we use range [-3, 3) - x_torch_tensor = torch.rand(shape) * 6 - 3 - - x = TestTensor( - shape, - x_torch_tensor.stride(), - dtype, - device, - mode="manual", - set_tensor=x_torch_tensor, - ) - - if inplace == Inplace.INPLACE_X: - y = x - else: - y = TestTensor(shape, None, dtype, device) - - if y.is_broadcast(): - return - - print( - f"Testing Erf on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}" - ) - - ans = erf_op(x.torch_tensor()) - - if sync is not None: - sync() - - descriptor = infiniopOperatorDescriptor_t() - check_error( - LIBINFINIOP.infiniopCreateErfDescriptor( - handle, ctypes.byref(descriptor), y.descriptor, x.descriptor - ) - ) - - # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel - for tensor in [x, y]: - tensor.destroy_desc() - - workspace_size = c_uint64(0) - check_error( - LIBINFINIOP.infiniopGetErfWorkspaceSize( - descriptor, ctypes.byref(workspace_size) - ) - ) - workspace = TestWorkspace(workspace_size.value, y.device) - - def lib_erf(): - check_error( - LIBINFINIOP.infiniopErf( - descriptor, workspace.data(), workspace_size.value, y.data(), x.data(), None - ) - ) - - lib_erf() - if sync is not None: - sync() - - atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) - if DEBUG: - debug(y.actual_tensor(), ans, atol=atol, rtol=rtol) - assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol, equal_nan=True) - - # Profiling workflow - if PROFILE: - # fmt: off - profile_operation("PyTorch", lambda: erf_op(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) - profile_operation(" lib", lambda: lib_erf(), device, NUM_PRERUN, NUM_ITERATIONS) - # fmt: on - - check_error(LIBINFINIOP.infiniopDestroyErfDescriptor(descriptor)) - - -if __name__ == "__main__": - args = get_args() - - # Configure testing options - DEBUG = args.debug - PROFILE = args.profile - NUM_PRERUN = args.num_prerun - NUM_ITERATIONS = args.num_iterations - - for device in get_test_devices(args): - test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) - - print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/floor.py b/test/infiniop/floor.py deleted file mode 100644 index b981da809..000000000 --- a/test/infiniop/floor.py +++ /dev/null @@ -1,165 +0,0 @@ -import ctypes -from ctypes import c_uint64 -from enum import Enum, auto - -import torch -from libinfiniop import ( - LIBINFINIOP, - InfiniDeviceNames, - InfiniDtype, - InfiniDtypeNames, - TestTensor, - TestWorkspace, - check_error, - debug, - get_args, - get_test_devices, - get_tolerance, - infiniopOperatorDescriptor_t, - profile_operation, - test_operator, -) - -# ============================================================================== -# Configuration (Internal Use Only) -# ============================================================================== -# These are not meant to be imported from other modules -_TEST_CASES_ = [ - # tensor_shape, inplace - ((1, 3),), - ((3, 3),), - ((32, 20, 512),), - ((33, 333, 333),), - ((32, 256, 112, 112),), - ((3, 3, 13, 9, 17),), -] - - -class Inplace(Enum): - OUT_OF_PLACE = auto() - INPLACE_X = auto() - - -# Inplace options applied for each test case in _TEST_CASES_ -_INPLACE = [ - Inplace.OUT_OF_PLACE, - Inplace.INPLACE_X, -] - -# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_ -_TEST_CASES = [ - test_case + (inplace_item,) - for test_case in _TEST_CASES_ - for inplace_item in _INPLACE -] - -# Data types used for testing (matching old operators library: only F16 and F32) -_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32] - -# Tolerance map for different data types -_TOLERANCE_MAP = { - InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, - InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7}, -} - -DEBUG = False -PROFILE = False -NUM_PRERUN = 10 -NUM_ITERATIONS = 1000 - - -def floor_op(x): - return torch.floor(x).to(x.dtype) - - -def test( - handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=InfiniDtype.F16, sync=None -): - # Generate test tensors with values in range [-20, -10) for floor operation - # floor domain is (-∞, +∞), so we use range [-20, -10) - x_torch_tensor = torch.rand(shape) * 10 - 20 - - x = TestTensor( - shape, - x_torch_tensor.stride(), - dtype, - device, - mode="manual", - set_tensor=x_torch_tensor, - ) - - if inplace == Inplace.INPLACE_X: - y = x - else: - y = TestTensor(shape, None, dtype, device) - - if y.is_broadcast(): - return - - print( - f"Testing Floor on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}" - ) - - ans = floor_op(x.torch_tensor()) - - if sync is not None: - sync() - - descriptor = infiniopOperatorDescriptor_t() - check_error( - LIBINFINIOP.infiniopCreateFloorDescriptor( - handle, ctypes.byref(descriptor), y.descriptor, x.descriptor - ) - ) - - # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel - for tensor in [x, y]: - tensor.destroy_desc() - - workspace_size = c_uint64(0) - check_error( - LIBINFINIOP.infiniopGetFloorWorkspaceSize( - descriptor, ctypes.byref(workspace_size) - ) - ) - workspace = TestWorkspace(workspace_size.value, y.device) - - def lib_floor(): - check_error( - LIBINFINIOP.infiniopFloor( - descriptor, workspace.data(), workspace_size.value, y.data(), x.data(), None - ) - ) - - lib_floor() - if sync is not None: - sync() - - atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) - if DEBUG: - debug(y.actual_tensor(), ans, atol=atol, rtol=rtol) - assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol, equal_nan=True) - - # Profiling workflow - if PROFILE: - # fmt: off - profile_operation("PyTorch", lambda: floor_op(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) - profile_operation(" lib", lambda: lib_floor(), device, NUM_PRERUN, NUM_ITERATIONS) - # fmt: on - - check_error(LIBINFINIOP.infiniopDestroyFloorDescriptor(descriptor)) - - -if __name__ == "__main__": - args = get_args() - - # Configure testing options - DEBUG = args.debug - PROFILE = args.profile - NUM_PRERUN = args.num_prerun - NUM_ITERATIONS = args.num_iterations - - for device in get_test_devices(args): - test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) - - print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/libinfiniop/binary_test_base.py b/test/infiniop/libinfiniop/binary_test_base.py new file mode 100644 index 000000000..c9da5b4de --- /dev/null +++ b/test/infiniop/libinfiniop/binary_test_base.py @@ -0,0 +1,273 @@ +""" +Base test template for binary operators. + +This module provides a unified test framework for all binary operators, +eliminating code duplication across individual test scripts. + +Usage: + from libinfiniop.binary_test_base import BinaryTestBase + + class DivTest(BinaryTestBase): + OP_NAME = "Div" + OP_NAME_LOWER = "div" + + @staticmethod + def torch_op(c, a, b): + torch.div(a, b, out=c) + + @staticmethod + def generate_input_a(shape, dtype, device): + return TestTensor(shape, None, dtype, device) + + @staticmethod + def generate_input_b(shape, dtype, device): + # For division, ensure b doesn't contain zeros + return TestTensor(shape, None, dtype, device, scale=2, bias=0.1) + + TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6}, + } + + if __name__ == "__main__": + DivTest.run() +""" + +import ctypes +from ctypes import c_uint64 +from enum import Enum, auto + +import torch +from libinfiniop import ( + LIBINFINIOP, + InfiniDeviceNames, + InfiniDtype, + InfiniDtypeNames, + TestTensor, + TestWorkspace, + check_error, + debug, + get_args, + get_test_devices, + get_tolerance, + infiniopOperatorDescriptor_t, + profile_operation, + test_operator, +) + + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_A = auto() + INPLACE_B = auto() + + +# Common test cases for binary operators +_BINARY_TEST_CASES_ = [ + # shape, a_stride, b_stride, c_stride + ((13, 4), None, None, None), + ((13, 4), (10, 1), (10, 1), (10, 1)), + ((13, 4), (0, 1), None, None), + ((13, 4, 4), None, None, None), + ((13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1)), + ((13, 4, 4), (4, 0, 1), (0, 4, 1), None), + ((16, 5632), None, None, None), + ((16, 5632), (13312, 1), (13312, 1), (13312, 1)), + ((13, 16, 2), (128, 4, 1), (0, 2, 1), (64, 4, 1)), + ((13, 16, 2), (128, 4, 1), (2, 0, 1), (64, 4, 1)), + ((4, 4, 5632), None, None, None), + ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1)), +] + +# Inplace options applied for each test case +_BINARY_INPLACE = [ + Inplace.OUT_OF_PLACE, + Inplace.INPLACE_A, + Inplace.INPLACE_B, +] + +# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_ +_BINARY_TEST_CASES = [ + test_case + (inplace_item,) + for test_case in _BINARY_TEST_CASES_ + for inplace_item in _BINARY_INPLACE +] + +# Data types used for testing (matching old operators library: only F16 and F32) +_BINARY_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32] + + +class BinaryTestBase: + """ + Base class for binary operator tests. + + Subclasses must define: + - OP_NAME: Uppercase operator name (e.g., "Div", "Pow") + - OP_NAME_LOWER: Lowercase operator name (e.g., "div", "pow") + - torch_op: Static method that performs the PyTorch operation + - generate_input_a: Static method that generates first input tensor + - generate_input_b: Static method that generates second input tensor + - TOLERANCE_MAP: Dictionary mapping dtype to tolerance values + """ + + OP_NAME = None + OP_NAME_LOWER = None + + # Default tolerance map (can be overridden) + TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6}, + } + + # Test cases (can be overridden) + TEST_CASES = _BINARY_TEST_CASES + TENSOR_DTYPES = _BINARY_TENSOR_DTYPES + + DEBUG = False + PROFILE = False + NUM_PRERUN = 10 + NUM_ITERATIONS = 1000 + + @staticmethod + def torch_op(c, a, b): + """PyTorch operation - must be implemented by subclass""" + raise NotImplementedError("Subclass must implement torch_op") + + @staticmethod + def generate_input_a(shape, a_stride, dtype, device): + """ + Generate first input tensor - must be implemented by subclass. + + Args: + shape: Tensor shape tuple + a_stride: Stride tuple or None + dtype: InfiniDtype enum value + device: InfiniDeviceEnum value + + Returns: + TestTensor: Generated first input tensor + """ + raise NotImplementedError("Subclass must implement generate_input_a") + + @staticmethod + def generate_input_b(shape, b_stride, dtype, device): + """ + Generate second input tensor - must be implemented by subclass. + + Args: + shape: Tensor shape tuple + b_stride: Stride tuple or None + dtype: InfiniDtype enum value + device: InfiniDeviceEnum value + + Returns: + TestTensor: Generated second input tensor + """ + raise NotImplementedError("Subclass must implement generate_input_b") + + @classmethod + def test(cls, handle, device, shape, a_stride=None, b_stride=None, c_stride=None, + inplace=Inplace.OUT_OF_PLACE, dtype=InfiniDtype.F16, sync=None): + """Common test function for binary operators""" + a = cls.generate_input_a(shape, a_stride, dtype, device) + b = cls.generate_input_b(shape, b_stride, dtype, device) + + if inplace == Inplace.INPLACE_A: + if c_stride is not None and c_stride != a_stride: + return + c = a + elif inplace == Inplace.INPLACE_B: + if c_stride is not None and c_stride != b_stride: + return + c = b + else: + c = TestTensor(shape, c_stride, dtype, device) + + if c.is_broadcast(): + return + + print( + f"Testing {cls.OP_NAME} on {InfiniDeviceNames[device]} with shape:{shape} a_stride:{a_stride} b_stride:{b_stride} c_stride:{c_stride} " + f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}" + ) + + cls.torch_op(c.torch_tensor(), a.torch_tensor(), b.torch_tensor()) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + create_func = getattr(LIBINFINIOP, f"infiniopCreate{cls.OP_NAME}Descriptor") + check_error( + create_func( + handle, + ctypes.byref(descriptor), + c.descriptor, + a.descriptor, + b.descriptor, + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + for tensor in [a, b, c]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + get_workspace_func = getattr(LIBINFINIOP, f"infiniopGet{cls.OP_NAME}WorkspaceSize") + check_error( + get_workspace_func( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, device) + + def lib_op(): + op_func = getattr(LIBINFINIOP, f"infiniop{cls.OP_NAME}") + check_error( + op_func( + descriptor, + workspace.data(), + workspace_size.value, + c.data(), + a.data(), + b.data(), + None, + ) + ) + + lib_op() + if sync is not None: + sync() + + atol, rtol = get_tolerance(cls.TOLERANCE_MAP, dtype) + if cls.DEBUG: + debug(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol) + + equal_nan = getattr(cls, 'EQUAL_NAN', False) + assert torch.allclose(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol, equal_nan=equal_nan) + + # Profiling workflow + if cls.PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: cls.torch_op(c.torch_tensor(), a.torch_tensor(), b.torch_tensor()), device, cls.NUM_PRERUN, cls.NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_op(), device, cls.NUM_PRERUN, cls.NUM_ITERATIONS) + # fmt: on + + destroy_func = getattr(LIBINFINIOP, f"infiniopDestroy{cls.OP_NAME}Descriptor") + check_error(destroy_func(descriptor)) + + @classmethod + def run(cls): + """Run the test""" + args = get_args() + + # Configure testing options + cls.DEBUG = args.debug + cls.PROFILE = args.profile + cls.NUM_PRERUN = args.num_prerun + cls.NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, cls.test, cls.TEST_CASES, cls.TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/libinfiniop/unary_test_base.py b/test/infiniop/libinfiniop/unary_test_base.py new file mode 100644 index 000000000..648a97d3e --- /dev/null +++ b/test/infiniop/libinfiniop/unary_test_base.py @@ -0,0 +1,242 @@ +""" +Base test template for unary operators. + +This module provides a unified test framework for all unary operators, +eliminating code duplication across individual test scripts. + +Usage: + from libinfiniop.unary_test_base import UnaryTestBase + + class AbsTest(UnaryTestBase): + OP_NAME = "Abs" + OP_NAME_LOWER = "abs" + + @staticmethod + def torch_op(x): + return torch.abs(x).to(x.dtype) + + @staticmethod + def generate_input(shape, dtype, device): + # Generate test tensors with values in range [-1, 1) for abs operation + return torch.rand(shape, dtype=dtype, device=device) * 2 - 1 + + TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7}, + } + + if __name__ == "__main__": + AbsTest.run() +""" + +import ctypes +from ctypes import c_uint64 +from enum import Enum, auto + +import torch +from libinfiniop import ( + LIBINFINIOP, + InfiniDeviceNames, + InfiniDtype, + InfiniDtypeNames, + TestTensor, + TestWorkspace, + check_error, + debug, + get_args, + get_test_devices, + get_tolerance, + infiniopOperatorDescriptor_t, + profile_operation, + test_operator, +) +from libinfiniop.utils import to_torch_dtype +from libinfiniop.devices import torch_device_map + + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_X = auto() + + +# Common test cases for unary operators +_UNARY_TEST_CASES_ = [ + # tensor_shape, inplace + ((1, 3),), + ((3, 3),), + ((32, 20, 512),), + ((33, 333, 333),), + ((32, 256, 112, 112),), + ((3, 3, 13, 9, 17),), +] + +# Inplace options applied for each test case +_UNARY_INPLACE = [ + Inplace.OUT_OF_PLACE, + Inplace.INPLACE_X, +] + +# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_ +_UNARY_TEST_CASES = [ + test_case + (inplace_item,) + for test_case in _UNARY_TEST_CASES_ + for inplace_item in _UNARY_INPLACE +] + +# Data types used for testing (matching old operators library: only F16 and F32) +_UNARY_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32] + + +class UnaryTestBase: + """ + Base class for unary operator tests. + + Subclasses must define: + - OP_NAME: Uppercase operator name (e.g., "Abs", "Log") + - OP_NAME_LOWER: Lowercase operator name (e.g., "abs", "log") + - torch_op: Static method that performs the PyTorch operation + - generate_input: Static method that generates input tensor + - TOLERANCE_MAP: Dictionary mapping dtype to tolerance values + """ + + OP_NAME = None + OP_NAME_LOWER = None + + # Default tolerance map (can be overridden) + TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7}, + } + + # Test cases (can be overridden) + TEST_CASES = _UNARY_TEST_CASES + TENSOR_DTYPES = _UNARY_TENSOR_DTYPES + + DEBUG = False + PROFILE = False + NUM_PRERUN = 10 + NUM_ITERATIONS = 1000 + + @staticmethod + def torch_op(x): + """PyTorch operation - must be implemented by subclass""" + raise NotImplementedError("Subclass must implement torch_op") + + @staticmethod + def generate_input(shape, dtype, device): + """ + Generate input tensor - must be implemented by subclass. + + Args: + shape: Tensor shape tuple + dtype: PyTorch dtype (e.g., torch.float16, torch.float32) + device: PyTorch device string (e.g., "cpu", "cuda") + + Returns: + torch.Tensor: Generated input tensor + """ + raise NotImplementedError("Subclass must implement generate_input") + + @classmethod + def test(cls, handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=InfiniDtype.F16, sync=None): + """Common test function for unary operators""" + from libinfiniop.devices import torch_device_map + from libinfiniop.utils import to_torch_dtype + + # Generate input tensor + torch_dtype = to_torch_dtype(dtype) + torch_device = torch_device_map[device] + x_torch_tensor = cls.generate_input(shape, torch_dtype, torch_device) + + x = TestTensor( + shape, + x_torch_tensor.stride(), + dtype, + device, + mode="manual", + set_tensor=x_torch_tensor, + ) + + if inplace == Inplace.INPLACE_X: + y = x + else: + y = TestTensor(shape, None, dtype, device) + + if y.is_broadcast(): + return + + print( + f"Testing {cls.OP_NAME} on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}" + ) + + ans = cls.torch_op(x.torch_tensor()) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + create_func = getattr(LIBINFINIOP, f"infiniopCreate{cls.OP_NAME}Descriptor") + check_error( + create_func( + handle, ctypes.byref(descriptor), y.descriptor, x.descriptor + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + for tensor in [x, y]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + get_workspace_func = getattr(LIBINFINIOP, f"infiniopGet{cls.OP_NAME}WorkspaceSize") + check_error( + get_workspace_func( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, y.device) + + def lib_op(): + op_func = getattr(LIBINFINIOP, f"infiniop{cls.OP_NAME}") + check_error( + op_func( + descriptor, workspace.data(), workspace_size.value, y.data(), x.data(), None + ) + ) + + lib_op() + if sync is not None: + sync() + + atol, rtol = get_tolerance(cls.TOLERANCE_MAP, dtype) + equal_nan = getattr(cls, 'EQUAL_NAN', False) + + if cls.DEBUG: + debug(y.actual_tensor(), ans, atol=atol, rtol=rtol, equal_nan=equal_nan) + + assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol, equal_nan=equal_nan) + + # Profiling workflow + if cls.PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: cls.torch_op(x.torch_tensor()), device, cls.NUM_PRERUN, cls.NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_op(), device, cls.NUM_PRERUN, cls.NUM_ITERATIONS) + # fmt: on + + destroy_func = getattr(LIBINFINIOP, f"infiniopDestroy{cls.OP_NAME}Descriptor") + check_error(destroy_func(descriptor)) + + @classmethod + def run(cls): + """Run the test""" + args = get_args() + + # Configure testing options + cls.DEBUG = args.debug + cls.PROFILE = args.profile + cls.NUM_PRERUN = args.num_prerun + cls.NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, cls.test, cls.TEST_CASES, cls.TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/log.py b/test/infiniop/log.py deleted file mode 100644 index 4f97de374..000000000 --- a/test/infiniop/log.py +++ /dev/null @@ -1,166 +0,0 @@ -import ctypes -from ctypes import c_uint64 -from enum import Enum, auto - -import torch -from libinfiniop import ( - LIBINFINIOP, - InfiniDeviceNames, - InfiniDtype, - InfiniDtypeNames, - TestTensor, - TestWorkspace, - check_error, - debug, - get_args, - get_test_devices, - get_tolerance, - infiniopOperatorDescriptor_t, - profile_operation, - test_operator, -) - -# ============================================================================== -# Configuration (Internal Use Only) -# ============================================================================== -# These are not meant to be imported from other modules -_TEST_CASES_ = [ - # tensor_shape, inplace - ((1, 3),), - ((3, 3),), - ((32, 20, 512),), - ((33, 333, 333),), - ((32, 256, 112, 112),), - ((3, 3, 13, 9, 17),), -] - - -class Inplace(Enum): - OUT_OF_PLACE = auto() - INPLACE_X = auto() - - -# Inplace options applied for each test case in _TEST_CASES_ -_INPLACE = [ - Inplace.OUT_OF_PLACE, - Inplace.INPLACE_X, -] - -# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_ -_TEST_CASES = [ - test_case + (inplace_item,) - for test_case in _TEST_CASES_ - for inplace_item in _INPLACE -] - -# Data types used for testing (matching old operators library: only F16 and F32) -_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32] - -# Tolerance map for different data types -# Copied from old operators library: atol=1e-7, rtol=1e-3 -_TOLERANCE_MAP = { - InfiniDtype.F16: {"atol": 1e-7, "rtol": 1e-3}, - InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-3}, -} - -DEBUG = False -PROFILE = False -NUM_PRERUN = 10 -NUM_ITERATIONS = 1000 - - -def log_op(x): - return torch.log(x).to(x.dtype) - - -def test( - handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=InfiniDtype.F16, sync=None -): - # Generate test tensors with values in range [0.1, 1.1) for log operation - # log domain is (0, +∞), so we use range [0.1, 1.1) - x_torch_tensor = torch.rand(shape) + 0.1 - - x = TestTensor( - shape, - x_torch_tensor.stride(), - dtype, - device, - mode="manual", - set_tensor=x_torch_tensor, - ) - - if inplace == Inplace.INPLACE_X: - y = x - else: - y = TestTensor(shape, None, dtype, device) - - if y.is_broadcast(): - return - - print( - f"Testing Log on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}" - ) - - ans = log_op(x.torch_tensor()) - - if sync is not None: - sync() - - descriptor = infiniopOperatorDescriptor_t() - check_error( - LIBINFINIOP.infiniopCreateLogDescriptor( - handle, ctypes.byref(descriptor), y.descriptor, x.descriptor - ) - ) - - # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel - for tensor in [x, y]: - tensor.destroy_desc() - - workspace_size = c_uint64(0) - check_error( - LIBINFINIOP.infiniopGetLogWorkspaceSize( - descriptor, ctypes.byref(workspace_size) - ) - ) - workspace = TestWorkspace(workspace_size.value, y.device) - - def lib_log(): - check_error( - LIBINFINIOP.infiniopLog( - descriptor, workspace.data(), workspace_size.value, y.data(), x.data(), None - ) - ) - - lib_log() - if sync is not None: - sync() - - atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) - if DEBUG: - debug(y.actual_tensor(), ans, atol=atol, rtol=rtol) - assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol, equal_nan=True) - - # Profiling workflow - if PROFILE: - # fmt: off - profile_operation("PyTorch", lambda: log_op(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) - profile_operation(" lib", lambda: lib_log(), device, NUM_PRERUN, NUM_ITERATIONS) - # fmt: on - - check_error(LIBINFINIOP.infiniopDestroyLogDescriptor(descriptor)) - - -if __name__ == "__main__": - args = get_args() - - # Configure testing options - DEBUG = args.debug - PROFILE = args.profile - NUM_PRERUN = args.num_prerun - NUM_ITERATIONS = args.num_iterations - - for device in get_test_devices(args): - test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) - - print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/max.py b/test/infiniop/max.py deleted file mode 100644 index e4221cf3e..000000000 --- a/test/infiniop/max.py +++ /dev/null @@ -1,189 +0,0 @@ -import torch -import ctypes -from ctypes import c_uint64 -from libinfiniop import ( - LIBINFINIOP, - TestTensor, - get_test_devices, - check_error, - test_operator, - get_args, - debug, - get_tolerance, - profile_operation, - TestWorkspace, - InfiniDtype, - InfiniDtypeNames, - InfiniDeviceNames, - infiniopOperatorDescriptor_t, -) -from enum import Enum, auto - -# ============================================================================== -# Configuration (Internal Use Only) -# ============================================================================== -# These are not meant to be imported from other modules -_TEST_CASES_ = [ - # shape, a_stride, b_stride, c_stride - ((13, 4), None, None, None), - ((13, 4), (10, 1), (10, 1), (10, 1)), - ((13, 4), (0, 1), None, None), - ((13, 4, 4), None, None, None), - ((13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1)), - ((13, 4, 4), (4, 0, 1), (0, 4, 1), None), - ((16, 5632), None, None, None), - ((16, 5632), (13312, 1), (13312, 1), (13312, 1)), - ((13, 16, 2), (128, 4, 1), (0, 2, 1), (64, 4, 1)), - ((13, 16, 2), (128, 4, 1), (2, 0, 1), (64, 4, 1)), - ((4, 4, 5632), None, None, None), - ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1)), -] - - -class Inplace(Enum): - OUT_OF_PLACE = auto() - INPLACE_A = auto() - INPLACE_B = auto() - - -# Inplace options applied for each test case in _TEST_CASES_ -_INPLACE = [ - Inplace.OUT_OF_PLACE, - Inplace.INPLACE_A, - Inplace.INPLACE_B, -] - -# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_ -_TEST_CASES = [ - test_case + (inplace_item,) - for test_case in _TEST_CASES_ - for inplace_item in _INPLACE -] - -# Data types used for testing (matching old operators library: only F16 and F32) -_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32] - -# Tolerance map for different data types -# Note: F32 tolerance is relaxed compared to theoretical precision due to: -# - Old operators library uses vectorized operations (pack_size=4) with vecN -# - InfiniCore uses elementwise operations, which can cause 1 ULP differences -# - This is acceptable as it's within floating-point precision limits -_TOLERANCE_MAP = { - InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, - InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6}, # Relaxed from 1e-7 to accommodate vectorization differences -} - -DEBUG = False -PROFILE = False -NUM_PRERUN = 10 -NUM_ITERATIONS = 1000 - - -def max(c, a, b): - # Only support F16 and F32 (matching old operators library) - torch.maximum(a, b, out=c) - - -def test( - handle, - device, - shape, - a_stride=None, - b_stride=None, - c_stride=None, - inplace=Inplace.OUT_OF_PLACE, - dtype=InfiniDtype.F16, - sync=None, -): - a = TestTensor(shape, a_stride, dtype, device) - b = TestTensor(shape, b_stride, dtype, device) - - if inplace == Inplace.INPLACE_A: - if c_stride is not None and c_stride != a_stride: - return - c = a - elif inplace == Inplace.INPLACE_B: - if c_stride is not None and c_stride != b_stride: - return - c = b - else: - c = TestTensor(shape, c_stride, dtype, device) - - if c.is_broadcast(): - return - - print( - f"Testing Max on {InfiniDeviceNames[device]} with shape:{shape} a_stride:{a_stride} b_stride:{b_stride} c_stride:{c_stride} " - f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}" - ) - max(c.torch_tensor(), a.torch_tensor(), b.torch_tensor()) - - if sync is not None: - sync() - - descriptor = infiniopOperatorDescriptor_t() - check_error( - LIBINFINIOP.infiniopCreateMaxDescriptor( - handle, - ctypes.byref(descriptor), - c.descriptor, - a.descriptor, - b.descriptor, - ) - ) - - # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel - for tensor in [a, b, c]: - tensor.destroy_desc() - - workspace_size = c_uint64(0) - check_error( - LIBINFINIOP.infiniopGetMaxWorkspaceSize( - descriptor, ctypes.byref(workspace_size) - ) - ) - workspace = TestWorkspace(workspace_size.value, device) - - def lib_max(): - check_error( - LIBINFINIOP.infiniopMax( - descriptor, - workspace.data(), - workspace_size.value, - c.data(), - a.data(), - b.data(), - None, - ) - ) - - lib_max() - if sync is not None: - sync() - - atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) - if DEBUG: - debug(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol) - assert torch.allclose(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol, equal_nan=True) - - # Profiling workflow - if PROFILE: - # fmt: off - profile_operation("PyTorch", lambda: max(c.torch_tensor(), a.torch_tensor(), b.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) - profile_operation(" lib", lambda: lib_max(), device, NUM_PRERUN, NUM_ITERATIONS) - # fmt: on - check_error(LIBINFINIOP.infiniopDestroyMaxDescriptor(descriptor)) - - -if __name__ == "__main__": - args = get_args() - # Configure testing options - DEBUG = args.debug - PROFILE = args.profile - NUM_PRERUN = args.num_prerun - NUM_ITERATIONS = args.num_iterations - - for device in get_test_devices(args): - test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) - - print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/min.py b/test/infiniop/min.py deleted file mode 100644 index 19f52a334..000000000 --- a/test/infiniop/min.py +++ /dev/null @@ -1,189 +0,0 @@ -import torch -import ctypes -from ctypes import c_uint64 -from libinfiniop import ( - LIBINFINIOP, - TestTensor, - get_test_devices, - check_error, - test_operator, - get_args, - debug, - get_tolerance, - profile_operation, - TestWorkspace, - InfiniDtype, - InfiniDtypeNames, - InfiniDeviceNames, - infiniopOperatorDescriptor_t, -) -from enum import Enum, auto - -# ============================================================================== -# Configuration (Internal Use Only) -# ============================================================================== -# These are not meant to be imported from other modules -_TEST_CASES_ = [ - # shape, a_stride, b_stride, c_stride - ((13, 4), None, None, None), - ((13, 4), (10, 1), (10, 1), (10, 1)), - ((13, 4), (0, 1), None, None), - ((13, 4, 4), None, None, None), - ((13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1)), - ((13, 4, 4), (4, 0, 1), (0, 4, 1), None), - ((16, 5632), None, None, None), - ((16, 5632), (13312, 1), (13312, 1), (13312, 1)), - ((13, 16, 2), (128, 4, 1), (0, 2, 1), (64, 4, 1)), - ((13, 16, 2), (128, 4, 1), (2, 0, 1), (64, 4, 1)), - ((4, 4, 5632), None, None, None), - ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1)), -] - - -class Inplace(Enum): - OUT_OF_PLACE = auto() - INPLACE_A = auto() - INPLACE_B = auto() - - -# Inplace options applied for each test case in _TEST_CASES_ -_INPLACE = [ - Inplace.OUT_OF_PLACE, - Inplace.INPLACE_A, - Inplace.INPLACE_B, -] - -# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_ -_TEST_CASES = [ - test_case + (inplace_item,) - for test_case in _TEST_CASES_ - for inplace_item in _INPLACE -] - -# Data types used for testing (matching old operators library: only F16 and F32) -_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32] - -# Tolerance map for different data types -# Note: F32 tolerance is relaxed compared to theoretical precision due to: -# - Old operators library uses vectorized operations (pack_size=4) with vecN -# - InfiniCore uses elementwise operations, which can cause 1 ULP differences -# - This is acceptable as it's within floating-point precision limits -_TOLERANCE_MAP = { - InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, - InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6}, # Relaxed from 1e-7 to accommodate vectorization differences -} - -DEBUG = False -PROFILE = False -NUM_PRERUN = 10 -NUM_ITERATIONS = 1000 - - -def min(c, a, b): - # Only support F16 and F32 (matching old operators library) - torch.minimum(a, b, out=c) - - -def test( - handle, - device, - shape, - a_stride=None, - b_stride=None, - c_stride=None, - inplace=Inplace.OUT_OF_PLACE, - dtype=InfiniDtype.F16, - sync=None, -): - a = TestTensor(shape, a_stride, dtype, device) - b = TestTensor(shape, b_stride, dtype, device) - - if inplace == Inplace.INPLACE_A: - if c_stride is not None and c_stride != a_stride: - return - c = a - elif inplace == Inplace.INPLACE_B: - if c_stride is not None and c_stride != b_stride: - return - c = b - else: - c = TestTensor(shape, c_stride, dtype, device) - - if c.is_broadcast(): - return - - print( - f"Testing Min on {InfiniDeviceNames[device]} with shape:{shape} a_stride:{a_stride} b_stride:{b_stride} c_stride:{c_stride} " - f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}" - ) - min(c.torch_tensor(), a.torch_tensor(), b.torch_tensor()) - - if sync is not None: - sync() - - descriptor = infiniopOperatorDescriptor_t() - check_error( - LIBINFINIOP.infiniopCreateMinDescriptor( - handle, - ctypes.byref(descriptor), - c.descriptor, - a.descriptor, - b.descriptor, - ) - ) - - # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel - for tensor in [a, b, c]: - tensor.destroy_desc() - - workspace_size = c_uint64(0) - check_error( - LIBINFINIOP.infiniopGetMinWorkspaceSize( - descriptor, ctypes.byref(workspace_size) - ) - ) - workspace = TestWorkspace(workspace_size.value, device) - - def lib_min(): - check_error( - LIBINFINIOP.infiniopMin( - descriptor, - workspace.data(), - workspace_size.value, - c.data(), - a.data(), - b.data(), - None, - ) - ) - - lib_min() - if sync is not None: - sync() - - atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) - if DEBUG: - debug(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol) - assert torch.allclose(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol, equal_nan=True) - - # Profiling workflow - if PROFILE: - # fmt: off - profile_operation("PyTorch", lambda: min(c.torch_tensor(), a.torch_tensor(), b.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) - profile_operation(" lib", lambda: lib_min(), device, NUM_PRERUN, NUM_ITERATIONS) - # fmt: on - check_error(LIBINFINIOP.infiniopDestroyMinDescriptor(descriptor)) - - -if __name__ == "__main__": - args = get_args() - # Configure testing options - DEBUG = args.debug - PROFILE = args.profile - NUM_PRERUN = args.num_prerun - NUM_ITERATIONS = args.num_iterations - - for device in get_test_devices(args): - test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) - - print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/mod.py b/test/infiniop/mod.py deleted file mode 100644 index 298f3137f..000000000 --- a/test/infiniop/mod.py +++ /dev/null @@ -1,190 +0,0 @@ -import torch -import ctypes -from ctypes import c_uint64 -from libinfiniop import ( - LIBINFINIOP, - TestTensor, - get_test_devices, - check_error, - test_operator, - get_args, - debug, - get_tolerance, - profile_operation, - TestWorkspace, - InfiniDtype, - InfiniDtypeNames, - InfiniDeviceNames, - infiniopOperatorDescriptor_t, -) -from enum import Enum, auto - -# ============================================================================== -# Configuration (Internal Use Only) -# ============================================================================== -# These are not meant to be imported from other modules -_TEST_CASES_ = [ - # shape, a_stride, b_stride, c_stride - ((13, 4), None, None, None), - ((13, 4), (10, 1), (10, 1), (10, 1)), - ((13, 4), (0, 1), None, None), - ((13, 4, 4), None, None, None), - ((13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1)), - ((13, 4, 4), (4, 0, 1), (0, 4, 1), None), - ((16, 5632), None, None, None), - ((16, 5632), (13312, 1), (13312, 1), (13312, 1)), - ((13, 16, 2), (128, 4, 1), (0, 2, 1), (64, 4, 1)), - ((13, 16, 2), (128, 4, 1), (2, 0, 1), (64, 4, 1)), - ((4, 4, 5632), None, None, None), - ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1)), -] - - -class Inplace(Enum): - OUT_OF_PLACE = auto() - INPLACE_A = auto() - INPLACE_B = auto() - - -# Inplace options applied for each test case in _TEST_CASES_ -_INPLACE = [ - Inplace.OUT_OF_PLACE, - Inplace.INPLACE_A, - Inplace.INPLACE_B, -] - -# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_ -_TEST_CASES = [ - test_case + (inplace_item,) - for test_case in _TEST_CASES_ - for inplace_item in _INPLACE -] - -# Data types used for testing (matching old operators library: only F16 and F32) -_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32] - -# Tolerance map for different data types -# Note: mod operation uses fmod for floating point, which should be exact -_TOLERANCE_MAP = { - InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, - InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7}, -} - -DEBUG = False -PROFILE = False -NUM_PRERUN = 10 -NUM_ITERATIONS = 1000 - - -def mod_op(c, a, b): - torch.fmod(a, b, out=c) - - -def test( - handle, - device, - shape, - a_stride=None, - b_stride=None, - c_stride=None, - inplace=Inplace.OUT_OF_PLACE, - dtype=InfiniDtype.F16, - sync=None, -): - # Generate test tensors with values in a reasonable range for mod operation - # Use scale=10 to get values in [0, 10) range, similar to old test - a = TestTensor(shape, a_stride, dtype, device, mode="random", scale=10.0) - # Ensure b doesn't contain zeros to avoid division by zero in mod - b = TestTensor(shape, b_stride, dtype, device, mode="random", scale=10.0, bias=0.1) - - if inplace == Inplace.INPLACE_A: - if c_stride is not None and c_stride != a_stride: - return - c = a - elif inplace == Inplace.INPLACE_B: - if c_stride is not None and c_stride != b_stride: - return - c = b - else: - c = TestTensor(shape, c_stride, dtype, device) - - if c.is_broadcast(): - return - - print( - f"Testing Mod on {InfiniDeviceNames[device]} with shape:{shape} a_stride:{a_stride} b_stride:{b_stride} c_stride:{c_stride} " - f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}" - ) - - mod_op(c.torch_tensor(), a.torch_tensor(), b.torch_tensor()) - - if sync is not None: - sync() - - descriptor = infiniopOperatorDescriptor_t() - check_error( - LIBINFINIOP.infiniopCreateModDescriptor( - handle, - ctypes.byref(descriptor), - c.descriptor, - a.descriptor, - b.descriptor, - ) - ) - - # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel - for tensor in [a, b, c]: - tensor.destroy_desc() - - workspace_size = c_uint64(0) - check_error( - LIBINFINIOP.infiniopGetModWorkspaceSize( - descriptor, ctypes.byref(workspace_size) - ) - ) - workspace = TestWorkspace(workspace_size.value, device) - - def lib_mod(): - check_error( - LIBINFINIOP.infiniopMod( - descriptor, - workspace.data(), - workspace_size.value, - c.data(), - a.data(), - b.data(), - None, - ) - ) - - lib_mod() - if sync is not None: - sync() - - atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) - if DEBUG: - debug(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol, equal_nan=True) - assert torch.allclose(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol, equal_nan=True) - - # Profiling workflow - if PROFILE: - # fmt: off - profile_operation("PyTorch", lambda: mod_op(c.torch_tensor(), a.torch_tensor(), b.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) - profile_operation(" lib", lambda: lib_mod(), device, NUM_PRERUN, NUM_ITERATIONS) - # fmt: on - check_error(LIBINFINIOP.infiniopDestroyModDescriptor(descriptor)) - - -if __name__ == "__main__": - args = get_args() - - # Configure testing options - DEBUG = args.debug - PROFILE = args.profile - NUM_PRERUN = args.num_prerun - NUM_ITERATIONS = args.num_iterations - - for device in get_test_devices(args): - test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) - - print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/neg.py b/test/infiniop/neg.py deleted file mode 100644 index 62607bce0..000000000 --- a/test/infiniop/neg.py +++ /dev/null @@ -1,165 +0,0 @@ -import ctypes -from ctypes import c_uint64 -from enum import Enum, auto - -import torch -from libinfiniop import ( - LIBINFINIOP, - InfiniDeviceNames, - InfiniDtype, - InfiniDtypeNames, - TestTensor, - TestWorkspace, - check_error, - debug, - get_args, - get_test_devices, - get_tolerance, - infiniopOperatorDescriptor_t, - profile_operation, - test_operator, -) - -# ============================================================================== -# Configuration (Internal Use Only) -# ============================================================================== -# These are not meant to be imported from other modules -_TEST_CASES_ = [ - # tensor_shape, inplace - ((1, 3),), - ((3, 3),), - ((32, 20, 512),), - ((33, 333, 333),), - ((32, 256, 112, 112),), - ((3, 3, 13, 9, 17),), -] - - -class Inplace(Enum): - OUT_OF_PLACE = auto() - INPLACE_X = auto() - - -# Inplace options applied for each test case in _TEST_CASES_ -_INPLACE = [ - Inplace.OUT_OF_PLACE, - Inplace.INPLACE_X, -] - -# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_ -_TEST_CASES = [ - test_case + (inplace_item,) - for test_case in _TEST_CASES_ - for inplace_item in _INPLACE -] - -# Data types used for testing (matching old operators library: only F16 and F32) -_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32] - -# Tolerance map for different data types -_TOLERANCE_MAP = { - InfiniDtype.F16: {"atol": 0, "rtol": 1e-3}, - InfiniDtype.F32: {"atol": 0, "rtol": 1e-7}, -} - -DEBUG = False -PROFILE = False -NUM_PRERUN = 10 -NUM_ITERATIONS = 1000 - - -def neg_op(x): - return torch.neg(x).to(x.dtype) - - -def test( - handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=InfiniDtype.F16, sync=None -): - # Generate test tensors with values in range [-200, -100) for neg operation - # This matches the original test case: * 100 - 200 - x_torch_tensor = torch.rand(shape) * 100 - 200 - - x = TestTensor( - shape, - x_torch_tensor.stride(), - dtype, - device, - mode="manual", - set_tensor=x_torch_tensor, - ) - - if inplace == Inplace.INPLACE_X: - y = x - else: - y = TestTensor(shape, None, dtype, device) - - if y.is_broadcast(): - return - - print( - f"Testing Neg on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}" - ) - - ans = neg_op(x.torch_tensor()) - - if sync is not None: - sync() - - descriptor = infiniopOperatorDescriptor_t() - check_error( - LIBINFINIOP.infiniopCreateNegDescriptor( - handle, ctypes.byref(descriptor), y.descriptor, x.descriptor - ) - ) - - # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel - for tensor in [x, y]: - tensor.destroy_desc() - - workspace_size = c_uint64(0) - check_error( - LIBINFINIOP.infiniopGetNegWorkspaceSize( - descriptor, ctypes.byref(workspace_size) - ) - ) - workspace = TestWorkspace(workspace_size.value, y.device) - - def lib_neg(): - check_error( - LIBINFINIOP.infiniopNeg( - descriptor, workspace.data(), workspace_size.value, y.data(), x.data(), None - ) - ) - - lib_neg() - if sync is not None: - sync() - - atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) - if DEBUG: - debug(y.actual_tensor(), ans, atol=atol, rtol=rtol) - assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol, equal_nan=True) - - # Profiling workflow - if PROFILE: - # fmt: off - profile_operation("PyTorch", lambda: neg_op(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) - profile_operation(" lib", lambda: lib_neg(), device, NUM_PRERUN, NUM_ITERATIONS) - # fmt: on - - check_error(LIBINFINIOP.infiniopDestroyNegDescriptor(descriptor)) - - -if __name__ == "__main__": - args = get_args() - - # Configure testing options - DEBUG = args.debug - PROFILE = args.profile - NUM_PRERUN = args.num_prerun - NUM_ITERATIONS = args.num_iterations - - for device in get_test_devices(args): - test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) - - print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/pow.py b/test/infiniop/pow.py deleted file mode 100644 index f437c4229..000000000 --- a/test/infiniop/pow.py +++ /dev/null @@ -1,190 +0,0 @@ -import torch -import ctypes -from ctypes import c_uint64 -from libinfiniop import ( - LIBINFINIOP, - TestTensor, - get_test_devices, - check_error, - test_operator, - get_args, - debug, - get_tolerance, - profile_operation, - TestWorkspace, - InfiniDtype, - InfiniDtypeNames, - InfiniDeviceNames, - infiniopOperatorDescriptor_t, -) -from enum import Enum, auto - -# ============================================================================== -# Configuration (Internal Use Only) -# ============================================================================== -# These are not meant to be imported from other modules -_TEST_CASES_ = [ - # shape, a_stride, b_stride, c_stride - ((13, 4), None, None, None), - ((13, 4), (10, 1), (10, 1), (10, 1)), - ((13, 4), (0, 1), None, None), - ((13, 4, 4), None, None, None), - ((13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1)), - ((13, 4, 4), (4, 0, 1), (0, 4, 1), None), - ((16, 5632), None, None, None), - ((16, 5632), (13312, 1), (13312, 1), (13312, 1)), - ((4, 4, 5632), None, None, None), - ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1)), -] - - -class Inplace(Enum): - OUT_OF_PLACE = auto() - INPLACE_A = auto() - INPLACE_B = auto() - - -# Inplace options applied for each test case in _TEST_CASES_ -_INPLACE = [ - Inplace.OUT_OF_PLACE, - Inplace.INPLACE_A, - Inplace.INPLACE_B, -] - -# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_ -_TEST_CASES = [ - test_case + (inplace_item,) - for test_case in _TEST_CASES_ - for inplace_item in _INPLACE -] - -# Data types used for testing -# Note: Only F16 and F32 are supported, matching the old repository's binary operator -_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32] - -# Tolerance map for different data types -# Note: pow operation may have larger numerical errors, especially for F16 -_TOLERANCE_MAP = { - InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, - InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-3}, -} - -DEBUG = False -PROFILE = False -NUM_PRERUN = 10 -NUM_ITERATIONS = 1000 - - -def pow_op(c, a, b): - torch.pow(a, b, out=c) - - -def test( - handle, - device, - shape, - a_stride=None, - b_stride=None, - c_stride=None, - inplace=Inplace.OUT_OF_PLACE, - dtype=InfiniDtype.F16, - sync=None, -): - # Generate test tensors with values in a reasonable range for pow operation - # Avoid negative bases and very large exponents to prevent numerical issues - a = TestTensor(shape, a_stride, dtype, device, mode="random", scale=5.0, bias=0.1) - b = TestTensor(shape, b_stride, dtype, device, mode="random", scale=3.0, bias=0.1) - - if inplace == Inplace.INPLACE_A: - if c_stride is not None and c_stride != a_stride: - return - c = a - elif inplace == Inplace.INPLACE_B: - if c_stride is not None and c_stride != b_stride: - return - c = b - else: - c = TestTensor(shape, c_stride, dtype, device) - - if c.is_broadcast(): - return - - print( - f"Testing Pow on {InfiniDeviceNames[device]} with shape:{shape} a_stride:{a_stride} b_stride:{b_stride} c_stride:{c_stride} " - f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}" - ) - - pow_op(c.torch_tensor(), a.torch_tensor(), b.torch_tensor()) - - if sync is not None: - sync() - - descriptor = infiniopOperatorDescriptor_t() - check_error( - LIBINFINIOP.infiniopCreatePowDescriptor( - handle, - ctypes.byref(descriptor), - c.descriptor, - a.descriptor, - b.descriptor, - ) - ) - - # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel - for tensor in [a, b, c]: - tensor.destroy_desc() - - workspace_size = c_uint64(0) - check_error( - LIBINFINIOP.infiniopGetPowWorkspaceSize( - descriptor, ctypes.byref(workspace_size) - ) - ) - workspace = TestWorkspace(workspace_size.value, c.device) - - def lib_pow(): - check_error( - LIBINFINIOP.infiniopPow( - descriptor, - workspace.data(), - workspace_size.value, - c.data(), - a.data(), - b.data(), - None, - ) - ) - - lib_pow() - - if sync is not None: - sync() - - atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) - if DEBUG: - debug(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol, equal_nan=True) - # Use equal_nan=True to handle NaN cases in pow operation - assert torch.allclose(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol, equal_nan=True) - - # Profiling workflow - if PROFILE: - # fmt: off - profile_operation("PyTorch", lambda: pow_op(c.torch_tensor(), a.torch_tensor(), b.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) - profile_operation(" lib", lambda: lib_pow(), device, NUM_PRERUN, NUM_ITERATIONS) - # fmt: on - check_error(LIBINFINIOP.infiniopDestroyPowDescriptor(descriptor)) - - -if __name__ == "__main__": - args = get_args() - - # Configure testing options - DEBUG = args.debug - PROFILE = args.profile - NUM_PRERUN = args.num_prerun - NUM_ITERATIONS = args.num_iterations - - for device in get_test_devices(args): - test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) - - print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/reciprocal.py b/test/infiniop/reciprocal.py deleted file mode 100644 index 4e816481c..000000000 --- a/test/infiniop/reciprocal.py +++ /dev/null @@ -1,168 +0,0 @@ -import ctypes -from ctypes import c_uint64 -from enum import Enum, auto - -import torch -from libinfiniop import ( - LIBINFINIOP, - InfiniDeviceNames, - InfiniDtype, - InfiniDtypeNames, - TestTensor, - TestWorkspace, - check_error, - debug, - get_args, - get_test_devices, - get_tolerance, - infiniopOperatorDescriptor_t, - profile_operation, - test_operator, -) - -# ============================================================================== -# Configuration (Internal Use Only) -# ============================================================================== -# These are not meant to be imported from other modules -_TEST_CASES_ = [ - # tensor_shape, inplace - ((1, 3),), - ((3, 3),), - ((32, 20, 512),), - ((33, 333, 333),), - ((32, 256, 112, 112),), - ((3, 3, 13, 9, 17),), -] - - -class Inplace(Enum): - OUT_OF_PLACE = auto() - INPLACE_X = auto() - - -# Inplace options applied for each test case in _TEST_CASES_ -_INPLACE = [ - Inplace.OUT_OF_PLACE, - Inplace.INPLACE_X, -] - -# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_ -_TEST_CASES = [ - test_case + (inplace_item,) - for test_case in _TEST_CASES_ - for inplace_item in _INPLACE -] - -# Data types used for testing (matching old operators library: only F16 and F32) -_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32] - -# Tolerance map for different data types -_TOLERANCE_MAP = { - InfiniDtype.F16: {"atol": 0, "rtol": 1e-3}, - InfiniDtype.F32: {"atol": 0, "rtol": 1e-7}, -} - -DEBUG = False -PROFILE = False -NUM_PRERUN = 10 -NUM_ITERATIONS = 1000 - - -def reciprocal_op(x): - return torch.reciprocal(x).to(x.dtype) - - -def test( - handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=InfiniDtype.F16, sync=None -): - # Generate test tensors with values in range [-10, 10) for reciprocal operation - # This matches the original test case: * 20 - 10 - # Note: Avoid values too close to zero to prevent division by zero issues - x_torch_tensor = torch.rand(shape) * 20 - 10 - # Ensure no zero values - x_torch_tensor = torch.where(x_torch_tensor == 0, torch.ones_like(x_torch_tensor), x_torch_tensor) - - x = TestTensor( - shape, - x_torch_tensor.stride(), - dtype, - device, - mode="manual", - set_tensor=x_torch_tensor, - ) - - if inplace == Inplace.INPLACE_X: - y = x - else: - y = TestTensor(shape, None, dtype, device) - - if y.is_broadcast(): - return - - print( - f"Testing Reciprocal on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}" - ) - - ans = reciprocal_op(x.torch_tensor()) - - if sync is not None: - sync() - - descriptor = infiniopOperatorDescriptor_t() - check_error( - LIBINFINIOP.infiniopCreateReciprocalDescriptor( - handle, ctypes.byref(descriptor), y.descriptor, x.descriptor - ) - ) - - # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel - for tensor in [x, y]: - tensor.destroy_desc() - - workspace_size = c_uint64(0) - check_error( - LIBINFINIOP.infiniopGetReciprocalWorkspaceSize( - descriptor, ctypes.byref(workspace_size) - ) - ) - workspace = TestWorkspace(workspace_size.value, y.device) - - def lib_reciprocal(): - check_error( - LIBINFINIOP.infiniopReciprocal( - descriptor, workspace.data(), workspace_size.value, y.data(), x.data(), None - ) - ) - - lib_reciprocal() - if sync is not None: - sync() - - atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) - if DEBUG: - debug(y.actual_tensor(), ans, atol=atol, rtol=rtol) - assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol, equal_nan=True) - - # Profiling workflow - if PROFILE: - # fmt: off - profile_operation("PyTorch", lambda: reciprocal_op(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) - profile_operation(" lib", lambda: lib_reciprocal(), device, NUM_PRERUN, NUM_ITERATIONS) - # fmt: on - - check_error(LIBINFINIOP.infiniopDestroyReciprocalDescriptor(descriptor)) - - -if __name__ == "__main__": - args = get_args() - - # Configure testing options - DEBUG = args.debug - PROFILE = args.profile - NUM_PRERUN = args.num_prerun - NUM_ITERATIONS = args.num_iterations - - for device in get_test_devices(args): - test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) - - print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/round.py b/test/infiniop/round.py deleted file mode 100644 index d6053f676..000000000 --- a/test/infiniop/round.py +++ /dev/null @@ -1,165 +0,0 @@ -import ctypes -from ctypes import c_uint64 -from enum import Enum, auto - -import torch -from libinfiniop import ( - LIBINFINIOP, - InfiniDeviceNames, - InfiniDtype, - InfiniDtypeNames, - TestTensor, - TestWorkspace, - check_error, - debug, - get_args, - get_test_devices, - get_tolerance, - infiniopOperatorDescriptor_t, - profile_operation, - test_operator, -) - -# ============================================================================== -# Configuration (Internal Use Only) -# ============================================================================== -# These are not meant to be imported from other modules -_TEST_CASES_ = [ - # tensor_shape, inplace - ((1, 3),), - ((3, 3),), - ((32, 20, 512),), - ((33, 333, 333),), - ((32, 256, 112, 112),), - ((3, 3, 13, 9, 17),), -] - - -class Inplace(Enum): - OUT_OF_PLACE = auto() - INPLACE_X = auto() - - -# Inplace options applied for each test case in _TEST_CASES_ -_INPLACE = [ - Inplace.OUT_OF_PLACE, - Inplace.INPLACE_X, -] - -# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_ -_TEST_CASES = [ - test_case + (inplace_item,) - for test_case in _TEST_CASES_ - for inplace_item in _INPLACE -] - -# Data types used for testing (matching old operators library: only F16 and F32) -_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32] - -# Tolerance map for different data types -_TOLERANCE_MAP = { - InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, - InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7}, -} - -DEBUG = False -PROFILE = False -NUM_PRERUN = 10 -NUM_ITERATIONS = 1000 - - -def round_op(x): - return torch.round(x).to(x.dtype) - - -def test( - handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=InfiniDtype.F16, sync=None -): - # Generate test tensors with values in range [-20, -10) for round operation - # This matches the original test case: * 10 - 20 - x_torch_tensor = torch.rand(shape) * 10 - 20 - - x = TestTensor( - shape, - x_torch_tensor.stride(), - dtype, - device, - mode="manual", - set_tensor=x_torch_tensor, - ) - - if inplace == Inplace.INPLACE_X: - y = x - else: - y = TestTensor(shape, None, dtype, device) - - if y.is_broadcast(): - return - - print( - f"Testing Round on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}" - ) - - ans = round_op(x.torch_tensor()) - - if sync is not None: - sync() - - descriptor = infiniopOperatorDescriptor_t() - check_error( - LIBINFINIOP.infiniopCreateRoundDescriptor( - handle, ctypes.byref(descriptor), y.descriptor, x.descriptor - ) - ) - - # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel - for tensor in [x, y]: - tensor.destroy_desc() - - workspace_size = c_uint64(0) - check_error( - LIBINFINIOP.infiniopGetRoundWorkspaceSize( - descriptor, ctypes.byref(workspace_size) - ) - ) - workspace = TestWorkspace(workspace_size.value, y.device) - - def lib_round(): - check_error( - LIBINFINIOP.infiniopRound( - descriptor, workspace.data(), workspace_size.value, y.data(), x.data(), None - ) - ) - - lib_round() - if sync is not None: - sync() - - atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) - if DEBUG: - debug(y.actual_tensor(), ans, atol=atol, rtol=rtol) - assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol, equal_nan=True) - - # Profiling workflow - if PROFILE: - # fmt: off - profile_operation("PyTorch", lambda: round_op(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) - profile_operation(" lib", lambda: lib_round(), device, NUM_PRERUN, NUM_ITERATIONS) - # fmt: on - - check_error(LIBINFINIOP.infiniopDestroyRoundDescriptor(descriptor)) - - -if __name__ == "__main__": - args = get_args() - - # Configure testing options - DEBUG = args.debug - PROFILE = args.profile - NUM_PRERUN = args.num_prerun - NUM_ITERATIONS = args.num_iterations - - for device in get_test_devices(args): - test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) - - print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/sign.py b/test/infiniop/sign.py deleted file mode 100644 index f0eb5b5f8..000000000 --- a/test/infiniop/sign.py +++ /dev/null @@ -1,166 +0,0 @@ -import ctypes -from ctypes import c_uint64 -from enum import Enum, auto - -import torch -from libinfiniop import ( - LIBINFINIOP, - InfiniDeviceNames, - InfiniDtype, - InfiniDtypeNames, - TestTensor, - TestWorkspace, - check_error, - debug, - get_args, - get_test_devices, - get_tolerance, - infiniopOperatorDescriptor_t, - profile_operation, - test_operator, -) - -# ============================================================================== -# Configuration (Internal Use Only) -# ============================================================================== -# These are not meant to be imported from other modules -_TEST_CASES_ = [ - # tensor_shape, inplace - ((1, 3),), - ((3, 3),), - ((32, 20, 512),), - ((33, 333, 333),), - ((32, 256, 112, 112),), - ((3, 3, 13, 9, 17),), -] - - -class Inplace(Enum): - OUT_OF_PLACE = auto() - INPLACE_X = auto() - - -# Inplace options applied for each test case in _TEST_CASES_ -_INPLACE = [ - Inplace.OUT_OF_PLACE, - Inplace.INPLACE_X, -] - -# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_ -_TEST_CASES = [ - test_case + (inplace_item,) - for test_case in _TEST_CASES_ - for inplace_item in _INPLACE -] - -# Data types used for testing (matching old operators library: only F16 and F32) -_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32] - -# Tolerance map for different data types -# Copied from old operators library: atol=0, rtol=0 -_TOLERANCE_MAP = { - InfiniDtype.F16: {"atol": 0, "rtol": 0}, - InfiniDtype.F32: {"atol": 0, "rtol": 0}, -} - -DEBUG = False -PROFILE = False -NUM_PRERUN = 10 -NUM_ITERATIONS = 1000 - - -def sign_op(x): - return torch.sign(x).to(x.dtype) - - -def test( - handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=InfiniDtype.F16, sync=None -): - # Generate test tensors with values in range [-200, -100) for sign operation - # sign domain is (-∞, +∞), so we use range [-200, -100) - x_torch_tensor = torch.rand(shape) * 100 - 200 - - x = TestTensor( - shape, - x_torch_tensor.stride(), - dtype, - device, - mode="manual", - set_tensor=x_torch_tensor, - ) - - if inplace == Inplace.INPLACE_X: - y = x - else: - y = TestTensor(shape, None, dtype, device) - - if y.is_broadcast(): - return - - print( - f"Testing Sign on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}" - ) - - ans = sign_op(x.torch_tensor()) - - if sync is not None: - sync() - - descriptor = infiniopOperatorDescriptor_t() - check_error( - LIBINFINIOP.infiniopCreateSignDescriptor( - handle, ctypes.byref(descriptor), y.descriptor, x.descriptor - ) - ) - - # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel - for tensor in [x, y]: - tensor.destroy_desc() - - workspace_size = c_uint64(0) - check_error( - LIBINFINIOP.infiniopGetSignWorkspaceSize( - descriptor, ctypes.byref(workspace_size) - ) - ) - workspace = TestWorkspace(workspace_size.value, y.device) - - def lib_sign(): - check_error( - LIBINFINIOP.infiniopSign( - descriptor, workspace.data(), workspace_size.value, y.data(), x.data(), None - ) - ) - - lib_sign() - if sync is not None: - sync() - - atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) - if DEBUG: - debug(y.actual_tensor(), ans, atol=atol, rtol=rtol) - assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol, equal_nan=True) - - # Profiling workflow - if PROFILE: - # fmt: off - profile_operation("PyTorch", lambda: sign_op(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) - profile_operation(" lib", lambda: lib_sign(), device, NUM_PRERUN, NUM_ITERATIONS) - # fmt: on - - check_error(LIBINFINIOP.infiniopDestroySignDescriptor(descriptor)) - - -if __name__ == "__main__": - args = get_args() - - # Configure testing options - DEBUG = args.debug - PROFILE = args.profile - NUM_PRERUN = args.num_prerun - NUM_ITERATIONS = args.num_iterations - - for device in get_test_devices(args): - test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) - - print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/sinh.py b/test/infiniop/sinh.py deleted file mode 100644 index 99bc02c58..000000000 --- a/test/infiniop/sinh.py +++ /dev/null @@ -1,166 +0,0 @@ -import ctypes -from ctypes import c_uint64 -from enum import Enum, auto - -import torch -from libinfiniop import ( - LIBINFINIOP, - InfiniDeviceNames, - InfiniDtype, - InfiniDtypeNames, - TestTensor, - TestWorkspace, - check_error, - debug, - get_args, - get_test_devices, - get_tolerance, - infiniopOperatorDescriptor_t, - profile_operation, - test_operator, -) - -# ============================================================================== -# Configuration (Internal Use Only) -# ============================================================================== -# These are not meant to be imported from other modules -_TEST_CASES_ = [ - # tensor_shape, inplace - ((1, 3),), - ((3, 3),), - ((32, 20, 512),), - ((33, 333, 333),), - ((32, 256, 112, 112),), - ((3, 3, 13, 9, 17),), -] - - -class Inplace(Enum): - OUT_OF_PLACE = auto() - INPLACE_X = auto() - - -# Inplace options applied for each test case in _TEST_CASES_ -_INPLACE = [ - Inplace.OUT_OF_PLACE, - Inplace.INPLACE_X, -] - -# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_ -_TEST_CASES = [ - test_case + (inplace_item,) - for test_case in _TEST_CASES_ - for inplace_item in _INPLACE -] - -# Data types used for testing (matching old operators library: only F16 and F32) -_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32] - -# Tolerance map for different data types -# Copied from old operators library: atol=0, rtol=0 -_TOLERANCE_MAP = { - InfiniDtype.F16: {"atol": 0, "rtol": 0}, - InfiniDtype.F32: {"atol": 0, "rtol": 0}, -} - -DEBUG = False -PROFILE = False -NUM_PRERUN = 10 -NUM_ITERATIONS = 1000 - - -def sinh_op(x): - return torch.sinh(x).to(x.dtype) - - -def test( - handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=InfiniDtype.F16, sync=None -): - # Generate test tensors with values in range [-200, -100) for sinh operation - # sinh domain is (-∞, +∞), so we use range [-200, -100) - x_torch_tensor = torch.rand(shape) * 100 - 200 - - x = TestTensor( - shape, - x_torch_tensor.stride(), - dtype, - device, - mode="manual", - set_tensor=x_torch_tensor, - ) - - if inplace == Inplace.INPLACE_X: - y = x - else: - y = TestTensor(shape, None, dtype, device) - - if y.is_broadcast(): - return - - print( - f"Testing Sinh on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}" - ) - - ans = sinh_op(x.torch_tensor()) - - if sync is not None: - sync() - - descriptor = infiniopOperatorDescriptor_t() - check_error( - LIBINFINIOP.infiniopCreateSinhDescriptor( - handle, ctypes.byref(descriptor), y.descriptor, x.descriptor - ) - ) - - # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel - for tensor in [x, y]: - tensor.destroy_desc() - - workspace_size = c_uint64(0) - check_error( - LIBINFINIOP.infiniopGetSinhWorkspaceSize( - descriptor, ctypes.byref(workspace_size) - ) - ) - workspace = TestWorkspace(workspace_size.value, y.device) - - def lib_sinh(): - check_error( - LIBINFINIOP.infiniopSinh( - descriptor, workspace.data(), workspace_size.value, y.data(), x.data(), None - ) - ) - - lib_sinh() - if sync is not None: - sync() - - atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) - if DEBUG: - debug(y.actual_tensor(), ans, atol=atol, rtol=rtol) - assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol, equal_nan=True) - - # Profiling workflow - if PROFILE: - # fmt: off - profile_operation("PyTorch", lambda: sinh_op(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) - profile_operation(" lib", lambda: lib_sinh(), device, NUM_PRERUN, NUM_ITERATIONS) - # fmt: on - - check_error(LIBINFINIOP.infiniopDestroySinhDescriptor(descriptor)) - - -if __name__ == "__main__": - args = get_args() - - # Configure testing options - DEBUG = args.debug - PROFILE = args.profile - NUM_PRERUN = args.num_prerun - NUM_ITERATIONS = args.num_iterations - - for device in get_test_devices(args): - test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) - - print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/sqrt.py b/test/infiniop/sqrt.py deleted file mode 100644 index 6e1419971..000000000 --- a/test/infiniop/sqrt.py +++ /dev/null @@ -1,166 +0,0 @@ -import ctypes -from ctypes import c_uint64 -from enum import Enum, auto - -import torch -from libinfiniop import ( - LIBINFINIOP, - InfiniDeviceNames, - InfiniDtype, - InfiniDtypeNames, - TestTensor, - TestWorkspace, - check_error, - debug, - get_args, - get_test_devices, - get_tolerance, - infiniopOperatorDescriptor_t, - profile_operation, - test_operator, -) - -# ============================================================================== -# Configuration (Internal Use Only) -# ============================================================================== -# These are not meant to be imported from other modules -_TEST_CASES_ = [ - # tensor_shape, inplace - ((1, 3),), - ((3, 3),), - ((32, 20, 512),), - ((33, 333, 333),), - ((32, 256, 112, 112),), - ((3, 3, 13, 9, 17),), -] - - -class Inplace(Enum): - OUT_OF_PLACE = auto() - INPLACE_X = auto() - - -# Inplace options applied for each test case in _TEST_CASES_ -_INPLACE = [ - Inplace.OUT_OF_PLACE, - Inplace.INPLACE_X, -] - -# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_ -_TEST_CASES = [ - test_case + (inplace_item,) - for test_case in _TEST_CASES_ - for inplace_item in _INPLACE -] - -# Data types used for testing (matching old operators library: only F16 and F32) -_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32] - -# Tolerance map for different data types -# Copied from old operators library: atol=0, rtol=1e-3 -_TOLERANCE_MAP = { - InfiniDtype.F16: {"atol": 0, "rtol": 1e-3}, - InfiniDtype.F32: {"atol": 0, "rtol": 1e-3}, -} - -DEBUG = False -PROFILE = False -NUM_PRERUN = 10 -NUM_ITERATIONS = 1000 - - -def sqrt_op(x): - return torch.sqrt(x).to(x.dtype) - - -def test( - handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=InfiniDtype.F16, sync=None -): - # Generate test tensors with values in range [0, 100) for sqrt operation - # sqrt domain is [0, +∞), so we use range [0, 100) - x_torch_tensor = torch.rand(shape) * 100 - - x = TestTensor( - shape, - x_torch_tensor.stride(), - dtype, - device, - mode="manual", - set_tensor=x_torch_tensor, - ) - - if inplace == Inplace.INPLACE_X: - y = x - else: - y = TestTensor(shape, None, dtype, device) - - if y.is_broadcast(): - return - - print( - f"Testing Sqrt on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}" - ) - - ans = sqrt_op(x.torch_tensor()) - - if sync is not None: - sync() - - descriptor = infiniopOperatorDescriptor_t() - check_error( - LIBINFINIOP.infiniopCreateSqrtDescriptor( - handle, ctypes.byref(descriptor), y.descriptor, x.descriptor - ) - ) - - # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel - for tensor in [x, y]: - tensor.destroy_desc() - - workspace_size = c_uint64(0) - check_error( - LIBINFINIOP.infiniopGetSqrtWorkspaceSize( - descriptor, ctypes.byref(workspace_size) - ) - ) - workspace = TestWorkspace(workspace_size.value, y.device) - - def lib_sqrt(): - check_error( - LIBINFINIOP.infiniopSqrt( - descriptor, workspace.data(), workspace_size.value, y.data(), x.data(), None - ) - ) - - lib_sqrt() - if sync is not None: - sync() - - atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) - if DEBUG: - debug(y.actual_tensor(), ans, atol=atol, rtol=rtol) - assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol, equal_nan=True) - - # Profiling workflow - if PROFILE: - # fmt: off - profile_operation("PyTorch", lambda: sqrt_op(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) - profile_operation(" lib", lambda: lib_sqrt(), device, NUM_PRERUN, NUM_ITERATIONS) - # fmt: on - - check_error(LIBINFINIOP.infiniopDestroySqrtDescriptor(descriptor)) - - -if __name__ == "__main__": - args = get_args() - - # Configure testing options - DEBUG = args.debug - PROFILE = args.profile - NUM_PRERUN = args.num_prerun - NUM_ITERATIONS = args.num_iterations - - for device in get_test_devices(args): - test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) - - print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/tan.py b/test/infiniop/tan.py deleted file mode 100644 index 877f5dd58..000000000 --- a/test/infiniop/tan.py +++ /dev/null @@ -1,167 +0,0 @@ -import ctypes -import math -from ctypes import c_uint64 -from enum import Enum, auto - -import torch -from libinfiniop import ( - LIBINFINIOP, - InfiniDeviceNames, - InfiniDtype, - InfiniDtypeNames, - TestTensor, - TestWorkspace, - check_error, - debug, - get_args, - get_test_devices, - get_tolerance, - infiniopOperatorDescriptor_t, - profile_operation, - test_operator, -) - -# ============================================================================== -# Configuration (Internal Use Only) -# ============================================================================== -# These are not meant to be imported from other modules -_TEST_CASES_ = [ - # tensor_shape, inplace - ((1, 3),), - ((3, 3),), - ((32, 20, 512),), - ((33, 333, 333),), - ((32, 256, 112, 112),), - ((3, 3, 13, 9, 17),), -] - - -class Inplace(Enum): - OUT_OF_PLACE = auto() - INPLACE_X = auto() - - -# Inplace options applied for each test case in _TEST_CASES_ -_INPLACE = [ - Inplace.OUT_OF_PLACE, - Inplace.INPLACE_X, -] - -# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_ -_TEST_CASES = [ - test_case + (inplace_item,) - for test_case in _TEST_CASES_ - for inplace_item in _INPLACE -] - -# Data types used for testing (matching old operators library: only F16 and F32) -_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32] - -# Tolerance map for different data types -# Copied from old operators library: atol=1e-6, rtol=1e-2 -_TOLERANCE_MAP = { - InfiniDtype.F16: {"atol": 1e-6, "rtol": 1e-2}, - InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-2}, -} - -DEBUG = False -PROFILE = False -NUM_PRERUN = 10 -NUM_ITERATIONS = 1000 - - -def tan_op(x): - return torch.tan(x).to(x.dtype) - - -def test( - handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=InfiniDtype.F16, sync=None -): - # Generate test tensors with values in range [-2π, 2π) for tan operation - # tan domain is (-∞, +∞), so we use range [-2π, 2π) - x_torch_tensor = torch.rand(shape) * 4 * math.pi - 2 * math.pi - - x = TestTensor( - shape, - x_torch_tensor.stride(), - dtype, - device, - mode="manual", - set_tensor=x_torch_tensor, - ) - - if inplace == Inplace.INPLACE_X: - y = x - else: - y = TestTensor(shape, None, dtype, device) - - if y.is_broadcast(): - return - - print( - f"Testing Tan on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}" - ) - - ans = tan_op(x.torch_tensor()) - - if sync is not None: - sync() - - descriptor = infiniopOperatorDescriptor_t() - check_error( - LIBINFINIOP.infiniopCreateTanDescriptor( - handle, ctypes.byref(descriptor), y.descriptor, x.descriptor - ) - ) - - # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel - for tensor in [x, y]: - tensor.destroy_desc() - - workspace_size = c_uint64(0) - check_error( - LIBINFINIOP.infiniopGetTanWorkspaceSize( - descriptor, ctypes.byref(workspace_size) - ) - ) - workspace = TestWorkspace(workspace_size.value, y.device) - - def lib_tan(): - check_error( - LIBINFINIOP.infiniopTan( - descriptor, workspace.data(), workspace_size.value, y.data(), x.data(), None - ) - ) - - lib_tan() - if sync is not None: - sync() - - atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) - if DEBUG: - debug(y.actual_tensor(), ans, atol=atol, rtol=rtol) - assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol, equal_nan=True) - - # Profiling workflow - if PROFILE: - # fmt: off - profile_operation("PyTorch", lambda: tan_op(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) - profile_operation(" lib", lambda: lib_tan(), device, NUM_PRERUN, NUM_ITERATIONS) - # fmt: on - - check_error(LIBINFINIOP.infiniopDestroyTanDescriptor(descriptor)) - - -if __name__ == "__main__": - args = get_args() - - # Configure testing options - DEBUG = args.debug - PROFILE = args.profile - NUM_PRERUN = args.num_prerun - NUM_ITERATIONS = args.num_iterations - - for device in get_test_devices(args): - test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) - - print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/test_all_binary_ops.py b/test/infiniop/test_all_binary_ops.py new file mode 100644 index 000000000..e08b3e41b --- /dev/null +++ b/test/infiniop/test_all_binary_ops.py @@ -0,0 +1,251 @@ +""" +统一测试所有 Binary 算子 + +这个文件包含所有 binary 算子的测试,方便统一管理和运行。 +可以通过命令行参数选择运行哪些算子,或者运行所有算子。 + +使用方法: + # 运行所有 binary 算子测试 + python test_all_binary_ops.py + + # 只运行 div 和 pow 算子 + python test_all_binary_ops.py --ops div pow + + # 运行特定设备上的测试 + python test_all_binary_ops.py --cpu --nvidia +""" + +import torch +import argparse +from libinfiniop import InfiniDtype, TestTensor +from libinfiniop.binary_test_base import BinaryTestBase + + +# ============================================================================== +# 所有 Binary 算子的测试类定义 +# ============================================================================== + +class DivTest(BinaryTestBase): + OP_NAME = "Div" + OP_NAME_LOWER = "div" + + @staticmethod + def torch_op(c, a, b): + torch.div(a, b, out=c) + + @staticmethod + def generate_input_a(shape, a_stride, dtype, device): + return TestTensor(shape, a_stride, dtype, device) + + @staticmethod + def generate_input_b(shape, b_stride, dtype, device): + # For division, ensure b doesn't contain zeros + return TestTensor(shape, b_stride, dtype, device, scale=2, bias=0.1) + + TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6}, + } + + EQUAL_NAN = True + + +class PowTest(BinaryTestBase): + OP_NAME = "Pow" + OP_NAME_LOWER = "pow" + + @staticmethod + def torch_op(c, a, b): + torch.pow(a, b, out=c) + + @staticmethod + def generate_input_a(shape, a_stride, dtype, device): + # Avoid negative bases and very large exponents + return TestTensor(shape, a_stride, dtype, device, mode="random", scale=5.0, bias=0.1) + + @staticmethod + def generate_input_b(shape, b_stride, dtype, device): + return TestTensor(shape, b_stride, dtype, device, mode="random", scale=3.0, bias=0.1) + + TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-3}, + } + + EQUAL_NAN = True + + +class ModTest(BinaryTestBase): + OP_NAME = "Mod" + OP_NAME_LOWER = "mod" + + @staticmethod + def torch_op(c, a, b): + torch.remainder(a, b, out=c) + + @staticmethod + def generate_input_a(shape, a_stride, dtype, device): + return TestTensor(shape, a_stride, dtype, device) + + @staticmethod + def generate_input_b(shape, b_stride, dtype, device): + # Avoid zeros + return TestTensor(shape, b_stride, dtype, device, scale=2, bias=0.1) + + TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6}, + } + + EQUAL_NAN = True + + +class MaxTest(BinaryTestBase): + OP_NAME = "Max" + OP_NAME_LOWER = "max" + + @staticmethod + def torch_op(c, a, b): + torch.maximum(a, b, out=c) + + @staticmethod + def generate_input_a(shape, a_stride, dtype, device): + return TestTensor(shape, a_stride, dtype, device) + + @staticmethod + def generate_input_b(shape, b_stride, dtype, device): + return TestTensor(shape, b_stride, dtype, device) + + TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6}, + } + + EQUAL_NAN = True + + +class MinTest(BinaryTestBase): + OP_NAME = "Min" + OP_NAME_LOWER = "min" + + @staticmethod + def torch_op(c, a, b): + torch.minimum(a, b, out=c) + + @staticmethod + def generate_input_a(shape, a_stride, dtype, device): + return TestTensor(shape, a_stride, dtype, device) + + @staticmethod + def generate_input_b(shape, b_stride, dtype, device): + return TestTensor(shape, b_stride, dtype, device) + + TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6}, + } + + EQUAL_NAN = True + + +# ============================================================================== +# 算子注册表 +# ============================================================================== + +# 所有 binary 算子的测试类映射 +BINARY_OP_TESTS = { + "div": DivTest, + "pow": PowTest, + "mod": ModTest, + "max": MaxTest, + "min": MinTest, +} + + +# ============================================================================== +# 主函数 +# ============================================================================== + +def main(): + # 先获取基础参数解析器 + from libinfiniop.utils import get_args as get_base_args + import sys + + # 创建新的参数解析器,添加 --ops 参数 + parser = argparse.ArgumentParser(description="Test all binary operators", parents=[]) + parser.add_argument( + "--ops", + nargs="+", + choices=list(BINARY_OP_TESTS.keys()), + default=list(BINARY_OP_TESTS.keys()), + help="Specify which operators to test (default: all)", + ) + + # 解析参数 + args, unknown = parser.parse_known_args() + + # 将未知参数传递给基础参数解析器 + if unknown: + sys.argv = [sys.argv[0]] + unknown + base_args = get_base_args() + else: + # 如果没有其他参数,使用默认值 + sys.argv = [sys.argv[0]] + base_args = get_base_args() + + # 合并参数 + for attr in dir(base_args): + if not attr.startswith("_") and not hasattr(args, attr): + setattr(args, attr, getattr(base_args, attr)) + + # 运行选定的算子测试 + print(f"\n{'='*60}") + print(f"Testing {len(args.ops)} binary operator(s): {', '.join(args.ops)}") + print(f"{'='*60}\n") + + failed_ops = [] + passed_ops = [] + + for op_name in args.ops: + test_class = BINARY_OP_TESTS[op_name] + print(f"\n{'='*60}") + print(f"Testing {test_class.OP_NAME} operator") + print(f"{'='*60}") + + try: + # 创建临时参数对象,传递给测试类 + test_class.DEBUG = args.debug + test_class.PROFILE = args.profile + test_class.NUM_PRERUN = args.num_prerun + test_class.NUM_ITERATIONS = args.num_iterations + + # 运行测试 + for device in get_test_devices(args): + test_operator(device, test_class.test, test_class.TEST_CASES, test_class.TENSOR_DTYPES) + + print(f"\033[92m{test_class.OP_NAME} test passed!\033[0m") + passed_ops.append(op_name) + except Exception as e: + print(f"\033[91m{test_class.OP_NAME} test failed: {e}\033[0m") + failed_ops.append(op_name) + if args.debug: + import traceback + traceback.print_exc() + + # 打印总结 + print(f"\n{'='*60}") + print("Test Summary") + print(f"{'='*60}") + print(f"Total operators: {len(args.ops)}") + print(f"\033[92mPassed: {len(passed_ops)} - {', '.join(passed_ops)}\033[0m") + if failed_ops: + print(f"\033[91mFailed: {len(failed_ops)} - {', '.join(failed_ops)}\033[0m") + print(f"{'='*60}\n") + + if failed_ops: + exit(1) + + +if __name__ == "__main__": + from libinfiniop.utils import get_test_devices, test_operator + main() diff --git a/test/infiniop/test_all_unary_ops.py b/test/infiniop/test_all_unary_ops.py new file mode 100644 index 000000000..b9d7cdc8b --- /dev/null +++ b/test/infiniop/test_all_unary_ops.py @@ -0,0 +1,548 @@ +""" +统一测试所有 Unary 算子 + +这个文件包含所有 unary 算子的测试,方便统一管理和运行。 +可以通过命令行参数选择运行哪些算子,或者运行所有算子。 + +使用方法: + # 运行所有 unary 算子测试 + python test_all_unary_ops.py + + # 只运行 abs 和 log 算子 + python test_all_unary_ops.py --ops abs log + + # 运行特定设备上的测试 + python test_all_unary_ops.py --cpu --nvidia +""" + +import torch +import argparse +from libinfiniop import InfiniDtype +from libinfiniop.unary_test_base import UnaryTestBase + + +# ============================================================================== +# 所有 Unary 算子的测试类定义 +# ============================================================================== + +class AbsTest(UnaryTestBase): + OP_NAME = "Abs" + OP_NAME_LOWER = "abs" + + @staticmethod + def torch_op(x): + return torch.abs(x).to(x.dtype) + + @staticmethod + def generate_input(shape, dtype, device): + return torch.rand(shape, dtype=dtype, device=device) * 2 - 1 + + TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7}, + } + + +class AcosTest(UnaryTestBase): + OP_NAME = "Acos" + OP_NAME_LOWER = "acos" + + @staticmethod + def torch_op(x): + return torch.acos(x).to(x.dtype) + + @staticmethod + def generate_input(shape, dtype, device): + # acos domain is [-1, 1] + return torch.rand(shape, dtype=dtype, device=device) * 2 - 1 + + TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7}, + } + + EQUAL_NAN = True + + +class AcoshTest(UnaryTestBase): + OP_NAME = "Acosh" + OP_NAME_LOWER = "acosh" + + @staticmethod + def torch_op(x): + return torch.acosh(x).to(x.dtype) + + @staticmethod + def generate_input(shape, dtype, device): + # acosh domain is [1, +∞) + return torch.rand(shape, dtype=dtype, device=device) * 10 + 1 + + TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7}, + } + + EQUAL_NAN = True + + +class AsinTest(UnaryTestBase): + OP_NAME = "Asin" + OP_NAME_LOWER = "asin" + + @staticmethod + def torch_op(x): + return torch.asin(x).to(x.dtype) + + @staticmethod + def generate_input(shape, dtype, device): + # asin domain is [-1, 1] + return torch.rand(shape, dtype=dtype, device=device) * 2 - 1 + + TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7}, + } + + EQUAL_NAN = True + + +class AsinhTest(UnaryTestBase): + OP_NAME = "Asinh" + OP_NAME_LOWER = "asinh" + + @staticmethod + def torch_op(x): + return torch.asinh(x).to(x.dtype) + + @staticmethod + def generate_input(shape, dtype, device): + return torch.rand(shape, dtype=dtype, device=device) * 2 - 1 + + TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7}, + } + + EQUAL_NAN = True + + +class AtanTest(UnaryTestBase): + OP_NAME = "Atan" + OP_NAME_LOWER = "atan" + + @staticmethod + def torch_op(x): + return torch.atan(x).to(x.dtype) + + @staticmethod + def generate_input(shape, dtype, device): + return torch.rand(shape, dtype=dtype, device=device) * 2 - 1 + + TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7}, + } + + EQUAL_NAN = True + + +class AtanhTest(UnaryTestBase): + OP_NAME = "Atanh" + OP_NAME_LOWER = "atanh" + + @staticmethod + def torch_op(x): + return torch.atanh(x).to(x.dtype) + + @staticmethod + def generate_input(shape, dtype, device): + # atanh domain is (-1, 1) + return torch.rand(shape, dtype=dtype, device=device) * 1.8 - 0.9 + + TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7}, + } + + EQUAL_NAN = True + + +class CeilTest(UnaryTestBase): + OP_NAME = "Ceil" + OP_NAME_LOWER = "ceil" + + @staticmethod + def torch_op(x): + return torch.ceil(x).to(x.dtype) + + @staticmethod + def generate_input(shape, dtype, device): + return torch.rand(shape, dtype=dtype, device=device) * 10 - 5 + + TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7}, + } + + +class CosTest(UnaryTestBase): + OP_NAME = "Cos" + OP_NAME_LOWER = "cos" + + @staticmethod + def torch_op(x): + return torch.cos(x).to(x.dtype) + + @staticmethod + def generate_input(shape, dtype, device): + # Generate test tensors with values in range [-200, -100) for cos operation + # cos domain is (-∞, +∞), so we use range [-200, -100) + return torch.rand(shape, dtype=dtype, device=device) * 100 - 200 + + TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-4, "rtol": 1e-2}, + InfiniDtype.F32: {"atol": 1e-4, "rtol": 1e-2}, + } + + EQUAL_NAN = True + + +class CoshTest(UnaryTestBase): + OP_NAME = "Cosh" + OP_NAME_LOWER = "cosh" + + @staticmethod + def torch_op(x): + return torch.cosh(x).to(x.dtype) + + @staticmethod + def generate_input(shape, dtype, device): + return torch.rand(shape, dtype=dtype, device=device) * 2 - 1 + + TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7}, + } + + EQUAL_NAN = True + + +class ErfTest(UnaryTestBase): + OP_NAME = "Erf" + OP_NAME_LOWER = "erf" + + @staticmethod + def torch_op(x): + return torch.erf(x).to(x.dtype) + + @staticmethod + def generate_input(shape, dtype, device): + return torch.rand(shape, dtype=dtype, device=device) * 2 - 1 + + TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7}, + } + + EQUAL_NAN = True + + +class FloorTest(UnaryTestBase): + OP_NAME = "Floor" + OP_NAME_LOWER = "floor" + + @staticmethod + def torch_op(x): + return torch.floor(x).to(x.dtype) + + @staticmethod + def generate_input(shape, dtype, device): + return torch.rand(shape, dtype=dtype, device=device) * 10 - 5 + + TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7}, + } + + EQUAL_NAN = True + + +class LogTest(UnaryTestBase): + OP_NAME = "Log" + OP_NAME_LOWER = "log" + + @staticmethod + def torch_op(x): + return torch.log(x).to(x.dtype) + + @staticmethod + def generate_input(shape, dtype, device): + # log domain is (0, +∞), so we use range [0.1, 1.1) + return torch.rand(shape, dtype=dtype, device=device) + 0.1 + + TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-7, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-3}, + } + + EQUAL_NAN = True + + +class NegTest(UnaryTestBase): + OP_NAME = "Neg" + OP_NAME_LOWER = "neg" + + @staticmethod + def torch_op(x): + return torch.neg(x).to(x.dtype) + + @staticmethod + def generate_input(shape, dtype, device): + return torch.rand(shape, dtype=dtype, device=device) * 2 - 1 + + TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7}, + } + + EQUAL_NAN = True + + +class ReciprocalTest(UnaryTestBase): + OP_NAME = "Reciprocal" + OP_NAME_LOWER = "reciprocal" + + @staticmethod + def torch_op(x): + return torch.reciprocal(x).to(x.dtype) + + @staticmethod + def generate_input(shape, dtype, device): + # Avoid zeros + return torch.rand(shape, dtype=dtype, device=device) * 2 + 0.1 + + TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7}, + } + + EQUAL_NAN = True + + +class RoundTest(UnaryTestBase): + OP_NAME = "Round" + OP_NAME_LOWER = "round" + + @staticmethod + def torch_op(x): + return torch.round(x).to(x.dtype) + + @staticmethod + def generate_input(shape, dtype, device): + return torch.rand(shape, dtype=dtype, device=device) * 10 - 5 + + TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7}, + } + + EQUAL_NAN = True + + +class SignTest(UnaryTestBase): + OP_NAME = "Sign" + OP_NAME_LOWER = "sign" + + @staticmethod + def torch_op(x): + return torch.sign(x).to(x.dtype) + + @staticmethod + def generate_input(shape, dtype, device): + return torch.rand(shape, dtype=dtype, device=device) * 2 - 1 + + TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7}, + } + + EQUAL_NAN = True + + +class SinhTest(UnaryTestBase): + OP_NAME = "Sinh" + OP_NAME_LOWER = "sinh" + + @staticmethod + def torch_op(x): + return torch.sinh(x).to(x.dtype) + + @staticmethod + def generate_input(shape, dtype, device): + return torch.rand(shape, dtype=dtype, device=device) * 2 - 1 + + TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7}, + } + + EQUAL_NAN = True + + +class SqrtTest(UnaryTestBase): + OP_NAME = "Sqrt" + OP_NAME_LOWER = "sqrt" + + @staticmethod + def torch_op(x): + return torch.sqrt(x).to(x.dtype) + + @staticmethod + def generate_input(shape, dtype, device): + # sqrt domain is [0, +∞) + return torch.rand(shape, dtype=dtype, device=device) * 100 + + TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 0, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 0, "rtol": 1e-3}, + } + + EQUAL_NAN = True + + +class TanTest(UnaryTestBase): + OP_NAME = "Tan" + OP_NAME_LOWER = "tan" + + @staticmethod + def torch_op(x): + return torch.tan(x).to(x.dtype) + + @staticmethod + def generate_input(shape, dtype, device): + return torch.rand(shape, dtype=dtype, device=device) * 2 - 1 + + TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7}, + } + + EQUAL_NAN = True + + +# ============================================================================== +# 算子注册表 +# ============================================================================== + +# 所有 unary 算子的测试类映射 +UNARY_OP_TESTS = { + "abs": AbsTest, + "acos": AcosTest, + "acosh": AcoshTest, + "asin": AsinTest, + "asinh": AsinhTest, + "atan": AtanTest, + "atanh": AtanhTest, + "ceil": CeilTest, + "cos": CosTest, + "cosh": CoshTest, + "erf": ErfTest, + "floor": FloorTest, + "log": LogTest, + "neg": NegTest, + "reciprocal": ReciprocalTest, + "round": RoundTest, + "sign": SignTest, + "sinh": SinhTest, + "sqrt": SqrtTest, + "tan": TanTest, +} + + +# ============================================================================== +# 主函数 +# ============================================================================== + +def main(): + # 先获取基础参数解析器 + from libinfiniop.utils import get_args as get_base_args + import sys + + # 创建新的参数解析器,添加 --ops 参数 + parser = argparse.ArgumentParser(description="Test all unary operators", parents=[]) + parser.add_argument( + "--ops", + nargs="+", + choices=list(UNARY_OP_TESTS.keys()), + default=list(UNARY_OP_TESTS.keys()), + help="Specify which operators to test (default: all)", + ) + + # 解析参数 + args, unknown = parser.parse_known_args() + + # 将未知参数传递给基础参数解析器 + if unknown: + sys.argv = [sys.argv[0]] + unknown + base_args = get_base_args() + else: + # 如果没有其他参数,使用默认值 + sys.argv = [sys.argv[0]] + base_args = get_base_args() + + # 合并参数 + for attr in dir(base_args): + if not attr.startswith("_") and not hasattr(args, attr): + setattr(args, attr, getattr(base_args, attr)) + + # 运行选定的算子测试 + print(f"\n{'='*60}") + print(f"Testing {len(args.ops)} unary operator(s): {', '.join(args.ops)}") + print(f"{'='*60}\n") + + failed_ops = [] + passed_ops = [] + + for op_name in args.ops: + test_class = UNARY_OP_TESTS[op_name] + print(f"\n{'='*60}") + print(f"Testing {test_class.OP_NAME} operator") + print(f"{'='*60}") + + try: + # 创建临时参数对象,传递给测试类 + test_class.DEBUG = args.debug + test_class.PROFILE = args.profile + test_class.NUM_PRERUN = args.num_prerun + test_class.NUM_ITERATIONS = args.num_iterations + + # 运行测试 + for device in get_test_devices(args): + test_operator(device, test_class.test, test_class.TEST_CASES, test_class.TENSOR_DTYPES) + + print(f"\033[92m{test_class.OP_NAME} test passed!\033[0m") + passed_ops.append(op_name) + except Exception as e: + print(f"\033[91m{test_class.OP_NAME} test failed: {e}\033[0m") + failed_ops.append(op_name) + if args.debug: + import traceback + traceback.print_exc() + + # 打印总结 + print(f"\n{'='*60}") + print("Test Summary") + print(f"{'='*60}") + print(f"Total operators: {len(args.ops)}") + print(f"\033[92mPassed: {len(passed_ops)} - {', '.join(passed_ops)}\033[0m") + if failed_ops: + print(f"\033[91mFailed: {len(failed_ops)} - {', '.join(failed_ops)}\033[0m") + print(f"{'='*60}\n") + + if failed_ops: + exit(1) + + +if __name__ == "__main__": + from libinfiniop.utils import get_test_devices, test_operator + main() From dcea337976d62fe93ba415040dddc576f7d4d1b4 Mon Sep 17 00:00:00 2001 From: gongchensu Date: Thu, 8 Jan 2026 06:21:12 +0000 Subject: [PATCH 5/7] Issue/888 - Add averagepool,batch_norm,cross_entropy_loss,exp,hardswish,gather,index_copy_inplace,interpolate_nearest,maxpool,scatter operators from competition. --- include/infiniop/ops/averagepool.h | 29 ++ include/infiniop/ops/batch_norm.h | 37 ++ include/infiniop/ops/cross_entropy_loss.h | 27 ++ include/infiniop/ops/exp.h | 24 ++ include/infiniop/ops/gather.h | 31 ++ include/infiniop/ops/hardswish.h | 24 ++ include/infiniop/ops/index_copy_inplace.h | 30 ++ include/infiniop/ops/interpolate_nearest.h | 25 ++ include/infiniop/ops/maxpool.h | 29 ++ include/infiniop/ops/scatter.h | 30 ++ src/infiniop/ops/averagepool/averagepool.h | 52 +++ .../ops/averagepool/cpu/averagepool_cpu.cc | 348 ++++++++++++++++++ .../ops/averagepool/cpu/averagepool_cpu.h | 8 + .../averagepool/cuda/averagepool_kernel.cuh | 185 ++++++++++ src/infiniop/ops/averagepool/info.h | 136 +++++++ .../ops/averagepool/nvidia/averagepool.cu | 220 +++++++++++ .../averagepool/nvidia/averagepool_nvidia.cuh | 8 + src/infiniop/ops/averagepool/operator.cc | 147 ++++++++ src/infiniop/ops/batch_norm/batch_norm.h | 54 +++ .../ops/batch_norm/cpu/batch_norm_cpu.cc | 118 ++++++ .../ops/batch_norm/cpu/batch_norm_cpu.h | 8 + src/infiniop/ops/batch_norm/cuda/kernel.cuh | 57 +++ src/infiniop/ops/batch_norm/info.h | 69 ++++ .../batch_norm/nvidia/batch_norm_nvidia.cu | 176 +++++++++ .../batch_norm/nvidia/batch_norm_nvidia.cuh | 10 + src/infiniop/ops/batch_norm/operator.cc | 156 ++++++++ .../cpu/cross_entropy_loss_cpu.cc | 321 ++++++++++++++++ .../cpu/cross_entropy_loss_cpu.h | 8 + .../cross_entropy_loss/cross_entropy_loss.h | 48 +++ src/infiniop/ops/cross_entropy_loss/info.h | 36 ++ .../nvidia/cross_entropy_loss_nvidia.cu | 217 +++++++++++ .../nvidia/cross_entropy_loss_nvidia.cuh | 8 + .../ops/cross_entropy_loss/operator.cc | 142 +++++++ src/infiniop/ops/exp/cpu/exp_cpu.cc | 52 +++ src/infiniop/ops/exp/cpu/exp_cpu.h | 21 ++ src/infiniop/ops/exp/cuda/kernel.cuh | 39 ++ src/infiniop/ops/exp/nvidia/exp_nvidia.cu | 59 +++ src/infiniop/ops/exp/nvidia/exp_nvidia.cuh | 8 + src/infiniop/ops/exp/operator.cc | 139 +++++++ src/infiniop/ops/gather/cpu/gather_cpu.cc | 96 +++++ src/infiniop/ops/gather/cpu/gather_cpu.h | 8 + src/infiniop/ops/gather/cuda/kernel.cuh | 37 ++ src/infiniop/ops/gather/gather.h | 47 +++ src/infiniop/ops/gather/info.h | 58 +++ .../ops/gather/nvidia/gather_nvidia.cu | 179 +++++++++ .../ops/gather/nvidia/gather_nvidia.cuh | 7 + src/infiniop/ops/gather/operator.cc | 144 ++++++++ .../ops/hardswish/cpu/hardswish_cpu.cc | 52 +++ .../ops/hardswish/cpu/hardswish_cpu.h | 30 ++ src/infiniop/ops/hardswish/cuda/kernel.cuh | 55 +++ .../ops/hardswish/nvidia/hardswish_nvidia.cu | 59 +++ .../ops/hardswish/nvidia/hardswish_nvidia.cuh | 8 + src/infiniop/ops/hardswish/operator.cc | 139 +++++++ .../cpu/index_copy_inplace_cpu.cc | 93 +++++ .../cpu/index_copy_inplace_cpu.h | 8 + .../index_copy_inplace/index_copy_inplace.h | 53 +++ src/infiniop/ops/index_copy_inplace/info.h | 74 ++++ .../nvidia/index_copy_inplace_nvidia.cu | 127 +++++++ .../nvidia/index_copy_inplace_nvidia.cuh | 7 + .../ops/index_copy_inplace/operator.cc | 144 ++++++++ .../cpu/interpolate_nearest_cpu.cc | 284 ++++++++++++++ .../cpu/interpolate_nearest_cpu.h | 8 + .../ops/interpolate_nearest/cuda/kernel.cuh | 168 +++++++++ src/infiniop/ops/interpolate_nearest/info.h | 118 ++++++ .../interpolate_nearest/interpolate_nearest.h | 51 +++ .../nvidia/interpolate_nearest_nvidia.cu | 93 +++++ .../nvidia/interpolate_nearest_nvidia.cuh | 9 + .../ops/interpolate_nearest/operator.cc | 139 +++++++ src/infiniop/ops/maxpool/cpu/maxpool_cpu.cc | 322 ++++++++++++++++ src/infiniop/ops/maxpool/cpu/maxpool_cpu.h | 8 + src/infiniop/ops/maxpool/info.h | 113 ++++++ src/infiniop/ops/maxpool/maxpool.h | 53 +++ .../ops/maxpool/nvidia/maxpool_nvidia.cu | 240 ++++++++++++ .../ops/maxpool/nvidia/maxpool_nvidia.cuh | 8 + src/infiniop/ops/maxpool/operator.cc | 147 ++++++++ src/infiniop/ops/scatter/cpu/scatter_cpu.cc | 100 +++++ src/infiniop/ops/scatter/cpu/scatter_cpu.h | 8 + src/infiniop/ops/scatter/cuda/kernel.cuh | 37 ++ src/infiniop/ops/scatter/info.h | 64 ++++ .../ops/scatter/metax/scatter_metax.h | 8 + .../ops/scatter/metax/scatter_metax.maca | 190 ++++++++++ .../ops/scatter/nvidia/scatter_nvidia.cu | 180 +++++++++ .../ops/scatter/nvidia/scatter_nvidia.cuh | 7 + src/infiniop/ops/scatter/operator.cc | 160 ++++++++ src/infiniop/ops/scatter/scatter.h | 47 +++ test/infiniop/averagepool.py | 239 ++++++++++++ test/infiniop/batch_norm.py | 244 ++++++++++++ test/infiniop/cross_entropy_loss.py | 213 +++++++++++ test/infiniop/exp.py | 165 +++++++++ test/infiniop/gather.py | 160 ++++++++ test/infiniop/hardswish.py | 167 +++++++++ test/infiniop/index_copy_inplace.py | 180 +++++++++ test/infiniop/interpolate_nearest.py | 265 +++++++++++++ test/infiniop/libinfiniop/op_register.py | 321 +++++++++++++++- test/infiniop/maxpool.py | 242 ++++++++++++ test/infiniop/scatter.py | 196 ++++++++++ 96 files changed, 9514 insertions(+), 1 deletion(-) create mode 100644 include/infiniop/ops/averagepool.h create mode 100644 include/infiniop/ops/batch_norm.h create mode 100644 include/infiniop/ops/cross_entropy_loss.h create mode 100644 include/infiniop/ops/exp.h create mode 100644 include/infiniop/ops/gather.h create mode 100644 include/infiniop/ops/hardswish.h create mode 100644 include/infiniop/ops/index_copy_inplace.h create mode 100644 include/infiniop/ops/interpolate_nearest.h create mode 100644 include/infiniop/ops/maxpool.h create mode 100644 include/infiniop/ops/scatter.h create mode 100644 src/infiniop/ops/averagepool/averagepool.h create mode 100644 src/infiniop/ops/averagepool/cpu/averagepool_cpu.cc create mode 100644 src/infiniop/ops/averagepool/cpu/averagepool_cpu.h create mode 100644 src/infiniop/ops/averagepool/cuda/averagepool_kernel.cuh create mode 100644 src/infiniop/ops/averagepool/info.h create mode 100644 src/infiniop/ops/averagepool/nvidia/averagepool.cu create mode 100644 src/infiniop/ops/averagepool/nvidia/averagepool_nvidia.cuh create mode 100644 src/infiniop/ops/averagepool/operator.cc create mode 100644 src/infiniop/ops/batch_norm/batch_norm.h create mode 100644 src/infiniop/ops/batch_norm/cpu/batch_norm_cpu.cc create mode 100644 src/infiniop/ops/batch_norm/cpu/batch_norm_cpu.h create mode 100644 src/infiniop/ops/batch_norm/cuda/kernel.cuh create mode 100644 src/infiniop/ops/batch_norm/info.h create mode 100644 src/infiniop/ops/batch_norm/nvidia/batch_norm_nvidia.cu create mode 100644 src/infiniop/ops/batch_norm/nvidia/batch_norm_nvidia.cuh create mode 100644 src/infiniop/ops/batch_norm/operator.cc create mode 100644 src/infiniop/ops/cross_entropy_loss/cpu/cross_entropy_loss_cpu.cc create mode 100644 src/infiniop/ops/cross_entropy_loss/cpu/cross_entropy_loss_cpu.h create mode 100644 src/infiniop/ops/cross_entropy_loss/cross_entropy_loss.h create mode 100644 src/infiniop/ops/cross_entropy_loss/info.h create mode 100644 src/infiniop/ops/cross_entropy_loss/nvidia/cross_entropy_loss_nvidia.cu create mode 100644 src/infiniop/ops/cross_entropy_loss/nvidia/cross_entropy_loss_nvidia.cuh create mode 100644 src/infiniop/ops/cross_entropy_loss/operator.cc create mode 100644 src/infiniop/ops/exp/cpu/exp_cpu.cc create mode 100644 src/infiniop/ops/exp/cpu/exp_cpu.h create mode 100644 src/infiniop/ops/exp/cuda/kernel.cuh create mode 100644 src/infiniop/ops/exp/nvidia/exp_nvidia.cu create mode 100644 src/infiniop/ops/exp/nvidia/exp_nvidia.cuh create mode 100644 src/infiniop/ops/exp/operator.cc create mode 100644 src/infiniop/ops/gather/cpu/gather_cpu.cc create mode 100644 src/infiniop/ops/gather/cpu/gather_cpu.h create mode 100644 src/infiniop/ops/gather/cuda/kernel.cuh create mode 100644 src/infiniop/ops/gather/gather.h create mode 100644 src/infiniop/ops/gather/info.h create mode 100644 src/infiniop/ops/gather/nvidia/gather_nvidia.cu create mode 100644 src/infiniop/ops/gather/nvidia/gather_nvidia.cuh create mode 100644 src/infiniop/ops/gather/operator.cc create mode 100644 src/infiniop/ops/hardswish/cpu/hardswish_cpu.cc create mode 100644 src/infiniop/ops/hardswish/cpu/hardswish_cpu.h create mode 100644 src/infiniop/ops/hardswish/cuda/kernel.cuh create mode 100644 src/infiniop/ops/hardswish/nvidia/hardswish_nvidia.cu create mode 100644 src/infiniop/ops/hardswish/nvidia/hardswish_nvidia.cuh create mode 100644 src/infiniop/ops/hardswish/operator.cc create mode 100644 src/infiniop/ops/index_copy_inplace/cpu/index_copy_inplace_cpu.cc create mode 100644 src/infiniop/ops/index_copy_inplace/cpu/index_copy_inplace_cpu.h create mode 100644 src/infiniop/ops/index_copy_inplace/index_copy_inplace.h create mode 100644 src/infiniop/ops/index_copy_inplace/info.h create mode 100644 src/infiniop/ops/index_copy_inplace/nvidia/index_copy_inplace_nvidia.cu create mode 100644 src/infiniop/ops/index_copy_inplace/nvidia/index_copy_inplace_nvidia.cuh create mode 100644 src/infiniop/ops/index_copy_inplace/operator.cc create mode 100644 src/infiniop/ops/interpolate_nearest/cpu/interpolate_nearest_cpu.cc create mode 100644 src/infiniop/ops/interpolate_nearest/cpu/interpolate_nearest_cpu.h create mode 100644 src/infiniop/ops/interpolate_nearest/cuda/kernel.cuh create mode 100644 src/infiniop/ops/interpolate_nearest/info.h create mode 100644 src/infiniop/ops/interpolate_nearest/interpolate_nearest.h create mode 100644 src/infiniop/ops/interpolate_nearest/nvidia/interpolate_nearest_nvidia.cu create mode 100644 src/infiniop/ops/interpolate_nearest/nvidia/interpolate_nearest_nvidia.cuh create mode 100644 src/infiniop/ops/interpolate_nearest/operator.cc create mode 100644 src/infiniop/ops/maxpool/cpu/maxpool_cpu.cc create mode 100644 src/infiniop/ops/maxpool/cpu/maxpool_cpu.h create mode 100644 src/infiniop/ops/maxpool/info.h create mode 100644 src/infiniop/ops/maxpool/maxpool.h create mode 100644 src/infiniop/ops/maxpool/nvidia/maxpool_nvidia.cu create mode 100644 src/infiniop/ops/maxpool/nvidia/maxpool_nvidia.cuh create mode 100644 src/infiniop/ops/maxpool/operator.cc create mode 100644 src/infiniop/ops/scatter/cpu/scatter_cpu.cc create mode 100644 src/infiniop/ops/scatter/cpu/scatter_cpu.h create mode 100644 src/infiniop/ops/scatter/cuda/kernel.cuh create mode 100644 src/infiniop/ops/scatter/info.h create mode 100644 src/infiniop/ops/scatter/metax/scatter_metax.h create mode 100644 src/infiniop/ops/scatter/metax/scatter_metax.maca create mode 100644 src/infiniop/ops/scatter/nvidia/scatter_nvidia.cu create mode 100644 src/infiniop/ops/scatter/nvidia/scatter_nvidia.cuh create mode 100644 src/infiniop/ops/scatter/operator.cc create mode 100644 src/infiniop/ops/scatter/scatter.h create mode 100644 test/infiniop/averagepool.py create mode 100644 test/infiniop/batch_norm.py create mode 100644 test/infiniop/cross_entropy_loss.py create mode 100644 test/infiniop/exp.py create mode 100644 test/infiniop/gather.py create mode 100644 test/infiniop/hardswish.py create mode 100644 test/infiniop/index_copy_inplace.py create mode 100644 test/infiniop/interpolate_nearest.py create mode 100644 test/infiniop/maxpool.py create mode 100644 test/infiniop/scatter.py diff --git a/include/infiniop/ops/averagepool.h b/include/infiniop/ops/averagepool.h new file mode 100644 index 000000000..87e857175 --- /dev/null +++ b/include/infiniop/ops/averagepool.h @@ -0,0 +1,29 @@ +#ifndef __INFINIOP_AVERAGEPOOL_H__ +#define __INFINIOP_AVERAGEPOOL_H__ + +#include "../operator_descriptor.h" + +__C typedef struct InfiniopDescriptor *infiniopAvgPoolDescriptor_t; + +__C infiniStatus_t infiniopCreateAvgPoolDescriptor(infiniopHandle_t handle, + infiniopAvgPoolDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc, + void *kernel_size, + void *strides, + void *pads, + bool ceil_mode); + +__C infiniStatus_t infiniopGetAvgPoolWorkspaceSize(infiniopAvgPoolDescriptor_t desc, + size_t *size); + +__C infiniStatus_t infiniopAvgPool(infiniopAvgPoolDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream); + +__C infiniStatus_t infiniopDestroyAvgPoolDescriptor(infiniopAvgPoolDescriptor_t desc); + +#endif // __INFINIOP_AVERAGEPOOL_H__ diff --git a/include/infiniop/ops/batch_norm.h b/include/infiniop/ops/batch_norm.h new file mode 100644 index 000000000..5487a1f69 --- /dev/null +++ b/include/infiniop/ops/batch_norm.h @@ -0,0 +1,37 @@ +#ifndef __INFINIOP_BATCH_NORM_API_H__ +#define __INFINIOP_BATCH_NORM_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopBatchNormDescriptor_t; + +__C __export infiniStatus_t infiniopCreateBatchNormDescriptor( + infiniopHandle_t handle, + infiniopBatchNormDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t running_mean_desc, + infiniopTensorDescriptor_t running_var_desc, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t weight_desc, + infiniopTensorDescriptor_t bias_desc, + float momentum, + float eps +); + +__C __export infiniStatus_t infiniopGetBatchNormWorkspaceSize(infiniopBatchNormDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopBatchNorm(infiniopBatchNormDescriptor_t desc, + void *workspace, + size_t workspace_size, + void * output, + void * running_mean, + void * running_var, + const void * input, + const void * weight, + const void * bias, + void *stream +); + +__C __export infiniStatus_t infiniopDestroyBatchNormDescriptor(infiniopBatchNormDescriptor_t desc); + +#endif diff --git a/include/infiniop/ops/cross_entropy_loss.h b/include/infiniop/ops/cross_entropy_loss.h new file mode 100644 index 000000000..8b59843c9 --- /dev/null +++ b/include/infiniop/ops/cross_entropy_loss.h @@ -0,0 +1,27 @@ +#ifndef __INFINIOP_CROSS_ENTROPY_LOSS_API_H__ +#define __INFINIOP_CROSS_ENTROPY_LOSS_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopCrossEntropyLossDescriptor_t; + +__C infiniStatus_t infiniopCreateCrossEntropyLossDescriptor(infiniopHandle_t handle, + infiniopCrossEntropyLossDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t loss_desc, + infiniopTensorDescriptor_t logits_desc, + infiniopTensorDescriptor_t target_desc); + +__C infiniStatus_t infiniopGetCrossEntropyLossWorkspaceSize(infiniopCrossEntropyLossDescriptor_t desc, + size_t *size); + +__C infiniStatus_t infiniopCrossEntropyLoss(infiniopCrossEntropyLossDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *loss, + const void *logits, + const void *target, + void *stream); + +__C infiniStatus_t infiniopDestroyCrossEntropyLossDescriptor(infiniopCrossEntropyLossDescriptor_t desc); + +#endif // __INFINIOP_CROSS_ENTROPY_LOSS_API_H__ diff --git a/include/infiniop/ops/exp.h b/include/infiniop/ops/exp.h new file mode 100644 index 000000000..624bc5363 --- /dev/null +++ b/include/infiniop/ops/exp.h @@ -0,0 +1,24 @@ +#ifndef __INFINIOP_EXP_API_H__ +#define __INFINIOP_EXP_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopExpDescriptor_t; + +__C __export infiniStatus_t infiniopCreateExpDescriptor(infiniopHandle_t handle, + infiniopExpDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output, + infiniopTensorDescriptor_t input); + +__C __export infiniStatus_t infiniopGetExpWorkspaceSize(infiniopExpDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopExp(infiniopExpDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream); + +__C __export infiniStatus_t infiniopDestroyExpDescriptor(infiniopExpDescriptor_t desc); + +#endif diff --git a/include/infiniop/ops/gather.h b/include/infiniop/ops/gather.h new file mode 100644 index 000000000..9ffe310c9 --- /dev/null +++ b/include/infiniop/ops/gather.h @@ -0,0 +1,31 @@ +#ifndef __INFINIOP_GATHER_API_H__ +#define __INFINIOP_GATHER_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopGatherDescriptor_t; + +__C __export infiniStatus_t infiniopCreateGatherDescriptor( + infiniopHandle_t handle, + infiniopGatherDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t index_desc, + size_t dim +); + +__C __export infiniStatus_t infiniopGetGatherWorkspaceSize(infiniopGatherDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopGather( + infiniopGatherDescriptor_t desc, + void *workspace, + size_t workspace_size, + void * output, + const void * input, + const void * index, + void *stream +); + +__C __export infiniStatus_t infiniopDestroyGatherDescriptor(infiniopGatherDescriptor_t desc); + +#endif diff --git a/include/infiniop/ops/hardswish.h b/include/infiniop/ops/hardswish.h new file mode 100644 index 000000000..8d655fe82 --- /dev/null +++ b/include/infiniop/ops/hardswish.h @@ -0,0 +1,24 @@ +#ifndef __INFINIOP_HARDSWISH_API_H__ +#define __INFINIOP_HARDSWISH_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopHardswishDescriptor_t; + +__C __export infiniStatus_t infiniopCreateHardswishDescriptor(infiniopHandle_t handle, + infiniopHardswishDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output, + infiniopTensorDescriptor_t input); + +__C __export infiniStatus_t infiniopGetHardswishWorkspaceSize(infiniopHardswishDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopHardswish(infiniopHardswishDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream); + +__C __export infiniStatus_t infiniopDestroyHardswishDescriptor(infiniopHardswishDescriptor_t desc); + +#endif diff --git a/include/infiniop/ops/index_copy_inplace.h b/include/infiniop/ops/index_copy_inplace.h new file mode 100644 index 000000000..e2266299a --- /dev/null +++ b/include/infiniop/ops/index_copy_inplace.h @@ -0,0 +1,30 @@ +#ifndef __INFINIOP_INDEX_COPY_INPLACE_API_H__ +#define __INFINIOP_INDEX_COPY_INPLACE_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopIndexCopyInplaceDescriptor_t; + +__C __export infiniStatus_t infiniopCreateIndexCopyInplaceDescriptor( + infiniopHandle_t handle, + infiniopIndexCopyInplaceDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t index_desc, + size_t dim +); + +__C __export infiniStatus_t infiniopGetIndexCopyInplaceWorkspaceSize(infiniopIndexCopyInplaceDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopIndexCopyInplace(infiniopIndexCopyInplaceDescriptor_t desc, + void *workspace, + size_t workspace_size, + void * output, + const void * input, + const void * index, + void *stream +); + +__C __export infiniStatus_t infiniopDestroyIndexCopyInplaceDescriptor(infiniopIndexCopyInplaceDescriptor_t desc); + +#endif diff --git a/include/infiniop/ops/interpolate_nearest.h b/include/infiniop/ops/interpolate_nearest.h new file mode 100644 index 000000000..7f970dc38 --- /dev/null +++ b/include/infiniop/ops/interpolate_nearest.h @@ -0,0 +1,25 @@ +#ifndef __INFINIOP_INTERPOLATE_NEAREST_H__ +#define __INFINIOP_INTERPOLATE_NEAREST_H__ + +#include "../operator_descriptor.h" + +__C typedef struct InfiniopDescriptor *infiniopInterpolateNearestDescriptor_t; + +__C infiniStatus_t infiniopCreateInterpolateNearestDescriptor(infiniopHandle_t handle, + infiniopInterpolateNearestDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc); + +__C infiniStatus_t infiniopGetInterpolateNearestWorkspaceSize(infiniopInterpolateNearestDescriptor_t desc, + size_t *size); + +__C infiniStatus_t infiniopInterpolateNearest(infiniopInterpolateNearestDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream); + +__C infiniStatus_t infiniopDestroyInterpolateNearestDescriptor(infiniopInterpolateNearestDescriptor_t desc); + +#endif // __INFINIOP_INTERPOLATE_NEAREST_H__ diff --git a/include/infiniop/ops/maxpool.h b/include/infiniop/ops/maxpool.h new file mode 100644 index 000000000..e47a43aed --- /dev/null +++ b/include/infiniop/ops/maxpool.h @@ -0,0 +1,29 @@ +#ifndef __INFINIOP_MAX_POOL_H__ +#define __INFINIOP_MAX_POOL_H__ + +#include "../operator_descriptor.h" + +__C typedef struct InfiniopDescriptor *infiniopMaxPoolDescriptor_t; + +__C infiniStatus_t infiniopCreateMaxPoolDescriptor(infiniopHandle_t handle, + infiniopMaxPoolDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc, + void *kernel_size, + void *strides, + void *pads, + bool ceil_mode); + +__C infiniStatus_t infiniopGetMaxPoolWorkspaceSize(infiniopMaxPoolDescriptor_t desc, + size_t *size); + +__C infiniStatus_t infiniopMaxPool(infiniopMaxPoolDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream); + +__C infiniStatus_t infiniopDestroyMaxPoolDescriptor(infiniopMaxPoolDescriptor_t desc); + +#endif // __INFINIOP_MAX_POOL_H__ diff --git a/include/infiniop/ops/scatter.h b/include/infiniop/ops/scatter.h new file mode 100644 index 000000000..22e0eff83 --- /dev/null +++ b/include/infiniop/ops/scatter.h @@ -0,0 +1,30 @@ +#ifndef __INFINIOP_SCATTER_API_H__ +#define __INFINIOP_SCATTER_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopScatterDescriptor_t; + +__C __export infiniStatus_t infiniopCreateScatterDescriptor( + infiniopHandle_t handle, + infiniopScatterDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t index_desc, + size_t dim +); + +__C __export infiniStatus_t infiniopGetScatterWorkspaceSize(infiniopScatterDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopScatter(infiniopScatterDescriptor_t desc, + void *workspace, + size_t workspace_size, + void * output, + const void * input, + const void * index, + void *stream +); + +__C __export infiniStatus_t infiniopDestroyScatterDescriptor(infiniopScatterDescriptor_t desc); + +#endif diff --git a/src/infiniop/ops/averagepool/averagepool.h b/src/infiniop/ops/averagepool/averagepool.h new file mode 100644 index 000000000..7762826ab --- /dev/null +++ b/src/infiniop/ops/averagepool/averagepool.h @@ -0,0 +1,52 @@ +#ifndef __AVERAGEPOOL_H__ +#define __AVERAGEPOOL_H__ + +#include "../../operator.h" +#include "info.h" + +#define DESCRIPTOR(NAMESPACE) \ + namespace op::averagepool::NAMESPACE { \ + class Descriptor final : public InfiniopDescriptor { \ + struct Opaque; \ + Opaque *_opaque; \ + infiniDtype_t _dtype; \ + AvgPoolInfo _info; \ + size_t _workspace_size; \ + \ + Descriptor( \ + infiniDtype_t dtype, \ + AvgPoolInfo info, \ + size_t workspace_size_, \ + Opaque *opaque, \ + infiniDevice_t device_type, \ + int device_id) \ + : InfiniopDescriptor{device_type, device_id}, \ + _opaque(opaque), \ + _dtype(dtype), \ + _info(info), \ + _workspace_size(workspace_size_) {} \ + \ + public: \ + ~Descriptor(); \ + \ + size_t workspaceSize() const { return _workspace_size; } \ + \ + static infiniStatus_t create( \ + infiniopHandle_t handle, \ + Descriptor **desc_ptr, \ + infiniopTensorDescriptor_t output_desc, \ + infiniopTensorDescriptor_t input_desc, \ + void *kernel_size, \ + void *strides, \ + void *pads, \ + bool ceil_mode); \ + \ + infiniStatus_t calculate( \ + void *workspace, size_t workspace_size, \ + void *output, \ + const void *input, \ + void *stream) const; \ + }; \ + } + +#endif // __AVERAGEPOOL_H__ diff --git a/src/infiniop/ops/averagepool/cpu/averagepool_cpu.cc b/src/infiniop/ops/averagepool/cpu/averagepool_cpu.cc new file mode 100644 index 000000000..95a347ddc --- /dev/null +++ b/src/infiniop/ops/averagepool/cpu/averagepool_cpu.cc @@ -0,0 +1,348 @@ +#include "averagepool_cpu.h" +#include "../../../devices/cpu/common_cpu.h" +#include "../../../devices/cpu/cpu_handle.h" +#include "../info.h" +#include +#include +#include +#include +#include + +namespace op::averagepool::cpu { + +struct Descriptor::Opaque { + device::cpu::Handle *handle; + AvgPoolInfo info; + size_t workspace_size = 0; + +private: + Opaque(device::cpu::Handle *handle_ptr, const AvgPoolInfo &avgpool_info) + : handle(handle_ptr), info(avgpool_info) { + workspace_size = 0; + } + + template + void _avgpool_1d(Ydata *output, const T *input) const { + size_t batch_size = info.batch; + size_t channels = info.channels; + size_t input_width = info.input_dims[0]; + size_t output_width = info.output_dims[0]; + size_t kernel_width = info.kernel_sizes[0]; + size_t stride_width = info.strides[0]; + size_t pad_width = info.pads[0]; + + const size_t input_nc_stride = input_width; + const size_t output_nc_stride = output_width; + +#pragma omp parallel for schedule(static) + for (int b = 0; b < static_cast(batch_size); ++b) { + for (int c = 0; c < static_cast(channels); ++c) { + const size_t input_offset = (static_cast(b) * channels + static_cast(c)) * input_nc_stride; + const size_t output_offset = (static_cast(b) * channels + static_cast(c)) * output_nc_stride; + + for (size_t ow = 0; ow < output_width; ++ow) { + float sum = 0.0f; + int valid_count = 0; + + const int window_start = static_cast(ow * stride_width) - static_cast(pad_width); + const int window_end = window_start + static_cast(kernel_width); + + for (int iw = window_start; iw < window_end; ++iw) { + if (iw >= 0 && iw < static_cast(input_width)) { + sum += utils::cast(input[input_offset + iw]); + valid_count++; + } else if (iw >= -static_cast(pad_width) && iw < static_cast(input_width + pad_width)) { + valid_count++; + } + } + + float result = 0.0f; + if (valid_count > 0) { + result = sum / static_cast(valid_count); + } + output[output_offset + ow] = utils::cast(result); + } + } + } + } + + template + void _avgpool_2d(Ydata *output, const T *input) const { + size_t batch_size = info.batch; + size_t channels = info.channels; + size_t input_height = info.input_dims[0]; + size_t input_width = info.input_dims[1]; + size_t output_height = info.output_dims[0]; + size_t output_width = info.output_dims[1]; + size_t kernel_height = info.kernel_sizes[0]; + size_t kernel_width = info.kernel_sizes[1]; + size_t stride_height = info.strides[0]; + size_t stride_width = info.strides[1]; + size_t pad_height = info.pads[0]; + size_t pad_width = info.pads[1]; + + const size_t input_nc_stride = input_height * input_width; + const size_t output_nc_stride = output_height * output_width; + +#pragma omp parallel for schedule(static) + for (int b = 0; b < static_cast(batch_size); ++b) { + for (int c = 0; c < static_cast(channels); ++c) { + const size_t input_offset = (static_cast(b) * channels + static_cast(c)) * input_nc_stride; + const size_t output_offset = (static_cast(b) * channels + static_cast(c)) * output_nc_stride; + + for (size_t oh = 0; oh < output_height; ++oh) { + for (size_t ow = 0; ow < output_width; ++ow) { + float sum = 0.0f; + int valid_count = 0; + + const int start_h = static_cast(oh * stride_height) - static_cast(pad_height); + const int start_w = static_cast(ow * stride_width) - static_cast(pad_width); + + for (int kh = 0; kh < static_cast(kernel_height); ++kh) { + for (int kw = 0; kw < static_cast(kernel_width); ++kw) { + const int ih = start_h + kh; + const int iw = start_w + kw; + + if (ih >= 0 && ih < static_cast(input_height) && iw >= 0 && iw < static_cast(input_width)) { + sum += utils::cast(input[input_offset + ih * input_width + iw]); + valid_count++; + } else if (ih >= -static_cast(pad_height) && ih < static_cast(input_height + pad_height) && iw >= -static_cast(pad_width) && iw < static_cast(input_width + pad_width)) { + valid_count++; + } + } + } + + float result = 0.0f; + if (valid_count > 0) { + result = sum / static_cast(valid_count); + } + output[output_offset + oh * output_width + ow] = utils::cast(result); + } + } + } + } + } + + template + void _avgpool_3d(Ydata *output, const T *input) const { + size_t batch_size = info.batch; + size_t channels = info.channels; + size_t input_depth = info.input_dims[0]; + size_t input_height = info.input_dims[1]; + size_t input_width = info.input_dims[2]; + size_t output_depth = info.output_dims[0]; + size_t output_height = info.output_dims[1]; + size_t output_width = info.output_dims[2]; + size_t kernel_depth = info.kernel_sizes[0]; + size_t kernel_height = info.kernel_sizes[1]; + size_t kernel_width = info.kernel_sizes[2]; + size_t stride_depth = info.strides[0]; + size_t stride_height = info.strides[1]; + size_t stride_width = info.strides[2]; + size_t pad_depth = info.pads[0]; + size_t pad_height = info.pads[1]; + size_t pad_width = info.pads[2]; + + const size_t input_nc_stride = input_depth * input_height * input_width; + const size_t output_nc_stride = output_depth * output_height * output_width; + +#pragma omp parallel for schedule(static) + for (int b = 0; b < static_cast(batch_size); ++b) { + for (int c = 0; c < static_cast(channels); ++c) { + const size_t input_offset = (static_cast(b) * channels + static_cast(c)) * input_nc_stride; + const size_t output_offset = (static_cast(b) * channels + static_cast(c)) * output_nc_stride; + + for (size_t od = 0; od < output_depth; ++od) { + for (size_t oh = 0; oh < output_height; ++oh) { + for (size_t ow = 0; ow < output_width; ++ow) { + float sum = 0.0f; + int valid_count = 0; + + const int start_d = static_cast(od * stride_depth) - static_cast(pad_depth); + const int start_h = static_cast(oh * stride_height) - static_cast(pad_height); + const int start_w = static_cast(ow * stride_width) - static_cast(pad_width); + + for (int kd = 0; kd < static_cast(kernel_depth); ++kd) { + const int id = start_d + kd; + for (int kh = 0; kh < static_cast(kernel_height); ++kh) { + const int ih = start_h + kh; + for (int kw = 0; kw < static_cast(kernel_width); ++kw) { + const int iw = start_w + kw; + + if (id >= 0 && id < static_cast(input_depth) && ih >= 0 && ih < static_cast(input_height) && iw >= 0 && iw < static_cast(input_width)) { + const size_t idx = id * (input_height * input_width) + ih * input_width + iw; + sum += utils::cast(input[input_offset + idx]); + valid_count++; + } else if (id >= -static_cast(pad_depth) && id < static_cast(input_depth + pad_depth) && ih >= -static_cast(pad_height) && ih < static_cast(input_height + pad_height) && iw >= -static_cast(pad_width) && iw < static_cast(input_width + pad_width)) { + valid_count++; + } + } + } + } + + float result = 0.0f; + if (valid_count > 0) { + result = sum / static_cast(valid_count); + } + + const size_t out_idx = od * (output_height * output_width) + oh * output_width + ow; + output[output_offset + out_idx] = utils::cast(result); + } + } + } + } + } + } + + template + void _avgpool_cpu(Ydata *output, const T *input) const { + switch (info.ndim) { + case 1: + _avgpool_1d(output, input); + break; + case 2: + _avgpool_2d(output, input); + break; + case 3: + _avgpool_3d(output, input); + break; + default: + break; + } + } + +public: + Opaque(Opaque &&other) noexcept + : handle(other.handle), + info(std::move(other.info)), + workspace_size(other.workspace_size) { + other.handle = nullptr; + other.workspace_size = 0; + } + + ~Opaque() = default; + + static inline utils::Result + create(device::cpu::Handle *handle_ptr, + AvgPoolInfo &info) { + + Opaque opaque(handle_ptr, info); + return utils::Result(std::move(opaque)); + } + + infiniStatus_t calculate(void *workspace, size_t workspace_size, + void *output, const void *input, infiniDtype_t dtype) const { + if (!output || !input) { + return INFINI_STATUS_BAD_PARAM; + } + + size_t output_size = info.batch * info.channels; + for (size_t i = 0; i < info.ndim; ++i) { + output_size *= info.output_dims[i]; + } + + switch (dtype) { + case INFINI_DTYPE_F32: { + float *typed_output = static_cast(output); + const float *typed_input = static_cast(input); + _avgpool_cpu(typed_output, typed_input); + break; + } + case INFINI_DTYPE_F16: { + float *typed_output_f32 = static_cast(workspace); + const fp16_t *typed_input = static_cast(input); + + _avgpool_cpu(typed_output_f32, typed_input); + + fp16_t *typed_output = static_cast(output); +#pragma omp parallel for + for (int i = 0; i < static_cast(output_size); ++i) { + typed_output[i] = utils::cast(typed_output_f32[i]); + } + break; + } + case INFINI_DTYPE_BF16: { + float *typed_output_f32 = static_cast(workspace); + const bf16_t *typed_input = static_cast(input); + + _avgpool_cpu(typed_output_f32, typed_input); + + bf16_t *typed_output = static_cast(output); +#pragma omp parallel for + for (int i = 0; i < static_cast(output_size); ++i) { + typed_output[i] = utils::cast(typed_output_f32[i]); + } + break; + } + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; + } +}; + +Descriptor::~Descriptor() { + if (_opaque) { + delete _opaque; + } +} + +inline size_t calculateOutputSize(const AvgPoolInfo &info) { + size_t size = info.batch * info.channels; + for (size_t i = 0; i < info.ndim; ++i) { + size *= info.output_dims[i]; + } + return size; +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc, + void *kernel_size, + void *strides, + void *pads, + bool ceil_mode) { + + auto handle = reinterpret_cast(handle_); + auto dtype = input_desc->dtype(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F32, INFINI_DTYPE_F16, INFINI_DTYPE_BF16); + + auto result = AvgPoolInfo::create(output_desc, input_desc, kernel_size, + strides, pads, ceil_mode); + CHECK_RESULT(result); + auto info = result.take(); + + auto opaque_result = Opaque::create(handle, info); + CHECK_RESULT(opaque_result); + auto opaque = new Opaque(opaque_result.take()); + + size_t workspace_size = 0; + if (dtype == INFINI_DTYPE_F16 || dtype == INFINI_DTYPE_BF16) { + workspace_size = calculateOutputSize(info) * sizeof(float); + } + + *desc_ptr = new Descriptor(dtype, std::move(info), workspace_size, + opaque, handle->device, handle->device_id); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + return _opaque->calculate(workspace, workspace_size, output, input, _dtype); +} + +} // namespace op::averagepool::cpu diff --git a/src/infiniop/ops/averagepool/cpu/averagepool_cpu.h b/src/infiniop/ops/averagepool/cpu/averagepool_cpu.h new file mode 100644 index 000000000..8388f80ff --- /dev/null +++ b/src/infiniop/ops/averagepool/cpu/averagepool_cpu.h @@ -0,0 +1,8 @@ +#ifndef __AVERAGEPOOL_CPU_H__ +#define __AVERAGEPOOL_CPU_H__ + +#include "../averagepool.h" + +DESCRIPTOR(cpu) + +#endif // __AVERAGEPOOL_CPU_H__ diff --git a/src/infiniop/ops/averagepool/cuda/averagepool_kernel.cuh b/src/infiniop/ops/averagepool/cuda/averagepool_kernel.cuh new file mode 100644 index 000000000..7c9d0f438 --- /dev/null +++ b/src/infiniop/ops/averagepool/cuda/averagepool_kernel.cuh @@ -0,0 +1,185 @@ +#ifndef __AVERAGEPOOL_KERNEL_H__ +#define __AVERAGEPOOL_KERNEL_H__ + +#include + +// 1D平均池化kernel,兼容PyTorch的隐式填充逻辑 +template +__global__ void avgpool1d_pytorch_compatible_kernel( + const T *input, T *output, int batch_size, int channels, int input_length, + int output_length, int kernel_size, int stride, int padding) { + + int batch_idx = blockIdx.x; + int channel_idx = blockIdx.y; + int output_idx = blockIdx.z * blockDim.x + threadIdx.x; + + if (batch_idx >= batch_size || channel_idx >= channels || output_idx >= output_length) { + return; + } + + // 计算输入和输出的偏移 + const T *input_ptr = input + batch_idx * channels * input_length + channel_idx * input_length; + T *output_ptr = output + batch_idx * channels * output_length + channel_idx * output_length; + + // 计算池化窗口的起始位置 + int window_start = output_idx * stride - padding; + + // 使用单精度进行中间计算 + float sum = 0.0f; + int valid_count = 0; + + // 遍历池化窗口 + for (int k = 0; k < kernel_size; ++k) { + int input_pos = window_start + k; + + if (input_pos >= 0 && input_pos < input_length) { + // 有效的输入位置,转换为单精度进行累加 + sum += static_cast(input_ptr[input_pos]); + valid_count++; + } else if (input_pos >= -padding && input_pos < input_length + padding) { + // 显式填充区域,值为0,只增加计数 + valid_count++; + } + // 其他位置是隐式填充,不计入分母 + } + + // 计算平均值并转换回原始数据类型 + if (valid_count > 0) { + float result = sum / static_cast(valid_count); + output_ptr[output_idx] = static_cast(result); + } else { + output_ptr[output_idx] = T(0); + } +} + +// 2D平均池化kernel,兼容PyTorch的隐式填充逻辑 +template +__global__ void avgpool2d_pytorch_compatible_kernel( + const T *input, T *output, int batch_size, int channels, int input_height, + int input_width, int output_height, int output_width, int kernel_h, + int kernel_w, int stride_h, int stride_w, int pad_h, int pad_w) { + + int batch_idx = blockIdx.x; + int channel_idx = blockIdx.y; + int output_idx = blockIdx.z * blockDim.x + threadIdx.x; + + int total_output_elements = output_height * output_width; + if (batch_idx >= batch_size || channel_idx >= channels || output_idx >= total_output_elements) { + return; + } + + // 将线性索引转换为2D坐标 + int out_h = output_idx / output_width; + int out_w = output_idx % output_width; + + // 计算输入和输出的偏移 + const T *input_ptr = input + batch_idx * channels * input_height * input_width + channel_idx * input_height * input_width; + T *output_ptr = output + batch_idx * channels * output_height * output_width + channel_idx * output_height * output_width; + + // 计算池化窗口的起始位置 + int window_start_h = out_h * stride_h - pad_h; + int window_start_w = out_w * stride_w - pad_w; + + // 使用单精度进行中间计算 + float sum = 0.0f; + int valid_count = 0; + + // 遍历池化窗口 + for (int kh = 0; kh < kernel_h; ++kh) { + for (int kw = 0; kw < kernel_w; ++kw) { + int input_h = window_start_h + kh; + int input_w = window_start_w + kw; + + if (input_h >= 0 && input_h < input_height && input_w >= 0 && input_w < input_width) { + // 有效的输入位置,转换为单精度进行累加 + int input_idx = input_h * input_width + input_w; + sum += static_cast(input_ptr[input_idx]); + valid_count++; + } else if (input_h >= -pad_h && input_h < input_height + pad_h && input_w >= -pad_w && input_w < input_width + pad_w) { + // 显式填充区域,值为0,只增加计数 + valid_count++; + } + // 其他位置是隐式填充,不计入分母 + } + } + + // 计算平均值并转换回原始数据类型 + if (valid_count > 0) { + float result = sum / static_cast(valid_count); + output_ptr[output_idx] = static_cast(result); + } else { + output_ptr[output_idx] = T(0); + } +} + +// 3D平均池化kernel,兼容PyTorch的隐式填充逻辑 +template +__global__ void avgpool3d_pytorch_compatible_kernel( + const T *input, T *output, int batch_size, int channels, int input_depth, + int input_height, int input_width, int output_depth, int output_height, + int output_width, int kernel_d, int kernel_h, int kernel_w, int stride_d, + int stride_h, int stride_w, int pad_d, int pad_h, int pad_w) { + + int batch_idx = blockIdx.x; + int channel_idx = blockIdx.y; + int output_idx = blockIdx.z * blockDim.x + threadIdx.x; + + int total_output_elements = output_depth * output_height * output_width; + if (batch_idx >= batch_size || channel_idx >= channels || output_idx >= total_output_elements) { + return; + } + + // 将线性索引转换为3D坐标 + int out_d = output_idx / (output_height * output_width); + int remaining = output_idx % (output_height * output_width); + int out_h = remaining / output_width; + int out_w = remaining % output_width; + + // 计算输入和输出的偏移 + int input_spatial_size = input_depth * input_height * input_width; + int output_spatial_size = output_depth * output_height * output_width; + + const T *input_ptr = input + batch_idx * channels * input_spatial_size + channel_idx * input_spatial_size; + T *output_ptr = output + batch_idx * channels * output_spatial_size + channel_idx * output_spatial_size; + + // 计算池化窗口的起始位置 + int window_start_d = out_d * stride_d - pad_d; + int window_start_h = out_h * stride_h - pad_h; + int window_start_w = out_w * stride_w - pad_w; + + // 使用单精度进行中间计算 + float sum = 0.0f; + int valid_count = 0; + + // 遍历池化窗口 + for (int kd = 0; kd < kernel_d; ++kd) { + for (int kh = 0; kh < kernel_h; ++kh) { + for (int kw = 0; kw < kernel_w; ++kw) { + int input_d = window_start_d + kd; + int input_h = window_start_h + kh; + int input_w = window_start_w + kw; + + if (input_d >= 0 && input_d < input_depth && input_h >= 0 && input_h < input_height && input_w >= 0 && input_w < input_width) { + // 有效的输入位置,转换为单精度进行累加 + int input_idx = (input_d * input_height + input_h) * input_width + input_w; + sum += static_cast(input_ptr[input_idx]); + valid_count++; + } else if (input_d >= -pad_d && input_d < input_depth + pad_d && input_h >= -pad_h && input_h < input_height + pad_h && input_w >= -pad_w && input_w < input_width + pad_w) { + // 显式填充区域,值为0,只增加计数 + valid_count++; + } + // 其他位置是隐式填充,不计入分母 + } + } + } + + // 计算平均值并转换回原始数据类型 + if (valid_count > 0) { + float result = sum / static_cast(valid_count); + output_ptr[output_idx] = static_cast(result); + } else { + output_ptr[output_idx] = T(0); + } +} + +#endif // __AVERAGEPOOL_KERNEL_H__ diff --git a/src/infiniop/ops/averagepool/info.h b/src/infiniop/ops/averagepool/info.h new file mode 100644 index 000000000..871e827a7 --- /dev/null +++ b/src/infiniop/ops/averagepool/info.h @@ -0,0 +1,136 @@ +#ifndef __AVERAGEPOOL_INFO_H__ +#define __AVERAGEPOOL_INFO_H__ + +#include "../../../utils.h" +#include "../../operator.h" +#include "../../tensor.h" +#include +#include + +namespace op::averagepool { + +inline utils::Result calculatePoolOutputSize( + size_t input_size, + size_t kernel_size, + size_t stride, + size_t padding = 0, + bool ceil_mode = false) { + + if (stride == 0) { + return utils::Result(INFINI_STATUS_BAD_PARAM); + } + if (kernel_size == 0) { + return utils::Result(INFINI_STATUS_BAD_PARAM); + } + + size_t padded_input_size = input_size + 2 * padding; + + if (padded_input_size < kernel_size) { + return utils::Result(INFINI_STATUS_BAD_TENSOR_SHAPE); + } + + size_t output_size; + if (ceil_mode) { + // 等效于整数的上取整 + output_size = (padded_input_size - kernel_size + stride - 1) / stride + 1; + } else { + // 等效于整数的下取整 + output_size = (padded_input_size - kernel_size) / stride + 1; + } + + return utils::Result(output_size); +} + +// 检查是否存在隐式填充 +inline bool hasImplicitPadding( + size_t input_size, + size_t kernel_size, + size_t stride, + size_t padding, + bool ceil_mode) { + + if (!ceil_mode) { + return false; + } + return ((input_size + 2 * padding) - kernel_size) % stride != 0; +} + +class AvgPoolInfo { + AvgPoolInfo() = default; + +public: + std::vector input_dims; + std::vector output_dims; + std::vector kernel_sizes; + std::vector strides; + std::vector pads; + bool ceil_mode; + size_t ndim; + size_t batch; + size_t channels; + bool has_implicit_padding = false; + + static utils::Result create( + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc, + void *kernel_size, + void *strides, + void *pads, + bool ceil_mode) { + + AvgPoolInfo info; + + if (input_desc->ndim() < 3 || input_desc->ndim() > 5) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + if (input_desc->ndim() != output_desc->ndim()) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + if (input_desc->dim(0) != output_desc->dim(0) || input_desc->dim(1) != output_desc->dim(1)) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + info.ndim = input_desc->ndim() - 2; // 空间维度 + info.batch = input_desc->dim(0); + info.channels = input_desc->dim(1); + info.ceil_mode = ceil_mode; + + auto kernel_ptr = reinterpret_cast(kernel_size); + auto stride_ptr = reinterpret_cast(strides); + auto pad_ptr = reinterpret_cast(pads); + + // 初始化隐式填充标志 + info.has_implicit_padding = false; + + // 获取并校验空间维度 + for (size_t i = 0; i < info.ndim; ++i) { + info.input_dims.push_back(input_desc->dim(i + 2)); + info.kernel_sizes.push_back(kernel_ptr[i]); + info.strides.push_back(stride_ptr[i]); + info.pads.push_back(pad_ptr[i]); + + auto output_size_result = calculatePoolOutputSize( + info.input_dims[i], info.kernel_sizes[i], info.strides[i], info.pads[i], info.ceil_mode); + CHECK_RESULT(output_size_result); + + size_t expected_size = output_size_result.take(); + if (expected_size != output_desc->dim(i + 2)) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + info.output_dims.push_back(output_desc->dim(i + 2)); + + // 检查当前维度是否存在隐式填充 + if (hasImplicitPadding(info.input_dims[i], info.kernel_sizes[i], + info.strides[i], info.pads[i], info.ceil_mode)) { + info.has_implicit_padding = true; + } + } + return utils::Result(std::move(info)); + } +}; +} // namespace op::averagepool + +#endif // __AVERAGEPOOL_INFO_H__ diff --git a/src/infiniop/ops/averagepool/nvidia/averagepool.cu b/src/infiniop/ops/averagepool/nvidia/averagepool.cu new file mode 100644 index 000000000..6f276aac8 --- /dev/null +++ b/src/infiniop/ops/averagepool/nvidia/averagepool.cu @@ -0,0 +1,220 @@ +#include "../../../devices/nvidia/nvidia_common.cuh" +#include "../../../devices/nvidia/nvidia_handle.cuh" +#include "averagepool_nvidia.cuh" + +#define DESTROY_CUDNN_DESCRIPTOR(desc_ptr, destroy_func) \ + do { \ + if (desc_ptr) { \ + destroy_func(desc_ptr); \ + desc_ptr = nullptr; \ + } \ + } while (0) + +#define CLEANUP_CUDNN_DESCRIPTORS() \ + do { \ + DESTROY_CUDNN_DESCRIPTOR(input_desc, cudnnDestroyTensorDescriptor); \ + DESTROY_CUDNN_DESCRIPTOR(output_desc, cudnnDestroyTensorDescriptor); \ + DESTROY_CUDNN_DESCRIPTOR(pooling_desc, cudnnDestroyPoolingDescriptor); \ + } while (0) + +namespace op::averagepool::nvidia { + +struct Descriptor::Opaque { + std::shared_ptr internal; + size_t workspace_size = 0; + +#ifdef ENABLE_CUDNN_API + cudnnTensorDescriptor_t input_desc = nullptr; + cudnnTensorDescriptor_t output_desc = nullptr; + cudnnPoolingDescriptor_t pooling_desc = nullptr; +#endif + +private: + Opaque(std::shared_ptr internal_ptr) + : internal(internal_ptr) {} + +#ifdef ENABLE_CUDNN_API + infiniStatus_t getCudnnDataType(infiniDtype_t data_type, + cudnnDataType_t &cudnn_data_type) const { + if (data_type == INFINI_DTYPE_F16) { + cudnn_data_type = device::nvidia::getCudnnDtype(data_type); + } else if (data_type == INFINI_DTYPE_F32) { + cudnn_data_type = device::nvidia::getCudnnDtype(data_type); + } else if (data_type == INFINI_DTYPE_BF16) { + cudnn_data_type = device::nvidia::getCudnnDtype(data_type); + } else { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + return INFINI_STATUS_SUCCESS; + } + + infiniStatus_t createPoolingDescriptors(const AvgPoolInfo &info, + cudnnDataType_t cudnn_data_type) { + CHECK_CUDNN(cudnnCreateTensorDescriptor(&input_desc)); + CHECK_CUDNN(cudnnCreateTensorDescriptor(&output_desc)); + CHECK_CUDNN(cudnnCreatePoolingDescriptor(&pooling_desc)); + + std::vector input_dims_vec = {static_cast(info.batch), + static_cast(info.channels)}; + std::vector output_dims_vec = {static_cast(info.batch), + static_cast(info.channels)}; + + for (size_t i = 0; i < info.ndim; ++i) { + input_dims_vec.push_back(static_cast(info.input_dims[i])); + output_dims_vec.push_back(static_cast(info.output_dims[i])); + } + + if (info.ndim == 1) { + input_dims_vec.push_back(1); + output_dims_vec.push_back(1); + } + + CHECK_CUDNN(cudnnSetTensorNdDescriptorEx( + input_desc, CUDNN_TENSOR_NCHW, cudnn_data_type, input_dims_vec.size(), + input_dims_vec.data())); + + CHECK_CUDNN(cudnnSetTensorNdDescriptorEx( + output_desc, CUDNN_TENSOR_NCHW, cudnn_data_type, output_dims_vec.size(), + output_dims_vec.data())); + + return INFINI_STATUS_SUCCESS; + } + + infiniStatus_t setupPoolingDescriptor(const AvgPoolInfo &info) { + std::vector kernel_vec, stride_vec, pad_vec; + for (size_t i = 0; i < info.ndim; ++i) { + kernel_vec.push_back(static_cast(info.kernel_sizes[i])); + stride_vec.push_back(static_cast(info.strides[i])); + pad_vec.push_back(static_cast(info.pads[i])); + } + + if (info.ndim == 1) { + kernel_vec.push_back(1); + stride_vec.push_back(1); + pad_vec.push_back(0); + } + + CHECK_CUDNN(cudnnSetPoolingNdDescriptor( + pooling_desc, CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING, + CUDNN_NOT_PROPAGATE_NAN, kernel_vec.size(), kernel_vec.data(), + pad_vec.data(), stride_vec.data())); + + return INFINI_STATUS_SUCCESS; + } + + infiniStatus_t initializeCudnnContext(AvgPoolInfo &info, + infiniDtype_t data_type) { + cudnnDataType_t cudnn_data_type; + CHECK_STATUS(getCudnnDataType(data_type, cudnn_data_type)); + + CHECK_STATUS(createPoolingDescriptors(info, cudnn_data_type)); + CHECK_STATUS(setupPoolingDescriptor(info)); + + // Average pooling typically doesn't need a workspace + workspace_size = 0; + + return INFINI_STATUS_SUCCESS; + } +#endif + +public: + Opaque(Opaque &&other) noexcept + : internal(std::move(other.internal)), + workspace_size(other.workspace_size) + // clang-format off +#ifdef ENABLE_CUDNN_API + , input_desc(other.input_desc) + , output_desc(other.output_desc) + , pooling_desc(other.pooling_desc) +#endif + // clang-format on + { +#ifdef ENABLE_CUDNN_API + other.input_desc = nullptr; + other.output_desc = nullptr; + other.pooling_desc = nullptr; +#endif + other.workspace_size = 0; + } + + ~Opaque() { +#ifdef ENABLE_CUDNN_API + CLEANUP_CUDNN_DESCRIPTORS(); +#endif + } + + static inline utils::Result + create(std::shared_ptr internal_ptr, + AvgPoolInfo &info, infiniDtype_t data_type) { +#ifdef ENABLE_CUDNN_API + Opaque opaque(internal_ptr); + auto status = opaque.initializeCudnnContext(info, data_type); + if (status != INFINI_STATUS_SUCCESS) { + return status; + } + return utils::Result(std::move(opaque)); +#else + return INFINI_STATUS_NOT_IMPLEMENTED; +#endif + } +}; + +Descriptor::~Descriptor() { + if (_opaque) { + delete _opaque; + } +} + +infiniStatus_t Descriptor::create(infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc, + void *kernel_size, void *strides, void *pads, + bool ceil_mode) { + +#ifdef ENABLE_CUDNN_API + auto handle = reinterpret_cast(handle_); + auto dtype = input_desc->dtype(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16); + + auto result = AvgPoolInfo::create(output_desc, input_desc, kernel_size, + strides, pads, ceil_mode); + CHECK_RESULT(result); + auto info = result.take(); + + auto opaque_result = Opaque::create(handle->internal(), info, dtype); + CHECK_RESULT(opaque_result); + auto opaque = new Opaque(opaque_result.take()); + + *desc_ptr = new Descriptor(dtype, std::move(info), opaque->workspace_size, + opaque, handle->device, handle->device_id); + + return INFINI_STATUS_SUCCESS; +#else + return INFINI_STATUS_NOT_IMPLEMENTED; +#endif +} + +infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, + void *output, const void *input, + void *stream) const { + +#ifdef ENABLE_CUDNN_API + const float alpha = 1.0f, beta = 0.0f; + + CHECK_STATUS(_opaque->internal->useCudnn( + (cudaStream_t)stream, [&](cudnnHandle_t handle) { + CHECK_CUDNN(cudnnPoolingForward(handle, _opaque->pooling_desc, &alpha, + _opaque->input_desc, input, &beta, + _opaque->output_desc, output)); + return INFINI_STATUS_SUCCESS; + })); + + return INFINI_STATUS_SUCCESS; +#else + return INFINI_STATUS_NOT_IMPLEMENTED; +#endif +} + +} // namespace op::averagepool::nvidia diff --git a/src/infiniop/ops/averagepool/nvidia/averagepool_nvidia.cuh b/src/infiniop/ops/averagepool/nvidia/averagepool_nvidia.cuh new file mode 100644 index 000000000..ef19aa1dc --- /dev/null +++ b/src/infiniop/ops/averagepool/nvidia/averagepool_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __AVERAGEPOOL_CUDA_CUH__ +#define __AVERAGEPOOL_CUDA_CUH__ + +#include "../averagepool.h" + +DESCRIPTOR(nvidia) + +#endif // __AVERAGEPOOL_CUDA_CUH__ diff --git a/src/infiniop/ops/averagepool/operator.cc b/src/infiniop/ops/averagepool/operator.cc new file mode 100644 index 000000000..233ec4736 --- /dev/null +++ b/src/infiniop/ops/averagepool/operator.cc @@ -0,0 +1,147 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/averagepool.h" + +#ifdef ENABLE_CPU_API +#include "cpu/averagepool_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#include "nvidia/averagepool_nvidia.cuh" +#endif + +__C infiniStatus_t infiniopCreateAvgPoolDescriptor( + infiniopHandle_t handle, + infiniopAvgPoolDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc, + void *kernel_size, + void *strides, + void *pads, + bool ceil_mode) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::averagepool::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + output_desc, \ + input_desc, \ + kernel_size, \ + strides, \ + pads, \ + ceil_mode) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetAvgPoolWorkspaceSize(infiniopAvgPoolDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopAvgPool( + infiniopAvgPoolDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, output, input, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroyAvgPoolDescriptor(infiniopAvgPoolDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + DELETE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/ops/batch_norm/batch_norm.h b/src/infiniop/ops/batch_norm/batch_norm.h new file mode 100644 index 000000000..3bee6b5bb --- /dev/null +++ b/src/infiniop/ops/batch_norm/batch_norm.h @@ -0,0 +1,54 @@ +#ifndef __BATCH_NORM_H__ +#define __BATCH_NORM_H__ + +#include "../../../utils.h" +#include "../../operator.h" +#include "../../tensor.h" +#include "info.h" + +#define DESCRIPTOR(NAMESPACE) \ + namespace op::batch_norm::NAMESPACE { \ + class Descriptor final : public InfiniopDescriptor { \ + struct Opaque; \ + Opaque *_opaque; \ + BatchNormInfo _info; \ + size_t _workspace_size; \ + Descriptor( \ + infiniDtype_t dtype, \ + BatchNormInfo info, \ + size_t workspace_size_, \ + Opaque *opaque, \ + infiniDevice_t device_type, \ + int device_id) : InfiniopDescriptor{device_type, device_id}, \ + _opaque(opaque), \ + _info(info), \ + _workspace_size(workspace_size_) {} \ + \ + public: \ + ~Descriptor(); \ + size_t workspaceSize() const { return _workspace_size; } \ + static infiniStatus_t create( \ + infiniopHandle_t handle, \ + Descriptor **desc_ptr, \ + infiniopTensorDescriptor_t output_desc, \ + infiniopTensorDescriptor_t running_mean_desc, \ + infiniopTensorDescriptor_t running_var_desc, \ + infiniopTensorDescriptor_t input_desc, \ + infiniopTensorDescriptor_t weight_desc, \ + infiniopTensorDescriptor_t bias_desc, \ + float momentum, \ + float eps); \ + infiniStatus_t calculate( \ + void *workspace, \ + size_t workspace_size, \ + void *output, \ + void *running_mean, \ + void *running_var, \ + const void *input, \ + const void *weight, \ + const void *bias, \ + void *stream) const; \ + }; \ + } + +#endif \ No newline at end of file diff --git a/src/infiniop/ops/batch_norm/cpu/batch_norm_cpu.cc b/src/infiniop/ops/batch_norm/cpu/batch_norm_cpu.cc new file mode 100644 index 000000000..876b82904 --- /dev/null +++ b/src/infiniop/ops/batch_norm/cpu/batch_norm_cpu.cc @@ -0,0 +1,118 @@ +#include "batch_norm_cpu.h" +#include "../../../devices/cpu/common_cpu.h" +#include "../../../reduce/cpu/reduce.h" + +namespace op::batch_norm::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t running_mean_desc, + infiniopTensorDescriptor_t running_var_desc, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t weight_desc, + infiniopTensorDescriptor_t bias_desc, + float momentum, + float eps) { + auto handle = reinterpret_cast(handle_); + auto dtype = input_desc->dtype(); + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16); + auto result = BatchNormInfo::createBatchNormInfo( + output_desc, + running_mean_desc, + running_var_desc, + input_desc, + weight_desc, + bias_desc, + momentum, + eps); + CHECK_RESULT(result); + const BatchNormInfo &info = result.take(); + size_t WorkSpaceSize = 0; + *desc_ptr = new Descriptor( + dtype, std::move(info), WorkSpaceSize, + nullptr, + handle->device, handle->device_id); + + return INFINI_STATUS_SUCCESS; +} + +template +infiniStatus_t calculate_batch_norm( + const BatchNormInfo &info, + Tdata *output, + Tdata *running_mean, + Tdata *running_var, + const Tdata *input, + const Tdata *weight, + const Tdata *bias) { + +#pragma omp parallel for + for (int c = 0; c < static_cast(info.channel_size); c++) { + float sum_sq = 0., sum = 0.; + for (size_t b = 0; b < info.batch_size; b++) { + sum += op::common_cpu::reduce_op::sum( + input + (b * info.channel_size + static_cast(c)) * info.dim_size, + info.dim_size, + 1); + sum_sq += op::common_cpu::reduce_op::sumSquared( + input + (b * info.channel_size + static_cast(c)) * info.dim_size, + info.dim_size, + 1); + } + float batch_and_dim_size = static_cast(info.batch_size * info.dim_size); + float E = sum / batch_and_dim_size; + float var_biased = sum_sq / batch_and_dim_size - E * E; + float var_unbiased = var_biased * batch_and_dim_size / (batch_and_dim_size - 1.0f); + + auto running_mean_ptr = running_mean + static_cast(c) * info.running_mean_stride; + auto running_var_ptr = running_var + static_cast(c) * info.running_var_stride; + *running_mean_ptr = utils::cast((1 - info.momentum) * utils::cast(*running_mean_ptr) + info.momentum * E); + *running_var_ptr = utils::cast((1 - info.momentum) * utils::cast(*running_var_ptr) + info.momentum * var_unbiased); + + for (size_t b = 0; b < info.batch_size; b++) { + for (size_t d = 0; d < info.dim_size; d++) { + auto input_ptr = input + ((b * info.channel_size + static_cast(c)) * info.dim_size) + d; + auto output_ptr = output + ((b * info.channel_size + static_cast(c)) * info.dim_size) + d; + ; + auto weight_ptr = weight + static_cast(c) * info.weight_stride; + auto bias_ptr = bias + static_cast(c) * info.bias_stride; + *output_ptr = utils::cast( + (utils::cast(*input_ptr) - E) / std::sqrt(var_biased + info.eps) * utils::cast(*weight_ptr) + utils::cast(*bias_ptr)); + } + } + } + return INFINI_STATUS_SUCCESS; +} + +#define CALCULATE_BATCH_NORM(TDATA) \ + CHECK_STATUS(calculate_batch_norm(_info, \ + (TDATA *)output, (TDATA *)running_mean, (TDATA *)running_var, (const TDATA *)input, (const TDATA *)weight, (const TDATA *)bias)) + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + void *running_mean, + void *running_var, + const void *input, + const void *weight, + const void *bias, + void *stream) const { + + if (_info.dtype == INFINI_DTYPE_F16) { + CALCULATE_BATCH_NORM(fp16_t); + } else if (_info.dtype == INFINI_DTYPE_BF16) { + CALCULATE_BATCH_NORM(bf16_t); + } else if (_info.dtype == INFINI_DTYPE_F32) { + CALCULATE_BATCH_NORM(float); + } else { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::batch_norm::cpu diff --git a/src/infiniop/ops/batch_norm/cpu/batch_norm_cpu.h b/src/infiniop/ops/batch_norm/cpu/batch_norm_cpu.h new file mode 100644 index 000000000..722ebc6ec --- /dev/null +++ b/src/infiniop/ops/batch_norm/cpu/batch_norm_cpu.h @@ -0,0 +1,8 @@ +#ifndef __BATCH_NORM_CPU_H__ +#define __BATCH_NORM_CPU_H__ + +#include "../batch_norm.h" + +DESCRIPTOR(cpu) + +#endif // __BATCH_NORM_CPU_H__ diff --git a/src/infiniop/ops/batch_norm/cuda/kernel.cuh b/src/infiniop/ops/batch_norm/cuda/kernel.cuh new file mode 100644 index 000000000..6132b1afe --- /dev/null +++ b/src/infiniop/ops/batch_norm/cuda/kernel.cuh @@ -0,0 +1,57 @@ +#ifndef __BATCH_NORM_KERNEL_CUH__ +#define __BATCH_NORM_KERNEL_CUH__ + +#include "../../../reduce/cuda/reduce.cuh" +#include + +template +__device__ void batchNormKernel( + Tdata *output, + Tdata *running_mean, + Tdata *running_var, + const Tdata *input, + const Tdata *weight, + const Tdata *bias, + + size_t batch_size, + size_t channel_size, + size_t dim_size, + ptrdiff_t running_mean_stride, + ptrdiff_t running_var_stride, + ptrdiff_t weight_stride, + ptrdiff_t bias_stride, + float momentum, + float eps) { + auto output_ptr = output + dim_size * blockIdx.x; + auto input_ptr = input + dim_size * blockIdx.x; + + auto running_mean_ptr = running_mean + running_mean_stride * blockIdx.x; + auto running_var_ptr = running_var + running_var_stride * blockIdx.x; + auto weight_ptr = weight + weight_stride * blockIdx.x; + auto bias_ptr = bias + bias_stride * blockIdx.x; + + Tcompute sum_squared = 0., sum = 0.; + for (size_t b = 0; b < batch_size; b++) { + sum += op::common_cuda::reduce_op::sum( + input_ptr + b * (channel_size * dim_size), dim_size); + sum_squared += op::common_cuda::reduce_op::sumSquared( + input_ptr + b * (channel_size * dim_size), dim_size); + } + + __shared__ Tcompute E, var_biased; + if (threadIdx.x == 0) { + E = sum / Tcompute(batch_size * dim_size); + var_biased = sum_squared / Tcompute(batch_size * dim_size) - E * E; + Tcompute var_unbiased = var_biased * Tcompute(batch_size * dim_size) / Tcompute(batch_size * dim_size - 1); + *running_mean_ptr = Tcompute(1 - momentum) * Tcompute(*running_mean_ptr) + Tcompute(momentum) * E; + *running_var_ptr = Tcompute(1 - momentum) * Tcompute(*running_var_ptr) + Tcompute(momentum) * var_unbiased; + } + __syncthreads(); + + for (size_t n = threadIdx.x; n < batch_size * dim_size; n += BLOCK_SIZE) { + size_t b = n / dim_size, d = n % dim_size; + *(output_ptr + b * channel_size * dim_size + d) = (Tcompute(*(input_ptr + b * channel_size * dim_size + d)) - E) / sqrtf(float(var_biased + Tcompute(eps))) * Tcompute(*weight_ptr) + Tcompute(*bias_ptr); + } +} + +#endif // __BATCH_NORM_KERNEL_CUH__ \ No newline at end of file diff --git a/src/infiniop/ops/batch_norm/info.h b/src/infiniop/ops/batch_norm/info.h new file mode 100644 index 000000000..c27479865 --- /dev/null +++ b/src/infiniop/ops/batch_norm/info.h @@ -0,0 +1,69 @@ +#ifndef __BATCH_NORM_INFO_H__ +#define __BATCH_NORM_INFO_H__ + +#include "../../../utils.h" +#include "../../operator.h" +#include "../../tensor.h" + +namespace op::batch_norm { + +class BatchNormInfo { +private: + BatchNormInfo() = default; + +public: + // ---------------------------- start: define member variables of Info ---------------------------- + infiniDtype_t dtype; + size_t batch_size, channel_size, dim_size; + + ptrdiff_t running_mean_stride; + ptrdiff_t running_var_stride; + ptrdiff_t weight_stride; + ptrdiff_t bias_stride; + float momentum; + float eps; + + // ----------------------------- end: define member variables of Info ----------------------------- + + static utils::Result createBatchNormInfo( + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t running_mean_desc, + infiniopTensorDescriptor_t running_var_desc, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t weight_desc, + infiniopTensorDescriptor_t bias_desc, + float momentum, + float eps) { + // ------------------------- start: check tensor shape and input validity ------------------------- + CHECK_OR_RETURN( + input_desc->ndim() == 3, + INFINI_STATUS_BAD_TENSOR_SHAPE); + CHECK_SAME_SHAPE(output_desc->shape(), input_desc->shape()); + size_t batch_size = output_desc->dim(0), + channel_size = output_desc->dim(1), + dim_size = output_desc->dim(2); + CHECK_SAME_SHAPE( + running_mean_desc->shape(), running_var_desc->shape(), + weight_desc->shape(), bias_desc->shape()); + CHECK_OR_RETURN( + running_mean_desc->ndim() == 1 && running_mean_desc->dim(0) == channel_size, + INFINI_STATUS_BAD_TENSOR_SHAPE); + + // -------------------------- end: check tensor shape and input validity -------------------------- + return utils::Result(BatchNormInfo{ + // ------------------------------ start: create an instance of Info ------------------------------- + output_desc->dtype(), + batch_size, channel_size, dim_size, + running_mean_desc->stride(0), + running_var_desc->stride(0), + weight_desc->stride(0), + bias_desc->stride(0), + momentum, + eps + // ------------------------------- end: create an instance of Info -------------------------------- + }); + } +}; +} // namespace op::batch_norm + +#endif // __BATCH_NORM_INFO_H__ diff --git a/src/infiniop/ops/batch_norm/nvidia/batch_norm_nvidia.cu b/src/infiniop/ops/batch_norm/nvidia/batch_norm_nvidia.cu new file mode 100644 index 000000000..e5e132c89 --- /dev/null +++ b/src/infiniop/ops/batch_norm/nvidia/batch_norm_nvidia.cu @@ -0,0 +1,176 @@ +#include "../../../devices/nvidia/nvidia_common.cuh" +#include "../../../devices/nvidia/nvidia_handle.cuh" +#include "../../../devices/nvidia/nvidia_kernel_common.cuh" + +#include "batch_norm_nvidia.cuh" + +#include "../../../reduce/cuda/reduce.cuh" +#include "../cuda/kernel.cuh" +#include + +#include "../info.h" + +namespace op::batch_norm::nvidia { + +// ---------------------- start: launchKernel: call kernel function of CUDA ----------------------- +template +INFINIOP_CUDA_KERNEL launchKernel( + Tdata *output, + Tdata *running_mean, + Tdata *running_var, + const Tdata *input, + const Tdata *weight, + const Tdata *bias, + + size_t batch_size, + size_t channel_size, + size_t dim_size, + ptrdiff_t running_mean_stride, + ptrdiff_t running_var_stride, + ptrdiff_t weight_stride, + ptrdiff_t bias_stride, + + float momentum, + float eps) { + + batchNormKernel( + output, + running_mean, + running_var, + input, + weight, + bias, + + batch_size, + channel_size, + dim_size, + + running_mean_stride, + running_var_stride, + weight_stride, + bias_stride, + + momentum, + eps); +} +// ----------------------- end: launchKernel: call kernel function of CUDA ------------------------ + +// ----------------------------------- start: call launchKernel ----------------------------------- +template +infiniStatus_t calculate_batch_norm( + const BatchNormInfo &info, + Tdata *output, + Tdata *running_mean, + Tdata *running_var, + const Tdata *input, + const Tdata *weight, + const Tdata *bias, + + cudaStream_t stream) { + launchKernel<<>>( + output, + running_mean, + running_var, + input, + weight, + bias, + + info.batch_size, + info.channel_size, + info.dim_size, + + info.running_mean_stride, + info.running_var_stride, + info.weight_stride, + info.bias_stride, + info.momentum, + info.eps); + return INFINI_STATUS_SUCCESS; +} +// ------------------------------------ end: call launchKernel ------------------------------------ + +struct Descriptor::Opaque { + std::shared_ptr internal; +}; + +Descriptor::~Descriptor() { + delete _opaque; +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t running_mean_desc, + infiniopTensorDescriptor_t running_var_desc, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t weight_desc, + infiniopTensorDescriptor_t bias_desc, + float momentum, + float eps) { + auto handle = reinterpret_cast(handle_); + // --------------------- start: check data type and calculate workspace size ---------------------- + auto dtype = output_desc->dtype(); + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16); + size_t WorkSpaceSize = 0; + // ---------------------- end: check data type and calculate workspace size ----------------------- + auto result = BatchNormInfo::createBatchNormInfo( + output_desc, + running_mean_desc, + running_var_desc, + input_desc, + weight_desc, + bias_desc, + momentum, + eps); + CHECK_RESULT(result); + const BatchNormInfo &info = result.take(); + *desc_ptr = new Descriptor( + dtype, std::move(info), WorkSpaceSize, + new Opaque{handle->internal()}, + handle->device, handle->device_id); + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + void *running_mean, + void *running_var, + const void *input, + const void *weight, + const void *bias, + void *stream_) const { + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + cudaStream_t stream = (cudaStream_t)stream_; + +#define CALCULATE_BATCH_NORM(BLOCK_SIZE, TDATA) \ + calculate_batch_norm(_info, (TDATA *)output, (TDATA *)running_mean, (TDATA *)running_var, (const TDATA *)input, (const TDATA *)weight, (const TDATA *)bias, stream) +#define CALCULATE_BATCH_NORM_WITH_BLOCK_SIZE(BLOCK_SIZE) \ + { \ + if (_info.dtype == INFINI_DTYPE_F16) \ + return CALCULATE_BATCH_NORM(BLOCK_SIZE, half); \ + else if (_info.dtype == INFINI_DTYPE_F32) \ + return CALCULATE_BATCH_NORM(BLOCK_SIZE, float); \ + else if (_info.dtype == INFINI_DTYPE_BF16) \ + return CALCULATE_BATCH_NORM(BLOCK_SIZE, __nv_bfloat16); \ + else \ + return INFINI_STATUS_BAD_TENSOR_DTYPE; \ + } + + if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_1024) { + CALCULATE_BATCH_NORM_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_1024) + } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_512) { + CALCULATE_BATCH_NORM_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_512) + } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_4096) { + CALCULATE_BATCH_NORM_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_4096) + } else { + return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::batch_norm::nvidia diff --git a/src/infiniop/ops/batch_norm/nvidia/batch_norm_nvidia.cuh b/src/infiniop/ops/batch_norm/nvidia/batch_norm_nvidia.cuh new file mode 100644 index 000000000..33c93f2b4 --- /dev/null +++ b/src/infiniop/ops/batch_norm/nvidia/batch_norm_nvidia.cuh @@ -0,0 +1,10 @@ +#ifndef __BATCH_NORM_NVIDIA_API_H__ +#define __BATCH_NORM_NVIDIA_API_H__ + +// #ifdef ENABLE_NINETOOTHED +#include "../batch_norm.h" +DESCRIPTOR(nvidia) + +// #endif + +#endif // __BATCH_NORM_NVIDIA_API_H__ diff --git a/src/infiniop/ops/batch_norm/operator.cc b/src/infiniop/ops/batch_norm/operator.cc new file mode 100644 index 000000000..a87dfff60 --- /dev/null +++ b/src/infiniop/ops/batch_norm/operator.cc @@ -0,0 +1,156 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/batch_norm.h" + +#ifdef ENABLE_CPU_API +#include "cpu/batch_norm_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#include "nvidia/batch_norm_nvidia.cuh" +#endif + +__C infiniStatus_t infiniopCreateBatchNormDescriptor( + infiniopHandle_t handle, + infiniopBatchNormDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t running_mean_desc, + infiniopTensorDescriptor_t running_var_desc, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t weight_desc, + infiniopTensorDescriptor_t bias_desc, + float momentum, + float eps) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::batch_norm::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + output_desc, \ + running_mean_desc, \ + running_var_desc, \ + input_desc, \ + weight_desc, \ + bias_desc, \ + momentum, \ + eps) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetBatchNormWorkspaceSize(infiniopBatchNormDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopBatchNorm( + infiniopBatchNormDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + void *running_mean, + void *running_var, + const void *input, + const void *weight, + const void *bias, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, output, running_mean, running_var, \ + input, weight, bias, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroyBatchNormDescriptor(infiniopBatchNormDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + DELETE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/ops/cross_entropy_loss/cpu/cross_entropy_loss_cpu.cc b/src/infiniop/ops/cross_entropy_loss/cpu/cross_entropy_loss_cpu.cc new file mode 100644 index 000000000..af97c1d09 --- /dev/null +++ b/src/infiniop/ops/cross_entropy_loss/cpu/cross_entropy_loss_cpu.cc @@ -0,0 +1,321 @@ +#include "cross_entropy_loss_cpu.h" +#include "../../../devices/cpu/common_cpu.h" +#include "../../../devices/cpu/cpu_handle.h" +#include "../info.h" +#include +#include +#include +#include + +namespace op::cross_entropy_loss::cpu { + +struct Descriptor::Opaque { + device::cpu::Handle *handle; + std::vector logits_shape; + size_t workspace_size = 0; + +private: + Opaque(device::cpu::Handle *handle_ptr, const std::vector &shape) + : handle(handle_ptr), logits_shape(shape) { + // 计算workspace大小:需要存储per-sample loss + size_t N = logits_shape[0]; + size_t inner_size = 1; + for (size_t i = 2; i < logits_shape.size(); ++i) { + inner_size *= logits_shape[i]; + } + workspace_size = N * inner_size * sizeof(float); + } + + void cross_entropy_f16_as_float(float *workspace, float *loss_result, + const fp16_t *logits, const int64_t *target) const { + size_t N = logits_shape[0]; + size_t C = logits_shape[1]; + size_t inner_size = 1; + for (size_t i = 2; i < logits_shape.size(); ++i) { + inner_size *= logits_shape[i]; + } + + // 转换F16 logits为float + size_t total_logits_size = N * C * inner_size; + std::vector float_logits(total_logits_size); + for (size_t i = 0; i < total_logits_size; ++i) { + float_logits[i] = utils::cast(logits[i]); + } + + // 使用float精度计算 + cross_entropy_cpu_float(workspace, loss_result, float_logits.data(), target); + } + + // 通用的float版本交叉熵计算 + void cross_entropy_cpu_float(float *workspace, float *loss_result, + const float *logits, const int64_t *target) const { + size_t N = logits_shape[0]; + size_t C = logits_shape[1]; + size_t inner_size = 1; + for (size_t i = 2; i < logits_shape.size(); ++i) { + inner_size *= logits_shape[i]; + } + + const int64_t ignore_index = -100; + float *per_sample_loss = workspace; + + // 计算每个样本的损失 + for (size_t n = 0; n < N; ++n) { + for (size_t inner = 0; inner < inner_size; ++inner) { + size_t sample_idx = n * inner_size + inner; + int64_t t = target[sample_idx]; + + // 检查ignore_index或无效target + if (t == ignore_index || t < 0 || t >= static_cast(C)) { + per_sample_loss[sample_idx] = 0.0f; + continue; + } + + // 计算这个位置的logits基址 + size_t base_offset = n * C * inner_size + inner; + + // 数值稳定的softmax计算:先找最大值 + float max_logit = -std::numeric_limits::infinity(); + for (size_t c = 0; c < C; ++c) { + size_t logit_idx = base_offset + c * inner_size; + max_logit = std::max(max_logit, logits[logit_idx]); + } + + // 计算exp的和(减去最大值保证数值稳定) + float sum_exp = 0.0f; + for (size_t c = 0; c < C; ++c) { + size_t logit_idx = base_offset + c * inner_size; + sum_exp += std::exp(logits[logit_idx] - max_logit); + } + + // 计算目标类别的logit + size_t target_logit_idx = base_offset + static_cast(t) * inner_size; + float target_logit = logits[target_logit_idx]; + + // 计算交叉熵损失:log_softmax[target] = logit[target] - log(sum_exp) - max_logit + // 所以 -log_softmax[target] = log(sum_exp) + max_logit - logit[target] + per_sample_loss[sample_idx] = std::log(sum_exp) + max_logit - target_logit; + } + } + + // 计算平均损失(忽略ignore_index的样本) + double total_loss = 0.0; + size_t valid_count = 0; + size_t total_samples = N * inner_size; + + for (size_t i = 0; i < total_samples; ++i) { + if (target[i] != ignore_index && target[i] >= 0 && target[i] < static_cast(C)) { + total_loss += static_cast(per_sample_loss[i]); + valid_count++; + } + } + + *loss_result = valid_count > 0 ? static_cast(total_loss / valid_count) : 0.0f; + } + + // 通用模板版本(用于F32和BF16) + template + void cross_entropy_cpu_generic(float *workspace, T *loss_result, + const T *logits, const int64_t *target) const { + size_t N = logits_shape[0]; + size_t C = logits_shape[1]; + size_t inner_size = 1; + for (size_t i = 2; i < logits_shape.size(); ++i) { + inner_size *= logits_shape[i]; + } + + const int64_t ignore_index = -100; + float *per_sample_loss = workspace; + + // 计算每个样本的损失 + for (size_t n = 0; n < N; ++n) { + for (size_t inner = 0; inner < inner_size; ++inner) { + size_t sample_idx = n * inner_size + inner; + int64_t t = target[sample_idx]; + + // 检查ignore_index或无效target + if (t == ignore_index || t < 0 || t >= static_cast(C)) { + per_sample_loss[sample_idx] = 0.0f; + continue; + } + + // 计算这个位置的logits基址 + size_t base_offset = n * C * inner_size + inner; + + // 数值稳定的softmax计算:先找最大值 + float max_logit = -std::numeric_limits::infinity(); + for (size_t c = 0; c < C; ++c) { + size_t logit_idx = base_offset + c * inner_size; + float logit_val; + if constexpr (std::is_same::value) { + logit_val = utils::cast(logits[logit_idx]); + } else { + logit_val = logits[logit_idx]; + } + max_logit = std::max(max_logit, logit_val); + } + + // 计算exp的和 + float sum_exp = 0.0f; + for (size_t c = 0; c < C; ++c) { + size_t logit_idx = base_offset + c * inner_size; + float logit_val; + if constexpr (std::is_same::value) { + logit_val = utils::cast(logits[logit_idx]); + } else { + logit_val = logits[logit_idx]; + } + sum_exp += std::exp(logit_val - max_logit); + } + + // 计算目标类别的logit + size_t target_logit_idx = base_offset + static_cast(t) * inner_size; + float target_logit; + if constexpr (std::is_same::value) { + target_logit = utils::cast(logits[target_logit_idx]); + } else { + target_logit = logits[target_logit_idx]; + } + + // 计算交叉熵损失 + per_sample_loss[sample_idx] = std::log(sum_exp) + max_logit - target_logit; + } + } + + // 计算平均损失 + double total_loss = 0.0; + size_t valid_count = 0; + size_t total_samples = N * inner_size; + + for (size_t i = 0; i < total_samples; ++i) { + if (target[i] != ignore_index && target[i] >= 0 && target[i] < static_cast(C)) { + total_loss += static_cast(per_sample_loss[i]); + valid_count++; + } + } + + float mean_loss = valid_count > 0 ? static_cast(total_loss / valid_count) : 0.0f; + + // 转换回输出类型 + if constexpr (std::is_same::value) { + *loss_result = utils::cast(mean_loss); + } else { + *loss_result = static_cast(mean_loss); + } + } + +public: + Opaque(Opaque &&other) noexcept + : handle(other.handle), + logits_shape(std::move(other.logits_shape)), + workspace_size(other.workspace_size) { + other.handle = nullptr; + other.workspace_size = 0; + } + + ~Opaque() = default; + + static inline utils::Result + create(device::cpu::Handle *handle_ptr, const std::vector &shape) { + Opaque opaque(handle_ptr, shape); + return utils::Result(std::move(opaque)); + } + + infiniStatus_t calculate(void *workspace, size_t workspace_size, + void *loss, const void *logits, const void *target, + infiniDtype_t dtype) const { + if (!workspace || !loss || !logits || !target) { + return INFINI_STATUS_BAD_PARAM; + } + + if (workspace_size < this->workspace_size) { + return INFINI_STATUS_INTERNAL_ERROR; + } + + float *workspace_ptr = static_cast(workspace); + const int64_t *target_ptr = static_cast(target); + + switch (dtype) { + case INFINI_DTYPE_F32: { + const float *logits_ptr = static_cast(logits); + float *loss_ptr = static_cast(loss); + cross_entropy_cpu_generic(workspace_ptr, loss_ptr, logits_ptr, target_ptr); + break; + } + + case INFINI_DTYPE_F16: { + const fp16_t *logits_ptr = static_cast(logits); + fp16_t *loss_ptr = static_cast(loss); + + // F16特殊处理:使用float计算 + float temp_loss; + cross_entropy_f16_as_float(workspace_ptr, &temp_loss, logits_ptr, target_ptr); + *loss_ptr = utils::cast(temp_loss); + break; + } + + case INFINI_DTYPE_BF16: { + const bf16_t *logits_ptr = static_cast(logits); + bf16_t *loss_ptr = static_cast(loss); + cross_entropy_cpu_generic(workspace_ptr, loss_ptr, logits_ptr, target_ptr); + break; + } + + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; + } + + size_t get_workspace_size() const { + return workspace_size; + } +}; + +Descriptor::~Descriptor() { + if (_opaque) { + delete _opaque; + } +} + +infiniStatus_t Descriptor::create(infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t /*loss_desc*/, + infiniopTensorDescriptor_t logits_desc, + infiniopTensorDescriptor_t /*target_desc*/) { + auto handle = reinterpret_cast(handle_); + auto dtype = logits_desc->dtype(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F32, INFINI_DTYPE_F16, INFINI_DTYPE_BF16); + + const auto &orig_shape = logits_desc->shape(); + std::vector logits_shape; + + if (orig_shape.size() == 1) { + logits_shape = {1, orig_shape[0]}; + } else { + logits_shape = orig_shape; + } + + if (logits_shape.size() < 2) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + auto opaque_result = Opaque::create(handle, logits_shape); + CHECK_RESULT(opaque_result); + auto opaque = new Opaque(opaque_result.take()); + + *desc_ptr = new Descriptor(dtype, opaque->get_workspace_size(), opaque, + handle->device, handle->device_id); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, + void *loss, const void *logits, + const void *target, void *stream) const { + return _opaque->calculate(workspace, workspace_size, loss, logits, target, _dtype); +} + +} // namespace op::cross_entropy_loss::cpu diff --git a/src/infiniop/ops/cross_entropy_loss/cpu/cross_entropy_loss_cpu.h b/src/infiniop/ops/cross_entropy_loss/cpu/cross_entropy_loss_cpu.h new file mode 100644 index 000000000..8afec63d0 --- /dev/null +++ b/src/infiniop/ops/cross_entropy_loss/cpu/cross_entropy_loss_cpu.h @@ -0,0 +1,8 @@ +#ifndef __CROSS_ENTROPY_LOSS_CPU_H__ +#define __CROSS_ENTROPY_LOSS_CPU_H__ + +#include "../cross_entropy_loss.h" + +DESCRIPTOR(cpu) + +#endif // __CROSS_ENTROPY_LOSS_CPU_H__ diff --git a/src/infiniop/ops/cross_entropy_loss/cross_entropy_loss.h b/src/infiniop/ops/cross_entropy_loss/cross_entropy_loss.h new file mode 100644 index 000000000..dad108d78 --- /dev/null +++ b/src/infiniop/ops/cross_entropy_loss/cross_entropy_loss.h @@ -0,0 +1,48 @@ +#ifndef __CROSS_ENTROPY_LOSS_H__ +#define __CROSS_ENTROPY_LOSS_H__ + +#include "../../operator.h" +#include "info.h" + +#define DESCRIPTOR(NAMESPACE) \ + \ + namespace op::cross_entropy_loss::NAMESPACE { \ + class Descriptor final : public InfiniopDescriptor { \ + struct Opaque; \ + Opaque *_opaque; \ + infiniDtype_t _dtype; \ + size_t _workspace_size; \ + \ + Descriptor( \ + infiniDtype_t dtype, \ + size_t workspace_size_, \ + Opaque *opaque, \ + infiniDevice_t device_type, \ + int device_id) \ + : InfiniopDescriptor{device_type, device_id}, \ + _opaque(opaque), \ + _dtype(dtype), \ + _workspace_size(workspace_size_) {} \ + \ + public: \ + ~Descriptor(); \ + \ + size_t workspaceSize() const { return _workspace_size; } \ + \ + static infiniStatus_t create( \ + infiniopHandle_t handle, \ + Descriptor **desc_ptr, \ + infiniopTensorDescriptor_t loss_desc, \ + infiniopTensorDescriptor_t logits_desc, \ + infiniopTensorDescriptor_t target_desc); \ + \ + infiniStatus_t calculate( \ + void *workspace, size_t workspace_size, \ + void *loss, \ + const void *logits, \ + const void *target, \ + void *stream) const; \ + }; \ + } + +#endif // __CROSS_ENTROPY_LOSS_H__ diff --git a/src/infiniop/ops/cross_entropy_loss/info.h b/src/infiniop/ops/cross_entropy_loss/info.h new file mode 100644 index 000000000..5278bf912 --- /dev/null +++ b/src/infiniop/ops/cross_entropy_loss/info.h @@ -0,0 +1,36 @@ +#ifndef __CROSS_ENTROPY_LOSS_INFO_H__ +#define __CROSS_ENTROPY_LOSS_INFO_H__ + +#include "../../../utils.h" +#include "../../operator.h" +#include "../../tensor.h" + +namespace op::cross_entropy_loss { + +class CrossEntropyInfo { +public: + CrossEntropyInfo() = default; + size_t batch = 0; + size_t num_classes = 0; + infiniDtype_t dtype; + + static utils::Result create( + infiniopTensorDescriptor_t loss, + infiniopTensorDescriptor_t logits, + infiniopTensorDescriptor_t target) { + + if (logits->ndim() != 2 || loss->ndim() != 1 || target->ndim() != 1) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + CrossEntropyInfo info; + info.batch = logits->dim(0); + info.num_classes = logits->dim(1); + info.dtype = logits->dtype(); + return utils::Result(std::move(info)); + } +}; + +} // namespace op::cross_entropy_loss + +#endif // __CROSS_ENTROPY_LOSS_INFO_H__ diff --git a/src/infiniop/ops/cross_entropy_loss/nvidia/cross_entropy_loss_nvidia.cu b/src/infiniop/ops/cross_entropy_loss/nvidia/cross_entropy_loss_nvidia.cu new file mode 100644 index 000000000..3d795a67a --- /dev/null +++ b/src/infiniop/ops/cross_entropy_loss/nvidia/cross_entropy_loss_nvidia.cu @@ -0,0 +1,217 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../../../devices/nvidia/nvidia_common.cuh" +#include "../../../devices/nvidia/nvidia_handle.cuh" +#include "cross_entropy_loss_nvidia.cuh" + +namespace op::cross_entropy_loss::nvidia { +namespace cuda { + +__device__ __forceinline__ float to_float(float v) { return v; } +__device__ __forceinline__ float to_float(double v) { return (float)v; } +__device__ __forceinline__ float to_float(half v) { return __half2float(v); } +__device__ __forceinline__ float to_float(__nv_bfloat16 v) { + return __bfloat162float(v); +} + +template +__global__ void +softmaxCrossEntropy_per_sample(T_out *__restrict__ loss, + const T_in *__restrict__ logits, + const int64_t *__restrict__ target, int N, int C, + long long inner_size, int64_t ignore_index) { + long long total = (long long)N * inner_size; + long long idx = (long long)blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= total) { + return; + } + + int n = (int)(idx / inner_size); + int inr = (int)(idx % inner_size); + + int64_t t = target[(long long)n * inner_size + inr]; + if (ignore_index != LLONG_MIN && t == ignore_index) { + loss[idx] = (T_out)0; + return; + } + if (t < 0 || t >= C) { + loss[idx] = (T_out)0; + return; + } + + const long long base = ((long long)n * C * inner_size) + inr; + + // 数值稳定 LSE:lse = log(sum exp(x - m)) + m + float m = -CUDART_INF_F; + for (int c = 0; c < C; ++c) { + m = fmaxf(m, to_float(logits[base + (long long)c * inner_size])); + } + + float sum_exp = 0.f; + for (int c = 0; c < C; ++c) { + sum_exp += expf(to_float(logits[base + (long long)c * inner_size]) - m); + } + + float lse = logf(sum_exp) + m; + float logit_t = to_float(logits[base + (long long)(int)t * inner_size]); + loss[idx] = (T_out)(lse - logit_t); +} + +} // namespace cuda + +struct Descriptor::Opaque { + std::shared_ptr internal; + std::vector logits_shape; + Opaque(std::shared_ptr p) : internal(p) {} + ~Opaque() = default; +}; + +Descriptor::~Descriptor() { + if (_opaque) { + delete _opaque; + } +} + +infiniStatus_t Descriptor::create(infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t /*loss_desc*/, + infiniopTensorDescriptor_t logits_desc, + infiniopTensorDescriptor_t /*target_desc*/) { +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) + auto handle = reinterpret_cast(handle_); + auto dtype = logits_desc->dtype(); + CHECK_DTYPE(dtype, INFINI_DTYPE_F32, INFINI_DTYPE_F16, INFINI_DTYPE_BF16); + + const auto &orig = logits_desc->shape(); + auto opaque = new Opaque(handle->internal()); + + if (orig.size() == 1) { + opaque->logits_shape = {1, orig[0]}; + } else { + opaque->logits_shape = orig; + } + + const auto &s = opaque->logits_shape; + long long N = (long long)s[0]; + long long inner = 1; + for (size_t i = 2; i < s.size(); ++i) { + inner *= (long long)s[i]; + } + + size_t workspace_size = (size_t)(N * inner) * sizeof(float); + *desc_ptr = new Descriptor(dtype, workspace_size, opaque, handle->device, + handle->device_id); + return INFINI_STATUS_SUCCESS; +#else + return INFINI_STATUS_NOT_IMPLEMENTED; +#endif +} + +infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, + void *loss, const void *logits, + const void *target, void *stream) const { +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) + const auto &s = _opaque->logits_shape; + int N = (int)s[0]; + int C = (int)s[1]; + long long inner = 1; + for (size_t i = 2; i < s.size(); ++i) { + inner *= (long long)s[i]; + } + long long total = (long long)N * inner; + + size_t need_ws = (size_t)total * sizeof(float); + if (workspace_size < need_ws) { + return INFINI_STATUS_INTERNAL_ERROR; + } + float *per_sample = reinterpret_cast(workspace); + + const int64_t *tgt_i64 = reinterpret_cast(target); + const int64_t ignore_index = -100; + + // 1) 写 per-sample loss -> workspace(float) + dim3 block(256); + dim3 grid((total + block.x - 1) / block.x); + cudaStream_t st = (cudaStream_t)stream; + + if (_dtype == INFINI_DTYPE_F32) { + cuda::softmaxCrossEntropy_per_sample<<>>( + per_sample, (const float *)logits, tgt_i64, N, C, inner, ignore_index); + } else if (_dtype == INFINI_DTYPE_F16) { + cuda::softmaxCrossEntropy_per_sample<<>>( + per_sample, (const half *)logits, tgt_i64, N, C, inner, ignore_index); + } else if (_dtype == INFINI_DTYPE_BF16) { + cuda::softmaxCrossEntropy_per_sample<__nv_bfloat16, float> + <<>>(per_sample, (const __nv_bfloat16 *)logits, + tgt_i64, N, C, inner, ignore_index); + } + { + auto err = cudaGetLastError(); + if (err != cudaSuccess) { + return INFINI_STATUS_INTERNAL_ERROR; + } + } + + // 2) host 侧 mean(仅统计 target != ignore_index) + std::vector h_loss((size_t)total); + std::vector h_tgt((size_t)total); + if (cudaMemcpyAsync(h_loss.data(), per_sample, need_ws, + cudaMemcpyDeviceToHost, st) + != cudaSuccess) { + return INFINI_STATUS_INTERNAL_ERROR; + } + if (cudaMemcpyAsync(h_tgt.data(), tgt_i64, (size_t)total * sizeof(int64_t), + cudaMemcpyDeviceToHost, st) + != cudaSuccess) { + return INFINI_STATUS_INTERNAL_ERROR; + } + if (cudaStreamSynchronize(st) != cudaSuccess) { + return INFINI_STATUS_INTERNAL_ERROR; + } + + double acc = 0.0; + long long cnt = 0; + for (long long i = 0; i < total; ++i) { + if (h_tgt[i] != ignore_index) { + acc += (double)h_loss[i]; + ++cnt; + } + } + double mean = (cnt > 0) ? (acc / (double)cnt) : 0.0; + + // 3) 把标量 mean 写回 device 的 loss 指针(按输入 dtype 写 1 个元素) + if (_dtype == INFINI_DTYPE_F32) { + float v = (float)mean; + if (cudaMemcpyAsync(loss, &v, sizeof(float), cudaMemcpyHostToDevice, st) != cudaSuccess) { + return INFINI_STATUS_INTERNAL_ERROR; + } + } else if (_dtype == INFINI_DTYPE_F16) { + half v = __float2half((float)mean); + if (cudaMemcpyAsync(loss, &v, sizeof(half), cudaMemcpyHostToDevice, st) != cudaSuccess) { + return INFINI_STATUS_INTERNAL_ERROR; + } + } else if (_dtype == INFINI_DTYPE_BF16) { + __nv_bfloat16 v = __float2bfloat16((float)mean); + if (cudaMemcpyAsync(loss, &v, sizeof(__nv_bfloat16), cudaMemcpyHostToDevice, + st) + != cudaSuccess) { + return INFINI_STATUS_INTERNAL_ERROR; + } + } + if (cudaStreamSynchronize(st) != cudaSuccess) { + return INFINI_STATUS_INTERNAL_ERROR; + } + + return INFINI_STATUS_SUCCESS; +#else + return INFINI_STATUS_NOT_IMPLEMENTED; +#endif +} +} // namespace op::cross_entropy_loss::nvidia diff --git a/src/infiniop/ops/cross_entropy_loss/nvidia/cross_entropy_loss_nvidia.cuh b/src/infiniop/ops/cross_entropy_loss/nvidia/cross_entropy_loss_nvidia.cuh new file mode 100644 index 000000000..843fc943d --- /dev/null +++ b/src/infiniop/ops/cross_entropy_loss/nvidia/cross_entropy_loss_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __CROSS_ENTROPY_LOSS_CUDA_CUH__ +#define __CROSS_ENTROPY_LOSS_CUDA_CUH__ + +#include "../cross_entropy_loss.h" + +DESCRIPTOR(nvidia) + +#endif // __CROSS_ENTROPY_LOSS_CUDA_CUH__ diff --git a/src/infiniop/ops/cross_entropy_loss/operator.cc b/src/infiniop/ops/cross_entropy_loss/operator.cc new file mode 100644 index 000000000..8668dc574 --- /dev/null +++ b/src/infiniop/ops/cross_entropy_loss/operator.cc @@ -0,0 +1,142 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/cross_entropy_loss.h" + +#ifdef ENABLE_CPU_API +#include "cpu/cross_entropy_loss_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#include "nvidia/cross_entropy_loss_nvidia.cuh" +#endif + +__C infiniStatus_t infiniopCreateCrossEntropyLossDescriptor( + infiniopHandle_t handle, + infiniopCrossEntropyLossDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t loss_desc, + infiniopTensorDescriptor_t logits_desc, + infiniopTensorDescriptor_t target_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::cross_entropy_loss::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + loss_desc, \ + logits_desc, \ + target_desc) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetCrossEntropyLossWorkspaceSize(infiniopCrossEntropyLossDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopCrossEntropyLoss( + infiniopCrossEntropyLossDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *loss, + const void *logits, + const void *target, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, loss, logits, target, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroyCrossEntropyLossDescriptor(infiniopCrossEntropyLossDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + DELETE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/ops/exp/cpu/exp_cpu.cc b/src/infiniop/ops/exp/cpu/exp_cpu.cc new file mode 100644 index 000000000..58a6d0f2d --- /dev/null +++ b/src/infiniop/ops/exp/cpu/exp_cpu.cc @@ -0,0 +1,52 @@ +#include "exp_cpu.h" + +namespace op::exp::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + // create CPU elementwise descriptor + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::exp::cpu diff --git a/src/infiniop/ops/exp/cpu/exp_cpu.h b/src/infiniop/ops/exp/cpu/exp_cpu.h new file mode 100644 index 000000000..867c7afa5 --- /dev/null +++ b/src/infiniop/ops/exp/cpu/exp_cpu.h @@ -0,0 +1,21 @@ +#ifndef __EXP_CPU_H__ +#define __EXP_CPU_H__ + +#include "../../../elementwise/cpu/elementwise_cpu.h" +#include + +ELEMENTWISE_DESCRIPTOR(exp, cpu) + +namespace op::exp::cpu { +typedef struct ExpOp { +public: + static constexpr size_t num_inputs = 1; + + template + T operator()(const T &input) const { + return std::exp(input); + } +} ExpOp; +} // namespace op::exp::cpu + +#endif // __EXP_CPU_H__ diff --git a/src/infiniop/ops/exp/cuda/kernel.cuh b/src/infiniop/ops/exp/cuda/kernel.cuh new file mode 100644 index 000000000..12446f31a --- /dev/null +++ b/src/infiniop/ops/exp/cuda/kernel.cuh @@ -0,0 +1,39 @@ +#ifndef __EXP_CUDA_H__ +#define __EXP_CUDA_H__ + +#include +#include +#include + +namespace op::exp::cuda { +typedef struct ExpOp { + static constexpr size_t num_inputs = 1; + + template + __device__ __forceinline__ T operator()(const T &input) const { + if constexpr (std::is_same_v) { + float2 vf = __half22float2(input); + float2 vr = make_float2(__expf(vf.x), __expf(vf.y)); + return __float22half2_rn(vr); + } else if constexpr (std::is_same_v) { + float inputf = __half2float(input); + return __float2half_rn(__expf(inputf)); + } else if constexpr (std::is_same_v) { + float f0 = __bfloat162float(__low2bfloat16(input)); + float f1 = __bfloat162float(__high2bfloat16(input)); + return __floats2bfloat162_rn(__expf(f0), __expf(f1)); + } else if constexpr (std::is_same_v) { + float inputf = __bfloat162float(input); + return __float2bfloat16_rn(__expf(inputf)); + } else if constexpr (std::is_same_v) { + return __expf(input); + } else if constexpr (std::is_same_v) { + return std::exp(input); + } else { + return std::exp(input); + } + } +} ExpOp; +} // namespace op::exp::cuda + +#endif // __EXP_CUDA_H__ diff --git a/src/infiniop/ops/exp/nvidia/exp_nvidia.cu b/src/infiniop/ops/exp/nvidia/exp_nvidia.cu new file mode 100644 index 000000000..3bdf2eb45 --- /dev/null +++ b/src/infiniop/ops/exp/nvidia/exp_nvidia.cu @@ -0,0 +1,59 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "exp_nvidia.cuh" + +namespace op::exp::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::ExpOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::ExpOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::ExpOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::ExpOp, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::exp::nvidia diff --git a/src/infiniop/ops/exp/nvidia/exp_nvidia.cuh b/src/infiniop/ops/exp/nvidia/exp_nvidia.cuh new file mode 100644 index 000000000..7545e8f3e --- /dev/null +++ b/src/infiniop/ops/exp/nvidia/exp_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __EXP_CUDA_API_H__ +#define __EXP_CUDA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(exp, nvidia) + +#endif // __EXP_CUDA_API_H__ diff --git a/src/infiniop/ops/exp/operator.cc b/src/infiniop/ops/exp/operator.cc new file mode 100644 index 000000000..cc369d660 --- /dev/null +++ b/src/infiniop/ops/exp/operator.cc @@ -0,0 +1,139 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/exp.h" + +#ifdef ENABLE_CPU_API +#include "cpu/exp_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#include "nvidia/exp_nvidia.cuh" +#endif + +__C infiniStatus_t infiniopCreateExpDescriptor( + infiniopHandle_t handle, + infiniopExpDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::exp::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + output_desc, \ + {input_desc}) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetExpWorkspaceSize(infiniopExpDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopExp( + infiniopExpDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, output, {input}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroyExpDescriptor(infiniopExpDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + DELETE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/ops/gather/cpu/gather_cpu.cc b/src/infiniop/ops/gather/cpu/gather_cpu.cc new file mode 100644 index 000000000..f7251bdd7 --- /dev/null +++ b/src/infiniop/ops/gather/cpu/gather_cpu.cc @@ -0,0 +1,96 @@ +#include "gather_cpu.h" +#include "../../../devices/cpu/common_cpu.h" +#include "../../../reduce/cpu/reduce.h" +#include "../info.h" + +namespace op::gather::cpu { + +infiniStatus_t calculate_gather( + const GatherInfo &info, + char *output, + const char *input, + const int64_t *index) { + // -------------------------------- start: perform operator on CPU -------------------------------- + std::vector contiguous_strides(info.ndim); + ptrdiff_t last_dim = 1; + ptrdiff_t last_stride = 1; + for (size_t d = 0; d < info.ndim; d++) { + contiguous_strides[d] = last_dim * last_stride; + last_dim = info.output_shape[d]; + last_stride = contiguous_strides[d]; + } + size_t total_size = last_dim * last_stride; + + int gather_dim = static_cast(info.dim); + size_t element_size = infiniSizeOf(info.dtype); + +#pragma omp parallel for + for (int i = 0; i < static_cast(total_size); i++) { + auto output_ptr = output; + auto input_ptr = input; + auto index_ptr = index; + size_t rem = static_cast(i); + for (int d = static_cast(info.ndim) - 1; d >= 0; d--) { + size_t dim_index = rem / contiguous_strides[d]; + rem = rem % contiguous_strides[d]; + output_ptr += dim_index * element_size * info.output_strides[d]; + index_ptr += dim_index * info.index_strides[d]; + if (d != gather_dim) { + input_ptr += dim_index * element_size * info.input_strides[d]; + } + } + int64_t gather_number = *index_ptr; + input_ptr += gather_number * element_size * info.input_strides[gather_dim]; + // *output_ptr = *input_ptr; + memcpy( + output_ptr, + input_ptr, + element_size); + } + // --------------------------------- end: perform operator on CPU --------------------------------- + return INFINI_STATUS_SUCCESS; +} + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t index_desc, + size_t dim) { + auto handle = reinterpret_cast(handle_); + + // --------------------- start: check data type and calculate workspace size ---------------------- + auto dtype = output_desc->dtype(); + size_t WorkSpaceSize = 0; + // ---------------------- end: check data type and calculate workspace size ----------------------- + + auto result = GatherInfo::createGatherInfo( + output_desc, + input_desc, + index_desc, + dim); + CHECK_RESULT(result); + const GatherInfo &info = result.take(); + + *desc_ptr = new Descriptor( + dtype, std::move(info), WorkSpaceSize, + nullptr, + handle->device, handle->device_id); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + const void *input, + const void *index, + void *stream) const { + + return calculate_gather(_info, (char *)output, (const char *)input, (const int64_t *)index); +} +} // namespace op::gather::cpu diff --git a/src/infiniop/ops/gather/cpu/gather_cpu.h b/src/infiniop/ops/gather/cpu/gather_cpu.h new file mode 100644 index 000000000..dac3583ac --- /dev/null +++ b/src/infiniop/ops/gather/cpu/gather_cpu.h @@ -0,0 +1,8 @@ +#ifndef __GATHER_CPU_H__ +#define __GATHER_CPU_H__ + +#include "../gather.h" + +DESCRIPTOR(cpu) + +#endif // __GATHER_CPU_H__ diff --git a/src/infiniop/ops/gather/cuda/kernel.cuh b/src/infiniop/ops/gather/cuda/kernel.cuh new file mode 100644 index 000000000..dbb818e83 --- /dev/null +++ b/src/infiniop/ops/gather/cuda/kernel.cuh @@ -0,0 +1,37 @@ +#ifndef __GATHER_KERNEL_CUH__ +#define __GATHER_KERNEL_CUH__ +// ------------------------------- start: perform operator on CUDA -------------------------------- +template +__device__ void gatherKernel( + Tdata *output, + const Tdata *input, + const int64_t *index, + size_t ndim, + size_t index_gather_size, + ptrdiff_t *output_strides, + ptrdiff_t *input_strides, + ptrdiff_t *index_strides, + ptrdiff_t *contiguous_strides, + int gather_dim) { + auto output_ptr = output; + auto input_ptr = input; + auto index_ptr = index; + size_t rem = blockIdx.x; + for (int d = ndim - 1; d >= 0; d--) { + if (d == gather_dim) { + continue; + } + size_t dim_index = rem / contiguous_strides[d]; + rem = rem % contiguous_strides[d]; + output_ptr += dim_index * output_strides[d]; + input_ptr += dim_index * input_strides[d]; + index_ptr += dim_index * index_strides[d]; + } + for (size_t c = threadIdx.x; c < index_gather_size; c++) { + int64_t gather_number = *(index_ptr + c * index_strides[gather_dim]); + *(output_ptr + c * output_strides[gather_dim]) = *(input_ptr + gather_number * input_strides[gather_dim]); + } +} +// -------------------------------- end: perform operator on CUDA --------------------------------- + +#endif // __GATHER_KERNEL_CUH__ diff --git a/src/infiniop/ops/gather/gather.h b/src/infiniop/ops/gather/gather.h new file mode 100644 index 000000000..d4c88b0c4 --- /dev/null +++ b/src/infiniop/ops/gather/gather.h @@ -0,0 +1,47 @@ +#ifndef __GATHER_H__ +#define __GATHER_H__ + +#include "../../../utils.h" +#include "../../operator.h" +#include "../../tensor.h" +#include "info.h" + +#define DESCRIPTOR(NAMESPACE) \ + namespace op::gather::NAMESPACE { \ + class Descriptor final : public InfiniopDescriptor { \ + struct Opaque; \ + Opaque *_opaque; \ + GatherInfo _info; \ + size_t _workspace_size; \ + Descriptor( \ + infiniDtype_t dtype, \ + GatherInfo info, \ + size_t workspace_size_, \ + Opaque *opaque, \ + infiniDevice_t device_type, \ + int device_id) : InfiniopDescriptor{device_type, device_id}, \ + _opaque(opaque), \ + _info(info), \ + _workspace_size(workspace_size_) {} \ + \ + public: \ + ~Descriptor(); \ + size_t workspaceSize() const { return _workspace_size; } \ + static infiniStatus_t create( \ + infiniopHandle_t handle, \ + Descriptor **desc_ptr, \ + infiniopTensorDescriptor_t output_desc, \ + infiniopTensorDescriptor_t input_desc, \ + infiniopTensorDescriptor_t index_desc, \ + size_t dim); \ + infiniStatus_t calculate( \ + void *workspace, \ + size_t workspace_size, \ + void *output, \ + const void *input, \ + const void *index, \ + void *stream) const; \ + }; \ + } + +#endif \ No newline at end of file diff --git a/src/infiniop/ops/gather/info.h b/src/infiniop/ops/gather/info.h new file mode 100644 index 000000000..0098c7ea1 --- /dev/null +++ b/src/infiniop/ops/gather/info.h @@ -0,0 +1,58 @@ +#ifndef __GATHER_INFO_H__ +#define __GATHER_INFO_H__ + +#include "../../../utils.h" +#include "../../operator.h" +#include "../../tensor.h" + +namespace op::gather { + +class GatherInfo { +private: + GatherInfo() = default; + +public: + // ---------------------------- start: define member variables of Info ---------------------------- + infiniDtype_t dtype; + size_t ndim; + std::vector output_shape; + size_t input_dim_size; + std::vector output_strides; + std::vector input_strides; + std::vector index_strides; + size_t dim; + + // ----------------------------- end: define member variables of Info ----------------------------- + + static utils::Result createGatherInfo( + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t index_desc, + size_t dim) { + // ------------------------- start: check tensor shape and input validity ------------------------- + CHECK_SAME_SHAPE(output_desc->shape(), index_desc->shape()); + size_t ndim = output_desc->ndim(); + for (size_t d = 0; d < ndim; d++) { + if (d != dim) { + CHECK_OR_RETURN(input_desc->dim(d) == output_desc->dim(d), INFINI_STATUS_BAD_TENSOR_SHAPE); + } + } + CHECK_OR_RETURN(ndim > dim, INFINI_STATUS_BAD_PARAM); + // -------------------------- end: check tensor shape and input validity -------------------------- + return utils::Result(GatherInfo{ + // ------------------------------ start: create an instance of Info ------------------------------- + output_desc->dtype(), + ndim, + output_desc->shape(), + input_desc->dim(dim), + output_desc->strides(), + input_desc->strides(), + index_desc->strides(), + dim + // ------------------------------- end: create an instance of Info -------------------------------- + }); + } +}; +} // namespace op::gather + +#endif // __GATHER_INFO_H__ diff --git a/src/infiniop/ops/gather/nvidia/gather_nvidia.cu b/src/infiniop/ops/gather/nvidia/gather_nvidia.cu new file mode 100644 index 000000000..f0a2a7fb3 --- /dev/null +++ b/src/infiniop/ops/gather/nvidia/gather_nvidia.cu @@ -0,0 +1,179 @@ +#include "../../../devices/nvidia/nvidia_common.cuh" +#include "../../../devices/nvidia/nvidia_handle.cuh" +#include "../../../devices/nvidia/nvidia_kernel_common.cuh" +#include "../cuda/kernel.cuh" +#include "../info.h" +#include "gather_nvidia.cuh" + +namespace op::gather::nvidia { + +// ---------------------- start: launchKernel: call kernel function of CUDA ----------------------- +template +INFINIOP_CUDA_KERNEL launchKernel( + Tdata *output, + const Tdata *input, + const int64_t *index, + size_t ndim, + size_t index_gather_size, + ptrdiff_t *output_strides, + ptrdiff_t *input_strides, + ptrdiff_t *index_strides, + ptrdiff_t *contiguous_strides, + int gather_dim) { + gatherKernel( + output, + input, + index, + ndim, + index_gather_size, + output_strides, + input_strides, + index_strides, + contiguous_strides, + gather_dim); +} +// ----------------------- end: launchKernel: call kernel function of CUDA ------------------------ + +// ----------------------------------- start: call launchKernel ----------------------------------- +template +infiniStatus_t calculate_gather( + const GatherInfo &info, + Tdata *output, + const Tdata *input, + const int64_t *index, + cudaStream_t stream, + void *workspace) { + size_t ndim = info.ndim; + ptrdiff_t *contiguous_strides = new ptrdiff_t[ndim]; + size_t last_dim = 1, last_stride = 1; + size_t gather_dim = info.dim; + for (size_t d = 0; d < ndim; d++) { + if (d == gather_dim) { + continue; + } + contiguous_strides[d] = last_dim * last_stride; + last_dim = info.output_shape[d]; + last_stride = contiguous_strides[d]; + } + size_t batch_size = last_dim * last_stride; + + ptrdiff_t *contiguous_strides_cuda = reinterpret_cast(workspace); + ptrdiff_t *input_strides_cuda = contiguous_strides_cuda + ndim; + ptrdiff_t *output_strides_cuda = input_strides_cuda + ndim; + ptrdiff_t *index_strides_cuda = output_strides_cuda + ndim; + + CHECK_CUDA(cudaMemcpyAsync(contiguous_strides_cuda, contiguous_strides, sizeof(ptrdiff_t) * ndim, cudaMemcpyHostToDevice, stream)); + CHECK_CUDA(cudaMemcpyAsync(input_strides_cuda, info.input_strides.data(), sizeof(ptrdiff_t) * ndim, cudaMemcpyHostToDevice, stream)); + CHECK_CUDA(cudaMemcpyAsync(output_strides_cuda, info.output_strides.data(), sizeof(ptrdiff_t) * ndim, cudaMemcpyHostToDevice, stream)); + CHECK_CUDA(cudaMemcpyAsync(index_strides_cuda, info.index_strides.data(), sizeof(ptrdiff_t) * ndim, cudaMemcpyHostToDevice, stream)); + + launchKernel<1, Tdata><<>>( + output, + input, + index, + ndim, + info.output_shape[gather_dim], + output_strides_cuda, + input_strides_cuda, + index_strides_cuda, + contiguous_strides_cuda, + info.dim); + delete[] contiguous_strides; + return INFINI_STATUS_SUCCESS; +} +// ------------------------------------ end: call launchKernel ------------------------------------ + +struct Descriptor::Opaque { + std::shared_ptr internal; +}; + +Descriptor::~Descriptor() { + delete _opaque; +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t index_desc, + size_t dim) { + auto handle = reinterpret_cast(handle_); + // --------------------- start: check data type and calculate workspace size ---------------------- + auto dtype = output_desc->dtype(); + size_t WorkSpaceSize = sizeof(ptrdiff_t) * input_desc->ndim() * 4; + // ---------------------- end: check data type and calculate workspace size ----------------------- + auto result = GatherInfo::createGatherInfo( + output_desc, + input_desc, + index_desc, + dim); + CHECK_RESULT(result); + const GatherInfo &info = result.take(); + *desc_ptr = new Descriptor( + dtype, std::move(info), WorkSpaceSize, + new Opaque{handle->internal()}, + handle->device, handle->device_id); + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + const void *input, + const void *index, + void *stream_) const { + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + cudaStream_t stream = (cudaStream_t)stream_; + +#define CALCULATE_GATHER(BLOCK_SIZE, TDATA) \ + calculate_gather(_info, (TDATA *)output, (const TDATA *)input, (const int64_t *)index, stream, workspace) +#define CALCULATE_GATHER_WITH_BLOCK_SIZE(BLOCK_SIZE) \ + switch (_info.dtype) { \ + case INFINI_DTYPE_BOOL: \ + return CALCULATE_GATHER(BLOCK_SIZE, bool); \ + case INFINI_DTYPE_U8: \ + return CALCULATE_GATHER(BLOCK_SIZE, uint8_t); \ + case INFINI_DTYPE_U16: \ + return CALCULATE_GATHER(BLOCK_SIZE, uint16_t); \ + case INFINI_DTYPE_U32: \ + return CALCULATE_GATHER(BLOCK_SIZE, uint32_t); \ + case INFINI_DTYPE_U64: \ + return CALCULATE_GATHER(BLOCK_SIZE, uint64_t); \ + case INFINI_DTYPE_I8: \ + return CALCULATE_GATHER(BLOCK_SIZE, int8_t); \ + case INFINI_DTYPE_I16: \ + return CALCULATE_GATHER(BLOCK_SIZE, int16_t); \ + case INFINI_DTYPE_I32: \ + return CALCULATE_GATHER(BLOCK_SIZE, int32_t); \ + case INFINI_DTYPE_I64: \ + return CALCULATE_GATHER(BLOCK_SIZE, int64_t); \ + case INFINI_DTYPE_F16: \ + return CALCULATE_GATHER(BLOCK_SIZE, half); \ + case INFINI_DTYPE_F32: \ + return CALCULATE_GATHER(BLOCK_SIZE, float); \ + case INFINI_DTYPE_BF16: \ + return CALCULATE_GATHER(BLOCK_SIZE, cuda_bfloat16); \ + default: \ + return INFINI_STATUS_BAD_TENSOR_DTYPE; \ + } + + if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_1024) { + CALCULATE_GATHER_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_1024) + } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_512) { + CALCULATE_GATHER_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_512) + } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_4096) { + CALCULATE_GATHER_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_4096) + } else { + return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED; + } + +#undef CALCULATE_GATHER_WITH_BLOCK_SIZE +#undef CALCULATE_GATHER + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::gather::nvidia diff --git a/src/infiniop/ops/gather/nvidia/gather_nvidia.cuh b/src/infiniop/ops/gather/nvidia/gather_nvidia.cuh new file mode 100644 index 000000000..46d42fa0c --- /dev/null +++ b/src/infiniop/ops/gather/nvidia/gather_nvidia.cuh @@ -0,0 +1,7 @@ +#ifndef __GATHER_NVIDIA_API_H__ +#define __GATHER_NVIDIA_API_H__ +#include "../gather.h" + +DESCRIPTOR(nvidia) + +#endif // __GATHER_NVIDIA_API_H__ diff --git a/src/infiniop/ops/gather/operator.cc b/src/infiniop/ops/gather/operator.cc new file mode 100644 index 000000000..706009e9b --- /dev/null +++ b/src/infiniop/ops/gather/operator.cc @@ -0,0 +1,144 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/gather.h" + +#ifdef ENABLE_CPU_API +#include "cpu/gather_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#include "nvidia/gather_nvidia.cuh" +#endif + +__C infiniStatus_t infiniopCreateGatherDescriptor( + infiniopHandle_t handle, + infiniopGatherDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t index_desc, + size_t dim) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::gather::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + output_desc, \ + input_desc, \ + index_desc, \ + dim) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetGatherWorkspaceSize(infiniopGatherDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopGather( + infiniopGatherDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + const void *index, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, output, input, index, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroyGatherDescriptor(infiniopGatherDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + DELETE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/ops/hardswish/cpu/hardswish_cpu.cc b/src/infiniop/ops/hardswish/cpu/hardswish_cpu.cc new file mode 100644 index 000000000..e7b68508a --- /dev/null +++ b/src/infiniop/ops/hardswish/cpu/hardswish_cpu.cc @@ -0,0 +1,52 @@ +#include "hardswish_cpu.h" + +namespace op::hardswish::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + // create CPU elementwise descriptor + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::hardswish::cpu diff --git a/src/infiniop/ops/hardswish/cpu/hardswish_cpu.h b/src/infiniop/ops/hardswish/cpu/hardswish_cpu.h new file mode 100644 index 000000000..e137be8a0 --- /dev/null +++ b/src/infiniop/ops/hardswish/cpu/hardswish_cpu.h @@ -0,0 +1,30 @@ +#ifndef __HARDSWISH_CPU_H__ +#define __HARDSWISH_CPU_H__ + +#include "../../../elementwise/cpu/elementwise_cpu.h" +#include + +ELEMENTWISE_DESCRIPTOR(hardswish, cpu) + +namespace op::hardswish::cpu { +typedef struct HardswishOp { +public: + static constexpr size_t num_inputs = 1; + + template + T operator()(const T &input) const { + if constexpr (std::is_integral_v) { + return static_cast(0); + } else { + // x * clamp(x + 3, 0, 6) / 6 + auto x = static_cast(input); + double y = x + 3.0; + y = std::min(std::max(y, 0.0), 6.0); + double out = x * (y / 6.0); + return static_cast(out); + } + } +} HardswishOp; +} // namespace op::hardswish::cpu + +#endif // __HARDSWISH_CPU_H__ diff --git a/src/infiniop/ops/hardswish/cuda/kernel.cuh b/src/infiniop/ops/hardswish/cuda/kernel.cuh new file mode 100644 index 000000000..d5b369bce --- /dev/null +++ b/src/infiniop/ops/hardswish/cuda/kernel.cuh @@ -0,0 +1,55 @@ +#ifndef __HARDSWISH_CUDA_H__ +#define __HARDSWISH_CUDA_H__ + +#include +#include +#include + +namespace op::hardswish::cuda { + +typedef struct HardswishOp { + static constexpr size_t num_inputs = 1; + + // Hardswish: f(x) = x * clamp(x + 3, 0, 6) / 6 + __device__ __forceinline__ float hswish_f32(float x) const { + float y = x + 3.0f; + y = y < 0.0f ? 0.0f : (y > 6.0f ? 6.0f : y); + return x * (y * (1.0f / 6.0f)); + } + + template + __device__ __forceinline__ T operator()(const T &input) const { + if constexpr (std::is_same_v) { + float2 vf = __half22float2(input); + float2 vr = make_float2( + hswish_f32(vf.x), + hswish_f32(vf.y)); + return __float22half2_rn(vr); + } else if constexpr (std::is_same_v) { + float xf = __half2float(input); + float yf = hswish_f32(xf); + return __float2half_rn(yf); + } else if constexpr (std::is_same_v) { + float f0 = __bfloat162float(__low2bfloat16(input)); + float f1 = __bfloat162float(__high2bfloat16(input)); + return __floats2bfloat162_rn(hswish_f32(f0), hswish_f32(f1)); + } else if constexpr (std::is_same_v) { + float xf = __bfloat162float(input); + return __float2bfloat16_rz(hswish_f32(xf)); + } else if constexpr (std::is_same_v) { + return hswish_f32(input); + } else if constexpr (std::is_same_v) { + double xd = static_cast(input); + double yd = xd * (std::fmin(std::fmax(xd + 3.0, 0.0), 6.0) / 6.0); + return static_cast(yd); + } else { + double xd = static_cast(input); + double yd = xd * (std::fmin(std::fmax(xd + 3.0, 0.0), 6.0) / 6.0); + return static_cast(yd); + } + } +} HardswishOp; + +} // namespace op::hardswish::cuda + +#endif // __HARDSWISH_CUDA_H__ diff --git a/src/infiniop/ops/hardswish/nvidia/hardswish_nvidia.cu b/src/infiniop/ops/hardswish/nvidia/hardswish_nvidia.cu new file mode 100644 index 000000000..9e279c2ef --- /dev/null +++ b/src/infiniop/ops/hardswish/nvidia/hardswish_nvidia.cu @@ -0,0 +1,59 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "hardswish_nvidia.cuh" + +namespace op::hardswish::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::HardswishOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::HardswishOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::HardswishOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::HardswishOp, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::hardswish::nvidia diff --git a/src/infiniop/ops/hardswish/nvidia/hardswish_nvidia.cuh b/src/infiniop/ops/hardswish/nvidia/hardswish_nvidia.cuh new file mode 100644 index 000000000..f869ad52f --- /dev/null +++ b/src/infiniop/ops/hardswish/nvidia/hardswish_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __HARDSWISH_CUDA_API_H__ +#define __HARDSWISH_CUDA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(hardswish, nvidia) + +#endif // __HARDSWISH_CUDA_API_H__ diff --git a/src/infiniop/ops/hardswish/operator.cc b/src/infiniop/ops/hardswish/operator.cc new file mode 100644 index 000000000..c51b18777 --- /dev/null +++ b/src/infiniop/ops/hardswish/operator.cc @@ -0,0 +1,139 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/hardswish.h" + +#ifdef ENABLE_CPU_API +#include "cpu/hardswish_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#include "nvidia/hardswish_nvidia.cuh" +#endif + +__C infiniStatus_t infiniopCreateHardswishDescriptor( + infiniopHandle_t handle, + infiniopHardswishDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::hardswish::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + output_desc, \ + {input_desc}) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetHardswishWorkspaceSize(infiniopHardswishDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopHardswish( + infiniopHardswishDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, output, {input}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroyHardswishDescriptor(infiniopHardswishDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + DELETE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/ops/index_copy_inplace/cpu/index_copy_inplace_cpu.cc b/src/infiniop/ops/index_copy_inplace/cpu/index_copy_inplace_cpu.cc new file mode 100644 index 000000000..68015ba6b --- /dev/null +++ b/src/infiniop/ops/index_copy_inplace/cpu/index_copy_inplace_cpu.cc @@ -0,0 +1,93 @@ +#include "index_copy_inplace_cpu.h" +#include "../../../devices/cpu/common_cpu.h" +#include "../../../reduce/cpu/reduce.h" +#include "../../rearrange/cpu/rearrange_cpu.h" +#include "../info.h" + +namespace op::index_copy_inplace::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t index_desc, + size_t dim) { + auto handle = reinterpret_cast(handle_); + + // --------------------- start: check data type and calculate workspace size ---------------------- + auto dtype = output_desc->dtype(); + + auto result = IndexCopyInplaceInfo::createIndexCopyInplaceInfo( + output_desc, + input_desc, + index_desc, + dim); + CHECK_RESULT(result); + const IndexCopyInplaceInfo &info = result.take(); + size_t WorkSpaceSize = (info.total_input_size + info.total_output_size) * infiniSizeOf(dtype); + // ---------------------- end: check data type and calculate workspace size ----------------------- + InfiniopTensorDescriptor *rearrange_in_desc = new InfiniopTensorDescriptor( + dtype, input_desc->ndim(), input_desc->shape().data(), info.meta_strides.data()); + InfiniopTensorDescriptor *rearrange_out_desc = new InfiniopTensorDescriptor( + dtype, input_desc->ndim(), output_desc->shape().data(), info.meta_strides.data()); + + void *in_rearrange_descriptor = nullptr; + void *out_rearrange_descriptor = nullptr; + + op::rearrange::cpu::Descriptor::create( + handle_, reinterpret_cast(&in_rearrange_descriptor), + rearrange_in_desc, input_desc); + op::rearrange::cpu::Descriptor::create( + handle_, reinterpret_cast(&out_rearrange_descriptor), + output_desc, rearrange_out_desc); + + *desc_ptr = new Descriptor( + dtype, std::move(info), WorkSpaceSize, + nullptr, + handle->device, handle->device_id, + in_rearrange_descriptor, + out_rearrange_descriptor); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + const void *input, + const void *index, + void *stream) const { + size_t size_of_dtype = infiniSizeOf(_info.dtype); + auto index_ptr = reinterpret_cast(index); + + char *workspace_in = reinterpret_cast(workspace); + char *workspace_out = workspace_in + size_of_dtype * _info.total_input_size; + + reinterpret_cast(_rearrange_desc_in)->calculate(workspace_in, input, stream); + memset(workspace_out, 0, _info.total_output_size * size_of_dtype); + size_t copy_unit_size = _info.meta_strides[_info.dim] * size_of_dtype; +#pragma omp parallel for + for (int dst_index = 0; dst_index < static_cast(_info.output_shape[_info.dim]); dst_index++) { + size_t src_index = _info.index_shape[0] - 1; + while (true) { + if (*(index_ptr + src_index * _info.index_strides[0]) == static_cast(dst_index)) { + std::memcpy( + workspace_out + size_of_dtype * dst_index * _info.meta_strides[_info.dim], + workspace_in + size_of_dtype * src_index * _info.meta_strides[_info.dim], + copy_unit_size); + break; + } else if (src_index == 0) { + break; + } + src_index--; + } + } + reinterpret_cast(_rearrange_desc_out)->calculate(output, workspace_out, stream); + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::index_copy_inplace::cpu diff --git a/src/infiniop/ops/index_copy_inplace/cpu/index_copy_inplace_cpu.h b/src/infiniop/ops/index_copy_inplace/cpu/index_copy_inplace_cpu.h new file mode 100644 index 000000000..49ba41f42 --- /dev/null +++ b/src/infiniop/ops/index_copy_inplace/cpu/index_copy_inplace_cpu.h @@ -0,0 +1,8 @@ +#ifndef __INDEX_COPY_INPLACE_CPU_H__ +#define __INDEX_COPY_INPLACE_CPU_H__ + +#include "../index_copy_inplace.h" + +INDEX_COPY_INPLACE_DESCRIPTOR(cpu) + +#endif // __INDEX_COPY_INPLACE_CPU_H__ diff --git a/src/infiniop/ops/index_copy_inplace/index_copy_inplace.h b/src/infiniop/ops/index_copy_inplace/index_copy_inplace.h new file mode 100644 index 000000000..80849ee2e --- /dev/null +++ b/src/infiniop/ops/index_copy_inplace/index_copy_inplace.h @@ -0,0 +1,53 @@ +#ifndef __INDEX_COPY_INPLACE_H__ +#define __INDEX_COPY_INPLACE_H__ + +#include "../../../utils.h" +#include "../../operator.h" +#include "../../tensor.h" +#include "info.h" + +#define INDEX_COPY_INPLACE_DESCRIPTOR(NAMESPACE) \ + namespace op::index_copy_inplace::NAMESPACE { \ + class Descriptor final : public InfiniopDescriptor { \ + struct Opaque; \ + Opaque *_opaque; \ + IndexCopyInplaceInfo _info; \ + size_t _workspace_size; \ + void *_rearrange_desc_in; \ + void *_rearrange_desc_out; \ + Descriptor( \ + infiniDtype_t dtype, \ + IndexCopyInplaceInfo info, \ + size_t workspace_size_, \ + Opaque *opaque, \ + infiniDevice_t device_type, \ + int device_id, \ + void *rearrange_desc_in, \ + void *rearrange_desc_out) : InfiniopDescriptor{device_type, device_id}, \ + _opaque(opaque), \ + _info(info), \ + _workspace_size(workspace_size_), \ + _rearrange_desc_in(rearrange_desc_in), \ + _rearrange_desc_out(rearrange_desc_out) {} \ + \ + public: \ + ~Descriptor(); \ + size_t workspaceSize() const { return _workspace_size; } \ + static infiniStatus_t create( \ + infiniopHandle_t handle, \ + Descriptor **desc_ptr, \ + infiniopTensorDescriptor_t output_desc, \ + infiniopTensorDescriptor_t input_desc, \ + infiniopTensorDescriptor_t index_desc, \ + size_t dim); \ + infiniStatus_t calculate( \ + void *workspace, \ + size_t workspace_size, \ + void *output, \ + const void *input, \ + const void *index, \ + void *stream) const; \ + }; \ + } + +#endif \ No newline at end of file diff --git a/src/infiniop/ops/index_copy_inplace/info.h b/src/infiniop/ops/index_copy_inplace/info.h new file mode 100644 index 000000000..d7ad41d6f --- /dev/null +++ b/src/infiniop/ops/index_copy_inplace/info.h @@ -0,0 +1,74 @@ +#ifndef __INDEX_COPY_INPLACE_INFO_H__ +#define __INDEX_COPY_INPLACE_INFO_H__ + +#include "../../../utils.h" +#include "../../operator.h" +#include "../../tensor.h" + +namespace op::index_copy_inplace { + +class IndexCopyInplaceInfo { +private: + IndexCopyInplaceInfo() = default; + +public: + // ---------------------------- start: define member variables of Info ---------------------------- + infiniDtype_t dtype; + size_t total_input_size; + size_t total_output_size; + std::vector output_shape; + std::vector input_shape; + std::vector index_shape; + std::vector output_strides; + std::vector input_strides; + std::vector index_strides; + std::vector meta_strides; + size_t dim; + + // ----------------------------- end: define member variables of Info ----------------------------- + + static utils::Result createIndexCopyInplaceInfo( + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t index_desc, + size_t dim) { + // ------------------------- start: check tensor shape and input validity ------------------------- + CHECK_OR_RETURN(output_desc->ndim() == input_desc->ndim(), INFINI_STATUS_BAD_TENSOR_STRIDES); + std::vector meta_strides(input_desc->ndim()); + ptrdiff_t last_dim = 1; + ptrdiff_t last_stride = 1; + size_t total_input_size = 1; + size_t total_output_size = 1; + for (size_t d = 0; d < input_desc->ndim(); d++) { + total_input_size *= input_desc->dim(d); + total_output_size *= output_desc->dim(d); + if (d == dim) { + continue; + } else { + meta_strides[d] = last_dim * last_stride; + last_dim = input_desc->dim(d); + last_stride = meta_strides[d]; + } + } + meta_strides[dim] = last_dim * last_stride; + // -------------------------- end: check tensor shape and input validity -------------------------- + return utils::Result(IndexCopyInplaceInfo{ + // ------------------------------ start: create an instance of Info ------------------------------- + output_desc->dtype(), + total_input_size, + total_output_size, + output_desc->shape(), + input_desc->shape(), + index_desc->shape(), + output_desc->strides(), + input_desc->strides(), + index_desc->strides(), + meta_strides, + dim + // ------------------------------- end: create an instance of Info -------------------------------- + }); + } +}; +} // namespace op::index_copy_inplace + +#endif // __INDEX_COPY_INPLACE_INFO_H__ diff --git a/src/infiniop/ops/index_copy_inplace/nvidia/index_copy_inplace_nvidia.cu b/src/infiniop/ops/index_copy_inplace/nvidia/index_copy_inplace_nvidia.cu new file mode 100644 index 000000000..70772fe67 --- /dev/null +++ b/src/infiniop/ops/index_copy_inplace/nvidia/index_copy_inplace_nvidia.cu @@ -0,0 +1,127 @@ +#include "../../../devices/nvidia/nvidia_common.cuh" +#include "../../../devices/nvidia/nvidia_handle.cuh" +#include "../../../devices/nvidia/nvidia_kernel_common.cuh" +#include "../../rearrange/nvidia/rearrange_nvidia.cuh" +#include "../info.h" +#include "index_copy_inplace_nvidia.cuh" + +namespace op::index_copy_inplace::nvidia { + +infiniStatus_t calculate_index_copy_inplace( + char *output, + const char *input, + const int64_t *index, + size_t copy_unit_size, + size_t output_len, + size_t index_len, + ptrdiff_t index_stride, + cudaStream_t stream) { + int64_t *dst_index = new int64_t; + size_t sizeof_int64_t = sizeof(int64_t); + for (size_t src_index = 0; src_index < index_len; src_index++) { + CHECK_CUDA(cudaMemcpyAsync( + dst_index, + index + src_index * index_stride, + sizeof_int64_t, + cudaMemcpyDeviceToHost, + stream)); + cudaStreamSynchronize(stream); + CHECK_CUDA(cudaMemcpyAsync( + output + (size_t)(*dst_index) * copy_unit_size, + input + src_index * copy_unit_size, + copy_unit_size, + cudaMemcpyDeviceToDevice, + stream)); + cudaStreamSynchronize(stream); + } + delete dst_index; + return INFINI_STATUS_SUCCESS; +} + +struct Descriptor::Opaque { + std::shared_ptr internal; +}; + +Descriptor::~Descriptor() { + delete reinterpret_cast(_rearrange_desc_in); + delete reinterpret_cast(_rearrange_desc_out); + delete _opaque; +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t index_desc, + size_t dim) { + auto handle = reinterpret_cast(handle_); + // --------------------- start: check data type and calculate workspace size ---------------------- + auto dtype = output_desc->dtype(); + // ---------------------- end: check data type and calculate workspace size ----------------------- + auto result = IndexCopyInplaceInfo::createIndexCopyInplaceInfo( + output_desc, + input_desc, + index_desc, + dim); + CHECK_RESULT(result); + const IndexCopyInplaceInfo &info = result.take(); + size_t WorkSpaceSize = (info.total_input_size + info.total_output_size) * infiniSizeOf(dtype); + + InfiniopTensorDescriptor *rearrange_in_desc = new InfiniopTensorDescriptor( + dtype, input_desc->ndim(), input_desc->shape().data(), info.meta_strides.data()); + InfiniopTensorDescriptor *rearrange_out_desc = new InfiniopTensorDescriptor( + dtype, input_desc->ndim(), output_desc->shape().data(), info.meta_strides.data()); + + void *in_rearrange_descriptor = nullptr; + void *out_rearrange_descriptor = nullptr; + + op::rearrange::nvidia::Descriptor::create( + handle_, reinterpret_cast(&in_rearrange_descriptor), + rearrange_in_desc, input_desc); + op::rearrange::nvidia::Descriptor::create( + handle_, reinterpret_cast(&out_rearrange_descriptor), + output_desc, rearrange_out_desc); + + *desc_ptr = new Descriptor( + dtype, std::move(info), WorkSpaceSize, + new Opaque{handle->internal()}, + handle->device, handle->device_id, + in_rearrange_descriptor, + out_rearrange_descriptor); + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + const void *input, + const void *index, + void *stream_) const { + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + cudaStream_t stream = (cudaStream_t)stream_; + + size_t elem_size = infiniSizeOf(_info.dtype); + char *workspace_in = reinterpret_cast(workspace); + char *workspace_out = workspace_in + elem_size * _info.total_input_size; + CHECK_STATUS(reinterpret_cast(_rearrange_desc_in)->calculate(workspace_in, input, stream)); + cudaMemsetAsync(workspace_out, 0, _info.total_output_size * elem_size, stream); + cudaDeviceSynchronize(); + CHECK_STATUS(calculate_index_copy_inplace( + reinterpret_cast(workspace_out), + reinterpret_cast(workspace_in), + reinterpret_cast(index), + elem_size * _info.meta_strides[_info.dim], + _info.output_shape[_info.dim], + _info.index_shape[0], + _info.index_strides[0], + stream)); + cudaDeviceSynchronize(); + + CHECK_STATUS(reinterpret_cast(_rearrange_desc_out)->calculate(output, workspace_out, stream)); + return INFINI_STATUS_SUCCESS; +} +} // namespace op::index_copy_inplace::nvidia diff --git a/src/infiniop/ops/index_copy_inplace/nvidia/index_copy_inplace_nvidia.cuh b/src/infiniop/ops/index_copy_inplace/nvidia/index_copy_inplace_nvidia.cuh new file mode 100644 index 000000000..04c3c86f7 --- /dev/null +++ b/src/infiniop/ops/index_copy_inplace/nvidia/index_copy_inplace_nvidia.cuh @@ -0,0 +1,7 @@ +#ifndef __INDEX_COPY_INPLACE_NVIDIA_API_H__ +#define __INDEX_COPY_INPLACE_NVIDIA_API_H__ +#include "../index_copy_inplace.h" + +INDEX_COPY_INPLACE_DESCRIPTOR(nvidia) + +#endif // __INDEX_COPY_INPLACE_NVIDIA_API_H__ diff --git a/src/infiniop/ops/index_copy_inplace/operator.cc b/src/infiniop/ops/index_copy_inplace/operator.cc new file mode 100644 index 000000000..00e6bca3a --- /dev/null +++ b/src/infiniop/ops/index_copy_inplace/operator.cc @@ -0,0 +1,144 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/index_copy_inplace.h" + +#ifdef ENABLE_CPU_API +#include "cpu/index_copy_inplace_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#include "nvidia/index_copy_inplace_nvidia.cuh" +#endif + +__C infiniStatus_t infiniopCreateIndexCopyInplaceDescriptor( + infiniopHandle_t handle, + infiniopIndexCopyInplaceDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t index_desc, + size_t dim) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::index_copy_inplace::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + output_desc, \ + input_desc, \ + index_desc, \ + dim) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetIndexCopyInplaceWorkspaceSize(infiniopIndexCopyInplaceDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopIndexCopyInplace( + infiniopIndexCopyInplaceDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + const void *index, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, output, input, index, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroyIndexCopyInplaceDescriptor(infiniopIndexCopyInplaceDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + DELETE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/ops/interpolate_nearest/cpu/interpolate_nearest_cpu.cc b/src/infiniop/ops/interpolate_nearest/cpu/interpolate_nearest_cpu.cc new file mode 100644 index 000000000..508dcecc6 --- /dev/null +++ b/src/infiniop/ops/interpolate_nearest/cpu/interpolate_nearest_cpu.cc @@ -0,0 +1,284 @@ +#include "interpolate_nearest_cpu.h" +#include "../../../devices/cpu/common_cpu.h" +#include "../../../devices/cpu/cpu_handle.h" +#include "../info.h" +#include +#include +#include + +namespace op::interpolate_nearest::cpu { + +struct Descriptor::Opaque { + device::cpu::Handle *handle; + InterpolateNearestInfo info; + size_t workspace_size = 0; + +private: + Opaque(device::cpu::Handle *handle_ptr, const InterpolateNearestInfo &interpolate_info) + : handle(handle_ptr), info(interpolate_info) { + workspace_size = 0; + } + + template + size_t compute_input_index_1d(size_t idx) const { + size_t temp = idx; + + // 1D插值:3D张量 (N, C, W) + size_t w = temp % info.output_size[0]; + temp /= info.output_size[0]; + size_t c = temp % info.channels; + size_t b = temp / info.channels; + + float inv_scale = static_cast(info.input_size[0]) / static_cast(info.output_size[0]); + size_t input_w = std::min(static_cast(std::floor(static_cast(w) * inv_scale)), + info.input_size[0] - 1); + + return b * info.input_stride[0] + c * info.input_stride[1] + input_w * info.input_stride[2]; + } + + // 计算2D插值的输入索引 + template + size_t compute_input_index_2d(size_t idx) const { + size_t temp = idx; + + // 2D插值:4D张量 (N, C, H, W) + size_t w = temp % info.output_size[1]; // width在索引1 + temp /= info.output_size[1]; + size_t h = temp % info.output_size[0]; // height在索引0 + temp /= info.output_size[0]; + size_t c = temp % info.channels; + size_t b = temp / info.channels; + + float inv_scale_h = static_cast(info.input_size[0]) / static_cast(info.output_size[0]); + float inv_scale_w = static_cast(info.input_size[1]) / static_cast(info.output_size[1]); + + size_t input_h = std::min(static_cast(std::floor(static_cast(h) * inv_scale_h)), + info.input_size[0] - 1); + size_t input_w = std::min(static_cast(std::floor(static_cast(w) * inv_scale_w)), + info.input_size[1] - 1); + + return b * info.input_stride[0] + c * info.input_stride[1] + input_h * info.input_stride[2] + input_w * info.input_stride[3]; + } + + // 计算3D插值的输入索引 + template + size_t compute_input_index_3d(size_t idx) const { + size_t temp = idx; + + // 3D插值:5D张量 (N, C, D, H, W) + size_t w = temp % info.output_size[2]; // width在索引2 + temp /= info.output_size[2]; + size_t h = temp % info.output_size[1]; // height在索引1 + temp /= info.output_size[1]; + size_t d = temp % info.output_size[0]; // depth在索引0 + temp /= info.output_size[0]; + size_t c = temp % info.channels; + size_t b = temp / info.channels; + + float inv_scale_d = static_cast(info.input_size[0]) / static_cast(info.output_size[0]); + float inv_scale_h = static_cast(info.input_size[1]) / static_cast(info.output_size[1]); + float inv_scale_w = static_cast(info.input_size[2]) / static_cast(info.output_size[2]); + + size_t input_d = std::min(static_cast(std::floor(static_cast(d) * inv_scale_d)), + info.input_size[0] - 1); + size_t input_h = std::min(static_cast(std::floor(static_cast(h) * inv_scale_h)), + info.input_size[1] - 1); + size_t input_w = std::min(static_cast(std::floor(static_cast(w) * inv_scale_w)), + info.input_size[2] - 1); + + return b * info.input_stride[0] + c * info.input_stride[1] + input_d * info.input_stride[2] + input_h * info.input_stride[3] + input_w * info.input_stride[4]; + } + + // 计算输出索引 + template + size_t compute_output_index(size_t idx) const { + size_t temp = idx; + size_t w, h, d, c, b; + + switch (info.dim) { + case INTERPOLATE_1D: { + // 3D张量 (N, C, W) + w = temp % info.output_size[0]; + temp /= info.output_size[0]; + c = temp % info.channels; + b = temp / info.channels; + return b * info.output_stride[0] + c * info.output_stride[1] + w * info.output_stride[2]; + } + + case INTERPOLATE_2D: { + // 4D张量 (N, C, H, W) + w = temp % info.output_size[1]; + temp /= info.output_size[1]; + h = temp % info.output_size[0]; + temp /= info.output_size[0]; + c = temp % info.channels; + b = temp / info.channels; + return b * info.output_stride[0] + c * info.output_stride[1] + h * info.output_stride[2] + w * info.output_stride[3]; + } + + case INTERPOLATE_3D: { + // 5D张量 (N, C, D, H, W) + w = temp % info.output_size[2]; + temp /= info.output_size[2]; + h = temp % info.output_size[1]; + temp /= info.output_size[1]; + d = temp % info.output_size[0]; + temp /= info.output_size[0]; + c = temp % info.channels; + b = temp / info.channels; + return b * info.output_stride[0] + c * info.output_stride[1] + d * info.output_stride[2] + h * info.output_stride[3] + w * info.output_stride[4]; + } + + default: + return 0; + } + } + + // 计算总元素数 + size_t calculate_total_elements() const { + size_t total = info.batch_size * info.channels; + switch (info.dim) { + case INTERPOLATE_1D: + total *= info.output_size[0]; // width + break; + case INTERPOLATE_2D: + total *= info.output_size[0] * info.output_size[1]; // height * width + break; + case INTERPOLATE_3D: + total *= info.output_size[0] * info.output_size[1] * info.output_size[2]; // depth * height * width + break; + } + return total; + } + + // 主要的插值计算函数 + template + void interpolate_nearest_cpu(T *output, const T *input) const { + size_t total_elements = calculate_total_elements(); + +#pragma omp parallel for schedule(static) + for (ptrdiff_t idx = 0; idx < static_cast(total_elements); ++idx) { + size_t input_idx; + + switch (info.dim) { + case INTERPOLATE_1D: + input_idx = compute_input_index_1d(idx); + break; + case INTERPOLATE_2D: + input_idx = compute_input_index_2d(idx); + break; + case INTERPOLATE_3D: + input_idx = compute_input_index_3d(idx); + break; + default: + continue; + } + + size_t output_idx = compute_output_index(idx); + output[output_idx] = input[input_idx]; + } + } + +public: + Opaque(Opaque &&other) noexcept + : handle(other.handle), + info(std::move(other.info)), + workspace_size(other.workspace_size) { + other.handle = nullptr; + other.workspace_size = 0; + } + + ~Opaque() = default; + + static inline utils::Result + create(device::cpu::Handle *handle_ptr, + const InterpolateNearestInfo &info, + infiniDtype_t data_type) { + if (data_type != INFINI_DTYPE_F32 && data_type != INFINI_DTYPE_F16 && data_type != INFINI_DTYPE_BF16 && data_type != INFINI_DTYPE_I8) { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + Opaque opaque(handle_ptr, info); + return utils::Result(std::move(opaque)); + } + + infiniStatus_t calculate(void *workspace, size_t workspace_size, + void *output, const void *input, infiniDtype_t dtype) const { + + if (!output || !input) { + return INFINI_STATUS_BAD_PARAM; + } + + switch (dtype) { + case INFINI_DTYPE_F32: { + float *typed_output = static_cast(output); + const float *typed_input = static_cast(input); + interpolate_nearest_cpu(typed_output, typed_input); + break; + } + + case INFINI_DTYPE_F16: { + fp16_t *typed_output = static_cast(output); + const fp16_t *typed_input = static_cast(input); + interpolate_nearest_cpu(typed_output, typed_input); + break; + } + + case INFINI_DTYPE_BF16: { + bf16_t *typed_output = static_cast(output); + const bf16_t *typed_input = static_cast(input); + interpolate_nearest_cpu(typed_output, typed_input); + break; + } + + case INFINI_DTYPE_I8: { + int8_t *typed_output = static_cast(output); + const int8_t *typed_input = static_cast(input); + interpolate_nearest_cpu(typed_output, typed_input); + break; + } + + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; + } +}; + +Descriptor::~Descriptor() { + if (_opaque) { + delete _opaque; + } +} + +infiniStatus_t Descriptor::create(infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc) { + auto handle = reinterpret_cast(handle_); + auto dtype = output_desc->dtype(); + + // 检查数据类型支持 + CHECK_DTYPE(dtype, INFINI_DTYPE_F32, INFINI_DTYPE_F16, INFINI_DTYPE_BF16, INFINI_DTYPE_I8); + + InterpolateNearestInfo info; + CHECK_STATUS(InterpolateNearestInfo::create(&info, output_desc, input_desc)); + + auto opaque_result = Opaque::create(handle, info, dtype); + CHECK_RESULT(opaque_result); + auto opaque = new Opaque(opaque_result.take()); + + *desc_ptr = new Descriptor(dtype, info, opaque->workspace_size, opaque, + handle->device, handle->device_id); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, + void *output, const void *input, + void *stream) const { + return _opaque->calculate(workspace, workspace_size, output, input, _dtype); +} + +} // namespace op::interpolate_nearest::cpu diff --git a/src/infiniop/ops/interpolate_nearest/cpu/interpolate_nearest_cpu.h b/src/infiniop/ops/interpolate_nearest/cpu/interpolate_nearest_cpu.h new file mode 100644 index 000000000..78dd3ff97 --- /dev/null +++ b/src/infiniop/ops/interpolate_nearest/cpu/interpolate_nearest_cpu.h @@ -0,0 +1,8 @@ +#ifndef __INTERPOLATE_NEAREST_CPU_H__ +#define __INTERPOLATE_NEAREST_CPU_H__ + +#include "../interpolate_nearest.h" + +DESCRIPTOR(cpu) + +#endif // __INTERPOLATE_NEAREST_CPU_H__ diff --git a/src/infiniop/ops/interpolate_nearest/cuda/kernel.cuh b/src/infiniop/ops/interpolate_nearest/cuda/kernel.cuh new file mode 100644 index 000000000..60c798792 --- /dev/null +++ b/src/infiniop/ops/interpolate_nearest/cuda/kernel.cuh @@ -0,0 +1,168 @@ +#ifndef INTERPOLATE_NEAREST_KERNEL_CUH +#define INTERPOLATE_NEAREST_KERNEL_CUH + +#include "../info.h" +#include + +template +__device__ inline size_t +compute_input_index_1d(size_t idx, const InterpolateNearestInfo &info) { + size_t temp = idx; + + // 1D 插值:3D 张量 (N, C, W) + size_t w = temp % info.output_size[0]; // width 在索引 0 + temp /= info.output_size[0]; + size_t c = temp % info.channels; + size_t b = temp / info.channels; + + float inv_scale = static_cast(info.input_size[0]) / static_cast(info.output_size[0]); + size_t input_w = min(static_cast(floorf(static_cast(w) * inv_scale)), + info.input_size[0] - 1); + + return b * info.input_stride[0] + c * info.input_stride[1] + input_w * info.input_stride[2]; +} + +template +__device__ inline size_t +compute_input_index_2d(size_t idx, const InterpolateNearestInfo &info) { + size_t temp = idx; + + // 2D 插值:4D 张量 (N, C, H, W) + size_t w = temp % info.output_size[1]; // width 在索引 1 + temp /= info.output_size[1]; + size_t h = temp % info.output_size[0]; // height 在索引 0 + temp /= info.output_size[0]; + size_t c = temp % info.channels; + size_t b = temp / info.channels; + + float inv_scale_h = static_cast(info.input_size[0]) / static_cast(info.output_size[0]); + float inv_scale_w = static_cast(info.input_size[1]) / static_cast(info.output_size[1]); + + size_t input_h = min(static_cast(floorf(static_cast(h) * inv_scale_h)), + info.input_size[0] - 1); + size_t input_w = min(static_cast(floorf(static_cast(w) * inv_scale_w)), + info.input_size[1] - 1); + + return b * info.input_stride[0] + c * info.input_stride[1] + input_h * info.input_stride[2] + input_w * info.input_stride[3]; +} + +template +__device__ inline size_t +compute_input_index_3d(size_t idx, const InterpolateNearestInfo &info) { + size_t temp = idx; + + // 3D 插值:5D 张量 (N, C, D, H, W) + size_t w = temp % info.output_size[2]; // width 在索引 2 + temp /= info.output_size[2]; + size_t h = temp % info.output_size[1]; // height 在索引 1 + temp /= info.output_size[1]; + size_t d = temp % info.output_size[0]; // depth 在索引 0 + temp /= info.output_size[0]; + size_t c = temp % info.channels; + size_t b = temp / info.channels; + + float inv_scale_d = static_cast(info.input_size[0]) / static_cast(info.output_size[0]); + float inv_scale_h = static_cast(info.input_size[1]) / static_cast(info.output_size[1]); + float inv_scale_w = static_cast(info.input_size[2]) / static_cast(info.output_size[2]); + + size_t input_d = min(static_cast(floorf(static_cast(d) * inv_scale_d)), + info.input_size[0] - 1); + size_t input_h = min(static_cast(floorf(static_cast(h) * inv_scale_h)), + info.input_size[1] - 1); + size_t input_w = min(static_cast(floorf(static_cast(w) * inv_scale_w)), + info.input_size[2] - 1); + + return b * info.input_stride[0] + c * info.input_stride[1] + input_d * info.input_stride[2] + input_h * info.input_stride[3] + input_w * info.input_stride[4]; +} + +template +__device__ inline size_t +compute_output_index(size_t idx, const InterpolateNearestInfo &info) { + size_t temp = idx; + size_t w, h, d, c, b; + + switch (info.dim) { + case INTERPOLATE_1D: { + // 3D 张量 (N, C, W) + w = temp % info.output_size[0]; + temp /= info.output_size[0]; + c = temp % info.channels; + b = temp / info.channels; + return b * info.output_stride[0] + c * info.output_stride[1] + w * info.output_stride[2]; + } + + case INTERPOLATE_2D: { + // 4D 张量 (N, C, H, W) + w = temp % info.output_size[1]; + temp /= info.output_size[1]; + h = temp % info.output_size[0]; + temp /= info.output_size[0]; + c = temp % info.channels; + b = temp / info.channels; + return b * info.output_stride[0] + c * info.output_stride[1] + h * info.output_stride[2] + w * info.output_stride[3]; + } + + case INTERPOLATE_3D: { + // 5D 张量 (N, C, D, H, W) + w = temp % info.output_size[2]; + temp /= info.output_size[2]; + h = temp % info.output_size[1]; + temp /= info.output_size[1]; + d = temp % info.output_size[0]; + temp /= info.output_size[0]; + c = temp % info.channels; + b = temp / info.channels; + return b * info.output_stride[0] + c * info.output_stride[1] + d * info.output_stride[2] + h * info.output_stride[3] + w * info.output_stride[4]; + } + + default: + return 0; + } +} + +__host__ __device__ inline size_t +calculate_total_elements(const InterpolateNearestInfo &info) { + size_t total = info.batch_size * info.channels; + switch (info.dim) { + case INTERPOLATE_1D: + total *= info.output_size[0]; // width + break; + case INTERPOLATE_2D: + total *= info.output_size[0] * info.output_size[1]; // height * width + break; + case INTERPOLATE_3D: + total *= info.output_size[0] * info.output_size[1] * info.output_size[2]; // depth * height * width + break; + } + return total; +} + +template +__global__ void interpolate_nearest_kernel(T *output, const T *input, + InterpolateNearestInfo info) { + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + size_t total_elements = calculate_total_elements(info); + + if (idx < total_elements) { + size_t input_idx; + + switch (info.dim) { + case INTERPOLATE_1D: + input_idx = compute_input_index_1d(idx, info); + break; + case INTERPOLATE_2D: + input_idx = compute_input_index_2d(idx, info); + break; + case INTERPOLATE_3D: + input_idx = compute_input_index_3d(idx, info); + break; + default: + return; + } + + size_t output_idx = compute_output_index(idx, info); + output[output_idx] = input[input_idx]; + } +} + +#endif // INTERPOLATE_NEAREST_KERNEL_CUH diff --git a/src/infiniop/ops/interpolate_nearest/info.h b/src/infiniop/ops/interpolate_nearest/info.h new file mode 100644 index 000000000..162d6eb02 --- /dev/null +++ b/src/infiniop/ops/interpolate_nearest/info.h @@ -0,0 +1,118 @@ +#ifndef __INTERPOLATE_NEAREST_INFO_H__ +#define __INTERPOLATE_NEAREST_INFO_H__ + +#include "../../../utils.h" +#include "../../operator.h" +#include "../../tensor.h" +#include + +enum InterpolateDim { + INTERPOLATE_1D = 1, // 3D 张量 (N, C, W) + INTERPOLATE_2D = 2, // 4D 张量 (N, C, H, W) + INTERPOLATE_3D = 3 // 5D 张量 (N, C, D, H, W) +}; + +struct InterpolateNearestInfo { + size_t batch_size; + size_t channels; + + // 输入和输出的空间维度大小 + size_t input_size[3]; // [depth/height/width] 根据维度使用不同数量 + size_t output_size[3]; // [depth/height/width] 根据维度使用不同数量 + + InterpolateDim dim; // 插值维度:1D, 2D, 3D + infiniDtype_t dtype; + + // 张量步长(最多支持 5D 张量) + size_t input_stride[5]; + size_t output_stride[5]; + + static infiniStatus_t create( + InterpolateNearestInfo *info, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc) { + + // 检查数据类型 + if (input_desc->dtype() != output_desc->dtype()) { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + auto input_shape = input_desc->shape(); + auto output_shape = output_desc->shape(); + auto input_stride = input_desc->strides(); + auto output_stride = output_desc->strides(); + + // 根据张量维度确定插值类型 + if (input_desc->ndim() == 3 && output_desc->ndim() == 3) { + // 1D 插值:3D 张量 (N, C, W) + info->dim = INTERPOLATE_1D; + info->batch_size = input_shape[0]; + info->channels = input_shape[1]; + info->input_size[0] = input_shape[2]; // width + info->output_size[0] = output_shape[2]; // width + + // 检查 N,C 维度匹配 + if (input_shape[0] != output_shape[0] || input_shape[1] != output_shape[1]) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + // 复制步长 + for (int i = 0; i < 3; ++i) { + info->input_stride[i] = input_stride[i]; + info->output_stride[i] = output_stride[i]; + } + + } else if (input_desc->ndim() == 4 && output_desc->ndim() == 4) { + // 2D 插值:4D 张量 (N, C, H, W) + info->dim = INTERPOLATE_2D; + info->batch_size = input_shape[0]; + info->channels = input_shape[1]; + info->input_size[0] = input_shape[2]; // height + info->input_size[1] = input_shape[3]; // width + info->output_size[0] = output_shape[2]; // height + info->output_size[1] = output_shape[3]; // width + + // 检查 N,C 维度匹配 + if (input_shape[0] != output_shape[0] || input_shape[1] != output_shape[1]) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + // 复制步长 + for (int i = 0; i < 4; ++i) { + info->input_stride[i] = input_stride[i]; + info->output_stride[i] = output_stride[i]; + } + + } else if (input_desc->ndim() == 5 && output_desc->ndim() == 5) { + // 3D 插值:5D 张量 (N, C, D, H, W) + info->dim = INTERPOLATE_3D; + info->batch_size = input_shape[0]; + info->channels = input_shape[1]; + info->input_size[0] = input_shape[2]; // depth + info->input_size[1] = input_shape[3]; // height + info->input_size[2] = input_shape[4]; // width + info->output_size[0] = output_shape[2]; // depth + info->output_size[1] = output_shape[3]; // height + info->output_size[2] = output_shape[4]; // width + + // 检查 N,C 维度匹配 + if (input_shape[0] != output_shape[0] || input_shape[1] != output_shape[1]) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + // 复制步长 + for (int i = 0; i < 5; ++i) { + info->input_stride[i] = input_stride[i]; + info->output_stride[i] = output_stride[i]; + } + + } else { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + info->dtype = input_desc->dtype(); + return INFINI_STATUS_SUCCESS; + } +}; + +#endif // __INTERPOLATE_NEAREST_INFO_H__ diff --git a/src/infiniop/ops/interpolate_nearest/interpolate_nearest.h b/src/infiniop/ops/interpolate_nearest/interpolate_nearest.h new file mode 100644 index 000000000..73499c2ff --- /dev/null +++ b/src/infiniop/ops/interpolate_nearest/interpolate_nearest.h @@ -0,0 +1,51 @@ +#ifndef __INTERPOLATE_NEAREST_H__ +#define __INTERPOLATE_NEAREST_H__ + +#include "../../operator.h" +#include "info.h" + +#define DESCRIPTOR(NAMESPACE) \ + \ + namespace op::interpolate_nearest::NAMESPACE { \ + class Descriptor final : public InfiniopDescriptor { \ + struct Opaque; \ + Opaque *_opaque; \ + \ + InterpolateNearestInfo _info; \ + infiniDtype_t _dtype; \ + size_t _workspace_size; \ + \ + Descriptor( \ + infiniDtype_t dtype, \ + InterpolateNearestInfo info, \ + size_t workspace_size, \ + Opaque *opaque, \ + infiniDevice_t device_type, \ + int device_id) \ + : InfiniopDescriptor{device_type, device_id}, \ + _opaque(opaque), \ + _info(info), \ + _dtype(dtype), \ + _workspace_size(workspace_size) {} \ + \ + public: \ + ~Descriptor(); \ + \ + static infiniStatus_t create( \ + infiniopHandle_t handle, \ + Descriptor **desc_ptr, \ + infiniopTensorDescriptor_t output_desc, \ + infiniopTensorDescriptor_t input_desc); \ + \ + size_t workspaceSize() const { return _workspace_size; } \ + \ + infiniStatus_t calculate( \ + void *workspace, \ + size_t workspace_size, \ + void *output, \ + const void *input, \ + void *stream) const; \ + }; \ + } + +#endif // __INTERPOLATE_NEAREST_H__ diff --git a/src/infiniop/ops/interpolate_nearest/nvidia/interpolate_nearest_nvidia.cu b/src/infiniop/ops/interpolate_nearest/nvidia/interpolate_nearest_nvidia.cu new file mode 100644 index 000000000..a7b63c6f4 --- /dev/null +++ b/src/infiniop/ops/interpolate_nearest/nvidia/interpolate_nearest_nvidia.cu @@ -0,0 +1,93 @@ +#include "../../../devices/nvidia/nvidia_common.cuh" +#include "../../../devices/nvidia/nvidia_kernel_common.cuh" +#include "../cuda/kernel.cuh" +#include "interpolate_nearest_nvidia.cuh" +#include +#include +#include + +namespace op::interpolate_nearest::nvidia { + +struct Descriptor::Opaque { + std::shared_ptr internal; + + Opaque(std::shared_ptr internal_) + : internal(internal_) {} +}; + +Descriptor::~Descriptor() { delete _opaque; } + +infiniStatus_t Descriptor::create(infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc) { + + auto handle = reinterpret_cast(handle_); + auto dtype = output_desc->dtype(); + + // Check supported data types + if (dtype != INFINI_DTYPE_F16 && dtype != INFINI_DTYPE_F32 && dtype != INFINI_DTYPE_BF16 && dtype != INFINI_DTYPE_I8) { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + InterpolateNearestInfo info; + CHECK_STATUS(InterpolateNearestInfo::create(&info, output_desc, input_desc)); + + *desc_ptr = new Descriptor(dtype, info, 0, new Opaque{handle->internal()}, + handle->device, handle->device_id); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, + void *output, const void *input, + void *stream) const { + + auto cuda_stream = reinterpret_cast(stream); + + size_t total_elements = calculate_total_elements(_info); + + int block_size = 256; + int grid_size = (total_elements + block_size - 1) / block_size; + + switch (_dtype) { + case INFINI_DTYPE_F32: { + float *typed_output = reinterpret_cast(output); + const float *typed_input = reinterpret_cast(input); + interpolate_nearest_kernel + <<>>(typed_output, typed_input, + _info); + } break; + + case INFINI_DTYPE_F16: { + half *typed_output = reinterpret_cast(output); + const half *typed_input = reinterpret_cast(input); + interpolate_nearest_kernel<<>>( + typed_output, typed_input, _info); + } break; + + case INFINI_DTYPE_BF16: { + auto typed_output = reinterpret_cast<__nv_bfloat16 *>(output); + auto typed_input = reinterpret_cast(input); + interpolate_nearest_kernel<__nv_bfloat16> + <<>>(typed_output, typed_input, + _info); + } break; + + case INFINI_DTYPE_I8: { + auto typed_output = reinterpret_cast(output); + auto typed_input = reinterpret_cast(input); + interpolate_nearest_kernel + <<>>(typed_output, typed_input, + _info); + } break; + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + CHECK_CUDA(cudaGetLastError()); + CHECK_CUDA(cudaStreamSynchronize(cuda_stream)); + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::interpolate_nearest::nvidia diff --git a/src/infiniop/ops/interpolate_nearest/nvidia/interpolate_nearest_nvidia.cuh b/src/infiniop/ops/interpolate_nearest/nvidia/interpolate_nearest_nvidia.cuh new file mode 100644 index 000000000..aab5f7882 --- /dev/null +++ b/src/infiniop/ops/interpolate_nearest/nvidia/interpolate_nearest_nvidia.cuh @@ -0,0 +1,9 @@ +#ifndef __INTERPOLATE_NEAREST_NVIDIA_CUH__ +#define __INTERPOLATE_NEAREST_NVIDIA_CUH__ + +#include "../../../devices/nvidia/nvidia_handle.h" +#include "../interpolate_nearest.h" + +DESCRIPTOR(nvidia) + +#endif // __INTERPOLATE_NEAREST_NVIDIA_CUH__ diff --git a/src/infiniop/ops/interpolate_nearest/operator.cc b/src/infiniop/ops/interpolate_nearest/operator.cc new file mode 100644 index 000000000..df367cfde --- /dev/null +++ b/src/infiniop/ops/interpolate_nearest/operator.cc @@ -0,0 +1,139 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/interpolate_nearest.h" + +#ifdef ENABLE_CPU_API +#include "cpu/interpolate_nearest_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#include "nvidia/interpolate_nearest_nvidia.cuh" +#endif + +__C infiniStatus_t infiniopCreateInterpolateNearestDescriptor( + infiniopHandle_t handle, + infiniopInterpolateNearestDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::interpolate_nearest::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + output_desc, \ + input_desc) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetInterpolateNearestWorkspaceSize(infiniopInterpolateNearestDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopInterpolateNearest( + infiniopInterpolateNearestDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, output, input, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroyInterpolateNearestDescriptor(infiniopInterpolateNearestDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + DELETE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/ops/maxpool/cpu/maxpool_cpu.cc b/src/infiniop/ops/maxpool/cpu/maxpool_cpu.cc new file mode 100644 index 000000000..16c859985 --- /dev/null +++ b/src/infiniop/ops/maxpool/cpu/maxpool_cpu.cc @@ -0,0 +1,322 @@ +#include "maxpool_cpu.h" +#include "../../../devices/cpu/common_cpu.h" +#include "../../../devices/cpu/cpu_handle.h" +#include "../info.h" +#include +#include +#include +#include + +namespace op::maxpool::cpu { + +struct Descriptor::Opaque { + device::cpu::Handle *handle; + MaxPoolInfo info; + size_t workspace_size = 0; + +private: + Opaque(device::cpu::Handle *handle_ptr, const MaxPoolInfo &maxpool_info) + : handle(handle_ptr), info(maxpool_info) { + // CPU实现通常不需要额外的工作空间 + workspace_size = 0; + } + + // 获取数据类型的最小值 + template + static T get_min_value() { + if constexpr (std::is_same::value) { + return -std::numeric_limits::infinity(); + } else if constexpr (std::is_same::value) { + return _f32_to_f16(-std::numeric_limits::infinity()); + } else if constexpr (std::is_same::value) { + return _f32_to_bf16(-std::numeric_limits::infinity()); + } else { + return std::numeric_limits::lowest(); + } + } + + // 比较两个值的大小(处理半精度类型) + template + static bool is_greater(const T &a, const T &b) { + if constexpr (std::is_same::value) { + return utils::cast(a) > utils::cast(b); + } else if constexpr (std::is_same::value) { + return utils::cast(a) > utils::cast(b); + } else { + return a > b; + } + } + + // 1D最大池化 + template + void maxpool_1d(T *output, const T *input) const { + size_t batch_size = info.batch; + size_t channels = info.channels; + size_t input_width = info.input_dims[0]; + size_t output_width = info.output_dims[0]; + size_t kernel_width = info.kernel_sizes[0]; + size_t stride_width = info.strides[0]; + size_t pad_width = info.pads[0]; + + // 并行处理每个批次和通道 +#pragma omp parallel for schedule(static) + for (int b = 0; b < static_cast(batch_size); ++b) { + for (int c = 0; c < static_cast(channels); ++c) { + size_t input_offset = static_cast(b) * channels * input_width + static_cast(c) * input_width; + size_t output_offset = static_cast(b) * channels * output_width + static_cast(c) * output_width; + + for (size_t ow = 0; ow < output_width; ++ow) { + T max_val = get_min_value(); + bool found_valid = false; + + int start_w = static_cast(ow * stride_width) - static_cast(pad_width); + int end_w = start_w + static_cast(kernel_width); + + for (int kw = start_w; kw < end_w; ++kw) { + if (kw >= 0 && kw < static_cast(input_width)) { + T val = input[input_offset + kw]; + if (!found_valid || is_greater(val, max_val)) { + max_val = val; + found_valid = true; + } + } + } + + output[output_offset + ow] = max_val; + } + } + } + } + + // 2D最大池化 + template + void maxpool_2d(T *output, const T *input) const { + size_t batch_size = info.batch; + size_t channels = info.channels; + size_t input_height = info.input_dims[0]; + size_t input_width = info.input_dims[1]; + size_t output_height = info.output_dims[0]; + size_t output_width = info.output_dims[1]; + size_t kernel_height = info.kernel_sizes[0]; + size_t kernel_width = info.kernel_sizes[1]; + size_t stride_height = info.strides[0]; + size_t stride_width = info.strides[1]; + size_t pad_height = info.pads[0]; + size_t pad_width = info.pads[1]; + + // 并行处理每个批次和通道 +#pragma omp parallel for schedule(static) + for (int b = 0; b < static_cast(batch_size); ++b) { + for (int c = 0; c < static_cast(channels); ++c) { + size_t input_offset = static_cast(b) * channels * input_height * input_width + static_cast(c) * input_height * input_width; + size_t output_offset = static_cast(b) * channels * output_height * output_width + static_cast(c) * output_height * output_width; + + for (size_t oh = 0; oh < output_height; ++oh) { + for (size_t ow = 0; ow < output_width; ++ow) { + T max_val = get_min_value(); + bool found_valid = false; + + int start_h = static_cast(oh * stride_height) - static_cast(pad_height); + int end_h = start_h + static_cast(kernel_height); + int start_w = static_cast(ow * stride_width) - static_cast(pad_width); + int end_w = start_w + static_cast(kernel_width); + + for (int kh = start_h; kh < end_h; ++kh) { + for (int kw = start_w; kw < end_w; ++kw) { + if (kh >= 0 && kh < static_cast(input_height) && kw >= 0 && kw < static_cast(input_width)) { + T val = input[input_offset + kh * input_width + kw]; + if (!found_valid || is_greater(val, max_val)) { + max_val = val; + found_valid = true; + } + } + } + } + + output[output_offset + oh * output_width + ow] = max_val; + } + } + } + } + } + + // 3D最大池化 + template + void maxpool_3d(T *output, const T *input) const { + size_t batch_size = info.batch; + size_t channels = info.channels; + size_t input_depth = info.input_dims[0]; + size_t input_height = info.input_dims[1]; + size_t input_width = info.input_dims[2]; + size_t output_depth = info.output_dims[0]; + size_t output_height = info.output_dims[1]; + size_t output_width = info.output_dims[2]; + size_t kernel_depth = info.kernel_sizes[0]; + size_t kernel_height = info.kernel_sizes[1]; + size_t kernel_width = info.kernel_sizes[2]; + size_t stride_depth = info.strides[0]; + size_t stride_height = info.strides[1]; + size_t stride_width = info.strides[2]; + size_t pad_depth = info.pads[0]; + size_t pad_height = info.pads[1]; + size_t pad_width = info.pads[2]; + + // 并行处理每个批次和通道 +#pragma omp parallel for schedule(static) + for (int b = 0; b < static_cast(batch_size); ++b) { + for (int c = 0; c < static_cast(channels); ++c) { + size_t input_offset = static_cast(b) * channels * input_depth * input_height * input_width + static_cast(c) * input_depth * input_height * input_width; + size_t output_offset = static_cast(b) * channels * output_depth * output_height * output_width + static_cast(c) * output_depth * output_height * output_width; + + for (size_t od = 0; od < output_depth; ++od) { + for (size_t oh = 0; oh < output_height; ++oh) { + for (size_t ow = 0; ow < output_width; ++ow) { + T max_val = get_min_value(); + bool found_valid = false; + + int start_d = static_cast(od * stride_depth) - static_cast(pad_depth); + int end_d = start_d + static_cast(kernel_depth); + int start_h = static_cast(oh * stride_height) - static_cast(pad_height); + int end_h = start_h + static_cast(kernel_height); + int start_w = static_cast(ow * stride_width) - static_cast(pad_width); + int end_w = start_w + static_cast(kernel_width); + + for (int kd = start_d; kd < end_d; ++kd) { + for (int kh = start_h; kh < end_h; ++kh) { + for (int kw = start_w; kw < end_w; ++kw) { + if (kd >= 0 && kd < static_cast(input_depth) && kh >= 0 && kh < static_cast(input_height) && kw >= 0 && kw < static_cast(input_width)) { + T val = input[input_offset + kd * input_height * input_width + kh * input_width + kw]; + if (!found_valid || is_greater(val, max_val)) { + max_val = val; + found_valid = true; + } + } + } + } + } + + output[output_offset + od * output_height * output_width + oh * output_width + ow] = max_val; + } + } + } + } + } + } + + // 主要的最大池化计算函数 + template + void maxpool_cpu(T *output, const T *input) const { + switch (info.ndim) { + case 1: + maxpool_1d(output, input); + break; + case 2: + maxpool_2d(output, input); + break; + case 3: + maxpool_3d(output, input); + break; + default: + break; + } + } + +public: + Opaque(Opaque &&other) noexcept + : handle(other.handle), + info(std::move(other.info)), + workspace_size(other.workspace_size) { + other.handle = nullptr; + other.workspace_size = 0; + } + + ~Opaque() = default; + + static inline utils::Result + create(device::cpu::Handle *handle_ptr, + MaxPoolInfo &info, + infiniDtype_t data_type) { + if (data_type != INFINI_DTYPE_F32 && data_type != INFINI_DTYPE_F16 && data_type != INFINI_DTYPE_BF16) { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + Opaque opaque(handle_ptr, info); + return utils::Result(std::move(opaque)); + } + + infiniStatus_t calculate(void *workspace, size_t workspace_size, + void *output, const void *input, infiniDtype_t dtype) const { + + if (!output || !input) { + return INFINI_STATUS_BAD_PARAM; + } + + switch (dtype) { + case INFINI_DTYPE_F32: { + float *typed_output = static_cast(output); + const float *typed_input = static_cast(input); + maxpool_cpu(typed_output, typed_input); + break; + } + + case INFINI_DTYPE_F16: { + fp16_t *typed_output = static_cast(output); + const fp16_t *typed_input = static_cast(input); + maxpool_cpu(typed_output, typed_input); + break; + } + + case INFINI_DTYPE_BF16: { + bf16_t *typed_output = static_cast(output); + const bf16_t *typed_input = static_cast(input); + maxpool_cpu(typed_output, typed_input); + break; + } + + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; + } +}; + +Descriptor::~Descriptor() { + if (_opaque) { + delete _opaque; + } +} + +infiniStatus_t Descriptor::create(infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc, + void *kernel_size, void *strides, void *pads, + bool ceil_mode) { + auto handle = reinterpret_cast(handle_); + auto dtype = input_desc->dtype(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F32, INFINI_DTYPE_F16, INFINI_DTYPE_BF16); + + auto result = MaxPoolInfo::create(output_desc, input_desc, kernel_size, + strides, pads, ceil_mode); + CHECK_RESULT(result); + auto info = result.take(); + + auto opaque_result = Opaque::create(handle, info, dtype); + CHECK_RESULT(opaque_result); + auto opaque = new Opaque(opaque_result.take()); + + *desc_ptr = new Descriptor(dtype, std::move(info), opaque->workspace_size, + opaque, handle->device, handle->device_id); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, + void *output, const void *input, + void *stream) const { + return _opaque->calculate(workspace, workspace_size, output, input, _dtype); +} + +} // namespace op::maxpool::cpu diff --git a/src/infiniop/ops/maxpool/cpu/maxpool_cpu.h b/src/infiniop/ops/maxpool/cpu/maxpool_cpu.h new file mode 100644 index 000000000..f3ecd349d --- /dev/null +++ b/src/infiniop/ops/maxpool/cpu/maxpool_cpu.h @@ -0,0 +1,8 @@ +#ifndef __MAX_POOL_CPU_H__ +#define __MAX_POOL_CPU_H__ + +#include "../maxpool.h" + +DESCRIPTOR(cpu) + +#endif // __MAX_POOL_CPU_H__ diff --git a/src/infiniop/ops/maxpool/info.h b/src/infiniop/ops/maxpool/info.h new file mode 100644 index 000000000..ff56fe28c --- /dev/null +++ b/src/infiniop/ops/maxpool/info.h @@ -0,0 +1,113 @@ +#ifndef __MAX_POOL_INFO_H__ +#define __MAX_POOL_INFO_H__ + +#include "../../../utils.h" +#include "../../operator.h" +#include "../../tensor.h" +#include + +namespace op::maxpool { + +inline utils::Result calculateMaxPoolOutputSize( + size_t input_size, + size_t kernel_size, + size_t stride, + size_t padding = 0, + bool ceil_mode = false) { + + if (stride == 0) { + return utils::Result(INFINI_STATUS_BAD_PARAM); + } + if (kernel_size == 0) { + return utils::Result(INFINI_STATUS_BAD_PARAM); + } + + // 理论最大输出数 + size_t max_output = 0; + if (ceil_mode) { + max_output = (input_size + 2 * padding - kernel_size + stride - 1) / stride + 1; + } else { + max_output = (input_size + 2 * padding - kernel_size) / stride + 1; + } + + size_t valid_output = 0; + for (size_t i = 0; i < max_output; ++i) { + int64_t start = static_cast(i) * stride - padding; + int64_t end = start + kernel_size; + // 判断区间 [start, end) 和 [0, input_size) 是否有交集 + int64_t real_start = std::max(start, int64_t(0)); + int64_t real_end = std::min(end, int64_t(input_size)); + if (real_end > real_start) { + ++valid_output; + } + } + return utils::Result(valid_output); +} + +class MaxPoolInfo { + MaxPoolInfo() = default; + +public: + std::vector input_dims; + std::vector output_dims; + std::vector kernel_sizes; + std::vector strides; + std::vector pads; + bool ceil_mode; + size_t ndim; + size_t batch; + size_t channels; + + static utils::Result create( + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc, + void *kernel_size, + void *strides, + void *pads, + bool ceil_mode) { + + MaxPoolInfo info; + + if (input_desc->ndim() < 3 || input_desc->ndim() > 5) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + if (input_desc->ndim() != output_desc->ndim()) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + if (input_desc->dim(0) != output_desc->dim(0) || input_desc->dim(1) != output_desc->dim(1)) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + info.ndim = input_desc->ndim() - 2; // spatial dimensions + info.batch = input_desc->dim(0); + info.channels = input_desc->dim(1); + info.ceil_mode = ceil_mode; + + auto kernel_ptr = reinterpret_cast(kernel_size); + auto stride_ptr = reinterpret_cast(strides); + auto pad_ptr = reinterpret_cast(pads); + + // Get spatial dimensions + for (size_t i = 0; i < info.ndim; ++i) { + info.input_dims.push_back(input_desc->dim(i + 2)); + info.kernel_sizes.push_back(kernel_ptr[i]); + info.strides.push_back(stride_ptr[i]); + info.pads.push_back(pad_ptr[i]); + auto output_size = calculateMaxPoolOutputSize( + info.input_dims[i], info.kernel_sizes[i], info.strides[i], info.pads[i], info.ceil_mode); + CHECK_RESULT(output_size); + size_t expected_size = output_size.take(); + if (expected_size != output_desc->dim(i + 2)) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + info.output_dims.push_back(output_desc->dim(i + 2)); + } + return utils::Result(std::move(info)); + } +}; +} // namespace op::maxpool + +#endif // __MAX_POOL_INFO_H__ diff --git a/src/infiniop/ops/maxpool/maxpool.h b/src/infiniop/ops/maxpool/maxpool.h new file mode 100644 index 000000000..5ee7703c5 --- /dev/null +++ b/src/infiniop/ops/maxpool/maxpool.h @@ -0,0 +1,53 @@ +#ifndef __MAX_POOL_H__ +#define __MAX_POOL_H__ + +#include "../../operator.h" +#include "info.h" + +#define DESCRIPTOR(NAMESPACE) \ + \ + namespace op::maxpool::NAMESPACE { \ + class Descriptor final : public InfiniopDescriptor { \ + struct Opaque; \ + Opaque *_opaque; \ + infiniDtype_t _dtype; \ + MaxPoolInfo _info; \ + size_t _workspace_size; \ + \ + Descriptor( \ + infiniDtype_t dtype, \ + MaxPoolInfo info, \ + size_t workspace_size_, \ + Opaque *opaque, \ + infiniDevice_t device_type, \ + int device_id) \ + : InfiniopDescriptor{device_type, device_id}, \ + _opaque(opaque), \ + _dtype(dtype), \ + _info(info), \ + _workspace_size(workspace_size_) {} \ + \ + public: \ + ~Descriptor(); \ + \ + size_t workspaceSize() const { return _workspace_size; } \ + \ + static infiniStatus_t create( \ + infiniopHandle_t handle, \ + Descriptor **desc_ptr, \ + infiniopTensorDescriptor_t output_desc, \ + infiniopTensorDescriptor_t input_desc, \ + void *kernel_size, \ + void *strides, \ + void *pads, \ + bool ceil_mode); \ + \ + infiniStatus_t calculate( \ + void *workspace, size_t workspace_size, \ + void *output, \ + const void *input, \ + void *stream) const; \ + }; \ + } + +#endif // __MAX_POOL_H__ diff --git a/src/infiniop/ops/maxpool/nvidia/maxpool_nvidia.cu b/src/infiniop/ops/maxpool/nvidia/maxpool_nvidia.cu new file mode 100644 index 000000000..8b94a29c1 --- /dev/null +++ b/src/infiniop/ops/maxpool/nvidia/maxpool_nvidia.cu @@ -0,0 +1,240 @@ +#include "../../../devices/nvidia/nvidia_common.cuh" +#include "../../../devices/nvidia/nvidia_handle.cuh" +#include "maxpool_nvidia.cuh" + +#define DESTROY_CUDNN_DESCRIPTOR(desc_ptr, destroy_func) \ + do { \ + if (desc_ptr) { \ + destroy_func(desc_ptr); \ + desc_ptr = nullptr; \ + } \ + } while (0) + +#define CLEANUP_CUDNN_DESCRIPTORS() \ + do { \ + DESTROY_CUDNN_DESCRIPTOR(input_desc, cudnnDestroyTensorDescriptor); \ + DESTROY_CUDNN_DESCRIPTOR(output_desc, cudnnDestroyTensorDescriptor); \ + DESTROY_CUDNN_DESCRIPTOR(pooling_desc, cudnnDestroyPoolingDescriptor); \ + } while (0) + +namespace op::maxpool::nvidia { + +struct Descriptor::Opaque { + std::shared_ptr internal; + size_t workspace_size = 0; + +#ifdef ENABLE_CUDNN_API + cudnnTensorDescriptor_t input_desc = nullptr; + cudnnTensorDescriptor_t output_desc = nullptr; + cudnnPoolingDescriptor_t pooling_desc = nullptr; +#endif + +private: + Opaque(std::shared_ptr internal_ptr) + : internal(internal_ptr) {} + +#ifdef ENABLE_CUDNN_API + infiniStatus_t getCudnnDataType(infiniDtype_t data_type, + cudnnDataType_t &cudnn_data_type) const { + if (data_type == INFINI_DTYPE_F16) { + cudnn_data_type = device::nvidia::getCudnnDtype(data_type); + } else if (data_type == INFINI_DTYPE_F32) { + cudnn_data_type = device::nvidia::getCudnnDtype(data_type); + } else if (data_type == INFINI_DTYPE_BF16) { + cudnn_data_type = device::nvidia::getCudnnDtype(data_type); + } else { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + return INFINI_STATUS_SUCCESS; + } + + infiniStatus_t createPoolingDescriptors(const MaxPoolInfo &info, + cudnnDataType_t cudnn_data_type) { + // Create CUDNN descriptors + CHECK_CUDNN(cudnnCreateTensorDescriptor(&input_desc)); + CHECK_CUDNN(cudnnCreateTensorDescriptor(&output_desc)); + CHECK_CUDNN(cudnnCreatePoolingDescriptor(&pooling_desc)); + + // Setup tensor descriptors + std::vector input_dims_vec = {static_cast(info.batch), + static_cast(info.channels)}; + std::vector output_dims_vec = {static_cast(info.batch), + static_cast(info.channels)}; + + for (size_t i = 0; i < info.ndim; ++i) { + input_dims_vec.push_back(static_cast(info.input_dims[i])); + output_dims_vec.push_back(static_cast(info.output_dims[i])); + } + + if (info.ndim == 1) { + // For 1D pooling, add dummy dimension + input_dims_vec.push_back(1); + output_dims_vec.push_back(1); + } + + CHECK_CUDNN(cudnnSetTensorNdDescriptorEx( + input_desc, CUDNN_TENSOR_NCHW, cudnn_data_type, input_dims_vec.size(), + input_dims_vec.data())); + + CHECK_CUDNN(cudnnSetTensorNdDescriptorEx( + output_desc, CUDNN_TENSOR_NCHW, cudnn_data_type, output_dims_vec.size(), + output_dims_vec.data())); + + return INFINI_STATUS_SUCCESS; + } + + infiniStatus_t setupPoolingDescriptor(const MaxPoolInfo &info) { + // Setup pooling descriptor + std::vector kernel_vec, stride_vec, pad_vec; + for (size_t i = 0; i < info.ndim; ++i) { + kernel_vec.push_back(static_cast(info.kernel_sizes[i])); + stride_vec.push_back(static_cast(info.strides[i])); + pad_vec.push_back(static_cast(info.pads[i])); + } + + if (info.ndim == 1) { + // For 1D pooling, add dummy dimension + kernel_vec.push_back(1); + stride_vec.push_back(1); + pad_vec.push_back(0); + } + + CHECK_CUDNN(cudnnSetPoolingNdDescriptor( + pooling_desc, CUDNN_POOLING_MAX, CUDNN_NOT_PROPAGATE_NAN, + kernel_vec.size(), kernel_vec.data(), pad_vec.data(), + stride_vec.data())); + + return INFINI_STATUS_SUCCESS; + } + + infiniStatus_t initializeCudnnContext(MaxPoolInfo &info, + infiniDtype_t data_type) { + cudnnDataType_t cudnn_data_type; + CHECK_STATUS(getCudnnDataType(data_type, cudnn_data_type)); + + CHECK_STATUS(createPoolingDescriptors(info, cudnn_data_type)); + CHECK_STATUS(setupPoolingDescriptor(info)); + + // Max pooling typically doesn't need workspace + workspace_size = 0; + + return INFINI_STATUS_SUCCESS; + } +#endif + +public: + Opaque(Opaque &&other) noexcept + : internal(std::move(other.internal)), + workspace_size(other.workspace_size) + // clang-format off +#ifdef ENABLE_CUDNN_API + , input_desc(other.input_desc) + , output_desc(other.output_desc) + , pooling_desc(other.pooling_desc) +#endif + // clang-format on + { +#ifdef ENABLE_CUDNN_API + other.input_desc = nullptr; + other.output_desc = nullptr; + other.pooling_desc = nullptr; +#endif + other.workspace_size = 0; + } + + ~Opaque() { +#ifdef ENABLE_CUDNN_API + CLEANUP_CUDNN_DESCRIPTORS(); +#endif + } + + static inline utils::Result + create(std::shared_ptr internal_ptr, + MaxPoolInfo &info, infiniDtype_t data_type) { +#ifdef ENABLE_CUDNN_API + Opaque opaque(internal_ptr); + auto status = opaque.initializeCudnnContext(info, data_type); + if (status != INFINI_STATUS_SUCCESS) { + return status; + } + return utils::Result(std::move(opaque)); +#else + return INFINI_STATUS_NOT_IMPLEMENTED; +#endif + } +}; + +Descriptor::~Descriptor() { + if (_opaque) { + delete _opaque; + } +} + +infiniStatus_t Descriptor::create(infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc, + void *kernel_size, void *strides, void *pads, + bool ceil_mode) { + +#ifdef ENABLE_CUDNN_API + auto handle = reinterpret_cast(handle_); + auto dtype = input_desc->dtype(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16); + + auto result = MaxPoolInfo::create(output_desc, input_desc, kernel_size, + strides, pads, ceil_mode); + CHECK_RESULT(result); + auto info = result.take(); + + auto opaque_result = Opaque::create(handle->internal(), info, dtype); + CHECK_RESULT(opaque_result); + auto opaque = new Opaque(opaque_result.take()); + + *desc_ptr = new Descriptor(dtype, std::move(info), opaque->workspace_size, + opaque, handle->device, handle->device_id); + + return INFINI_STATUS_SUCCESS; +#else + return INFINI_STATUS_NOT_IMPLEMENTED; +#endif +} + +infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, + void *output, const void *input, + void *stream) const { + +#ifdef ENABLE_CUDNN_API + const float alpha = 1.0f, beta = 0.0f; + + // 打印input展平后的前十个数据 + // printf("MaxPool input (first 10 elements): "); + // const uint16_t *input_data = static_cast(input); + // for (int i = 0; i < 10; ++i) { + // // 将BF16转换为float显示 + // union { + // uint32_t bits; + // float value; + // } converter; + // uint16_t bf16_val = input_data[i]; + // converter.bits = static_cast(bf16_val) << 16; + // printf("%f ", converter.value); + // } + // printf("\n"); + + CHECK_STATUS(_opaque->internal->useCudnn( + (cudaStream_t)stream, [&](cudnnHandle_t handle) { + CHECK_CUDNN(cudnnPoolingForward(handle, _opaque->pooling_desc, &alpha, + _opaque->input_desc, input, &beta, + _opaque->output_desc, output)); + return INFINI_STATUS_SUCCESS; + })); + + return INFINI_STATUS_SUCCESS; +#else + return INFINI_STATUS_NOT_IMPLEMENTED; +#endif +} + +} // namespace op::maxpool::nvidia diff --git a/src/infiniop/ops/maxpool/nvidia/maxpool_nvidia.cuh b/src/infiniop/ops/maxpool/nvidia/maxpool_nvidia.cuh new file mode 100644 index 000000000..539ad5a1a --- /dev/null +++ b/src/infiniop/ops/maxpool/nvidia/maxpool_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __MAX_POOL_CUDA_CUH__ +#define __MAX_POOL_CUDA_CUH__ + +#include "../maxpool.h" + +DESCRIPTOR(nvidia) + +#endif // __MAX_POOL_CUDA_CUH__ diff --git a/src/infiniop/ops/maxpool/operator.cc b/src/infiniop/ops/maxpool/operator.cc new file mode 100644 index 000000000..fa47b5b72 --- /dev/null +++ b/src/infiniop/ops/maxpool/operator.cc @@ -0,0 +1,147 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/maxpool.h" + +#ifdef ENABLE_CPU_API +#include "cpu/maxpool_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#include "nvidia/maxpool_nvidia.cuh" +#endif + +__C infiniStatus_t infiniopCreateMaxPoolDescriptor( + infiniopHandle_t handle, + infiniopMaxPoolDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc, + void *kernel_size, + void *strides, + void *pads, + bool ceil_mode) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::maxpool::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + output_desc, \ + input_desc, \ + kernel_size, \ + strides, \ + pads, \ + ceil_mode) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetMaxPoolWorkspaceSize(infiniopMaxPoolDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopMaxPool( + infiniopMaxPoolDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, output, input, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroyMaxPoolDescriptor(infiniopMaxPoolDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + DELETE(INFINI_DEVICE_QY, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/ops/scatter/cpu/scatter_cpu.cc b/src/infiniop/ops/scatter/cpu/scatter_cpu.cc new file mode 100644 index 000000000..03b808bf5 --- /dev/null +++ b/src/infiniop/ops/scatter/cpu/scatter_cpu.cc @@ -0,0 +1,100 @@ +#include "scatter_cpu.h" +#include "../../../devices/cpu/common_cpu.h" +#include "../../../reduce/cpu/reduce.h" +#include "../info.h" + +namespace op::scatter::cpu { + +infiniStatus_t calculate_scatter( + const ScatterInfo &info, + char *output, + const char *input, + const int64_t *index) { + // -------------------------------- start: perform operator on CPU -------------------------------- + std::vector contiguous_strides(info.ndim); + ptrdiff_t last_dim = 1; + ptrdiff_t last_stride = 1; + for (size_t d = 0; d < info.ndim; d++) { + if (d == info.dim) { + continue; + } + contiguous_strides[d] = last_dim * last_stride; + last_dim = info.index_shape[d]; + last_stride = contiguous_strides[d]; + } + size_t batch_size = last_dim * last_stride; + int scatter_dim = static_cast(info.dim); + size_t element_size = infiniSizeOf(info.dtype); + +#pragma omp parallel for + for (int n = 0; n < static_cast(batch_size); n++) { + auto output_ptr = output; + auto input_ptr = input; + auto index_ptr = index; + size_t rem = static_cast(n); + for (int d = static_cast(info.ndim) - 1; d >= 0; d--) { + if (d == scatter_dim) { + continue; + } + size_t dim_index = rem / contiguous_strides[d]; + rem = rem % contiguous_strides[d]; + output_ptr += dim_index * element_size * info.output_strides[d]; + input_ptr += dim_index * element_size * info.input_strides[d]; + index_ptr += dim_index * info.index_strides[d]; + } + for (size_t c = 0; c < info.index_shape[scatter_dim]; c++) { + int64_t scatter_number = *(index_ptr + c * info.index_strides[scatter_dim]); + memcpy( + output_ptr + scatter_number * element_size * info.output_strides[scatter_dim], + input_ptr + c * element_size * info.input_strides[scatter_dim], + element_size); + } + } + + // --------------------------------- end: perform operator on CPU --------------------------------- + return INFINI_STATUS_SUCCESS; +} + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t index_desc, + size_t dim) { + auto handle = reinterpret_cast(handle_); + + // --------------------- start: check data type and calculate workspace size ---------------------- + auto dtype = input_desc->dtype(); + size_t WorkSpaceSize = 0; + // ---------------------- end: check data type and calculate workspace size ----------------------- + + auto result = ScatterInfo::createScatterInfo( + output_desc, + input_desc, + index_desc, + dim); + CHECK_RESULT(result); + const ScatterInfo &info = result.take(); + + *desc_ptr = new Descriptor( + dtype, std::move(info), WorkSpaceSize, + nullptr, + handle->device, handle->device_id); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + const void *input, + const void *index, + void *stream) const { + + return calculate_scatter(_info, (char *)output, (const char *)input, (const int64_t *)index); +} +} // namespace op::scatter::cpu diff --git a/src/infiniop/ops/scatter/cpu/scatter_cpu.h b/src/infiniop/ops/scatter/cpu/scatter_cpu.h new file mode 100644 index 000000000..ad52c7b91 --- /dev/null +++ b/src/infiniop/ops/scatter/cpu/scatter_cpu.h @@ -0,0 +1,8 @@ +#ifndef __SCATTER_CPU_H__ +#define __SCATTER_CPU_H__ + +#include "../scatter.h" + +DESCRIPTOR(cpu) + +#endif // __SCATTER_CPU_H__ diff --git a/src/infiniop/ops/scatter/cuda/kernel.cuh b/src/infiniop/ops/scatter/cuda/kernel.cuh new file mode 100644 index 000000000..733d2e14d --- /dev/null +++ b/src/infiniop/ops/scatter/cuda/kernel.cuh @@ -0,0 +1,37 @@ +#ifndef __SCATTER_KERNEL_CUH__ +#define __SCATTER_KERNEL_CUH__ +// ------------------------------- start: perform operator on CUDA -------------------------------- +template +__device__ void scatterKernel( + Tdata *output, + const Tdata *input, + const int64_t *index, + size_t ndim, + size_t index_scatter_size, + ptrdiff_t *output_strides, + ptrdiff_t *input_strides, + ptrdiff_t *index_strides, + ptrdiff_t *contiguous_strides, + int scatter_dim) { + auto output_ptr = output; + auto input_ptr = input; + auto index_ptr = index; + size_t rem = blockIdx.x; + for (int d = ndim - 1; d >= 0; d--) { + if (d == scatter_dim) { + continue; + } + size_t dim_index = rem / contiguous_strides[d]; + rem = rem % contiguous_strides[d]; + output_ptr += dim_index * output_strides[d]; + input_ptr += dim_index * input_strides[d]; + index_ptr += dim_index * index_strides[d]; + } + for (size_t c = threadIdx.x; c < index_scatter_size; c += BLOCK_SIZE) { + int64_t scatter_number = *(index_ptr + c * index_strides[scatter_dim]); + *(output_ptr + scatter_number * output_strides[scatter_dim]) = *(input_ptr + c * input_strides[scatter_dim]); + } +} +// -------------------------------- end: perform operator on CUDA --------------------------------- + +#endif // __SCATTER_KERNEL_CUH__ diff --git a/src/infiniop/ops/scatter/info.h b/src/infiniop/ops/scatter/info.h new file mode 100644 index 000000000..4ef8b9e76 --- /dev/null +++ b/src/infiniop/ops/scatter/info.h @@ -0,0 +1,64 @@ +#ifndef __SCATTER_INFO_H__ +#define __SCATTER_INFO_H__ + +#include "../../../utils.h" +#include "../../operator.h" +#include "../../tensor.h" + +namespace op::scatter { + +class ScatterInfo { +private: + ScatterInfo() = default; + +public: + // ---------------------------- start: define member variables of Info ---------------------------- + infiniDtype_t dtype; + size_t ndim; + std::vector output_shape; + std::vector input_shape; + std::vector index_shape; + std::vector output_strides; + std::vector input_strides; + std::vector index_strides; + size_t dim; + + // ----------------------------- end: define member variables of Info ----------------------------- + + static utils::Result createScatterInfo( + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t index_desc, + size_t dim) { + // ------------------------- start: check tensor shape and input validity ------------------------- + CHECK_OR_RETURN( + input_desc->ndim() == output_desc->ndim() && output_desc->ndim() == index_desc->ndim(), + INFINI_STATUS_BAD_TENSOR_SHAPE); + size_t ndim = output_desc->ndim(); + for (size_t d = 0; d < ndim; d++) { + if (d != dim) { + CHECK_OR_RETURN( + index_desc->dim(d) <= input_desc->dim(d) && index_desc->dim(d) <= output_desc->dim(d), + INFINI_STATUS_BAD_TENSOR_SHAPE;); + } + } + CHECK_OR_RETURN(index_desc->dim(dim) <= input_desc->dim(dim), INFINI_STATUS_BAD_TENSOR_SHAPE); + // -------------------------- end: check tensor shape and input validity -------------------------- + return utils::Result(ScatterInfo{ + // ------------------------------ start: create an instance of Info ------------------------------- + output_desc->dtype(), + ndim, + output_desc->shape(), + input_desc->shape(), + index_desc->shape(), + output_desc->strides(), + input_desc->strides(), + index_desc->strides(), + dim + // ------------------------------- end: create an instance of Info -------------------------------- + }); + } +}; +} // namespace op::scatter + +#endif // __SCATTER_INFO_H__ diff --git a/src/infiniop/ops/scatter/metax/scatter_metax.h b/src/infiniop/ops/scatter/metax/scatter_metax.h new file mode 100644 index 000000000..d5ce0ef16 --- /dev/null +++ b/src/infiniop/ops/scatter/metax/scatter_metax.h @@ -0,0 +1,8 @@ +#ifndef __SCATTER_METAX_H__ +#define __SCATTER_METAX_H__ + +#include "../scatter.h" + +DESCRIPTOR(metax) + +#endif // __SCATTER_METAX_H__ diff --git a/src/infiniop/ops/scatter/metax/scatter_metax.maca b/src/infiniop/ops/scatter/metax/scatter_metax.maca new file mode 100644 index 000000000..1c742f60d --- /dev/null +++ b/src/infiniop/ops/scatter/metax/scatter_metax.maca @@ -0,0 +1,190 @@ +#include "../../../devices/metax/metax_common.h" +#include "scatter_metax.h" +#include +#include "../../../devices/metax/metax_kernel_common.h" +#include "../../../reduce/cuda/reduce.cuh" +#include "../cuda/kernel.cuh" +#include "../info.h" + +namespace op::scatter::metax { + +template +INFINIOP_METAX_KERNEL launchKernel( + Tdata * output, + const Tdata * input, + const int64_t * index, + size_t ndim, + size_t index_scatter_size, + ptrdiff_t * output_strides, + ptrdiff_t * input_strides, + ptrdiff_t * index_strides, + ptrdiff_t * contiguous_strides, + int scatter_dim +) { + scatterKernel( + output, + input, + index, + ndim, + index_scatter_size, + output_strides, + input_strides, + index_strides, + contiguous_strides, + scatter_dim + ); +} +// ----------------------- end: launchKernel: call kernel function of CUDA ------------------------ + +// ----------------------------------- start: call launchKernel ----------------------------------- +template +infiniStatus_t calculate_scatter( + const ScatterInfo &info, + Tdata * output, + const Tdata * input, + const int64_t * index, + hcStream_t stream, + void * workspace +) { + size_t ndim = info.ndim; + ptrdiff_t * contiguous_strides = new ptrdiff_t[ndim]; + size_t last_dim = 1, last_stride = 1; + size_t scatter_dim = info.dim; + for(size_t d = 0; d < ndim; d ++) + { + if (d == scatter_dim) + continue; + contiguous_strides[d] = last_dim * last_stride; + last_dim = info.index_shape[d]; + last_stride = contiguous_strides[d]; + } + + size_t batch_size = last_dim * last_stride; + + ptrdiff_t * contiguous_strides_cuda = reinterpret_cast(workspace); + ptrdiff_t * input_strides_cuda = contiguous_strides_cuda + ndim; + ptrdiff_t * output_strides_cuda = input_strides_cuda + ndim; + ptrdiff_t * index_strides_cuda = output_strides_cuda + ndim; + + CHECK_METAX(hcMemcpyAsync(contiguous_strides_cuda, contiguous_strides, sizeof(ptrdiff_t) * ndim, hcMemcpyHostToDevice, stream)); + CHECK_METAX(hcMemcpyAsync(input_strides_cuda, info.input_strides.data(), sizeof(ptrdiff_t) * ndim, hcMemcpyHostToDevice, stream)); + CHECK_METAX(hcMemcpyAsync(output_strides_cuda, info.output_strides.data(), sizeof(ptrdiff_t) * ndim, hcMemcpyHostToDevice, stream)); + CHECK_METAX(hcMemcpyAsync(index_strides_cuda, info.index_strides.data(), sizeof(ptrdiff_t) * ndim, hcMemcpyHostToDevice, stream)); + + + launchKernel<<>>( + output, + input, + index, + ndim, + info.index_shape[scatter_dim], + output_strides_cuda, + input_strides_cuda, + index_strides_cuda, + contiguous_strides_cuda, + scatter_dim + ); + delete[] contiguous_strides; + return INFINI_STATUS_SUCCESS; +} +// ------------------------------------ end: call launchKernel ------------------------------------ + + +struct Descriptor::Opaque { + std::shared_ptr internal; +}; + +Descriptor::~Descriptor() { + delete _opaque; +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t index_desc, + size_t dim +) { + auto handle = reinterpret_cast(handle_); +// --------------------- start: check data type and calculate workspace size ---------------------- + auto dtype = output_desc->dtype(); + auto result = ScatterInfo::createScatterInfo( + output_desc, + input_desc, + index_desc, + dim + ); + CHECK_RESULT(result); + const ScatterInfo &info = result.take(); + size_t WorkSpaceSize = sizeof(ptrdiff_t) * input_desc->ndim() * 4; +// ---------------------- end: check data type and calculate workspace size ----------------------- + + *desc_ptr = new Descriptor( + dtype, std::move(info), WorkSpaceSize, + new Opaque{handle->internal()}, + handle->device, handle->device_id + ); + return INFINI_STATUS_SUCCESS; +} + + + +infiniStatus_t Descriptor::calculate( + void * workspace, + size_t workspace_size, + void * output, + const void * input, + const void * index, + void *stream_ +) const { + if (workspace_size < _workspace_size) + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + + hcStream_t stream = (hcStream_t)stream_; + + #define CALCULATE_SCATTER(BLOCK_SIZE, TDATA) \ + calculate_scatter(_info, (TDATA *)output, (const TDATA *)input, (const int64_t *)index, stream, workspace) + + #define CALCULATE_SCATTER_WITH_METAX_BLOCK(BLOCK_SIZE) \ + switch (_info.dtype) { \ + case INFINI_DTYPE_BOOL: \ + return CALCULATE_SCATTER(BLOCK_SIZE, bool); \ + case INFINI_DTYPE_U8: \ + return CALCULATE_SCATTER(BLOCK_SIZE, uint8_t); \ + case INFINI_DTYPE_U16: \ + return CALCULATE_SCATTER(BLOCK_SIZE, uint16_t); \ + case INFINI_DTYPE_U32: \ + return CALCULATE_SCATTER(BLOCK_SIZE, uint32_t); \ + case INFINI_DTYPE_U64: \ + return CALCULATE_SCATTER(BLOCK_SIZE, uint64_t); \ + case INFINI_DTYPE_I8: \ + return CALCULATE_SCATTER(BLOCK_SIZE, int8_t); \ + case INFINI_DTYPE_I16: \ + return CALCULATE_SCATTER(BLOCK_SIZE, int16_t); \ + case INFINI_DTYPE_I32: \ + return CALCULATE_SCATTER(BLOCK_SIZE, int32_t); \ + case INFINI_DTYPE_I64: \ + return CALCULATE_SCATTER(BLOCK_SIZE, int64_t); \ + case INFINI_DTYPE_F16: \ + return CALCULATE_SCATTER(BLOCK_SIZE, half); \ + case INFINI_DTYPE_F32: \ + return CALCULATE_SCATTER(BLOCK_SIZE, float); \ + case INFINI_DTYPE_BF16: \ + return CALCULATE_SCATTER(BLOCK_SIZE, cuda_bfloat16); \ + default: \ + return INFINI_STATUS_BAD_TENSOR_DTYPE; \ + } + + if (_opaque->internal->maxThreadsPerBlock() == METAX_BLOCK_SIZE_1024) + CALCULATE_SCATTER_WITH_METAX_BLOCK(METAX_BLOCK_SIZE_1024) + else if (_opaque->internal->maxThreadsPerBlock() == METAX_BLOCK_SIZE_512) + CALCULATE_SCATTER_WITH_METAX_BLOCK(METAX_BLOCK_SIZE_512) + else + return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED; + return INFINI_STATUS_SUCCESS; + + #undef CALCULATE_SCATTER_WITH_METAX_BLOCK + #undef CALCULATE_SCATTER +} +} // namespace op::scatter::metax diff --git a/src/infiniop/ops/scatter/nvidia/scatter_nvidia.cu b/src/infiniop/ops/scatter/nvidia/scatter_nvidia.cu new file mode 100644 index 000000000..136ad1f65 --- /dev/null +++ b/src/infiniop/ops/scatter/nvidia/scatter_nvidia.cu @@ -0,0 +1,180 @@ +#include "../../../devices/nvidia/nvidia_common.cuh" +#include "../../../devices/nvidia/nvidia_handle.cuh" +#include "../../../devices/nvidia/nvidia_kernel_common.cuh" + +#include "../cuda/kernel.cuh" +#include "../info.h" +#include "scatter_nvidia.cuh" + +namespace op::scatter::nvidia { + +// ---------------------- start: launchKernel: call kernel function of CUDA ----------------------- +template +INFINIOP_CUDA_KERNEL launchKernel( + Tdata *output, + const Tdata *input, + const int64_t *index, + size_t ndim, + size_t index_scatter_size, + ptrdiff_t *output_strides, + ptrdiff_t *input_strides, + ptrdiff_t *index_strides, + ptrdiff_t *contiguous_strides, + int scatter_dim) { + scatterKernel( + output, + input, + index, + ndim, + index_scatter_size, + output_strides, + input_strides, + index_strides, + contiguous_strides, + scatter_dim); +} +// ----------------------- end: launchKernel: call kernel function of CUDA ------------------------ + +// ----------------------------------- start: call launchKernel ----------------------------------- +template +infiniStatus_t calculate_scatter( + const ScatterInfo &info, + Tdata *output, + const Tdata *input, + const int64_t *index, + cudaStream_t stream, + void *workspace) { + size_t ndim = info.ndim; + ptrdiff_t *contiguous_strides = new ptrdiff_t[ndim]; + size_t last_dim = 1, last_stride = 1; + size_t scatter_dim = info.dim; + for (size_t d = 0; d < ndim; d++) { + if (d == scatter_dim) { + continue; + } + contiguous_strides[d] = last_dim * last_stride; + last_dim = info.index_shape[d]; + last_stride = contiguous_strides[d]; + } + + size_t batch_size = last_dim * last_stride; + + ptrdiff_t *contiguous_strides_cuda = reinterpret_cast(workspace); + ptrdiff_t *input_strides_cuda = contiguous_strides_cuda + ndim; + ptrdiff_t *output_strides_cuda = input_strides_cuda + ndim; + ptrdiff_t *index_strides_cuda = output_strides_cuda + ndim; + + CHECK_CUDA(cudaMemcpyAsync(contiguous_strides_cuda, contiguous_strides, sizeof(ptrdiff_t) * ndim, cudaMemcpyHostToDevice, stream)); + CHECK_CUDA(cudaMemcpyAsync(input_strides_cuda, info.input_strides.data(), sizeof(ptrdiff_t) * ndim, cudaMemcpyHostToDevice, stream)); + CHECK_CUDA(cudaMemcpyAsync(output_strides_cuda, info.output_strides.data(), sizeof(ptrdiff_t) * ndim, cudaMemcpyHostToDevice, stream)); + CHECK_CUDA(cudaMemcpyAsync(index_strides_cuda, info.index_strides.data(), sizeof(ptrdiff_t) * ndim, cudaMemcpyHostToDevice, stream)); + + launchKernel<<>>( + output, + input, + index, + ndim, + info.index_shape[scatter_dim], + output_strides_cuda, + input_strides_cuda, + index_strides_cuda, + contiguous_strides_cuda, + scatter_dim); + delete[] contiguous_strides; + return INFINI_STATUS_SUCCESS; +} +// ------------------------------------ end: call launchKernel ------------------------------------ + +struct Descriptor::Opaque { + std::shared_ptr internal; +}; + +Descriptor::~Descriptor() { + delete _opaque; +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t index_desc, + size_t dim) { + auto handle = reinterpret_cast(handle_); + // --------------------- start: check data type and calculate workspace size ---------------------- + auto dtype = output_desc->dtype(); + size_t WorkSpaceSize = sizeof(ptrdiff_t) * input_desc->ndim() * 4; + // ---------------------- end: check data type and calculate workspace size ----------------------- + auto result = ScatterInfo::createScatterInfo( + output_desc, + input_desc, + index_desc, + dim); + CHECK_RESULT(result); + const ScatterInfo &info = result.take(); + *desc_ptr = new Descriptor( + dtype, std::move(info), WorkSpaceSize, + new Opaque{handle->internal()}, + handle->device, handle->device_id); + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + const void *input, + const void *index, + void *stream_) const { + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + cudaStream_t stream = (cudaStream_t)stream_; +#define CALCULATE_SCATTER(BLOCK_SIZE, TDATA) \ + calculate_scatter(_info, (TDATA *)output, (const TDATA *)input, (const int64_t *)index, stream, workspace) +#define CALCULATE_SCATTER_WITH_BLOCK_SIZE(BLOCK_SIZE) \ + switch (_info.dtype) { \ + case INFINI_DTYPE_BOOL: \ + return CALCULATE_SCATTER(BLOCK_SIZE, bool); \ + case INFINI_DTYPE_U8: \ + return CALCULATE_SCATTER(BLOCK_SIZE, uint8_t); \ + case INFINI_DTYPE_U16: \ + return CALCULATE_SCATTER(BLOCK_SIZE, uint16_t); \ + case INFINI_DTYPE_U32: \ + return CALCULATE_SCATTER(BLOCK_SIZE, uint32_t); \ + case INFINI_DTYPE_U64: \ + return CALCULATE_SCATTER(BLOCK_SIZE, uint64_t); \ + case INFINI_DTYPE_I8: \ + return CALCULATE_SCATTER(BLOCK_SIZE, int8_t); \ + case INFINI_DTYPE_I16: \ + return CALCULATE_SCATTER(BLOCK_SIZE, int16_t); \ + case INFINI_DTYPE_I32: \ + return CALCULATE_SCATTER(BLOCK_SIZE, int32_t); \ + case INFINI_DTYPE_I64: \ + return CALCULATE_SCATTER(BLOCK_SIZE, int64_t); \ + case INFINI_DTYPE_F16: \ + return CALCULATE_SCATTER(BLOCK_SIZE, half); \ + case INFINI_DTYPE_F32: \ + return CALCULATE_SCATTER(BLOCK_SIZE, float); \ + case INFINI_DTYPE_BF16: \ + return CALCULATE_SCATTER(BLOCK_SIZE, cuda_bfloat16); \ + default: \ + return INFINI_STATUS_BAD_TENSOR_DTYPE; \ + } + + if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_1024) { + CALCULATE_SCATTER_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_1024) + } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_512) { + CALCULATE_SCATTER_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_512) + } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_4096) { + CALCULATE_SCATTER_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_4096) + } else { + return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED; + } + +#undef CALCULATE_SCATTER_WITH_BLOCK_SIZE +#undef CALCULATE_SCATTER + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::scatter::nvidia diff --git a/src/infiniop/ops/scatter/nvidia/scatter_nvidia.cuh b/src/infiniop/ops/scatter/nvidia/scatter_nvidia.cuh new file mode 100644 index 000000000..a199edb6e --- /dev/null +++ b/src/infiniop/ops/scatter/nvidia/scatter_nvidia.cuh @@ -0,0 +1,7 @@ +#ifndef __SCATTER_NVIDIA_API_H__ +#define __SCATTER_NVIDIA_API_H__ +#include "../scatter.h" + +DESCRIPTOR(nvidia) + +#endif // __SCATTER_NVIDIA_API_H__ diff --git a/src/infiniop/ops/scatter/operator.cc b/src/infiniop/ops/scatter/operator.cc new file mode 100644 index 000000000..95857d731 --- /dev/null +++ b/src/infiniop/ops/scatter/operator.cc @@ -0,0 +1,160 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/scatter.h" + +#ifdef ENABLE_CPU_API +#include "cpu/scatter_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#include "nvidia/scatter_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/scatter_metax.h" +#endif + +__C infiniStatus_t infiniopCreateScatterDescriptor( + infiniopHandle_t handle, + infiniopScatterDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t index_desc, + size_t dim) { +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::scatter::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + output_desc, \ + input_desc, \ + index_desc, \ + dim) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetScatterWorkspaceSize(infiniopScatterDescriptor_t desc, size_t *size) { +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET +} + +__C infiniStatus_t infiniopScatter( + infiniopScatterDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + const void *index, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc)->calculate( \ + workspace, \ + workspace_size, \ + output, \ + input, \ + index, \ + stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroyScatterDescriptor(infiniopScatterDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + DELETE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/ops/scatter/scatter.h b/src/infiniop/ops/scatter/scatter.h new file mode 100644 index 000000000..e1e332471 --- /dev/null +++ b/src/infiniop/ops/scatter/scatter.h @@ -0,0 +1,47 @@ +#ifndef __SCATTER_H__ +#define __SCATTER_H__ + +#include "../../../utils.h" +#include "../../operator.h" +#include "../../tensor.h" +#include "info.h" + +#define DESCRIPTOR(NAMESPACE) \ + namespace op::scatter::NAMESPACE { \ + class Descriptor final : public InfiniopDescriptor { \ + struct Opaque; \ + Opaque *_opaque; \ + ScatterInfo _info; \ + size_t _workspace_size; \ + Descriptor( \ + infiniDtype_t dtype, \ + ScatterInfo info, \ + size_t workspace_size_, \ + Opaque *opaque, \ + infiniDevice_t device_type, \ + int device_id) : InfiniopDescriptor{device_type, device_id}, \ + _opaque(opaque), \ + _info(info), \ + _workspace_size(workspace_size_) {} \ + \ + public: \ + ~Descriptor(); \ + size_t workspaceSize() const { return _workspace_size; } \ + static infiniStatus_t create( \ + infiniopHandle_t handle, \ + Descriptor **desc_ptr, \ + infiniopTensorDescriptor_t output_desc, \ + infiniopTensorDescriptor_t input_desc, \ + infiniopTensorDescriptor_t index_desc, \ + size_t dim); \ + infiniStatus_t calculate( \ + void *workspace, \ + size_t workspace_size, \ + void *output, \ + const void *input, \ + const void *index, \ + void *stream) const; \ + }; \ + } + +#endif \ No newline at end of file diff --git a/test/infiniop/averagepool.py b/test/infiniop/averagepool.py new file mode 100644 index 000000000..55d5c37cf --- /dev/null +++ b/test/infiniop/averagepool.py @@ -0,0 +1,239 @@ +import torch +import ctypes +from ctypes import c_uint64, c_bool + +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + get_tolerance, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, +) +from typing import Tuple +import math +from torch.nn import functional as F + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + +_TEST_CASES = [ + # ============ 1D Average Pooling Tests (converted to MaxPool format) ============ + # Basic cases + ((4, 8, 128), None, (3,), (1,), (0,), False), # kernel=3, stride=1, pad=0 + ((2, 16, 256), None, (5,), (2,), (2,), False), # kernel=5, stride=2, pad=2 + ((8, 4, 64), None, (7,), (3,), (1,), False), # kernel=7, stride=3, pad=1 + # ceil_mode variations + ((1, 3, 99), None, (4,), (3,), (1,), True), # kernel=4, stride=3, pad=1 + ((3, 2, 77), None, (6,), (4,), (0,), True), # kernel=6, stride=4, pad=0 + # ============ 2D Average Pooling Tests ============ + # Basic cases with square kernels + ((2, 3, 64, 64), None, (3, 3), (1, 1), (1, 1), False), + ((4, 16, 128, 128), None, (5, 5), (2, 2), (2, 2), False), + ((1, 8, 96, 96), None, (7, 7), (3, 3), (0, 0), False), + # Rectangular kernels + ((2, 4, 80, 120), None, (3, 5), (1, 2), (1, 2), False), + ((1, 6, 72, 48), None, (7, 3), (2, 1), (3, 1), False), + ((3, 2, 56, 84), None, (2, 4), (2, 3), (0, 2), False), + # ceil_mode variations + ((1, 1, 33, 33), None, (4, 4), (3, 3), (1, 1), True), + ((2, 5, 77, 89), None, (5, 3), (4, 2), (2, 1), True), + # ============ 3D Average Pooling Tests ============ + # Basic cubic kernels + ((1, 2, 32, 32, 32), None, (3, 3, 3), (1, 1, 1), (1, 1, 1), False), + ((2, 4, 48, 48, 48), None, (5, 5, 5), (2, 2, 2), (2, 2, 2), False), + ((1, 1, 64, 64, 64), None, (7, 7, 7), (3, 3, 3), (0, 0, 0), False), + # Non-cubic kernels + ((1, 3, 24, 36, 48), None, (2, 3, 4), (1, 2, 2), (0, 1, 2), False), + ((2, 2, 40, 32, 56), None, (5, 3, 7), (2, 1, 3), (2, 1, 3), False), + ((1, 1, 28, 44, 36), None, (3, 5, 2), (2, 3, 1), (1, 2, 1), False), + # ceil_mode variations + ((1, 1, 27, 27, 27), None, (4, 4, 4), (3, 3, 3), (1, 1, 1), True), + ((2, 2, 33, 45, 39), None, (5, 3, 4), (3, 2, 3), (2, 1, 1), True), +] + +_TENSOR_DTYPES = [InfiniDtype.F32, InfiniDtype.F16, InfiniDtype.BF16] +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-4, "rtol": 1e-4}, + InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2}, +} + + +def averagepool(input_tensor, kernel_size, stride, padding, ceil_mode, output_tensor): + ndim = len(input_tensor.shape) - 2 + if ndim == 1: + result = F.avg_pool1d( + input_tensor.to(torch.float32), kernel_size[0], stride[0], padding[0], ceil_mode=ceil_mode + ) + elif ndim == 2: + result = F.avg_pool2d( + input_tensor.to(torch.float32), kernel_size, stride, padding, ceil_mode=ceil_mode + ) + elif ndim == 3: + result = F.avg_pool3d( + input_tensor.to(torch.float32), kernel_size, stride, padding, ceil_mode=ceil_mode + ) + else: + raise ValueError(f"Unsupported spatial dimensions: {ndim}") + + # 将计算结果转换回原始数据类型 + output_tensor.copy_(result.to(output_tensor.dtype)) + + +def infer_output_shape(input_shape, kernel_size, stride, padding, ceil_mode): + def calc_output_size(input_size, k, s, p, ceil_mode): + return ( + math.ceil((input_size + 2 * p - k) / s + 1) + if ceil_mode + else math.floor((input_size + 2 * p - k) / s + 1) + ) + + batch, channel, *spatial = input_shape + output_spatial = [ + calc_output_size(spatial[i], kernel_size[i], stride[i], padding[i], ceil_mode) + for i in range(len(spatial)) + ] + return (batch, channel) + tuple(output_spatial) + + +def tuple_to_void_p(py_tuple: Tuple): + arr = (ctypes.c_uint64 * len(py_tuple))(*py_tuple) + return ctypes.cast(arr, ctypes.c_void_p) + + +def test( + handle, + device, + input_shape, + input_stride, + kernel_size, + stride, + padding, + ceil_mode, + tensor_dtype=InfiniDtype.F16, + sync=None, +): + input_tensor = TestTensor( + input_shape, input_stride, dt=tensor_dtype, device=device, scale=1.0 + ) + output_shape = infer_output_shape( + input_shape, kernel_size, stride, padding, ceil_mode + ) + output_tensor = TestTensor(output_shape, None, dt=tensor_dtype, device=device) + + print( + f"Testing AvgPool on {InfiniDeviceNames[device]} with input_shape: {input_shape}, kernel_size: {kernel_size}, stride: {stride}, padding: {padding}, ceil_mode: {ceil_mode}, dtype: {InfiniDtypeNames[tensor_dtype]}" + ) + + averagepool( + input_tensor.torch_tensor(), + kernel_size, + stride, + padding, + ceil_mode, + output_tensor.torch_tensor(), + ) + + if sync: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateAvgPoolDescriptor( + handle, + ctypes.byref(descriptor), + output_tensor.descriptor, + input_tensor.descriptor, + tuple_to_void_p(kernel_size), + tuple_to_void_p(stride), + tuple_to_void_p(padding), + c_bool(ceil_mode), + ) + ) + + for tensor in [input_tensor, output_tensor]: + if tensor: + tensor.destroy_desc() + + workspace_size = ctypes.c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetAvgPoolWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, output_tensor.device) + + def lib_averagepool(): + check_error( + LIBINFINIOP.infiniopAvgPool( + descriptor, + workspace.data(), + workspace_size.value, + output_tensor.data(), + input_tensor.data(), + None, + ) + ) + + lib_averagepool() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, tensor_dtype) + if DEBUG: + debug( + output_tensor.actual_tensor(), + output_tensor.torch_tensor(), + atol=atol, + rtol=rtol, + ) + + assert torch.allclose( + output_tensor.actual_tensor(), + output_tensor.torch_tensor(), + atol=atol, + rtol=rtol, + ), f"Mismatch for shape {input_shape}, kernel {kernel_size}" + + if PROFILE: + profile_operation( + "PyTorch", + lambda: averagepool( + input_tensor.torch_tensor(), + kernel_size, + stride, + padding, + ceil_mode, + output_tensor.torch_tensor(), + ), + device, + NUM_PRERUN, + NUM_ITERATIONS, + ) + profile_operation( + " lib", lib_averagepool, device, NUM_PRERUN, NUM_ITERATIONS + ) + + check_error(LIBINFINIOP.infiniopDestroyAvgPoolDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/batch_norm.py b/test/infiniop/batch_norm.py new file mode 100644 index 000000000..a7b46858f --- /dev/null +++ b/test/infiniop/batch_norm.py @@ -0,0 +1,244 @@ +import torch +import ctypes +from ctypes import c_uint64 +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + get_tolerance, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, +) +from enum import Enum, auto + +_TEST_CASES_ = [ + # shape, momentum, eps + ((13, 4, 5,), 0.1, 1e-5), + ((2, 3, 4), 0.1, 1e-4), + ((15, 16, 17,), 0.2, 1e-5), + ((50, 60, 70), 0.1, 1e-4), +] + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE = auto() + + +# Inplace options applied for each test case in _TEST_CASES_ +_INPLACE = [ + Inplace.OUT_OF_PLACE, + Inplace.INPLACE, +] + +_TEST_CASES = [ + test_case + (inplace_item,) + for test_case in _TEST_CASES_ + for inplace_item in _INPLACE +] + + +# No implement for INPLACE + + +# Data types used for testing +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16] + +# Tolerance map for different data types +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-2, "rtol": 1e-2}, + InfiniDtype.F32: {"atol": 1e-5, "rtol": 1e-5}, + InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def torch_batch_norm( + output: torch.Tensor, + running_mean: torch.Tensor, + running_var: torch.Tensor, + input: torch.Tensor, + weight: torch.Tensor, + bias: torch.Tensor, + init_running_mean: torch.Tensor, + init_running_var: torch.Tensor, + momentum: float, + eps: float +): + bn = torch.nn.BatchNorm1d( + num_features=input.shape[1], + eps=eps, + momentum=momentum, + dtype=input.dtype, + ) + bn.weight.data = weight + bn.bias.data = bias + bn.running_mean.data = init_running_mean + bn.running_var.data = init_running_var + output.copy_(bn(input).detach()) + running_mean.copy_(bn.running_mean.data) + running_var.copy_(bn.running_var.data) + + +def test( + handle, + device, + shape, momentum, eps, + inplace, + dtype, + sync=None, +): + running_mean = TestTensor( + [shape[1]], + None, + dtype, + device, + ) + running_var = TestTensor( + [shape[1]], + None, + dtype, + device, + ) + + input = TestTensor( + shape, + None, + dtype, + device, + ) + if inplace == Inplace.INPLACE: + output = input + else: + output = TestTensor( + shape, + None, + dtype, + device + ) + + weight = TestTensor( + [shape[1]], + None, + dtype, + device, + ) + bias = TestTensor( + [shape[1]], + None, + dtype, + device, + ) + + + print( + f"Testing BatchNorm on {InfiniDeviceNames[device]} with shape:{shape}, inplace:{inplace}, momentum:{momentum}, eps:{eps}," + f"dtype:{InfiniDtypeNames[dtype]}" + ) + + + torch_batch_norm( + output.torch_tensor(), running_mean.torch_tensor(), running_var.torch_tensor(), + input.torch_tensor(), weight.torch_tensor(), bias.torch_tensor(), + running_mean.torch_tensor(), running_var.torch_tensor(), + momentum, eps + ) + + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateBatchNormDescriptor( + handle, + ctypes.byref(descriptor), + output.descriptor, + running_mean.descriptor, + running_var.descriptor, + input.descriptor, + weight.descriptor, + bias.descriptor, + momentum, + eps + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + for tensor in [output, running_mean, running_var, input, weight, bias]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetBatchNormWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, output.device) + + def lib_batch_norm(): + check_error( + LIBINFINIOP.infiniopBatchNorm( + descriptor, + workspace.data(), + workspace.size(), + output.data(), + running_mean.data(), + running_var.data(), + input.data(), + weight.data(), + bias.data(), + None, + ) + ) + + lib_batch_norm() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol) + debug(running_mean.actual_tensor(), running_mean.torch_tensor(), atol=atol, rtol=rtol) + debug(running_var.actual_tensor(), running_var.torch_tensor(), atol=atol, rtol=rtol) + + + assert torch.allclose(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol) + assert torch.allclose(running_mean.actual_tensor(), running_mean.torch_tensor(), atol=atol, rtol=rtol) + assert torch.allclose(running_var.actual_tensor(), running_var.torch_tensor(), atol=atol, rtol=rtol) + + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: torch_batch_norm( + output.torch_tensor(), running_mean.torch_tensor(), running_var.torch_tensor(), + input.torch_tensor(), weight.torch_tensor(), bias.torch_tensor(), running_mean.torch_tensor(), running_var.torch_tensor(), momentum, eps + ), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_batch_norm(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + check_error(LIBINFINIOP.infiniopDestroyBatchNormDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest my BatchNorm passed!\033[0m") diff --git a/test/infiniop/cross_entropy_loss.py b/test/infiniop/cross_entropy_loss.py new file mode 100644 index 000000000..acc5cadc4 --- /dev/null +++ b/test/infiniop/cross_entropy_loss.py @@ -0,0 +1,213 @@ +import torch +import ctypes +from ctypes import c_uint64 +import numpy as np + +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + get_tolerance, + profile_operation, + infiniopOperatorDescriptor_t, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + TestWorkspace, + InfiniDeviceEnum, +) +from torch.nn import functional as F + +_TEST_CASES = [ + # Single sample classification + ((10,), 10), + ((200,), 200), + # 2D: (N, C) - batch classification + ((4, 10), 10), + ((8, 5), 5), + ((16, 100), 100), + ((32, 1000), 1000), + ((64, 21), 21), + ((128, 50), 50), + # 3D: (N, C, d1) - sequence classification + ((4, 10, 5), 10), + # 4D: (N, C, d1, d2) - image segmentation + ((2, 8, 8, 8), 8), + # 5D: (N, C, d1, d2, d3) - 3D segmentation + ((3, 10, 10, 20, 30), 10), +] + +_TENSOR_DTYPES = [InfiniDtype.F32, InfiniDtype.F16, InfiniDtype.BF16] +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-5, "rtol": 1e-5}, + InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2}, +} +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def cross_entropy_loss_pytorch(logits, target): + return F.cross_entropy(logits.double(), target.long(), reduction="mean") + + +def test( + handle, + device, + input_shape, + num_classes, + tensor_dtype=InfiniDtype.F32, + sync=None, +): + # 根据输入形状确定logits和target的形状 + if len(input_shape) == 1: + # Shape (C,) - single sample classification + logits_shape = (num_classes,) + target_shape = (1,) # 修改:使用 (1,) 而不是标量 + else: + # Shape (N, C, [d1], [d2], ...) + logits_shape = input_shape + target_shape = (input_shape[0],) + input_shape[2:] + + print( + f"Testing CrossEntropyLoss on {InfiniDeviceNames[device]} with logits_shape: {logits_shape}, target_shape: {target_shape}, dtype:{InfiniDtypeNames[tensor_dtype]}" + ) + + # 创建logits张量 + logits = TestTensor(logits_shape, None, dt=tensor_dtype, device=device) + + # 创建target张量 + target_torch = torch.randint( + 0, + num_classes, + target_shape, + dtype=torch.long, + device=logits.torch_tensor().device, + ) + target = TestTensor.from_torch(target_torch, dt=InfiniDtype.I64, device=device) + + # 创建loss张量 + loss = TestTensor((1,), None, dt=tensor_dtype, device=device) + + # 计算PyTorch参考损失 + if len(input_shape) == 1: + # 对于一维logits,target需要是标量 + target_scalar = target.torch_tensor()[0] + pytorch_loss = cross_entropy_loss_pytorch(logits.torch_tensor(), target_scalar) + else: + pytorch_loss = cross_entropy_loss_pytorch( + logits.torch_tensor(), target.torch_tensor() + ) + + # 将参考结果存储到loss张量 + loss.torch_tensor()[0] = pytorch_loss.to(loss.torch_tensor().dtype) + + if sync: + sync() + + # 创建算子描述符 + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateCrossEntropyLossDescriptor( + handle, + ctypes.byref(descriptor), + loss.descriptor, + logits.descriptor, + target.descriptor, + ) + ) + + # 销毁tensor的描述符以防止内核直接使用 + for tensor in [logits, target, loss]: + tensor.destroy_desc() + + # 获取工作空间大小并创建工作空间 + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetCrossEntropyLossWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, device) + + # PyTorch参考实现函数 + def torch_cross_entropy(): + if len(input_shape) == 1: + target_scalar = target.torch_tensor()[0] + result = cross_entropy_loss_pytorch(logits.torch_tensor(), target_scalar) + else: + result = cross_entropy_loss_pytorch( + logits.torch_tensor(), target.torch_tensor() + ) + loss.torch_tensor()[0] = result.to(loss.torch_tensor().dtype) + + # InfiniOP实现函数 + def lib_cross_entropy(): + check_error( + LIBINFINIOP.infiniopCrossEntropyLoss( + descriptor, + workspace.data(), + workspace_size.value, + loss.data(), + logits.data(), + target.data(), + None, + ) + ) + + # 执行InfiniOP算子 + lib_cross_entropy() + + if sync: + sync() + + # 验证结果 + atol, rtol = get_tolerance(_TOLERANCE_MAP, tensor_dtype) + actual_loss = loss.actual_tensor()[0] + expected_loss = loss.torch_tensor()[0] + + if DEBUG: + print(f"Expected loss: {expected_loss.item()}") + print(f"Actual loss: {actual_loss.item()}") + if target_shape: + print( + f"Target shape: {target_shape}, first few targets: {target.torch_tensor().flatten()[:5]}" + ) + else: + print(f"Target (scalar): {target.torch_tensor()[0].item()}") + debug(actual_loss, expected_loss, atol=atol, rtol=rtol) + + if not torch.allclose(actual_loss, expected_loss, atol=atol, rtol=rtol): + print("--- ERROR ANALYSIS ---") + print(f"Expected: {expected_loss.item()}, Actual: {actual_loss.item()}") + print(f"Difference: {abs(actual_loss - expected_loss).item()}") + print(f"Tolerance: atol={atol}, rtol={rtol}") + + assert torch.allclose(actual_loss, expected_loss, atol=atol, rtol=rtol) + + # Profile功能 + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: torch_cross_entropy(), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_cross_entropy(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + + check_error(LIBINFINIOP.infiniopDestroyCrossEntropyLossDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + print("\033[92mAll CrossEntropyLoss tests passed!\033[0m") diff --git a/test/infiniop/exp.py b/test/infiniop/exp.py new file mode 100644 index 000000000..eb139af12 --- /dev/null +++ b/test/infiniop/exp.py @@ -0,0 +1,165 @@ +import torch +import ctypes +from ctypes import c_uint64 +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + get_tolerance, + get_sync_func, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, +) +from enum import Enum, auto + +# ======================================================================== +# Configuration (Internal Use Only) +# ======================================================================== +_TEST_CASES_ = [ + # shape, input_stride, output_stride + ((13, 4), None, None), + ((13, 4), (10, 1), (10, 1)), + ((13, 4), (0, 1), None), + ((13, 4, 4), None, None), + ((13, 4, 4), (20, 4, 1), (20, 4, 1)), + ((13, 4, 4), (4, 0, 1), None), + ((16, 5632), None, None), + ((16, 5632), (10240, 1), (10240, 1)), + ((4, 4, 5632), None, None), + ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1)), +] + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_INPUT = auto() + +_INPLACE = [ + Inplace.OUT_OF_PLACE, + Inplace.INPLACE_INPUT, +] + +_TEST_CASES = [ + test_case + (inplace,) + for test_case in _TEST_CASES_ + for inplace in _INPLACE +] + +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16] + +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6}, + InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def exp(output, input): + output.copy_(torch.exp(input)) + +def test( + handle, + device, + shape, + input_stride=None, + output_stride=None, + inplace=Inplace.OUT_OF_PLACE, + dtype=torch.float16, + sync=None, +): + input = TestTensor(shape, input_stride, dtype, device) + if inplace == Inplace.INPLACE_INPUT: + if input_stride != output_stride: + return + output = input + else: + output = TestTensor(shape, output_stride, dtype, device, mode="ones") + + if output.is_broadcast(): + return + + print( + f"Testing Exp on {InfiniDeviceNames[device]} with shape:{shape} input_stride:{input_stride} output_stride:{output_stride} " + f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}" + ) + + exp(output.torch_tensor(), input.torch_tensor()) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateExpDescriptor( + handle, + ctypes.byref(descriptor), + output.descriptor, + input.descriptor, + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + for tensor in [input, output]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetExpWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, output.device) + + def lib_exp(): + check_error( + LIBINFINIOP.infiniopExp( + descriptor, + workspace.data(), + workspace_size.value, + output.data(), + input.data(), + None, + ) + ) + + lib_exp() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol) + assert torch.allclose(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: exp(output.torch_tensor(), input.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_exp(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + check_error(LIBINFINIOP.infiniopDestroyExpDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/gather.py b/test/infiniop/gather.py new file mode 100644 index 000000000..b5c8ea93d --- /dev/null +++ b/test/infiniop/gather.py @@ -0,0 +1,160 @@ +import torch +import ctypes +from ctypes import c_uint64 +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + get_tolerance, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, +) +from enum import Enum, auto + +_TEST_CASES = [ + # input_shape, output_shape, dim, input_strides, output_strides, index_strides + ((2, 3, 7), (2, 3, 5), 2, (177, 17, 1), None, None), + ((10, 5, 4), (10, 4, 4), 1, (30, 5, 1), None, [16, 4, 1]), + ((11, 2, 2, 4), (11, 2, 2, 4), 0, None, (1007, 107, 10, 1), None), + ((11, 20, 20, 13, 37), (11, 20, 20, 13, 37), 1, None, None, None) +] + +# Data types used for testing +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16] + +# Tolerance map for different data types +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 0, "rtol": 0}, + InfiniDtype.F32: {"atol": 0, "rtol": 0}, + InfiniDtype.BF16: {"atol": 0, "rtol": 0}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def torch_gather(output, input, dim, index): + torch.gather(input, dim, index, out=output) + +def test( + handle, + device, + input_shape, output_shape, dim, input_strides, output_strides, index_strides, + dtype, + sync=None, +): + print( + f"Testing Gather on {InfiniDeviceNames[device]} with input shape:{input_shape}, dim:{dim}, output_shape:{output_shape}," + f"dtype:{InfiniDtypeNames[dtype]}" + ) + + input = TestTensor( + input_shape, + input_strides, + dtype, + device + ) + torch_index = torch.randint(low=0, high=input_shape[dim], size=output_shape, dtype=torch.int64) + if index_strides: + torch_index = torch_index.as_strided(output_shape, index_strides) + index = TestTensor( + output_shape, + torch_index.stride(), + InfiniDtype.I64, + device, + "manual", + set_tensor=torch_index + ) + output = TestTensor( + output_shape, + output_strides, + dtype, + device, + ) + + torch_gather(output.torch_tensor(), input.torch_tensor(), dim, index.torch_tensor()) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateGatherDescriptor( + handle, + ctypes.byref(descriptor), + output.descriptor, + input.descriptor, + index.descriptor, + dim + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + for tensor in [input, output, index]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetGatherWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, input.device) + + def lib_gather(): + check_error( + LIBINFINIOP.infiniopGather( + descriptor, + workspace.data(), + workspace.size(), + output.data(), + input.data(), + index.data(), + None, + ) + ) + + lib_gather() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol) + # print("x:", input.torch_tensor()) + # print("CALCULATED:\n", output.actual_tensor(), ) + # print("GT\n", output.torch_tensor()) + assert torch.allclose(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: torch_gather( + output.torch_tensor(), input.torch_tensor(), dim, index.torch_tensor() + ), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_gather(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + check_error(LIBINFINIOP.infiniopDestroyGatherDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest my Gather passed!\033[0m") diff --git a/test/infiniop/hardswish.py b/test/infiniop/hardswish.py new file mode 100644 index 000000000..424b30567 --- /dev/null +++ b/test/infiniop/hardswish.py @@ -0,0 +1,167 @@ +import torch +import ctypes +from ctypes import c_uint64 +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + get_tolerance, + get_sync_func, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, +) +from enum import Enum, auto + +# ======================================================================== +# Configuration (Internal Use Only) +# ======================================================================== +_TEST_CASES_ = [ + # shape, input_stride, output_stride + ((13, 4), None, None), + ((13, 4), (10, 1), (10, 1)), + ((13, 4), (0, 1), None), + ((13, 4, 4), None, None), + ((13, 4, 4), (20, 4, 1), (20, 4, 1)), + ((13, 4, 4), (4, 0, 1), None), + ((16, 5632), None, None), + ((16, 5632), (10240, 1), (10240, 1)), + ((4, 4, 5632), None, None), + ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1)), +] + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_INPUT = auto() + +_INPLACE = [ + Inplace.OUT_OF_PLACE, + Inplace.INPLACE_INPUT, +] + +_TEST_CASES = [ + test_case + (inplace,) + for test_case in _TEST_CASES_ + for inplace in _INPLACE +] + +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16] + +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6}, + InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def hardswish(output, input): + output.copy_(input * torch.clamp(input + 3, min=0, max=6) / 6) + + +def test( + handle, + device, + shape, + input_stride=None, + output_stride=None, + inplace=Inplace.OUT_OF_PLACE, + dtype=torch.float16, + sync=None, +): + input = TestTensor(shape, input_stride, dtype, device) + if inplace == Inplace.INPLACE_INPUT: + if input_stride != output_stride: + return + output = input + else: + output = TestTensor(shape, output_stride, dtype, device, mode="ones") + + if output.is_broadcast(): + return + + print( + f"Testing Hardswish on {InfiniDeviceNames[device]} with shape:{shape} input_stride:{input_stride} output_stride:{output_stride} " + f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}" + ) + + hardswish(output.torch_tensor(), input.torch_tensor()) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateHardswishDescriptor( + handle, + ctypes.byref(descriptor), + output.descriptor, + input.descriptor, + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + for tensor in [input, output]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetHardswishWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, output.device) + + def lib_hardswish(): + check_error( + LIBINFINIOP.infiniopHardswish( + descriptor, + workspace.data(), + workspace_size.value, + output.data(), + input.data(), + None, + ) + ) + + lib_hardswish() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol) + + assert torch.allclose(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: hardswish(output.torch_tensor(), input.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_hardswish(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + check_error(LIBINFINIOP.infiniopDestroyHardswishDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/index_copy_inplace.py b/test/infiniop/index_copy_inplace.py new file mode 100644 index 000000000..97dbd8266 --- /dev/null +++ b/test/infiniop/index_copy_inplace.py @@ -0,0 +1,180 @@ +import torch +import ctypes +from ctypes import c_uint64 +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + get_tolerance, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, +) +from enum import Enum, auto +import random + + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE = auto() + +_TEST_CASES = [ + # input_shape, output_shape, dim, output_strides, input_strides, + ([13, 1], [13, 4], 1, [37, 1], [37, 1], Inplace.OUT_OF_PLACE), + ([1333, 4], [1333, 4], 0, [1, 1333], [1, 2333], Inplace.INPLACE), + ([1333, 4], [1333, 4], 0, [1, 1333], [1, 2333], Inplace.OUT_OF_PLACE), + ([133, 23, 53], [133, 23, 53], 1, None, None, Inplace.OUT_OF_PLACE), + ([133, 23, 13, 53], [133, 23, 13, 53], 2, None, None, Inplace.OUT_OF_PLACE), +] + +# Data types used for testing +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16] + +# Tolerance map for different data types +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 0, "rtol": 0}, + InfiniDtype.F32: {"atol": 0, "rtol": 0}, + InfiniDtype.BF16: {"atol": 0, "rtol": 0}, +} + + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def torch_index_copy_inplace(output, input, index, dim): + output.index_copy_(dim, index, input.clone()) + + +def test( + handle, + device, + input_shape, output_shape, dim, output_strides, input_strides, + inplace, + dtype, + sync=None, +): + print( + f"Testing index_copy_inplace on {InfiniDeviceNames[device]} with shape:{input_shape}," + f"inplace:{inplace}," + f"dtype:{InfiniDtypeNames[dtype]}" + ) + + input = TestTensor( + input_shape, + input_strides, + dtype, + device, + ) + if inplace == Inplace.INPLACE: + assert output_shape == input_shape + output = input + else: + output = TestTensor( + output_shape, + output_strides, + dtype, + device, + "zeros", + ) + + index_list = list(range(output_shape[dim])) + + random.shuffle(index_list) + torch_index = torch.tensor(index_list[:input_shape[dim]], dtype=torch.int64) + index = TestTensor( + [input_shape[dim]], + torch_index.stride(), + InfiniDtype.I64, + device, + "manual", + set_tensor=torch_index + ) + + torch_index_copy_inplace(output.torch_tensor(), input.torch_tensor(), index.torch_tensor(), dim) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateIndexCopyInplaceDescriptor( + handle, + ctypes.byref(descriptor), + output.descriptor, + input.descriptor, + index.descriptor, + dim, + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + for tensor in [output, input, index]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetIndexCopyInplaceWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, output.device) + + def lib_index_copy_inplace(): + check_error( + LIBINFINIOP.infiniopIndexCopyInplace( + descriptor, + workspace.data(), + workspace.size(), + output.data(), + input.data(), + index.data(), + None, + ) + ) + + lib_index_copy_inplace() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol) + # print('input:\n', input.torch_tensor()) + # print('index:\n', index.torch_tensor()) + # print('output:\n', output.torch_tensor(), '\n', output.actual_tensor(), ) + + + assert torch.allclose(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: torch_index_copy_inplace( + output.torch_tensor(), input.torch_tensor(), index.torch_tensor(), dim + ), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_index_copy_inplace(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + check_error(LIBINFINIOP.infiniopDestroyIndexCopyInplaceDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest my index_copy_inplace passed!\033[0m") diff --git a/test/infiniop/interpolate_nearest.py b/test/infiniop/interpolate_nearest.py new file mode 100644 index 000000000..0440fdfec --- /dev/null +++ b/test/infiniop/interpolate_nearest.py @@ -0,0 +1,265 @@ +import torch +import ctypes +from ctypes import c_uint64 + +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + get_tolerance, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, +) +from enum import Enum, auto +from typing import List, Tuple +import math +from torch.nn import functional as F + +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + +# Test cases: (input_shape, input_stride, output_shape, output_stride) +_TEST_CASES = [ + # 2D test cases - simplified to one line each + ((1, 1, 2, 2), None, (1, 1, 4, 4), None), # Simple contiguous case + ((1, 3, 4, 4), (48, 16, 4, 1), (1, 3, 8, 8), (192, 64, 8, 1)), # 2D upscaling 2x + ((1, 3, 8, 8), (192, 64, 8, 1), (1, 3, 4, 4), (48, 16, 4, 1)), # 2D downscaling 2x + ((2, 4, 2, 2), (16, 4, 2, 1), (2, 4, 6, 6), (144, 36, 6, 1)), # Batch upscaling + ( + (1, 1, 3, 5), + (15, 15, 5, 1), + (1, 1, 9, 10), + (90, 90, 10, 1), + ), # Different aspect ratio + ( + (4, 64, 16, 16), + (16384, 256, 16, 1), + (4, 64, 32, 32), + (65536, 1024, 32, 1), + ), # Large batch + ((1, 1, 1, 1), (1, 1, 1, 1), (1, 1, 7, 7), (49, 49, 7, 1)), # Small to large + ( + (1, 2, 3, 4), + (24, 1, 8, 2), + (1, 2, 6, 8), + (96, 1, 16, 2), + ), # Non-contiguous layout + ((2, 3, 2, 2), (32, 8, 4, 1), (2, 3, 4, 4), (128, 32, 8, 1)), # Padded strides + # 1D test cases + ((1, 3, 8), (24, 8, 1), (1, 3, 16), (48, 16, 1)), # 1D upscaling 2x + ((2, 5, 10), (50, 10, 1), (2, 5, 5), (25, 5, 1)), # 1D downscaling 2x + ((4, 2, 32), (64, 32, 1), (4, 2, 64), (128, 64, 1)), # 1D larger upscaling + # 3D test cases + ( + (1, 2, 2, 2, 2), + (16, 8, 4, 2, 1), + (1, 2, 4, 4, 4), + (128, 64, 16, 4, 1), + ), # 3D upscaling 2x + ( + (1, 1, 2, 3, 4), + (24, 24, 12, 4, 1), + (1, 1, 4, 6, 8), + (192, 192, 48, 8, 1), + ), # 3D uniform upscaling + ( + (3, 2, 5, 5, 5), + (250, 125, 25, 5, 1), + (3, 2, 3, 3, 3), + (54, 27, 9, 3, 1), + ), # 3D non-uniform scaling +] + +# Data types used for testing +_TENSOR_DTYPES = [InfiniDtype.F32, InfiniDtype.F16, InfiniDtype.BF16, InfiniDtype.I8] + +# Tolerance map for different data types +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-4, "rtol": 1e-4}, + InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2}, + InfiniDtype.I8: {"atol": 0, "rtol": 0}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def interpolate_nearest(input_tensor, output_shape, output_tensor): + """ + Perform nearest neighbor interpolation using PyTorch as reference + """ + # Extract spatial dimensions (H, W) + target_size = output_shape[2:] # Skip batch and channel dimensions + + # Use PyTorch's interpolate function with nearest mode + if input_tensor.dtype in [ + torch.int8, + torch.uint8, + torch.int16, + torch.int32, + torch.int64, + ]: + # 对于整数类型,先转换为 float32,进行插值,再转换回原类型 + original_dtype = input_tensor.dtype + + # 转换为 float32 进行插值 + float_input = input_tensor.float() + result = F.interpolate(float_input, size=target_size, mode="nearest") + + # 转换回原始类型 + result = result.to(original_dtype) + else: + result = F.interpolate(input_tensor, size=target_size, mode="nearest") + + output_tensor.copy_(result) + + +def test( + handle, + device, + input_shape, + input_stride, + output_shape, + output_stride, + tensor_dtype=InfiniDtype.F16, + sync=None, +): + # Create input and output tensors + # For I8 type, use appropriate randint range (-128 to 127) and don't use scale + if tensor_dtype == InfiniDtype.I8: + input_tensor = TestTensor( + input_shape, input_stride, dt=tensor_dtype, device=device, + randint_low=-128, randint_high=128 + ) + output_tensor = TestTensor( + output_shape, output_stride, dt=tensor_dtype, device=device, + randint_low=-128, randint_high=128 + ) + else: + input_tensor = TestTensor( + input_shape, input_stride, dt=tensor_dtype, device=device, scale=1.0 + ) + output_tensor = TestTensor( + output_shape, output_stride, dt=tensor_dtype, device=device + ) + + print( + f"Testing InterpolateNearest on {InfiniDeviceNames[device]} with " + f"input_shape: {input_shape}, output_shape: {output_shape}, " + f"input_stride: {input_stride}, output_stride: {output_stride}, " + f"dtype: {InfiniDtypeNames[tensor_dtype]}" + ) + + # Compute reference result using PyTorch + interpolate_nearest( + input_tensor.torch_tensor(), output_shape, output_tensor.torch_tensor() + ) + + if sync is not None: + sync() + + # Create descriptor for our interpolate_nearest operator + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateInterpolateNearestDescriptor( + handle, + ctypes.byref(descriptor), + output_tensor.descriptor, + input_tensor.descriptor, + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + for tensor in [input_tensor, output_tensor]: + if tensor is not None: + tensor.destroy_desc() + + # Get workspace size + workspace_size = ctypes.c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetInterpolateNearestWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, output_tensor.device) + + def lib_interpolate_nearest(): + check_error( + LIBINFINIOP.infiniopInterpolateNearest( + descriptor, + workspace.data(), + workspace_size.value, + output_tensor.data(), + input_tensor.data(), + None, + ) + ) + + # Execute the operation + lib_interpolate_nearest() + + # Check results + atol, rtol = get_tolerance(_TOLERANCE_MAP, tensor_dtype) + if DEBUG: + debug( + output_tensor.actual_tensor(), + output_tensor.torch_tensor(), + atol=atol, + rtol=rtol, + ) + + assert torch.allclose( + output_tensor.actual_tensor(), + output_tensor.torch_tensor(), + atol=atol, + rtol=rtol, + ), f"Results don't match for shape {input_shape} -> {output_shape}" + + # Profiling workflow + if PROFILE: + profile_operation( + "PyTorch", + lambda: interpolate_nearest( + input_tensor.torch_tensor(), output_shape, output_tensor.torch_tensor() + ), + device, + NUM_PRERUN, + NUM_ITERATIONS, + ) + profile_operation( + " lib", + lambda: lib_interpolate_nearest(), + device, + NUM_PRERUN, + NUM_ITERATIONS, + ) + + # Clean up + check_error(LIBINFINIOP.infiniopDestroyInterpolateNearestDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/libinfiniop/op_register.py b/test/infiniop/libinfiniop/op_register.py index 20a9188d6..a0f7cbccb 100644 --- a/test/infiniop/libinfiniop/op_register.py +++ b/test/infiniop/libinfiniop/op_register.py @@ -4,7 +4,7 @@ infiniopOperatorDescriptor_t, ) -from ctypes import c_int32, c_void_p, c_size_t, POINTER, c_float +from ctypes import c_int32, c_void_p, c_size_t, POINTER, c_float, c_bool class OpRegister: @@ -1897,3 +1897,322 @@ def paged_attention_prefill_(lib): lib.infiniopDestroyPagedAttentionPrefillDescriptor.argtypes = [ infiniopOperatorDescriptor_t, ] + + +@OpRegister.operator +def averagepool_(lib): + lib.infiniopCreateAvgPoolDescriptor.restype = c_int32 + lib.infiniopCreateAvgPoolDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + c_void_p, + c_void_p, + c_void_p, + c_bool, + ] + lib.infiniopGetAvgPoolWorkspaceSize.restype = c_int32 + lib.infiniopGetAvgPoolWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + lib.infiniopAvgPool.restype = c_int32 + lib.infiniopAvgPool.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + ] + lib.infiniopDestroyAvgPoolDescriptor.restype = c_int32 + lib.infiniopDestroyAvgPoolDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + +@OpRegister.operator +def batch_norm_(lib): + lib.infiniopCreateBatchNormDescriptor.restype = c_int32 + lib.infiniopCreateBatchNormDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + c_float, + c_float, + ] + lib.infiniopGetBatchNormWorkspaceSize.restype = c_int32 + lib.infiniopGetBatchNormWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + lib.infiniopBatchNorm.restype = c_int32 + lib.infiniopBatchNorm.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + ] + lib.infiniopDestroyBatchNormDescriptor.restype = c_int32 + lib.infiniopDestroyBatchNormDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + +@OpRegister.operator +def cross_entropy_loss_(lib): + lib.infiniopCreateCrossEntropyLossDescriptor.restype = c_int32 + lib.infiniopCreateCrossEntropyLossDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + lib.infiniopGetCrossEntropyLossWorkspaceSize.restype = c_int32 + lib.infiniopGetCrossEntropyLossWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + lib.infiniopCrossEntropyLoss.restype = c_int32 + lib.infiniopCrossEntropyLoss.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + ] + lib.infiniopDestroyCrossEntropyLossDescriptor.restype = c_int32 + lib.infiniopDestroyCrossEntropyLossDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + +@OpRegister.operator +def exp_(lib): + lib.infiniopCreateExpDescriptor.restype = c_int32 + lib.infiniopCreateExpDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + lib.infiniopGetExpWorkspaceSize.restype = c_int32 + lib.infiniopGetExpWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + lib.infiniopExp.restype = c_int32 + lib.infiniopExp.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + ] + lib.infiniopDestroyExpDescriptor.restype = c_int32 + lib.infiniopDestroyExpDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + +@OpRegister.operator +def gather_(lib): + lib.infiniopCreateGatherDescriptor.restype = c_int32 + lib.infiniopCreateGatherDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + c_size_t, + ] + lib.infiniopGetGatherWorkspaceSize.restype = c_int32 + lib.infiniopGetGatherWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + lib.infiniopGather.restype = c_int32 + lib.infiniopGather.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + ] + lib.infiniopDestroyGatherDescriptor.restype = c_int32 + lib.infiniopDestroyGatherDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + +@OpRegister.operator +def hardswish_(lib): + lib.infiniopCreateHardswishDescriptor.restype = c_int32 + lib.infiniopCreateHardswishDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + lib.infiniopGetHardswishWorkspaceSize.restype = c_int32 + lib.infiniopGetHardswishWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + lib.infiniopHardswish.restype = c_int32 + lib.infiniopHardswish.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + ] + lib.infiniopDestroyHardswishDescriptor.restype = c_int32 + lib.infiniopDestroyHardswishDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + +@OpRegister.operator +def index_copy_inplace_(lib): + lib.infiniopCreateIndexCopyInplaceDescriptor.restype = c_int32 + lib.infiniopCreateIndexCopyInplaceDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + c_size_t, + ] + lib.infiniopGetIndexCopyInplaceWorkspaceSize.restype = c_int32 + lib.infiniopGetIndexCopyInplaceWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + lib.infiniopIndexCopyInplace.restype = c_int32 + lib.infiniopIndexCopyInplace.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + ] + lib.infiniopDestroyIndexCopyInplaceDescriptor.restype = c_int32 + lib.infiniopDestroyIndexCopyInplaceDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + +@OpRegister.operator +def interpolate_nearest_(lib): + lib.infiniopCreateInterpolateNearestDescriptor.restype = c_int32 + lib.infiniopCreateInterpolateNearestDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + lib.infiniopGetInterpolateNearestWorkspaceSize.restype = c_int32 + lib.infiniopGetInterpolateNearestWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + lib.infiniopInterpolateNearest.restype = c_int32 + lib.infiniopInterpolateNearest.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + ] + lib.infiniopDestroyInterpolateNearestDescriptor.restype = c_int32 + lib.infiniopDestroyInterpolateNearestDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + +@OpRegister.operator +def maxpool_(lib): + lib.infiniopCreateMaxPoolDescriptor.restype = c_int32 + lib.infiniopCreateMaxPoolDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + c_void_p, + c_void_p, + c_void_p, + c_bool, + ] + lib.infiniopGetMaxPoolWorkspaceSize.restype = c_int32 + lib.infiniopGetMaxPoolWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + lib.infiniopMaxPool.restype = c_int32 + lib.infiniopMaxPool.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + ] + lib.infiniopDestroyMaxPoolDescriptor.restype = c_int32 + lib.infiniopDestroyMaxPoolDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + +@OpRegister.operator +def scatter_(lib): + lib.infiniopCreateScatterDescriptor.restype = c_int32 + lib.infiniopCreateScatterDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + c_size_t, + ] + lib.infiniopGetScatterWorkspaceSize.restype = c_int32 + lib.infiniopGetScatterWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + lib.infiniopScatter.restype = c_int32 + lib.infiniopScatter.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + ] + lib.infiniopDestroyScatterDescriptor.restype = c_int32 + lib.infiniopDestroyScatterDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] diff --git a/test/infiniop/maxpool.py b/test/infiniop/maxpool.py new file mode 100644 index 000000000..81ddce060 --- /dev/null +++ b/test/infiniop/maxpool.py @@ -0,0 +1,242 @@ +import torch +import ctypes +from ctypes import c_uint64, c_bool + +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + get_tolerance, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, +) +from enum import Enum, auto +from typing import List, Tuple +import math +from torch.nn import functional as F + +# Configuration for profiling +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + +# Test cases: (input_shape, input_stride, kernel_size, stride, padding, ceil_mode) +_TEST_CASES = [ + # 1D max pooling cases + ((1, 3, 8), None, (2,), (2,), (0,), False), + ((2, 4, 16), None, (3,), (2,), (1,), False), + ((3, 2, 77), None, (6,), (4,), (3,), True), + # 2D max pooling cases + ((1, 1, 4, 4), None, (2, 2), (2, 2), (0, 0), False), + ((2, 3, 8, 8), None, (3, 3), (2, 2), (1, 1), False), + ((1, 64, 32, 32), None, (2, 2), (2, 2), (0, 0), False), + ((4, 128, 16, 16), None, (3, 3), (1, 1), (1, 1), False), + # 3D max pooling cases + ((1, 1, 4, 4, 4), None, (2, 2, 2), (2, 2, 2), (0, 0, 0), False), + ((2, 2, 8, 8, 8), None, (2, 3, 3), (2, 2, 2), (0, 1, 1), False), + # Cases with ceil_mode=True + ((1, 1, 7, 7), None, (3, 3), (2, 2), (1, 1), True), + ((1, 2, 5), None, (3,), (2,), (0,), True), +] + +# Data types used for testing +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16] + +# Tolerance map for different data types +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-4, "rtol": 1e-4}, + InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2}, +} + + +def max_pool(input_tensor, kernel_size, stride, padding, ceil_mode): + """ + Perform max pooling using PyTorch as reference + """ + ndim = len(input_tensor.shape) - 2 # Spatial dimensions + + if ndim == 1: + result = F.max_pool1d( + input_tensor, + kernel_size=kernel_size[0], + stride=stride[0], + padding=padding[0], + ceil_mode=ceil_mode, + ) + elif ndim == 2: + result = F.max_pool2d( + input_tensor, + kernel_size=kernel_size, + stride=stride, + padding=padding, + ceil_mode=ceil_mode, + ) + elif ndim == 3: + result = F.max_pool3d( + input_tensor, + kernel_size=kernel_size, + stride=stride, + padding=padding, + ceil_mode=ceil_mode, + ) + else: + raise ValueError(f"Unsupported spatial dimensions: {ndim}") + + return result + + +def tuple_to_void_p(py_tuple: Tuple): + """Convert a python tuple to a ctype void pointer""" + array = ctypes.c_uint64 * len(py_tuple) + data_array = array(*py_tuple) + return ctypes.cast(data_array, ctypes.c_void_p) + + +def test( + handle, + device, + input_shape, + input_stride, + kernel_size, + stride, + padding, + ceil_mode, + tensor_dtype=InfiniDtype.F16, + sync=None, +): + # Create input tensor + input_tensor = TestTensor( + input_shape, input_stride, dt=tensor_dtype, device=device, scale=1.0 + ) + + # Compute reference result using PyTorch + torch_ref_output = max_pool( + input_tensor.torch_tensor(), + kernel_size, + stride, + padding, + ceil_mode, + ) + + # Use PyTorch输出shape来初始化output_tensor + output_tensor = TestTensor( + torch_ref_output.shape, None, dt=tensor_dtype, device=device + ) + + print( + f"Testing MaxPool on {InfiniDeviceNames[device]} with " + f"input_shape: {input_shape}, kernel_size: {kernel_size}, " + f"stride: {stride}, padding: {padding}, ceil_mode: {ceil_mode}, " + f"dtype: {InfiniDtypeNames[tensor_dtype]}" + ) + + if sync is not None: + sync() + + # Create descriptor for our max pool operator + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateMaxPoolDescriptor( + handle, + ctypes.byref(descriptor), + output_tensor.descriptor, + input_tensor.descriptor, + tuple_to_void_p(kernel_size), + tuple_to_void_p(stride), + tuple_to_void_p(padding), + c_bool(ceil_mode), + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + for tensor in [input_tensor, output_tensor]: + if tensor is not None: + tensor.destroy_desc() + + # Get workspace size + workspace_size = ctypes.c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetMaxPoolWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, output_tensor.device) + + def lib_max_pool(): + check_error( + LIBINFINIOP.infiniopMaxPool( + descriptor, + workspace.data(), + workspace_size.value, + output_tensor.data(), + input_tensor.data(), + None, + ) + ) + + # Execute the operation + lib_max_pool() + + # Check results + atol, rtol = get_tolerance(_TOLERANCE_MAP, tensor_dtype) + if DEBUG: + debug( + output_tensor.actual_tensor(), + torch_ref_output, + atol=atol, + rtol=rtol, + ) + + assert torch.allclose( + output_tensor.actual_tensor(), + torch_ref_output, + atol=atol, + rtol=rtol, + ), f"Results don't match for input_shape {input_shape}, kernel_size {kernel_size}" + + # Profiling workflow + if PROFILE: + profile_operation( + "PyTorch", + lambda: max_pool( + input_tensor.torch_tensor(), + kernel_size, + stride, + padding, + ceil_mode, + ), + device, + NUM_PRERUN, + NUM_ITERATIONS, + ) + profile_operation( + " lib", lambda: lib_max_pool(), device, NUM_PRERUN, NUM_ITERATIONS + ) + + # Clean up + check_error(LIBINFINIOP.infiniopDestroyMaxPoolDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/scatter.py b/test/infiniop/scatter.py new file mode 100644 index 000000000..86ccdcdeb --- /dev/null +++ b/test/infiniop/scatter.py @@ -0,0 +1,196 @@ +import torch +import ctypes +from ctypes import c_uint64 +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + get_tolerance, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, +) +from enum import Enum, auto +import random + +_TEST_CASES = [ + # input_shape, index_shape, output_shape, dim, input_strides, output_strides, index_strides + ((6, 7), (6, 7), (6, 7), 1, (7, 1), (1, 7), None), + ((2, 3, 7), (2, 3, 5), (2, 3, 5), 2, (1, 2, 6), None, None), + ((10, 5, 4), (10, 4, 4), (10, 4, 4), 1, None, None, [16, 4, 1]), + ((11, 2, 2, 4), (11, 2, 2, 4), (11, 2, 2, 4), 0, None, [16, 8, 4, 1], None), +] + + +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16] + +# Tolerance map for different data types +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 0, "rtol": 0}, + InfiniDtype.F32: {"atol": 0, "rtol": 0}, + InfiniDtype.BF16: {"atol": 0, "rtol": 0}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def torch_scatter(output: torch.Tensor, input, index, dim): + output.scatter_(dim, index, src=input) + + +def test( + handle, + device, + input_shape, index_shape, output_shape, dim, input_strides, output_strides, index_strides, + dtype, + sync=None, +): + print( + f"Testing scatter on {InfiniDeviceNames[device]} with input_shape:{input_shape}, index_shape:{index_shape}, output_shape:{output_shape}, dim:{dim}," + f"dtype:{InfiniDtypeNames[dtype]}" + ) + + output = TestTensor( + output_shape, + output_strides, + dtype, + device, + "zeros", + ) + + input = TestTensor( + input_shape, + input_strides, + dtype, + device, + ) + + def get_test_index_tensor(input_shape, index_shape, output_shape, scatter_dim): + index = torch.empty(index_shape, dtype=torch.int64) + ndim = len(input_shape) + if ndim == 2 and scatter_dim == 1: + for i in range(input.shape[0]): + row = list(range(output_shape[dim])) + random.shuffle(row) + index[i, :] = torch.tensor(row[:index_shape[dim]]).type(torch.float64) + elif ndim == 3 and scatter_dim == 2: + for i in range(input.shape[0]): + for j in range(input.shape[1]): + row = list(range(output_shape[dim])) + random.shuffle(row) + index[i, j, :] = torch.tensor(row[:index_shape[dim]]).type(torch.float64) + elif ndim == 3 and scatter_dim == 1: + for i in range(input.shape[0]): + for j in range(input.shape[2]): + row = list(range(output_shape[dim])) + random.shuffle(row) + index[i, :, j] = torch.tensor(row[:index_shape[dim]]).type(torch.float64) + elif ndim == 4 and scatter_dim == 0: + for i in range(input.shape[1]): + for j in range(input.shape[2]): + for k in range(input.shape[3]): + row = list(range(output_shape[dim])) + random.shuffle(row) + index[:, i, j, k] = torch.tensor(row[:index_shape[dim]]).type(torch.float64) + return index + + torch_index = get_test_index_tensor(input_shape, index_shape, output_shape, dim).type(torch.int64) + if index_strides: + torch_index = torch_index.as_strided(index_shape, index_strides) + index = TestTensor( + index_shape, + torch_index.stride(), + InfiniDtype.I64, + device, + "manual", + set_tensor=torch_index + ) + + torch_scatter(output.torch_tensor(), input.torch_tensor(), index.torch_tensor(), dim) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateScatterDescriptor( + handle, + ctypes.byref(descriptor), + output.descriptor, + input.descriptor, + index.descriptor, + dim, + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + for tensor in [output, input, index]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetScatterWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, output.device) + + def lib_scatter(): + check_error( + LIBINFINIOP.infiniopScatter( + descriptor, + workspace.data(), + workspace.size(), + output.data(), + input.data(), + index.data(), + None, + ) + ) + + lib_scatter() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol) + # print('input:\n', input.torch_tensor()) + # print('index:\n', index.torch_tensor()) + # print('output:\n', output.torch_tensor(), '\n', output.actual_tensor(), ) + + + assert torch.allclose(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: torch_scatter( + output.torch_tensor(), input.torch_tensor(), index.torch_tensor(), dim + ), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_scatter(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + check_error(LIBINFINIOP.infiniopDestroyScatterDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest my scatter passed!\033[0m") From 05096eacf045078e0429415e065e244a69167350 Mon Sep 17 00:00:00 2001 From: gongchensu Date: Tue, 20 Jan 2026 02:43:25 +0000 Subject: [PATCH 6/7] Issue/888 - Refactor: integrate exp and hardswish operators into unified unary framework. --- include/infiniop/ops/exp.h | 24 --- include/infiniop/ops/hardswish.h | 24 --- include/infiniop/ops/unary_ops_api.h | 2 + .../elementwise/cpu/elementwise_cpu_impl.h | 185 +++++++++------- .../nvidia/elementwise_nvidia_impl.cuh | 197 ++++++++++-------- src/infiniop/elementwise/unary.h | 47 +++++ src/infiniop/ops/exp/cpu/exp_cpu.cc | 48 +---- src/infiniop/ops/exp/cpu/exp_cpu.h | 16 +- src/infiniop/ops/exp/cuda/kernel.cuh | 33 +-- src/infiniop/ops/exp/nvidia/exp_nvidia.cu | 53 +---- src/infiniop/ops/exp/operator.cc | 134 +----------- .../ops/hardswish/cpu/hardswish_cpu.cc | 48 +---- .../ops/hardswish/cpu/hardswish_cpu.h | 25 +-- src/infiniop/ops/hardswish/cuda/kernel.cuh | 49 +---- .../ops/hardswish/nvidia/hardswish_nvidia.cu | 53 +---- src/infiniop/ops/hardswish/operator.cc | 134 +----------- test/infiniop/exp.py | 165 --------------- test/infiniop/hardswish.py | 167 --------------- test/infiniop/test_all_unary_ops.py | 46 ++++ 19 files changed, 337 insertions(+), 1113 deletions(-) delete mode 100644 include/infiniop/ops/exp.h delete mode 100644 include/infiniop/ops/hardswish.h delete mode 100644 test/infiniop/exp.py delete mode 100644 test/infiniop/hardswish.py diff --git a/include/infiniop/ops/exp.h b/include/infiniop/ops/exp.h deleted file mode 100644 index 624bc5363..000000000 --- a/include/infiniop/ops/exp.h +++ /dev/null @@ -1,24 +0,0 @@ -#ifndef __INFINIOP_EXP_API_H__ -#define __INFINIOP_EXP_API_H__ - -#include "../operator_descriptor.h" - -typedef struct InfiniopDescriptor *infiniopExpDescriptor_t; - -__C __export infiniStatus_t infiniopCreateExpDescriptor(infiniopHandle_t handle, - infiniopExpDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t output, - infiniopTensorDescriptor_t input); - -__C __export infiniStatus_t infiniopGetExpWorkspaceSize(infiniopExpDescriptor_t desc, size_t *size); - -__C __export infiniStatus_t infiniopExp(infiniopExpDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *output, - const void *input, - void *stream); - -__C __export infiniStatus_t infiniopDestroyExpDescriptor(infiniopExpDescriptor_t desc); - -#endif diff --git a/include/infiniop/ops/hardswish.h b/include/infiniop/ops/hardswish.h deleted file mode 100644 index 8d655fe82..000000000 --- a/include/infiniop/ops/hardswish.h +++ /dev/null @@ -1,24 +0,0 @@ -#ifndef __INFINIOP_HARDSWISH_API_H__ -#define __INFINIOP_HARDSWISH_API_H__ - -#include "../operator_descriptor.h" - -typedef struct InfiniopDescriptor *infiniopHardswishDescriptor_t; - -__C __export infiniStatus_t infiniopCreateHardswishDescriptor(infiniopHandle_t handle, - infiniopHardswishDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t output, - infiniopTensorDescriptor_t input); - -__C __export infiniStatus_t infiniopGetHardswishWorkspaceSize(infiniopHardswishDescriptor_t desc, size_t *size); - -__C __export infiniStatus_t infiniopHardswish(infiniopHardswishDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *output, - const void *input, - void *stream); - -__C __export infiniStatus_t infiniopDestroyHardswishDescriptor(infiniopHardswishDescriptor_t desc); - -#endif diff --git a/include/infiniop/ops/unary_ops_api.h b/include/infiniop/ops/unary_ops_api.h index 95b0773b6..11cae2190 100644 --- a/include/infiniop/ops/unary_ops_api.h +++ b/include/infiniop/ops/unary_ops_api.h @@ -35,5 +35,7 @@ UNARY_OP_API_DECLARE(erf, Erf) UNARY_OP_API_DECLARE(atan, Atan) UNARY_OP_API_DECLARE(acos, Acos) UNARY_OP_API_DECLARE(ceil, Ceil) +UNARY_OP_API_DECLARE(exp, Exp) +UNARY_OP_API_DECLARE(hardswish, Hardswish) #endif // __INFINIOP_UNARY_OPS_API_H__ diff --git a/src/infiniop/elementwise/cpu/elementwise_cpu_impl.h b/src/infiniop/elementwise/cpu/elementwise_cpu_impl.h index 030f4d87e..fff5b1819 100644 --- a/src/infiniop/elementwise/cpu/elementwise_cpu_impl.h +++ b/src/infiniop/elementwise/cpu/elementwise_cpu_impl.h @@ -25,8 +25,74 @@ * } */ +// ========================================================================= +// Internal Helpers (Private Macros to reduce duplication) +// ========================================================================= + +/** + * @brief Common Calculate Switch Cases (F16 & F32) + */ +#define _IMPL_CALC_CASES_COMMON \ + case INFINI_DTYPE_F16: \ + return _device_info->template calculate(_info, output, inputs, stream); \ + case INFINI_DTYPE_F32: \ + return _device_info->template calculate(_info, output, inputs, stream); + /** - * @brief Macro to generate binary operator implementation. + * @brief Extended Calculate Switch Cases (Adds F64 & BF16) + */ +#define _IMPL_CALC_CASES_EXTENDED \ + _IMPL_CALC_CASES_COMMON \ + case INFINI_DTYPE_F64: \ + return _device_info->template calculate(_info, output, inputs, stream); \ + case INFINI_DTYPE_BF16: \ + return _device_info->template calculate(_info, output, inputs, stream); + +/** + * @brief Generic Template for the Calculate method + * @param CASES_MACRO The macro containing the switch cases to use + */ +#define _IMPL_CALCULATE_METHOD(CASES_MACRO) \ + infiniStatus_t Descriptor::calculate( \ + void *workspace, \ + size_t workspace_size, \ + void *output, \ + std::vector inputs, \ + void *stream) const { \ + switch (_dtype) { \ + CASES_MACRO \ + default: \ + return INFINI_STATUS_BAD_TENSOR_DTYPE; \ + } \ + } + +/** + * @brief Generic Template for the Create method + * @param SHAPE_CHECK_BLOCK Code block to execute for shape checking + * @param ... Variadic arguments for allowed data types in CHECK_DTYPE + */ +#define _IMPL_CREATE_METHOD(SHAPE_CHECK_BLOCK, ...) \ + Descriptor::~Descriptor() = default; \ + infiniStatus_t Descriptor::create( \ + infiniopHandle_t handle_, \ + Descriptor **desc_ptr, \ + infiniopTensorDescriptor_t out_desc, \ + std::vector input_desc_vec) { \ + auto handle = reinterpret_cast(handle_); \ + auto dtype = out_desc->dtype(); \ + const auto &out_shape = out_desc->shape(); \ + SHAPE_CHECK_BLOCK \ + CHECK_DTYPE(dtype, __VA_ARGS__); \ + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); \ + return INFINI_STATUS_SUCCESS; \ + } + +// ========================================================================= +// Public API Implementation Macros +// ========================================================================= + +/** + * @brief Implementation for Binary Operators (F16, F32) * * This macro generates the Descriptor destructor, create, and calculate methods * for binary operators, using the generic implementation. @@ -37,48 +103,19 @@ * ELEMENTWISE_CPU_IMPL_BINARY(pow) * } */ -#define ELEMENTWISE_CPU_IMPL_BINARY(OP) \ - \ - Descriptor::~Descriptor() = default; \ - \ - infiniStatus_t Descriptor::create( \ - infiniopHandle_t handle_, \ - Descriptor **desc_ptr, \ - infiniopTensorDescriptor_t out_desc, \ - std::vector input_desc_vec) { \ - auto handle = reinterpret_cast(handle_); \ - auto dtype = out_desc->dtype(); \ - const auto &a_desc = input_desc_vec.at(0); \ - const auto &b_desc = input_desc_vec.at(1); \ - const auto &out_shape = out_desc->shape(); \ - const auto &a_shape = a_desc->shape(); \ - const auto &b_shape = b_desc->shape(); \ - CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); \ - CHECK_SAME_SHAPE(out_shape, a_shape, b_shape); \ - CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); \ - return INFINI_STATUS_SUCCESS; \ - } \ - \ - infiniStatus_t Descriptor::calculate( \ - void *workspace, \ - size_t workspace_size, \ - void *output, \ - std::vector inputs, \ - void *stream) const { \ - switch (_dtype) { \ - case INFINI_DTYPE_F16: \ - return _device_info->template calculate( \ - _info, output, inputs, stream); \ - case INFINI_DTYPE_F32: \ - return _device_info->template calculate( \ - _info, output, inputs, stream); \ - default: \ - return INFINI_STATUS_BAD_TENSOR_DTYPE; \ - } \ - } +#define ELEMENTWISE_CPU_IMPL_BINARY(OP) \ + _IMPL_CREATE_METHOD( \ + const auto &a_desc = input_desc_vec.at(0); \ + const auto &b_desc = input_desc_vec.at(1); \ + const auto &a_shape = a_desc->shape(); \ + const auto &b_shape = b_desc->shape(); \ + CHECK_SAME_SHAPE(out_shape, a_shape, b_shape);, \ + INFINI_DTYPE_F16, INFINI_DTYPE_F32 \ + ) \ + _IMPL_CALCULATE_METHOD(_IMPL_CALC_CASES_COMMON) /** - * @brief Macro to generate unary operator implementation. + * @brief Implementation for Unary Operators (F16, F32) * * This macro generates the Descriptor destructor, create, and calculate methods * for unary operators, using the generic implementation. @@ -89,42 +126,34 @@ * ELEMENTWISE_CPU_IMPL_UNARY(sqrt) * } */ -#define ELEMENTWISE_CPU_IMPL_UNARY(OP) \ - \ - Descriptor::~Descriptor() = default; \ - \ - infiniStatus_t Descriptor::create( \ - infiniopHandle_t handle_, \ - Descriptor **desc_ptr, \ - infiniopTensorDescriptor_t out_desc, \ - std::vector input_desc_vec) { \ - auto handle = reinterpret_cast(handle_); \ - auto dtype = out_desc->dtype(); \ - const auto &x_desc = input_desc_vec.at(0); \ - const auto &y_shape = out_desc->shape(); \ - const auto &x_shape = x_desc->shape(); \ - CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); \ - CHECK_SAME_SHAPE(y_shape, x_shape); \ - CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); \ - return INFINI_STATUS_SUCCESS; \ - } \ - \ - infiniStatus_t Descriptor::calculate( \ - void *workspace, \ - size_t workspace_size, \ - void *output, \ - std::vector inputs, \ - void *stream) const { \ - switch (_dtype) { \ - case INFINI_DTYPE_F16: \ - return _device_info->template calculate( \ - _info, output, inputs, stream); \ - case INFINI_DTYPE_F32: \ - return _device_info->template calculate( \ - _info, output, inputs, stream); \ - default: \ - return INFINI_STATUS_BAD_TENSOR_DTYPE; \ - } \ - } +#define ELEMENTWISE_CPU_IMPL_UNARY(OP) \ + _IMPL_CREATE_METHOD( \ + const auto &x_desc = input_desc_vec.at(0); \ + const auto &x_shape = x_desc->shape(); \ + CHECK_SAME_SHAPE(out_shape, x_shape);, \ + INFINI_DTYPE_F16, INFINI_DTYPE_F32 \ + ) \ + _IMPL_CALCULATE_METHOD(_IMPL_CALC_CASES_COMMON) + +/** + * @brief Implementation for Unary Operators Extended (F16, F32, F64, BF16) + * + * This macro generates the Descriptor destructor, create, and calculate methods + * for unary operators supporting F16, F32, F64, and BF16 data types. + * + * Usage: + * namespace op::exp::cpu { + * using Op = op::elementwise::unary::UnaryOp; + * ELEMENTWISE_CPU_IMPL_UNARY_EXTENDED(exp) + * } + */ +#define ELEMENTWISE_CPU_IMPL_UNARY_EXTENDED(OP) \ + _IMPL_CREATE_METHOD( \ + const auto &x_desc = input_desc_vec.at(0); \ + const auto &x_shape = x_desc->shape(); \ + CHECK_SAME_SHAPE(out_shape, x_shape);, \ + INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16 \ + ) \ + _IMPL_CALCULATE_METHOD(_IMPL_CALC_CASES_EXTENDED) #endif // __INFINIOP_ELEMENTWISE_CPU_IMPL_H__ diff --git a/src/infiniop/elementwise/nvidia/elementwise_nvidia_impl.cuh b/src/infiniop/elementwise/nvidia/elementwise_nvidia_impl.cuh index 39b78884a..b0716db1a 100644 --- a/src/infiniop/elementwise/nvidia/elementwise_nvidia_impl.cuh +++ b/src/infiniop/elementwise/nvidia/elementwise_nvidia_impl.cuh @@ -25,8 +25,81 @@ * } */ +// ========================================================================= +// Internal Helpers (Private Macros to reduce duplication) +// ========================================================================= + +/** + * @brief Common Calculate Switch Cases (F16 & F32) + */ +#define _IMPL_CALC_CASES_COMMON \ + case INFINI_DTYPE_F16: \ + return _device_info->calculate<256, cuda::Op, half>(_info, workspace, output, inputs, stream); \ + case INFINI_DTYPE_F32: \ + return _device_info->calculate<256, cuda::Op, float>(_info, workspace, output, inputs, stream); + /** - * @brief Macro to generate binary operator implementation for NVIDIA/CUDA. + * @brief Extended Calculate Switch Cases (Adds F64 & BF16) + * Note: Order is F16, BF16, F32, F64 to match original implementation + */ +#define _IMPL_CALC_CASES_EXTENDED \ + case INFINI_DTYPE_F16: \ + return _device_info->calculate<256, cuda::Op, half>(_info, workspace, output, inputs, stream); \ + case INFINI_DTYPE_BF16: \ + return _device_info->calculate<256, cuda::Op, cuda_bfloat16>(_info, workspace, output, inputs, stream); \ + case INFINI_DTYPE_F32: \ + return _device_info->calculate<256, cuda::Op, float>(_info, workspace, output, inputs, stream); \ + case INFINI_DTYPE_F64: \ + return _device_info->calculate<256, cuda::Op, double>(_info, workspace, output, inputs, stream); + +/** + * @brief Generic Template for the Calculate method + * @param CASES_MACRO The macro containing the switch cases to use + */ +#define _IMPL_CALCULATE_METHOD(CASES_MACRO) \ + infiniStatus_t Descriptor::calculate( \ + void *workspace, \ + size_t workspace_size, \ + void *output, \ + std::vector inputs, \ + void *stream) const { \ + if (workspace_size < _workspace_size) { \ + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; \ + } \ + switch (_dtype) { \ + CASES_MACRO \ + default: \ + return INFINI_STATUS_BAD_TENSOR_DTYPE; \ + } \ + } + +/** + * @brief Generic Template for the Create method + * @param SHAPE_CHECK_BLOCK Code block to execute for shape checking + * @param ... Variadic arguments for allowed data types in CHECK_DTYPE + */ +#define _IMPL_CREATE_METHOD(SHAPE_CHECK_BLOCK, ...) \ + Descriptor::~Descriptor() = default; \ + infiniStatus_t Descriptor::create( \ + infiniopHandle_t handle_, \ + Descriptor **desc_ptr, \ + infiniopTensorDescriptor_t out_desc, \ + std::vector input_desc_vec) { \ + auto handle = reinterpret_cast(handle_); \ + auto dtype = out_desc->dtype(); \ + const auto &out_shape = out_desc->shape(); \ + SHAPE_CHECK_BLOCK \ + CHECK_DTYPE(dtype, __VA_ARGS__); \ + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); \ + return INFINI_STATUS_SUCCESS; \ + } + +// ========================================================================= +// Public API Implementation Macros +// ========================================================================= + +/** + * @brief Implementation for Binary Operators (F16, F32) * * This macro generates the Descriptor destructor, create, and calculate methods * for binary operators, using the generic implementation. @@ -36,51 +109,19 @@ * ELEMENTWISE_NVIDIA_IMPL_BINARY(pow) * } */ -#define ELEMENTWISE_NVIDIA_IMPL_BINARY(OP) \ - \ - Descriptor::~Descriptor() = default; \ - \ - infiniStatus_t Descriptor::create( \ - infiniopHandle_t handle_, \ - Descriptor **desc_ptr, \ - infiniopTensorDescriptor_t out_desc, \ - std::vector input_desc_vec) { \ - auto handle = reinterpret_cast(handle_); \ - auto dtype = out_desc->dtype(); \ - const auto &a_desc = input_desc_vec.at(0); \ - const auto &b_desc = input_desc_vec.at(1); \ - const auto &c_shape = out_desc->shape(); \ - const auto &a_shape = a_desc->shape(); \ - const auto &b_shape = b_desc->shape(); \ - CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); \ - CHECK_SAME_SHAPE(c_shape, a_shape, b_shape); \ - CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); \ - return INFINI_STATUS_SUCCESS; \ - } \ - \ - infiniStatus_t Descriptor::calculate( \ - void *workspace, \ - size_t workspace_size, \ - void *output, \ - std::vector inputs, \ - void *stream) const { \ - if (workspace_size < _workspace_size) { \ - return INFINI_STATUS_INSUFFICIENT_WORKSPACE; \ - } \ - switch (_dtype) { \ - case INFINI_DTYPE_F16: \ - return _device_info->calculate<256, cuda::Op, half>( \ - _info, workspace, output, inputs, stream); \ - case INFINI_DTYPE_F32: \ - return _device_info->calculate<256, cuda::Op, float>( \ - _info, workspace, output, inputs, stream); \ - default: \ - return INFINI_STATUS_BAD_TENSOR_DTYPE; \ - } \ - } +#define ELEMENTWISE_NVIDIA_IMPL_BINARY(OP) \ + _IMPL_CREATE_METHOD( \ + const auto &a_desc = input_desc_vec.at(0); \ + const auto &b_desc = input_desc_vec.at(1); \ + const auto &a_shape = a_desc->shape(); \ + const auto &b_shape = b_desc->shape(); \ + CHECK_SAME_SHAPE(out_shape, a_shape, b_shape);, \ + INFINI_DTYPE_F16, INFINI_DTYPE_F32 \ + ) \ + _IMPL_CALCULATE_METHOD(_IMPL_CALC_CASES_COMMON) /** - * @brief Macro to generate unary operator implementation for NVIDIA/CUDA. + * @brief Implementation for Unary Operators (F16, F32) * * This macro generates the Descriptor destructor, create, and calculate methods * for unary operators, using the generic implementation. @@ -90,45 +131,33 @@ * ELEMENTWISE_NVIDIA_IMPL_UNARY(sqrt) * } */ -#define ELEMENTWISE_NVIDIA_IMPL_UNARY(OP) \ - \ - Descriptor::~Descriptor() = default; \ - \ - infiniStatus_t Descriptor::create( \ - infiniopHandle_t handle_, \ - Descriptor **desc_ptr, \ - infiniopTensorDescriptor_t out_desc, \ - std::vector input_desc_vec) { \ - auto handle = reinterpret_cast(handle_); \ - auto dtype = out_desc->dtype(); \ - const auto &x_desc = input_desc_vec.at(0); \ - const auto &y_shape = out_desc->shape(); \ - const auto &x_shape = x_desc->shape(); \ - CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32); \ - CHECK_SAME_SHAPE(y_shape, x_shape); \ - CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); \ - return INFINI_STATUS_SUCCESS; \ - } \ - \ - infiniStatus_t Descriptor::calculate( \ - void *workspace, \ - size_t workspace_size, \ - void *output, \ - std::vector inputs, \ - void *stream) const { \ - if (workspace_size < _workspace_size) { \ - return INFINI_STATUS_INSUFFICIENT_WORKSPACE; \ - } \ - switch (_dtype) { \ - case INFINI_DTYPE_F16: \ - return _device_info->calculate<256, cuda::Op, half>( \ - _info, workspace, output, inputs, stream); \ - case INFINI_DTYPE_F32: \ - return _device_info->calculate<256, cuda::Op, float>( \ - _info, workspace, output, inputs, stream); \ - default: \ - return INFINI_STATUS_BAD_TENSOR_DTYPE; \ - } \ - } +#define ELEMENTWISE_NVIDIA_IMPL_UNARY(OP) \ + _IMPL_CREATE_METHOD( \ + const auto &x_desc = input_desc_vec.at(0); \ + const auto &x_shape = x_desc->shape(); \ + CHECK_SAME_SHAPE(out_shape, x_shape);, \ + INFINI_DTYPE_F16, INFINI_DTYPE_F32 \ + ) \ + _IMPL_CALCULATE_METHOD(_IMPL_CALC_CASES_COMMON) + +/** + * @brief Implementation for Unary Operators Extended (F16, F32, F64, BF16) + * + * This macro generates the Descriptor destructor, create, and calculate methods + * for unary operators supporting F16, F32, F64, and BF16 data types. + * + * Usage: + * namespace op::exp::nvidia { + * ELEMENTWISE_NVIDIA_IMPL_UNARY_EXTENDED(exp) + * } + */ +#define ELEMENTWISE_NVIDIA_IMPL_UNARY_EXTENDED(OP) \ + _IMPL_CREATE_METHOD( \ + const auto &x_desc = input_desc_vec.at(0); \ + const auto &x_shape = x_desc->shape(); \ + CHECK_SAME_SHAPE(out_shape, x_shape);, \ + INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16 \ + ) \ + _IMPL_CALCULATE_METHOD(_IMPL_CALC_CASES_EXTENDED) #endif // __INFINIOP_ELEMENTWISE_NVIDIA_IMPL_CUH__ diff --git a/src/infiniop/elementwise/unary.h b/src/infiniop/elementwise/unary.h index 9f41dedb2..330f305dd 100644 --- a/src/infiniop/elementwise/unary.h +++ b/src/infiniop/elementwise/unary.h @@ -48,6 +48,7 @@ enum class UnaryMode { Sigmoid, Sign, Erf, + Hardswish, }; /** @@ -124,6 +125,17 @@ struct UnaryOp { return x > T(0) ? T(1) : (x == T(0) ? T(0) : T(-1)); } else if constexpr (Mode == UnaryMode::Erf) { return std::erf(x); + } else if constexpr (Mode == UnaryMode::Hardswish) { + if constexpr (std::is_integral_v) { + return static_cast(0); + } else { + // x * clamp(x + 3, 0, 6) / 6 + auto x_val = static_cast(x); + double y = x_val + 3.0; + y = std::min(std::max(y, 0.0), 6.0); + double out = x_val * (y / 6.0); + return static_cast(out); + } } else { static_assert(Mode != Mode, "Unsupported unary operation mode"); return x; @@ -487,6 +499,41 @@ struct UnaryOp { } else { return std::erf(x); } + } else if constexpr (Mode == UnaryMode::Hardswish) { + // Hardswish: f(x) = x * clamp(x + 3, 0, 6) / 6 + auto hswish_f32 = [](float x) -> float { + float y = x + 3.0f; + y = y < 0.0f ? 0.0f : (y > 6.0f ? 6.0f : y); + return x * (y * (1.0f / 6.0f)); + }; + if constexpr (std::is_same_v) { + float2 vf = __half22float2(x); + float2 vr = make_float2( + hswish_f32(vf.x), + hswish_f32(vf.y)); + return __float22half2_rn(vr); + } else if constexpr (std::is_same_v) { + float xf = __half2float(x); + float yf = hswish_f32(xf); + return __float2half_rn(yf); + } else if constexpr (std::is_same_v) { + float f0 = __bfloat162float(__low2bfloat16(x)); + float f1 = __bfloat162float(__high2bfloat16(x)); + return __floats2bfloat162_rn(hswish_f32(f0), hswish_f32(f1)); + } else if constexpr (std::is_same_v) { + float xf = __bfloat162float(x); + return __float2bfloat16_rz(hswish_f32(xf)); + } else if constexpr (std::is_same_v) { + return hswish_f32(x); + } else if constexpr (std::is_same_v) { + double xd = static_cast(x); + double yd = xd * (std::fmin(std::fmax(xd + 3.0, 0.0), 6.0) / 6.0); + return static_cast(yd); + } else { + double xd = static_cast(x); + double yd = xd * (std::fmin(std::fmax(xd + 3.0, 0.0), 6.0) / 6.0); + return static_cast(yd); + } } else { static_assert(Mode != Mode, "Unsupported unary operation mode"); return x; diff --git a/src/infiniop/ops/exp/cpu/exp_cpu.cc b/src/infiniop/ops/exp/cpu/exp_cpu.cc index 58a6d0f2d..fb254884f 100644 --- a/src/infiniop/ops/exp/cpu/exp_cpu.cc +++ b/src/infiniop/ops/exp/cpu/exp_cpu.cc @@ -1,52 +1,8 @@ #include "exp_cpu.h" +#include "../../../elementwise/cpu/elementwise_cpu_impl.h" namespace op::exp::cpu { -Descriptor::~Descriptor() = default; +ELEMENTWISE_CPU_IMPL_UNARY_EXTENDED(exp) -infiniStatus_t Descriptor::create( - infiniopHandle_t handle_, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t out_desc, - std::vector input_desc_vec) { - - auto handle = reinterpret_cast(handle_); - auto dtype = out_desc->dtype(); - - const auto &input_desc = input_desc_vec.at(0); - const auto &output_shape = out_desc->shape(); - const auto &input_shape = input_desc->shape(); - - CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); - - CHECK_SAME_SHAPE(output_shape, input_shape); - - // create CPU elementwise descriptor - CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); - - return INFINI_STATUS_SUCCESS; -} - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *output, - std::vector inputs, - void *stream) const { - - switch (_dtype) { - case INFINI_DTYPE_F16: - return _device_info->calculate(_info, output, inputs, stream); - case INFINI_DTYPE_F32: - return _device_info->calculate(_info, output, inputs, stream); - case INFINI_DTYPE_F64: - return _device_info->calculate(_info, output, inputs, stream); - case INFINI_DTYPE_BF16: - return _device_info->calculate(_info, output, inputs, stream); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - - return INFINI_STATUS_SUCCESS; -} } // namespace op::exp::cpu diff --git a/src/infiniop/ops/exp/cpu/exp_cpu.h b/src/infiniop/ops/exp/cpu/exp_cpu.h index 867c7afa5..c247651a5 100644 --- a/src/infiniop/ops/exp/cpu/exp_cpu.h +++ b/src/infiniop/ops/exp/cpu/exp_cpu.h @@ -2,20 +2,8 @@ #define __EXP_CPU_H__ #include "../../../elementwise/cpu/elementwise_cpu.h" -#include +#include "../../../elementwise/unary.h" -ELEMENTWISE_DESCRIPTOR(exp, cpu) - -namespace op::exp::cpu { -typedef struct ExpOp { -public: - static constexpr size_t num_inputs = 1; - - template - T operator()(const T &input) const { - return std::exp(input); - } -} ExpOp; -} // namespace op::exp::cpu +UNARY_ELEMENTWISE_DESCRIPTOR(exp, cpu, op::elementwise::unary::UnaryMode::Exp) #endif // __EXP_CPU_H__ diff --git a/src/infiniop/ops/exp/cuda/kernel.cuh b/src/infiniop/ops/exp/cuda/kernel.cuh index 12446f31a..8ef5375b8 100644 --- a/src/infiniop/ops/exp/cuda/kernel.cuh +++ b/src/infiniop/ops/exp/cuda/kernel.cuh @@ -1,39 +1,10 @@ #ifndef __EXP_CUDA_H__ #define __EXP_CUDA_H__ -#include -#include -#include +#include "../../../elementwise/unary.h" namespace op::exp::cuda { -typedef struct ExpOp { - static constexpr size_t num_inputs = 1; - - template - __device__ __forceinline__ T operator()(const T &input) const { - if constexpr (std::is_same_v) { - float2 vf = __half22float2(input); - float2 vr = make_float2(__expf(vf.x), __expf(vf.y)); - return __float22half2_rn(vr); - } else if constexpr (std::is_same_v) { - float inputf = __half2float(input); - return __float2half_rn(__expf(inputf)); - } else if constexpr (std::is_same_v) { - float f0 = __bfloat162float(__low2bfloat16(input)); - float f1 = __bfloat162float(__high2bfloat16(input)); - return __floats2bfloat162_rn(__expf(f0), __expf(f1)); - } else if constexpr (std::is_same_v) { - float inputf = __bfloat162float(input); - return __float2bfloat16_rn(__expf(inputf)); - } else if constexpr (std::is_same_v) { - return __expf(input); - } else if constexpr (std::is_same_v) { - return std::exp(input); - } else { - return std::exp(input); - } - } -} ExpOp; +using Op = op::elementwise::unary::cuda::UnaryOp; } // namespace op::exp::cuda #endif // __EXP_CUDA_H__ diff --git a/src/infiniop/ops/exp/nvidia/exp_nvidia.cu b/src/infiniop/ops/exp/nvidia/exp_nvidia.cu index 3bdf2eb45..532f3a0d7 100644 --- a/src/infiniop/ops/exp/nvidia/exp_nvidia.cu +++ b/src/infiniop/ops/exp/nvidia/exp_nvidia.cu @@ -1,59 +1,10 @@ -#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" +#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh" #include "../cuda/kernel.cuh" #include "exp_nvidia.cuh" namespace op::exp::nvidia { -Descriptor::~Descriptor() = default; +ELEMENTWISE_NVIDIA_IMPL_UNARY_EXTENDED(exp) -infiniStatus_t Descriptor::create( - infiniopHandle_t handle_, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t out_desc, - std::vector input_desc_vec) { - - auto handle = reinterpret_cast(handle_); - auto dtype = out_desc->dtype(); - - const auto &input_desc = input_desc_vec.at(0); - const auto &output_shape = out_desc->shape(); - const auto &input_shape = input_desc->shape(); - - CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); - - CHECK_SAME_SHAPE(output_shape, input_shape); - - // create CUDA elementwise descriptor - CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) - - return INFINI_STATUS_SUCCESS; -} - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *output, - std::vector inputs, - void *stream) const { - - if (workspace_size < _workspace_size) { - return INFINI_STATUS_INSUFFICIENT_WORKSPACE; - } - - switch (_dtype) { - case INFINI_DTYPE_F16: - return _device_info->calculate<256, cuda::ExpOp, half>(_info, workspace, output, inputs, stream); - case INFINI_DTYPE_BF16: - return _device_info->calculate<256, cuda::ExpOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); - case INFINI_DTYPE_F32: - return _device_info->calculate<256, cuda::ExpOp, float>(_info, workspace, output, inputs, stream); - case INFINI_DTYPE_F64: - return _device_info->calculate<256, cuda::ExpOp, double>(_info, workspace, output, inputs, stream); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - - return INFINI_STATUS_SUCCESS; -} } // namespace op::exp::nvidia diff --git a/src/infiniop/ops/exp/operator.cc b/src/infiniop/ops/exp/operator.cc index cc369d660..9a7aaecc5 100644 --- a/src/infiniop/ops/exp/operator.cc +++ b/src/infiniop/ops/exp/operator.cc @@ -1,6 +1,5 @@ -#include "../../operator.h" -#include "../../handle.h" -#include "infiniop/ops/exp.h" +#include "../../operator_impl.h" +#include "infiniop/ops/unary_ops_api.h" #ifdef ENABLE_CPU_API #include "cpu/exp_cpu.h" @@ -9,131 +8,4 @@ #include "nvidia/exp_nvidia.cuh" #endif -__C infiniStatus_t infiniopCreateExpDescriptor( - infiniopHandle_t handle, - infiniopExpDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t output_desc, - infiniopTensorDescriptor_t input_desc) { - -#define CREATE(CASE, NAMESPACE) \ - case CASE: \ - return op::exp::NAMESPACE::Descriptor::create( \ - handle, \ - reinterpret_cast(desc_ptr), \ - output_desc, \ - {input_desc}) - - switch (handle->device) { - -#ifdef ENABLE_CPU_API - CREATE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - CREATE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - CREATE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - CREATE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef CREATE -} - -__C infiniStatus_t infiniopGetExpWorkspaceSize(infiniopExpDescriptor_t desc, size_t *size) { - -#define GET(CASE, NAMESPACE) \ - case CASE: \ - *size = reinterpret_cast(desc)->workspaceSize(); \ - return INFINI_STATUS_SUCCESS; - - switch (desc->device_type) { -#ifdef ENABLE_CPU_API - GET(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - GET(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - GET(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - GET(INFINI_DEVICE_QY, nvidia); -#endif - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } -#undef GET - - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; -} - -__C infiniStatus_t infiniopExp( - infiniopExpDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *output, - const void *input, - void *stream) { - -#define CALCULATE(CASE, NAMESPACE) \ - case CASE: \ - return reinterpret_cast(desc) \ - ->calculate(workspace, workspace_size, output, {input}, stream) - - switch (desc->device_type) { - -#ifdef ENABLE_CPU_API - CALCULATE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - CALCULATE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef CALCULATE -} - -__C infiniStatus_t -infiniopDestroyExpDescriptor(infiniopExpDescriptor_t desc) { - -#define DELETE(CASE, NAMESPACE) \ - case CASE: \ - delete reinterpret_cast(desc); \ - return INFINI_STATUS_SUCCESS; - - switch (desc->device_type) { - -#ifdef ENABLE_CPU_API - DELETE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - DELETE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - DELETE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - DELETE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef DELETE -} +UNARY_OP_IMPL(exp, Exp) diff --git a/src/infiniop/ops/hardswish/cpu/hardswish_cpu.cc b/src/infiniop/ops/hardswish/cpu/hardswish_cpu.cc index e7b68508a..0d5b8577c 100644 --- a/src/infiniop/ops/hardswish/cpu/hardswish_cpu.cc +++ b/src/infiniop/ops/hardswish/cpu/hardswish_cpu.cc @@ -1,52 +1,8 @@ #include "hardswish_cpu.h" +#include "../../../elementwise/cpu/elementwise_cpu_impl.h" namespace op::hardswish::cpu { -Descriptor::~Descriptor() = default; +ELEMENTWISE_CPU_IMPL_UNARY_EXTENDED(hardswish) -infiniStatus_t Descriptor::create( - infiniopHandle_t handle_, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t out_desc, - std::vector input_desc_vec) { - - auto handle = reinterpret_cast(handle_); - auto dtype = out_desc->dtype(); - - const auto &input_desc = input_desc_vec.at(0); - const auto &output_shape = out_desc->shape(); - const auto &input_shape = input_desc->shape(); - - CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); - - CHECK_SAME_SHAPE(output_shape, input_shape); - - // create CPU elementwise descriptor - CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); - - return INFINI_STATUS_SUCCESS; -} - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *output, - std::vector inputs, - void *stream) const { - - switch (_dtype) { - case INFINI_DTYPE_F16: - return _device_info->calculate(_info, output, inputs, stream); - case INFINI_DTYPE_F32: - return _device_info->calculate(_info, output, inputs, stream); - case INFINI_DTYPE_F64: - return _device_info->calculate(_info, output, inputs, stream); - case INFINI_DTYPE_BF16: - return _device_info->calculate(_info, output, inputs, stream); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - - return INFINI_STATUS_SUCCESS; -} } // namespace op::hardswish::cpu diff --git a/src/infiniop/ops/hardswish/cpu/hardswish_cpu.h b/src/infiniop/ops/hardswish/cpu/hardswish_cpu.h index e137be8a0..de1a78f65 100644 --- a/src/infiniop/ops/hardswish/cpu/hardswish_cpu.h +++ b/src/infiniop/ops/hardswish/cpu/hardswish_cpu.h @@ -2,29 +2,8 @@ #define __HARDSWISH_CPU_H__ #include "../../../elementwise/cpu/elementwise_cpu.h" -#include +#include "../../../elementwise/unary.h" -ELEMENTWISE_DESCRIPTOR(hardswish, cpu) - -namespace op::hardswish::cpu { -typedef struct HardswishOp { -public: - static constexpr size_t num_inputs = 1; - - template - T operator()(const T &input) const { - if constexpr (std::is_integral_v) { - return static_cast(0); - } else { - // x * clamp(x + 3, 0, 6) / 6 - auto x = static_cast(input); - double y = x + 3.0; - y = std::min(std::max(y, 0.0), 6.0); - double out = x * (y / 6.0); - return static_cast(out); - } - } -} HardswishOp; -} // namespace op::hardswish::cpu +UNARY_ELEMENTWISE_DESCRIPTOR(hardswish, cpu, op::elementwise::unary::UnaryMode::Hardswish) #endif // __HARDSWISH_CPU_H__ diff --git a/src/infiniop/ops/hardswish/cuda/kernel.cuh b/src/infiniop/ops/hardswish/cuda/kernel.cuh index d5b369bce..95377b75c 100644 --- a/src/infiniop/ops/hardswish/cuda/kernel.cuh +++ b/src/infiniop/ops/hardswish/cuda/kernel.cuh @@ -1,55 +1,10 @@ #ifndef __HARDSWISH_CUDA_H__ #define __HARDSWISH_CUDA_H__ -#include -#include -#include +#include "../../../elementwise/unary.h" namespace op::hardswish::cuda { - -typedef struct HardswishOp { - static constexpr size_t num_inputs = 1; - - // Hardswish: f(x) = x * clamp(x + 3, 0, 6) / 6 - __device__ __forceinline__ float hswish_f32(float x) const { - float y = x + 3.0f; - y = y < 0.0f ? 0.0f : (y > 6.0f ? 6.0f : y); - return x * (y * (1.0f / 6.0f)); - } - - template - __device__ __forceinline__ T operator()(const T &input) const { - if constexpr (std::is_same_v) { - float2 vf = __half22float2(input); - float2 vr = make_float2( - hswish_f32(vf.x), - hswish_f32(vf.y)); - return __float22half2_rn(vr); - } else if constexpr (std::is_same_v) { - float xf = __half2float(input); - float yf = hswish_f32(xf); - return __float2half_rn(yf); - } else if constexpr (std::is_same_v) { - float f0 = __bfloat162float(__low2bfloat16(input)); - float f1 = __bfloat162float(__high2bfloat16(input)); - return __floats2bfloat162_rn(hswish_f32(f0), hswish_f32(f1)); - } else if constexpr (std::is_same_v) { - float xf = __bfloat162float(input); - return __float2bfloat16_rz(hswish_f32(xf)); - } else if constexpr (std::is_same_v) { - return hswish_f32(input); - } else if constexpr (std::is_same_v) { - double xd = static_cast(input); - double yd = xd * (std::fmin(std::fmax(xd + 3.0, 0.0), 6.0) / 6.0); - return static_cast(yd); - } else { - double xd = static_cast(input); - double yd = xd * (std::fmin(std::fmax(xd + 3.0, 0.0), 6.0) / 6.0); - return static_cast(yd); - } - } -} HardswishOp; - +using Op = op::elementwise::unary::cuda::UnaryOp; } // namespace op::hardswish::cuda #endif // __HARDSWISH_CUDA_H__ diff --git a/src/infiniop/ops/hardswish/nvidia/hardswish_nvidia.cu b/src/infiniop/ops/hardswish/nvidia/hardswish_nvidia.cu index 9e279c2ef..ebf5250f7 100644 --- a/src/infiniop/ops/hardswish/nvidia/hardswish_nvidia.cu +++ b/src/infiniop/ops/hardswish/nvidia/hardswish_nvidia.cu @@ -1,59 +1,10 @@ -#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" +#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh" #include "../cuda/kernel.cuh" #include "hardswish_nvidia.cuh" namespace op::hardswish::nvidia { -Descriptor::~Descriptor() = default; +ELEMENTWISE_NVIDIA_IMPL_UNARY_EXTENDED(hardswish) -infiniStatus_t Descriptor::create( - infiniopHandle_t handle_, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t out_desc, - std::vector input_desc_vec) { - - auto handle = reinterpret_cast(handle_); - auto dtype = out_desc->dtype(); - - const auto &input_desc = input_desc_vec.at(0); - const auto &output_shape = out_desc->shape(); - const auto &input_shape = input_desc->shape(); - - CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); - - CHECK_SAME_SHAPE(output_shape, input_shape); - - // create CUDA elementwise descriptor - CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) - - return INFINI_STATUS_SUCCESS; -} - -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - void *output, - std::vector inputs, - void *stream) const { - - if (workspace_size < _workspace_size) { - return INFINI_STATUS_INSUFFICIENT_WORKSPACE; - } - - switch (_dtype) { - case INFINI_DTYPE_F16: - return _device_info->calculate<256, cuda::HardswishOp, half>(_info, workspace, output, inputs, stream); - case INFINI_DTYPE_BF16: - return _device_info->calculate<256, cuda::HardswishOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); - case INFINI_DTYPE_F32: - return _device_info->calculate<256, cuda::HardswishOp, float>(_info, workspace, output, inputs, stream); - case INFINI_DTYPE_F64: - return _device_info->calculate<256, cuda::HardswishOp, double>(_info, workspace, output, inputs, stream); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - - return INFINI_STATUS_SUCCESS; -} } // namespace op::hardswish::nvidia diff --git a/src/infiniop/ops/hardswish/operator.cc b/src/infiniop/ops/hardswish/operator.cc index c51b18777..4cf68d328 100644 --- a/src/infiniop/ops/hardswish/operator.cc +++ b/src/infiniop/ops/hardswish/operator.cc @@ -1,6 +1,5 @@ -#include "../../operator.h" -#include "../../handle.h" -#include "infiniop/ops/hardswish.h" +#include "../../operator_impl.h" +#include "infiniop/ops/unary_ops_api.h" #ifdef ENABLE_CPU_API #include "cpu/hardswish_cpu.h" @@ -9,131 +8,4 @@ #include "nvidia/hardswish_nvidia.cuh" #endif -__C infiniStatus_t infiniopCreateHardswishDescriptor( - infiniopHandle_t handle, - infiniopHardswishDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t output_desc, - infiniopTensorDescriptor_t input_desc) { - -#define CREATE(CASE, NAMESPACE) \ - case CASE: \ - return op::hardswish::NAMESPACE::Descriptor::create( \ - handle, \ - reinterpret_cast(desc_ptr), \ - output_desc, \ - {input_desc}) - - switch (handle->device) { - -#ifdef ENABLE_CPU_API - CREATE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - CREATE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - CREATE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - CREATE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef CREATE -} - -__C infiniStatus_t infiniopGetHardswishWorkspaceSize(infiniopHardswishDescriptor_t desc, size_t *size) { - -#define GET(CASE, NAMESPACE) \ - case CASE: \ - *size = reinterpret_cast(desc)->workspaceSize(); \ - return INFINI_STATUS_SUCCESS; - - switch (desc->device_type) { -#ifdef ENABLE_CPU_API - GET(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - GET(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - GET(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - GET(INFINI_DEVICE_QY, nvidia); -#endif - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } -#undef GET - - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; -} - -__C infiniStatus_t infiniopHardswish( - infiniopHardswishDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *output, - const void *input, - void *stream) { - -#define CALCULATE(CASE, NAMESPACE) \ - case CASE: \ - return reinterpret_cast(desc) \ - ->calculate(workspace, workspace_size, output, {input}, stream) - - switch (desc->device_type) { - -#ifdef ENABLE_CPU_API - CALCULATE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - CALCULATE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef CALCULATE -} - -__C infiniStatus_t -infiniopDestroyHardswishDescriptor(infiniopHardswishDescriptor_t desc) { - -#define DELETE(CASE, NAMESPACE) \ - case CASE: \ - delete reinterpret_cast(desc); \ - return INFINI_STATUS_SUCCESS; - - switch (desc->device_type) { - -#ifdef ENABLE_CPU_API - DELETE(INFINI_DEVICE_CPU, cpu); -#endif -#ifdef ENABLE_NVIDIA_API - DELETE(INFINI_DEVICE_NVIDIA, nvidia); -#endif -#ifdef ENABLE_ILUVATAR_API - DELETE(INFINI_DEVICE_ILUVATAR, nvidia); -#endif -#ifdef ENABLE_QY_API - DELETE(INFINI_DEVICE_QY, nvidia); -#endif - - default: - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; - } - -#undef DELETE -} +UNARY_OP_IMPL(hardswish, Hardswish) diff --git a/test/infiniop/exp.py b/test/infiniop/exp.py deleted file mode 100644 index eb139af12..000000000 --- a/test/infiniop/exp.py +++ /dev/null @@ -1,165 +0,0 @@ -import torch -import ctypes -from ctypes import c_uint64 -from libinfiniop import ( - LIBINFINIOP, - TestTensor, - get_test_devices, - check_error, - test_operator, - get_args, - debug, - get_tolerance, - get_sync_func, - profile_operation, - TestWorkspace, - InfiniDtype, - InfiniDtypeNames, - InfiniDeviceNames, - infiniopOperatorDescriptor_t, -) -from enum import Enum, auto - -# ======================================================================== -# Configuration (Internal Use Only) -# ======================================================================== -_TEST_CASES_ = [ - # shape, input_stride, output_stride - ((13, 4), None, None), - ((13, 4), (10, 1), (10, 1)), - ((13, 4), (0, 1), None), - ((13, 4, 4), None, None), - ((13, 4, 4), (20, 4, 1), (20, 4, 1)), - ((13, 4, 4), (4, 0, 1), None), - ((16, 5632), None, None), - ((16, 5632), (10240, 1), (10240, 1)), - ((4, 4, 5632), None, None), - ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1)), -] - -class Inplace(Enum): - OUT_OF_PLACE = auto() - INPLACE_INPUT = auto() - -_INPLACE = [ - Inplace.OUT_OF_PLACE, - Inplace.INPLACE_INPUT, -] - -_TEST_CASES = [ - test_case + (inplace,) - for test_case in _TEST_CASES_ - for inplace in _INPLACE -] - -_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16] - -_TOLERANCE_MAP = { - InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, - InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6}, - InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2}, -} - -DEBUG = False -PROFILE = False -NUM_PRERUN = 10 -NUM_ITERATIONS = 1000 - - -def exp(output, input): - output.copy_(torch.exp(input)) - -def test( - handle, - device, - shape, - input_stride=None, - output_stride=None, - inplace=Inplace.OUT_OF_PLACE, - dtype=torch.float16, - sync=None, -): - input = TestTensor(shape, input_stride, dtype, device) - if inplace == Inplace.INPLACE_INPUT: - if input_stride != output_stride: - return - output = input - else: - output = TestTensor(shape, output_stride, dtype, device, mode="ones") - - if output.is_broadcast(): - return - - print( - f"Testing Exp on {InfiniDeviceNames[device]} with shape:{shape} input_stride:{input_stride} output_stride:{output_stride} " - f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}" - ) - - exp(output.torch_tensor(), input.torch_tensor()) - - if sync is not None: - sync() - - descriptor = infiniopOperatorDescriptor_t() - check_error( - LIBINFINIOP.infiniopCreateExpDescriptor( - handle, - ctypes.byref(descriptor), - output.descriptor, - input.descriptor, - ) - ) - - # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel - for tensor in [input, output]: - tensor.destroy_desc() - - workspace_size = c_uint64(0) - check_error( - LIBINFINIOP.infiniopGetExpWorkspaceSize( - descriptor, ctypes.byref(workspace_size) - ) - ) - workspace = TestWorkspace(workspace_size.value, output.device) - - def lib_exp(): - check_error( - LIBINFINIOP.infiniopExp( - descriptor, - workspace.data(), - workspace_size.value, - output.data(), - input.data(), - None, - ) - ) - - lib_exp() - - atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) - if DEBUG: - debug(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol) - assert torch.allclose(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol) - - # Profiling workflow - if PROFILE: - # fmt: off - profile_operation("PyTorch", lambda: exp(output.torch_tensor(), input.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) - profile_operation(" lib", lambda: lib_exp(), device, NUM_PRERUN, NUM_ITERATIONS) - # fmt: on - check_error(LIBINFINIOP.infiniopDestroyExpDescriptor(descriptor)) - - -if __name__ == "__main__": - args = get_args() - - # Configure testing options - DEBUG = args.debug - PROFILE = args.profile - NUM_PRERUN = args.num_prerun - NUM_ITERATIONS = args.num_iterations - - for device in get_test_devices(args): - test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) - - print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/hardswish.py b/test/infiniop/hardswish.py deleted file mode 100644 index 424b30567..000000000 --- a/test/infiniop/hardswish.py +++ /dev/null @@ -1,167 +0,0 @@ -import torch -import ctypes -from ctypes import c_uint64 -from libinfiniop import ( - LIBINFINIOP, - TestTensor, - get_test_devices, - check_error, - test_operator, - get_args, - debug, - get_tolerance, - get_sync_func, - profile_operation, - TestWorkspace, - InfiniDtype, - InfiniDtypeNames, - InfiniDeviceNames, - infiniopOperatorDescriptor_t, -) -from enum import Enum, auto - -# ======================================================================== -# Configuration (Internal Use Only) -# ======================================================================== -_TEST_CASES_ = [ - # shape, input_stride, output_stride - ((13, 4), None, None), - ((13, 4), (10, 1), (10, 1)), - ((13, 4), (0, 1), None), - ((13, 4, 4), None, None), - ((13, 4, 4), (20, 4, 1), (20, 4, 1)), - ((13, 4, 4), (4, 0, 1), None), - ((16, 5632), None, None), - ((16, 5632), (10240, 1), (10240, 1)), - ((4, 4, 5632), None, None), - ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1)), -] - -class Inplace(Enum): - OUT_OF_PLACE = auto() - INPLACE_INPUT = auto() - -_INPLACE = [ - Inplace.OUT_OF_PLACE, - Inplace.INPLACE_INPUT, -] - -_TEST_CASES = [ - test_case + (inplace,) - for test_case in _TEST_CASES_ - for inplace in _INPLACE -] - -_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16] - -_TOLERANCE_MAP = { - InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, - InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6}, - InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2}, -} - -DEBUG = False -PROFILE = False -NUM_PRERUN = 10 -NUM_ITERATIONS = 1000 - - -def hardswish(output, input): - output.copy_(input * torch.clamp(input + 3, min=0, max=6) / 6) - - -def test( - handle, - device, - shape, - input_stride=None, - output_stride=None, - inplace=Inplace.OUT_OF_PLACE, - dtype=torch.float16, - sync=None, -): - input = TestTensor(shape, input_stride, dtype, device) - if inplace == Inplace.INPLACE_INPUT: - if input_stride != output_stride: - return - output = input - else: - output = TestTensor(shape, output_stride, dtype, device, mode="ones") - - if output.is_broadcast(): - return - - print( - f"Testing Hardswish on {InfiniDeviceNames[device]} with shape:{shape} input_stride:{input_stride} output_stride:{output_stride} " - f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}" - ) - - hardswish(output.torch_tensor(), input.torch_tensor()) - - if sync is not None: - sync() - - descriptor = infiniopOperatorDescriptor_t() - check_error( - LIBINFINIOP.infiniopCreateHardswishDescriptor( - handle, - ctypes.byref(descriptor), - output.descriptor, - input.descriptor, - ) - ) - - # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel - for tensor in [input, output]: - tensor.destroy_desc() - - workspace_size = c_uint64(0) - check_error( - LIBINFINIOP.infiniopGetHardswishWorkspaceSize( - descriptor, ctypes.byref(workspace_size) - ) - ) - workspace = TestWorkspace(workspace_size.value, output.device) - - def lib_hardswish(): - check_error( - LIBINFINIOP.infiniopHardswish( - descriptor, - workspace.data(), - workspace_size.value, - output.data(), - input.data(), - None, - ) - ) - - lib_hardswish() - - atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) - if DEBUG: - debug(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol) - - assert torch.allclose(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol) - - # Profiling workflow - if PROFILE: - # fmt: off - profile_operation("PyTorch", lambda: hardswish(output.torch_tensor(), input.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) - profile_operation(" lib", lambda: lib_hardswish(), device, NUM_PRERUN, NUM_ITERATIONS) - # fmt: on - check_error(LIBINFINIOP.infiniopDestroyHardswishDescriptor(descriptor)) - - -if __name__ == "__main__": - args = get_args() - - # Configure testing options - DEBUG = args.debug - PROFILE = args.profile - NUM_PRERUN = args.num_prerun - NUM_ITERATIONS = args.num_iterations - - for device in get_test_devices(args): - test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) - - print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/test_all_unary_ops.py b/test/infiniop/test_all_unary_ops.py index b9d7cdc8b..2a65cf938 100644 --- a/test/infiniop/test_all_unary_ops.py +++ b/test/infiniop/test_all_unary_ops.py @@ -430,6 +430,50 @@ def generate_input(shape, dtype, device): EQUAL_NAN = True +class ExpTest(UnaryTestBase): + OP_NAME = "Exp" + OP_NAME_LOWER = "exp" + + @staticmethod + def torch_op(x): + return torch.exp(x).to(x.dtype) + + @staticmethod + def generate_input(shape, dtype, device): + return torch.rand(shape, dtype=dtype, device=device) * 2 - 1 + + TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6}, + InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2}, + } + + # Support BF16 + TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16] + + +class HardswishTest(UnaryTestBase): + OP_NAME = "Hardswish" + OP_NAME_LOWER = "hardswish" + + @staticmethod + def torch_op(x): + return (x * torch.clamp(x + 3, min=0, max=6) / 6).to(x.dtype) + + @staticmethod + def generate_input(shape, dtype, device): + return torch.rand(shape, dtype=dtype, device=device) * 2 - 1 + + TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6}, + InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2}, + } + + # Support BF16 + TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16] + + # ============================================================================== # 算子注册表 # ============================================================================== @@ -456,6 +500,8 @@ def generate_input(shape, dtype, device): "sinh": SinhTest, "sqrt": SqrtTest, "tan": TanTest, + "exp": ExpTest, + "hardswish": HardswishTest, } From 6d475877b8766c62132fd492c462a7b8c1314802 Mon Sep 17 00:00:00 2001 From: gongchensu Date: Wed, 21 Jan 2026 05:26:22 +0000 Subject: [PATCH 7/7] Issue/888 - Add gt,lt,ge,le,eq,ne,logical_and,logical_or,logical_xor,sin,bitwise_and, bitwise_or, bitwise_xor, bitwise_left_shift, bitwise_right_shift,floor_divide,atan2,exp2,log2,log10,rsqrt,square,hypot,copysign,remainder,isnan,isfinite,isinf,sinc,fmin,fmax,log1p binary operators. --- include/infiniop.h | 8 + include/infiniop/ops/averagepool.h | 8 +- include/infiniop/ops/binary_ops_api.h | 21 + include/infiniop/ops/cross_entropy_loss.h | 8 +- include/infiniop/ops/interpolate_nearest.h | 8 +- include/infiniop/ops/maxpool.h | 8 +- include/infiniop/ops/unary_ops_api.h | 11 + src/infiniop/README.md | 208 ++++ src/infiniop/elementwise/binary.h | 544 +++++++- .../elementwise/cpu/elementwise_cpu_impl.h | 141 ++- .../nvidia/elementwise_nvidia_impl.cuh | 150 ++- src/infiniop/elementwise/unary.h | 354 +++++- src/infiniop/ops/atan2/cpu/atan2_cpu.cc | 8 + src/infiniop/ops/atan2/cpu/atan2_cpu.h | 9 + src/infiniop/ops/atan2/cuda/kernel.cuh | 10 + src/infiniop/ops/atan2/nvidia/atan2_nvidia.cu | 10 + .../ops/atan2/nvidia/atan2_nvidia.cuh | 8 + src/infiniop/ops/atan2/operator.cc | 11 + .../ops/bitwise_and/cpu/bitwise_and_cpu.cc | 8 + .../ops/bitwise_and/cpu/bitwise_and_cpu.h | 9 + src/infiniop/ops/bitwise_and/cuda/kernel.cuh | 10 + .../bitwise_and/nvidia/bitwise_and_nvidia.cu | 10 + .../bitwise_and/nvidia/bitwise_and_nvidia.cuh | 8 + src/infiniop/ops/bitwise_and/operator.cc | 11 + .../cpu/bitwise_left_shift_cpu.cc | 8 + .../cpu/bitwise_left_shift_cpu.h | 9 + .../ops/bitwise_left_shift/cuda/kernel.cuh | 10 + .../nvidia/bitwise_left_shift_nvidia.cu | 10 + .../nvidia/bitwise_left_shift_nvidia.cuh | 8 + .../ops/bitwise_left_shift/operator.cc | 11 + .../ops/bitwise_or/cpu/bitwise_or_cpu.cc | 8 + .../ops/bitwise_or/cpu/bitwise_or_cpu.h | 9 + src/infiniop/ops/bitwise_or/cuda/kernel.cuh | 10 + .../bitwise_or/nvidia/bitwise_or_nvidia.cu | 10 + .../bitwise_or/nvidia/bitwise_or_nvidia.cuh | 8 + src/infiniop/ops/bitwise_or/operator.cc | 11 + .../cpu/bitwise_right_shift_cpu.cc | 8 + .../cpu/bitwise_right_shift_cpu.h | 9 + .../ops/bitwise_right_shift/cuda/kernel.cuh | 10 + .../nvidia/bitwise_right_shift_nvidia.cu | 10 + .../nvidia/bitwise_right_shift_nvidia.cuh | 8 + .../ops/bitwise_right_shift/operator.cc | 11 + .../ops/bitwise_xor/cpu/bitwise_xor_cpu.cc | 8 + .../ops/bitwise_xor/cpu/bitwise_xor_cpu.h | 9 + src/infiniop/ops/bitwise_xor/cuda/kernel.cuh | 10 + .../bitwise_xor/nvidia/bitwise_xor_nvidia.cu | 10 + .../bitwise_xor/nvidia/bitwise_xor_nvidia.cuh | 8 + src/infiniop/ops/bitwise_xor/operator.cc | 11 + src/infiniop/ops/copysign/cpu/copysign_cpu.cc | 8 + src/infiniop/ops/copysign/cpu/copysign_cpu.h | 9 + src/infiniop/ops/copysign/cuda/kernel.cuh | 10 + .../ops/copysign/nvidia/copysign_nvidia.cu | 10 + .../ops/copysign/nvidia/copysign_nvidia.cuh | 8 + src/infiniop/ops/copysign/operator.cc | 11 + src/infiniop/ops/eq/cpu/eq_cpu.cc | 8 + src/infiniop/ops/eq/cpu/eq_cpu.h | 9 + src/infiniop/ops/eq/cuda/kernel.cuh | 10 + src/infiniop/ops/eq/nvidia/eq_nvidia.cu | 10 + src/infiniop/ops/eq/nvidia/eq_nvidia.cuh | 8 + src/infiniop/ops/eq/operator.cc | 11 + src/infiniop/ops/exp2/cpu/exp2_cpu.cc | 8 + src/infiniop/ops/exp2/cpu/exp2_cpu.h | 9 + src/infiniop/ops/exp2/cuda/kernel.cuh | 10 + src/infiniop/ops/exp2/nvidia/exp2_nvidia.cu | 10 + src/infiniop/ops/exp2/nvidia/exp2_nvidia.cuh | 8 + src/infiniop/ops/exp2/operator.cc | 11 + .../ops/floor_divide/cpu/floor_divide_cpu.cc | 8 + .../ops/floor_divide/cpu/floor_divide_cpu.h | 9 + src/infiniop/ops/floor_divide/cuda/kernel.cuh | 10 + .../nvidia/floor_divide_nvidia.cu | 10 + .../nvidia/floor_divide_nvidia.cuh | 8 + src/infiniop/ops/floor_divide/operator.cc | 11 + src/infiniop/ops/fmax/cpu/fmax_cpu.cc | 8 + src/infiniop/ops/fmax/cpu/fmax_cpu.h | 9 + src/infiniop/ops/fmax/cuda/kernel.cuh | 10 + src/infiniop/ops/fmax/nvidia/fmax_nvidia.cu | 10 + src/infiniop/ops/fmax/nvidia/fmax_nvidia.cuh | 8 + src/infiniop/ops/fmax/operator.cc | 11 + src/infiniop/ops/fmin/cpu/fmin_cpu.cc | 8 + src/infiniop/ops/fmin/cpu/fmin_cpu.h | 9 + src/infiniop/ops/fmin/cuda/kernel.cuh | 10 + src/infiniop/ops/fmin/nvidia/fmin_nvidia.cu | 10 + src/infiniop/ops/fmin/nvidia/fmin_nvidia.cuh | 8 + src/infiniop/ops/fmin/operator.cc | 11 + src/infiniop/ops/ge/cpu/ge_cpu.cc | 8 + src/infiniop/ops/ge/cpu/ge_cpu.h | 9 + src/infiniop/ops/ge/cuda/kernel.cuh | 10 + src/infiniop/ops/ge/nvidia/ge_nvidia.cu | 10 + src/infiniop/ops/ge/nvidia/ge_nvidia.cuh | 8 + src/infiniop/ops/ge/operator.cc | 11 + src/infiniop/ops/gt/cpu/gt_cpu.cc | 8 + src/infiniop/ops/gt/cpu/gt_cpu.h | 9 + src/infiniop/ops/gt/cuda/kernel.cuh | 10 + src/infiniop/ops/gt/nvidia/gt_nvidia.cu | 10 + src/infiniop/ops/gt/nvidia/gt_nvidia.cuh | 8 + src/infiniop/ops/gt/operator.cc | 11 + src/infiniop/ops/hypot/cpu/hypot_cpu.cc | 8 + src/infiniop/ops/hypot/cpu/hypot_cpu.h | 9 + src/infiniop/ops/hypot/cuda/kernel.cuh | 10 + src/infiniop/ops/hypot/nvidia/hypot_nvidia.cu | 10 + .../ops/hypot/nvidia/hypot_nvidia.cuh | 8 + src/infiniop/ops/hypot/operator.cc | 11 + src/infiniop/ops/isfinite/cpu/isfinite_cpu.cc | 8 + src/infiniop/ops/isfinite/cpu/isfinite_cpu.h | 9 + src/infiniop/ops/isfinite/cuda/kernel.cuh | 10 + .../ops/isfinite/nvidia/isfinite_nvidia.cu | 10 + .../ops/isfinite/nvidia/isfinite_nvidia.cuh | 8 + src/infiniop/ops/isfinite/operator.cc | 11 + src/infiniop/ops/isinf/cpu/isinf_cpu.cc | 8 + src/infiniop/ops/isinf/cpu/isinf_cpu.h | 9 + src/infiniop/ops/isinf/cuda/kernel.cuh | 10 + src/infiniop/ops/isinf/nvidia/isinf_nvidia.cu | 10 + .../ops/isinf/nvidia/isinf_nvidia.cuh | 8 + src/infiniop/ops/isinf/operator.cc | 11 + src/infiniop/ops/isnan/cpu/isnan_cpu.cc | 8 + src/infiniop/ops/isnan/cpu/isnan_cpu.h | 9 + src/infiniop/ops/isnan/cuda/kernel.cuh | 10 + src/infiniop/ops/isnan/nvidia/isnan_nvidia.cu | 10 + .../ops/isnan/nvidia/isnan_nvidia.cuh | 8 + src/infiniop/ops/isnan/operator.cc | 11 + src/infiniop/ops/le/cpu/le_cpu.cc | 8 + src/infiniop/ops/le/cpu/le_cpu.h | 9 + src/infiniop/ops/le/cuda/kernel.cuh | 10 + src/infiniop/ops/le/nvidia/le_nvidia.cu | 10 + src/infiniop/ops/le/nvidia/le_nvidia.cuh | 8 + src/infiniop/ops/le/operator.cc | 11 + src/infiniop/ops/log10/cpu/log10_cpu.cc | 8 + src/infiniop/ops/log10/cpu/log10_cpu.h | 9 + src/infiniop/ops/log10/cuda/kernel.cuh | 10 + src/infiniop/ops/log10/nvidia/log10_nvidia.cu | 10 + .../ops/log10/nvidia/log10_nvidia.cuh | 8 + src/infiniop/ops/log10/operator.cc | 11 + src/infiniop/ops/log1p/cpu/log1p_cpu.cc | 8 + src/infiniop/ops/log1p/cpu/log1p_cpu.h | 9 + src/infiniop/ops/log1p/cuda/kernel.cuh | 10 + src/infiniop/ops/log1p/nvidia/log1p_nvidia.cu | 10 + .../ops/log1p/nvidia/log1p_nvidia.cuh | 8 + src/infiniop/ops/log1p/operator.cc | 11 + src/infiniop/ops/log2/cpu/log2_cpu.cc | 8 + src/infiniop/ops/log2/cpu/log2_cpu.h | 9 + src/infiniop/ops/log2/cuda/kernel.cuh | 10 + src/infiniop/ops/log2/nvidia/log2_nvidia.cu | 10 + src/infiniop/ops/log2/nvidia/log2_nvidia.cuh | 8 + src/infiniop/ops/log2/operator.cc | 11 + .../ops/logical_and/cpu/logical_and_cpu.cc | 8 + .../ops/logical_and/cpu/logical_and_cpu.h | 9 + src/infiniop/ops/logical_and/cuda/kernel.cuh | 10 + .../logical_and/nvidia/logical_and_nvidia.cu | 10 + .../logical_and/nvidia/logical_and_nvidia.cuh | 8 + src/infiniop/ops/logical_and/operator.cc | 11 + .../ops/logical_or/cpu/logical_or_cpu.cc | 8 + .../ops/logical_or/cpu/logical_or_cpu.h | 9 + src/infiniop/ops/logical_or/cuda/kernel.cuh | 10 + .../logical_or/nvidia/logical_or_nvidia.cu | 10 + .../logical_or/nvidia/logical_or_nvidia.cuh | 8 + src/infiniop/ops/logical_or/operator.cc | 11 + .../ops/logical_xor/cpu/logical_xor_cpu.cc | 8 + .../ops/logical_xor/cpu/logical_xor_cpu.h | 9 + src/infiniop/ops/logical_xor/cuda/kernel.cuh | 10 + .../logical_xor/nvidia/logical_xor_nvidia.cu | 10 + .../logical_xor/nvidia/logical_xor_nvidia.cuh | 8 + src/infiniop/ops/logical_xor/operator.cc | 11 + src/infiniop/ops/lt/cpu/lt_cpu.cc | 8 + src/infiniop/ops/lt/cpu/lt_cpu.h | 9 + src/infiniop/ops/lt/cuda/kernel.cuh | 10 + src/infiniop/ops/lt/nvidia/lt_nvidia.cu | 10 + src/infiniop/ops/lt/nvidia/lt_nvidia.cuh | 8 + src/infiniop/ops/lt/operator.cc | 11 + src/infiniop/ops/ne/cpu/ne_cpu.cc | 8 + src/infiniop/ops/ne/cpu/ne_cpu.h | 9 + src/infiniop/ops/ne/cuda/kernel.cuh | 10 + src/infiniop/ops/ne/nvidia/ne_nvidia.cu | 10 + src/infiniop/ops/ne/nvidia/ne_nvidia.cuh | 8 + src/infiniop/ops/ne/operator.cc | 11 + .../ops/remainder/cpu/remainder_cpu.cc | 8 + .../ops/remainder/cpu/remainder_cpu.h | 9 + src/infiniop/ops/remainder/cuda/kernel.cuh | 10 + .../ops/remainder/nvidia/remainder_nvidia.cu | 10 + .../ops/remainder/nvidia/remainder_nvidia.cuh | 8 + src/infiniop/ops/remainder/operator.cc | 11 + src/infiniop/ops/rsqrt/cpu/rsqrt_cpu.cc | 8 + src/infiniop/ops/rsqrt/cpu/rsqrt_cpu.h | 9 + src/infiniop/ops/rsqrt/cuda/kernel.cuh | 10 + src/infiniop/ops/rsqrt/nvidia/rsqrt_nvidia.cu | 10 + .../ops/rsqrt/nvidia/rsqrt_nvidia.cuh | 8 + src/infiniop/ops/rsqrt/operator.cc | 11 + src/infiniop/ops/sin/cpu/sin_cpu.cc | 8 + src/infiniop/ops/sin/cpu/sin_cpu.h | 9 + src/infiniop/ops/sin/cuda/kernel.cuh | 10 + src/infiniop/ops/sin/nvidia/sin_nvidia.cu | 10 + src/infiniop/ops/sin/nvidia/sin_nvidia.cuh | 8 + src/infiniop/ops/sin/operator.cc | 11 + src/infiniop/ops/sinc/cpu/sinc_cpu.cc | 8 + src/infiniop/ops/sinc/cpu/sinc_cpu.h | 9 + src/infiniop/ops/sinc/cuda/kernel.cuh | 10 + src/infiniop/ops/sinc/nvidia/sinc_nvidia.cu | 10 + src/infiniop/ops/sinc/nvidia/sinc_nvidia.cuh | 8 + src/infiniop/ops/sinc/operator.cc | 11 + src/infiniop/ops/square/cpu/square_cpu.cc | 8 + src/infiniop/ops/square/cpu/square_cpu.h | 9 + src/infiniop/ops/square/cuda/kernel.cuh | 10 + .../ops/square/nvidia/square_nvidia.cu | 10 + .../ops/square/nvidia/square_nvidia.cuh | 8 + src/infiniop/ops/square/operator.cc | 11 + test/infiniop/libinfiniop/op_register.py | 1091 ++++++++++++++++- test/infiniop/libinfiniop/utils.py | 42 +- test/infiniop/test_all_binary_ops.py | 606 +++++++++ test/infiniop/test_all_unary_ops.py | 279 +++++ 208 files changed, 5093 insertions(+), 186 deletions(-) create mode 100644 src/infiniop/ops/atan2/cpu/atan2_cpu.cc create mode 100644 src/infiniop/ops/atan2/cpu/atan2_cpu.h create mode 100644 src/infiniop/ops/atan2/cuda/kernel.cuh create mode 100644 src/infiniop/ops/atan2/nvidia/atan2_nvidia.cu create mode 100644 src/infiniop/ops/atan2/nvidia/atan2_nvidia.cuh create mode 100644 src/infiniop/ops/atan2/operator.cc create mode 100644 src/infiniop/ops/bitwise_and/cpu/bitwise_and_cpu.cc create mode 100644 src/infiniop/ops/bitwise_and/cpu/bitwise_and_cpu.h create mode 100644 src/infiniop/ops/bitwise_and/cuda/kernel.cuh create mode 100644 src/infiniop/ops/bitwise_and/nvidia/bitwise_and_nvidia.cu create mode 100644 src/infiniop/ops/bitwise_and/nvidia/bitwise_and_nvidia.cuh create mode 100644 src/infiniop/ops/bitwise_and/operator.cc create mode 100644 src/infiniop/ops/bitwise_left_shift/cpu/bitwise_left_shift_cpu.cc create mode 100644 src/infiniop/ops/bitwise_left_shift/cpu/bitwise_left_shift_cpu.h create mode 100644 src/infiniop/ops/bitwise_left_shift/cuda/kernel.cuh create mode 100644 src/infiniop/ops/bitwise_left_shift/nvidia/bitwise_left_shift_nvidia.cu create mode 100644 src/infiniop/ops/bitwise_left_shift/nvidia/bitwise_left_shift_nvidia.cuh create mode 100644 src/infiniop/ops/bitwise_left_shift/operator.cc create mode 100644 src/infiniop/ops/bitwise_or/cpu/bitwise_or_cpu.cc create mode 100644 src/infiniop/ops/bitwise_or/cpu/bitwise_or_cpu.h create mode 100644 src/infiniop/ops/bitwise_or/cuda/kernel.cuh create mode 100644 src/infiniop/ops/bitwise_or/nvidia/bitwise_or_nvidia.cu create mode 100644 src/infiniop/ops/bitwise_or/nvidia/bitwise_or_nvidia.cuh create mode 100644 src/infiniop/ops/bitwise_or/operator.cc create mode 100644 src/infiniop/ops/bitwise_right_shift/cpu/bitwise_right_shift_cpu.cc create mode 100644 src/infiniop/ops/bitwise_right_shift/cpu/bitwise_right_shift_cpu.h create mode 100644 src/infiniop/ops/bitwise_right_shift/cuda/kernel.cuh create mode 100644 src/infiniop/ops/bitwise_right_shift/nvidia/bitwise_right_shift_nvidia.cu create mode 100644 src/infiniop/ops/bitwise_right_shift/nvidia/bitwise_right_shift_nvidia.cuh create mode 100644 src/infiniop/ops/bitwise_right_shift/operator.cc create mode 100644 src/infiniop/ops/bitwise_xor/cpu/bitwise_xor_cpu.cc create mode 100644 src/infiniop/ops/bitwise_xor/cpu/bitwise_xor_cpu.h create mode 100644 src/infiniop/ops/bitwise_xor/cuda/kernel.cuh create mode 100644 src/infiniop/ops/bitwise_xor/nvidia/bitwise_xor_nvidia.cu create mode 100644 src/infiniop/ops/bitwise_xor/nvidia/bitwise_xor_nvidia.cuh create mode 100644 src/infiniop/ops/bitwise_xor/operator.cc create mode 100644 src/infiniop/ops/copysign/cpu/copysign_cpu.cc create mode 100644 src/infiniop/ops/copysign/cpu/copysign_cpu.h create mode 100644 src/infiniop/ops/copysign/cuda/kernel.cuh create mode 100644 src/infiniop/ops/copysign/nvidia/copysign_nvidia.cu create mode 100644 src/infiniop/ops/copysign/nvidia/copysign_nvidia.cuh create mode 100644 src/infiniop/ops/copysign/operator.cc create mode 100644 src/infiniop/ops/eq/cpu/eq_cpu.cc create mode 100644 src/infiniop/ops/eq/cpu/eq_cpu.h create mode 100644 src/infiniop/ops/eq/cuda/kernel.cuh create mode 100644 src/infiniop/ops/eq/nvidia/eq_nvidia.cu create mode 100644 src/infiniop/ops/eq/nvidia/eq_nvidia.cuh create mode 100644 src/infiniop/ops/eq/operator.cc create mode 100644 src/infiniop/ops/exp2/cpu/exp2_cpu.cc create mode 100644 src/infiniop/ops/exp2/cpu/exp2_cpu.h create mode 100644 src/infiniop/ops/exp2/cuda/kernel.cuh create mode 100644 src/infiniop/ops/exp2/nvidia/exp2_nvidia.cu create mode 100644 src/infiniop/ops/exp2/nvidia/exp2_nvidia.cuh create mode 100644 src/infiniop/ops/exp2/operator.cc create mode 100644 src/infiniop/ops/floor_divide/cpu/floor_divide_cpu.cc create mode 100644 src/infiniop/ops/floor_divide/cpu/floor_divide_cpu.h create mode 100644 src/infiniop/ops/floor_divide/cuda/kernel.cuh create mode 100644 src/infiniop/ops/floor_divide/nvidia/floor_divide_nvidia.cu create mode 100644 src/infiniop/ops/floor_divide/nvidia/floor_divide_nvidia.cuh create mode 100644 src/infiniop/ops/floor_divide/operator.cc create mode 100644 src/infiniop/ops/fmax/cpu/fmax_cpu.cc create mode 100644 src/infiniop/ops/fmax/cpu/fmax_cpu.h create mode 100644 src/infiniop/ops/fmax/cuda/kernel.cuh create mode 100644 src/infiniop/ops/fmax/nvidia/fmax_nvidia.cu create mode 100644 src/infiniop/ops/fmax/nvidia/fmax_nvidia.cuh create mode 100644 src/infiniop/ops/fmax/operator.cc create mode 100644 src/infiniop/ops/fmin/cpu/fmin_cpu.cc create mode 100644 src/infiniop/ops/fmin/cpu/fmin_cpu.h create mode 100644 src/infiniop/ops/fmin/cuda/kernel.cuh create mode 100644 src/infiniop/ops/fmin/nvidia/fmin_nvidia.cu create mode 100644 src/infiniop/ops/fmin/nvidia/fmin_nvidia.cuh create mode 100644 src/infiniop/ops/fmin/operator.cc create mode 100644 src/infiniop/ops/ge/cpu/ge_cpu.cc create mode 100644 src/infiniop/ops/ge/cpu/ge_cpu.h create mode 100644 src/infiniop/ops/ge/cuda/kernel.cuh create mode 100644 src/infiniop/ops/ge/nvidia/ge_nvidia.cu create mode 100644 src/infiniop/ops/ge/nvidia/ge_nvidia.cuh create mode 100644 src/infiniop/ops/ge/operator.cc create mode 100644 src/infiniop/ops/gt/cpu/gt_cpu.cc create mode 100644 src/infiniop/ops/gt/cpu/gt_cpu.h create mode 100644 src/infiniop/ops/gt/cuda/kernel.cuh create mode 100644 src/infiniop/ops/gt/nvidia/gt_nvidia.cu create mode 100644 src/infiniop/ops/gt/nvidia/gt_nvidia.cuh create mode 100644 src/infiniop/ops/gt/operator.cc create mode 100644 src/infiniop/ops/hypot/cpu/hypot_cpu.cc create mode 100644 src/infiniop/ops/hypot/cpu/hypot_cpu.h create mode 100644 src/infiniop/ops/hypot/cuda/kernel.cuh create mode 100644 src/infiniop/ops/hypot/nvidia/hypot_nvidia.cu create mode 100644 src/infiniop/ops/hypot/nvidia/hypot_nvidia.cuh create mode 100644 src/infiniop/ops/hypot/operator.cc create mode 100644 src/infiniop/ops/isfinite/cpu/isfinite_cpu.cc create mode 100644 src/infiniop/ops/isfinite/cpu/isfinite_cpu.h create mode 100644 src/infiniop/ops/isfinite/cuda/kernel.cuh create mode 100644 src/infiniop/ops/isfinite/nvidia/isfinite_nvidia.cu create mode 100644 src/infiniop/ops/isfinite/nvidia/isfinite_nvidia.cuh create mode 100644 src/infiniop/ops/isfinite/operator.cc create mode 100644 src/infiniop/ops/isinf/cpu/isinf_cpu.cc create mode 100644 src/infiniop/ops/isinf/cpu/isinf_cpu.h create mode 100644 src/infiniop/ops/isinf/cuda/kernel.cuh create mode 100644 src/infiniop/ops/isinf/nvidia/isinf_nvidia.cu create mode 100644 src/infiniop/ops/isinf/nvidia/isinf_nvidia.cuh create mode 100644 src/infiniop/ops/isinf/operator.cc create mode 100644 src/infiniop/ops/isnan/cpu/isnan_cpu.cc create mode 100644 src/infiniop/ops/isnan/cpu/isnan_cpu.h create mode 100644 src/infiniop/ops/isnan/cuda/kernel.cuh create mode 100644 src/infiniop/ops/isnan/nvidia/isnan_nvidia.cu create mode 100644 src/infiniop/ops/isnan/nvidia/isnan_nvidia.cuh create mode 100644 src/infiniop/ops/isnan/operator.cc create mode 100644 src/infiniop/ops/le/cpu/le_cpu.cc create mode 100644 src/infiniop/ops/le/cpu/le_cpu.h create mode 100644 src/infiniop/ops/le/cuda/kernel.cuh create mode 100644 src/infiniop/ops/le/nvidia/le_nvidia.cu create mode 100644 src/infiniop/ops/le/nvidia/le_nvidia.cuh create mode 100644 src/infiniop/ops/le/operator.cc create mode 100644 src/infiniop/ops/log10/cpu/log10_cpu.cc create mode 100644 src/infiniop/ops/log10/cpu/log10_cpu.h create mode 100644 src/infiniop/ops/log10/cuda/kernel.cuh create mode 100644 src/infiniop/ops/log10/nvidia/log10_nvidia.cu create mode 100644 src/infiniop/ops/log10/nvidia/log10_nvidia.cuh create mode 100644 src/infiniop/ops/log10/operator.cc create mode 100644 src/infiniop/ops/log1p/cpu/log1p_cpu.cc create mode 100644 src/infiniop/ops/log1p/cpu/log1p_cpu.h create mode 100644 src/infiniop/ops/log1p/cuda/kernel.cuh create mode 100644 src/infiniop/ops/log1p/nvidia/log1p_nvidia.cu create mode 100644 src/infiniop/ops/log1p/nvidia/log1p_nvidia.cuh create mode 100644 src/infiniop/ops/log1p/operator.cc create mode 100644 src/infiniop/ops/log2/cpu/log2_cpu.cc create mode 100644 src/infiniop/ops/log2/cpu/log2_cpu.h create mode 100644 src/infiniop/ops/log2/cuda/kernel.cuh create mode 100644 src/infiniop/ops/log2/nvidia/log2_nvidia.cu create mode 100644 src/infiniop/ops/log2/nvidia/log2_nvidia.cuh create mode 100644 src/infiniop/ops/log2/operator.cc create mode 100644 src/infiniop/ops/logical_and/cpu/logical_and_cpu.cc create mode 100644 src/infiniop/ops/logical_and/cpu/logical_and_cpu.h create mode 100644 src/infiniop/ops/logical_and/cuda/kernel.cuh create mode 100644 src/infiniop/ops/logical_and/nvidia/logical_and_nvidia.cu create mode 100644 src/infiniop/ops/logical_and/nvidia/logical_and_nvidia.cuh create mode 100644 src/infiniop/ops/logical_and/operator.cc create mode 100644 src/infiniop/ops/logical_or/cpu/logical_or_cpu.cc create mode 100644 src/infiniop/ops/logical_or/cpu/logical_or_cpu.h create mode 100644 src/infiniop/ops/logical_or/cuda/kernel.cuh create mode 100644 src/infiniop/ops/logical_or/nvidia/logical_or_nvidia.cu create mode 100644 src/infiniop/ops/logical_or/nvidia/logical_or_nvidia.cuh create mode 100644 src/infiniop/ops/logical_or/operator.cc create mode 100644 src/infiniop/ops/logical_xor/cpu/logical_xor_cpu.cc create mode 100644 src/infiniop/ops/logical_xor/cpu/logical_xor_cpu.h create mode 100644 src/infiniop/ops/logical_xor/cuda/kernel.cuh create mode 100644 src/infiniop/ops/logical_xor/nvidia/logical_xor_nvidia.cu create mode 100644 src/infiniop/ops/logical_xor/nvidia/logical_xor_nvidia.cuh create mode 100644 src/infiniop/ops/logical_xor/operator.cc create mode 100644 src/infiniop/ops/lt/cpu/lt_cpu.cc create mode 100644 src/infiniop/ops/lt/cpu/lt_cpu.h create mode 100644 src/infiniop/ops/lt/cuda/kernel.cuh create mode 100644 src/infiniop/ops/lt/nvidia/lt_nvidia.cu create mode 100644 src/infiniop/ops/lt/nvidia/lt_nvidia.cuh create mode 100644 src/infiniop/ops/lt/operator.cc create mode 100644 src/infiniop/ops/ne/cpu/ne_cpu.cc create mode 100644 src/infiniop/ops/ne/cpu/ne_cpu.h create mode 100644 src/infiniop/ops/ne/cuda/kernel.cuh create mode 100644 src/infiniop/ops/ne/nvidia/ne_nvidia.cu create mode 100644 src/infiniop/ops/ne/nvidia/ne_nvidia.cuh create mode 100644 src/infiniop/ops/ne/operator.cc create mode 100644 src/infiniop/ops/remainder/cpu/remainder_cpu.cc create mode 100644 src/infiniop/ops/remainder/cpu/remainder_cpu.h create mode 100644 src/infiniop/ops/remainder/cuda/kernel.cuh create mode 100644 src/infiniop/ops/remainder/nvidia/remainder_nvidia.cu create mode 100644 src/infiniop/ops/remainder/nvidia/remainder_nvidia.cuh create mode 100644 src/infiniop/ops/remainder/operator.cc create mode 100644 src/infiniop/ops/rsqrt/cpu/rsqrt_cpu.cc create mode 100644 src/infiniop/ops/rsqrt/cpu/rsqrt_cpu.h create mode 100644 src/infiniop/ops/rsqrt/cuda/kernel.cuh create mode 100644 src/infiniop/ops/rsqrt/nvidia/rsqrt_nvidia.cu create mode 100644 src/infiniop/ops/rsqrt/nvidia/rsqrt_nvidia.cuh create mode 100644 src/infiniop/ops/rsqrt/operator.cc create mode 100644 src/infiniop/ops/sin/cpu/sin_cpu.cc create mode 100644 src/infiniop/ops/sin/cpu/sin_cpu.h create mode 100644 src/infiniop/ops/sin/cuda/kernel.cuh create mode 100644 src/infiniop/ops/sin/nvidia/sin_nvidia.cu create mode 100644 src/infiniop/ops/sin/nvidia/sin_nvidia.cuh create mode 100644 src/infiniop/ops/sin/operator.cc create mode 100644 src/infiniop/ops/sinc/cpu/sinc_cpu.cc create mode 100644 src/infiniop/ops/sinc/cpu/sinc_cpu.h create mode 100644 src/infiniop/ops/sinc/cuda/kernel.cuh create mode 100644 src/infiniop/ops/sinc/nvidia/sinc_nvidia.cu create mode 100644 src/infiniop/ops/sinc/nvidia/sinc_nvidia.cuh create mode 100644 src/infiniop/ops/sinc/operator.cc create mode 100644 src/infiniop/ops/square/cpu/square_cpu.cc create mode 100644 src/infiniop/ops/square/cpu/square_cpu.h create mode 100644 src/infiniop/ops/square/cuda/kernel.cuh create mode 100644 src/infiniop/ops/square/nvidia/square_nvidia.cu create mode 100644 src/infiniop/ops/square/nvidia/square_nvidia.cuh create mode 100644 src/infiniop/ops/square/operator.cc diff --git a/include/infiniop.h b/include/infiniop.h index e87839bc2..27f52ac85 100644 --- a/include/infiniop.h +++ b/include/infiniop.h @@ -9,15 +9,22 @@ #include "infiniop/ops/add.h" #include "infiniop/ops/add_rms_norm.h" #include "infiniop/ops/attention.h" +#include "infiniop/ops/averagepool.h" +#include "infiniop/ops/batch_norm.h" #include "infiniop/ops/causal_softmax.h" #include "infiniop/ops/clip.h" #include "infiniop/ops/conv.h" +#include "infiniop/ops/cross_entropy_loss.h" #include "infiniop/ops/dequantize_awq.h" +#include "infiniop/ops/gather.h" #include "infiniop/ops/gelu.h" #include "infiniop/ops/gemm.h" +#include "infiniop/ops/index_copy_inplace.h" +#include "infiniop/ops/interpolate_nearest.h" #include "infiniop/ops/layer_norm.h" #include "infiniop/ops/logsoftmax.h" #include "infiniop/ops/lp_norm.h" +#include "infiniop/ops/maxpool.h" #include "infiniop/ops/mul.h" #include "infiniop/ops/ones.h" #include "infiniop/ops/paged_attention.h" @@ -28,6 +35,7 @@ #include "infiniop/ops/relu.h" #include "infiniop/ops/rms_norm.h" #include "infiniop/ops/rope.h" +#include "infiniop/ops/scatter.h" #include "infiniop/ops/sigmoid.h" #include "infiniop/ops/silu.h" #include "infiniop/ops/softmax.h" diff --git a/include/infiniop/ops/averagepool.h b/include/infiniop/ops/averagepool.h index 87e857175..752484772 100644 --- a/include/infiniop/ops/averagepool.h +++ b/include/infiniop/ops/averagepool.h @@ -5,7 +5,7 @@ __C typedef struct InfiniopDescriptor *infiniopAvgPoolDescriptor_t; -__C infiniStatus_t infiniopCreateAvgPoolDescriptor(infiniopHandle_t handle, +__C __export infiniStatus_t infiniopCreateAvgPoolDescriptor(infiniopHandle_t handle, infiniopAvgPoolDescriptor_t *desc_ptr, infiniopTensorDescriptor_t output_desc, infiniopTensorDescriptor_t input_desc, @@ -14,16 +14,16 @@ __C infiniStatus_t infiniopCreateAvgPoolDescriptor(infiniopHandle_t handle, void *pads, bool ceil_mode); -__C infiniStatus_t infiniopGetAvgPoolWorkspaceSize(infiniopAvgPoolDescriptor_t desc, +__C __export infiniStatus_t infiniopGetAvgPoolWorkspaceSize(infiniopAvgPoolDescriptor_t desc, size_t *size); -__C infiniStatus_t infiniopAvgPool(infiniopAvgPoolDescriptor_t desc, +__C __export infiniStatus_t infiniopAvgPool(infiniopAvgPoolDescriptor_t desc, void *workspace, size_t workspace_size, void *output, const void *input, void *stream); -__C infiniStatus_t infiniopDestroyAvgPoolDescriptor(infiniopAvgPoolDescriptor_t desc); +__C __export infiniStatus_t infiniopDestroyAvgPoolDescriptor(infiniopAvgPoolDescriptor_t desc); #endif // __INFINIOP_AVERAGEPOOL_H__ diff --git a/include/infiniop/ops/binary_ops_api.h b/include/infiniop/ops/binary_ops_api.h index 24d7715c9..8ddc6f0fd 100644 --- a/include/infiniop/ops/binary_ops_api.h +++ b/include/infiniop/ops/binary_ops_api.h @@ -15,9 +15,30 @@ // Declare all binary operator APIs BINARY_OP_API_DECLARE(div, Div) +BINARY_OP_API_DECLARE(floor_divide, FloorDivide) BINARY_OP_API_DECLARE(pow, Pow) +BINARY_OP_API_DECLARE(copysign, CopySign) +BINARY_OP_API_DECLARE(hypot, Hypot) +BINARY_OP_API_DECLARE(atan2, Atan2) BINARY_OP_API_DECLARE(mod, Mod) +BINARY_OP_API_DECLARE(remainder, Remainder) BINARY_OP_API_DECLARE(max, Max) BINARY_OP_API_DECLARE(min, Min) +BINARY_OP_API_DECLARE(fmax, Fmax) +BINARY_OP_API_DECLARE(fmin, Fmin) +BINARY_OP_API_DECLARE(gt, Gt) +BINARY_OP_API_DECLARE(lt, Lt) +BINARY_OP_API_DECLARE(ge, Ge) +BINARY_OP_API_DECLARE(le, Le) +BINARY_OP_API_DECLARE(eq, Eq) +BINARY_OP_API_DECLARE(ne, Ne) +BINARY_OP_API_DECLARE(logical_and, LogicalAnd) +BINARY_OP_API_DECLARE(logical_or, LogicalOr) +BINARY_OP_API_DECLARE(logical_xor, LogicalXor) +BINARY_OP_API_DECLARE(bitwise_and, BitwiseAnd) +BINARY_OP_API_DECLARE(bitwise_or, BitwiseOr) +BINARY_OP_API_DECLARE(bitwise_xor, BitwiseXor) +BINARY_OP_API_DECLARE(bitwise_left_shift, BitwiseLeftShift) +BINARY_OP_API_DECLARE(bitwise_right_shift, BitwiseRightShift) #endif // __INFINIOP_BINARY_OPS_API_H__ diff --git a/include/infiniop/ops/cross_entropy_loss.h b/include/infiniop/ops/cross_entropy_loss.h index 8b59843c9..034a0e38f 100644 --- a/include/infiniop/ops/cross_entropy_loss.h +++ b/include/infiniop/ops/cross_entropy_loss.h @@ -5,16 +5,16 @@ typedef struct InfiniopDescriptor *infiniopCrossEntropyLossDescriptor_t; -__C infiniStatus_t infiniopCreateCrossEntropyLossDescriptor(infiniopHandle_t handle, +__C __export infiniStatus_t infiniopCreateCrossEntropyLossDescriptor(infiniopHandle_t handle, infiniopCrossEntropyLossDescriptor_t *desc_ptr, infiniopTensorDescriptor_t loss_desc, infiniopTensorDescriptor_t logits_desc, infiniopTensorDescriptor_t target_desc); -__C infiniStatus_t infiniopGetCrossEntropyLossWorkspaceSize(infiniopCrossEntropyLossDescriptor_t desc, +__C __export infiniStatus_t infiniopGetCrossEntropyLossWorkspaceSize(infiniopCrossEntropyLossDescriptor_t desc, size_t *size); -__C infiniStatus_t infiniopCrossEntropyLoss(infiniopCrossEntropyLossDescriptor_t desc, +__C __export infiniStatus_t infiniopCrossEntropyLoss(infiniopCrossEntropyLossDescriptor_t desc, void *workspace, size_t workspace_size, void *loss, @@ -22,6 +22,6 @@ __C infiniStatus_t infiniopCrossEntropyLoss(infiniopCrossEntropyLossDescriptor_t const void *target, void *stream); -__C infiniStatus_t infiniopDestroyCrossEntropyLossDescriptor(infiniopCrossEntropyLossDescriptor_t desc); +__C __export infiniStatus_t infiniopDestroyCrossEntropyLossDescriptor(infiniopCrossEntropyLossDescriptor_t desc); #endif // __INFINIOP_CROSS_ENTROPY_LOSS_API_H__ diff --git a/include/infiniop/ops/interpolate_nearest.h b/include/infiniop/ops/interpolate_nearest.h index 7f970dc38..c604a7d48 100644 --- a/include/infiniop/ops/interpolate_nearest.h +++ b/include/infiniop/ops/interpolate_nearest.h @@ -5,21 +5,21 @@ __C typedef struct InfiniopDescriptor *infiniopInterpolateNearestDescriptor_t; -__C infiniStatus_t infiniopCreateInterpolateNearestDescriptor(infiniopHandle_t handle, +__C __export infiniStatus_t infiniopCreateInterpolateNearestDescriptor(infiniopHandle_t handle, infiniopInterpolateNearestDescriptor_t *desc_ptr, infiniopTensorDescriptor_t output_desc, infiniopTensorDescriptor_t input_desc); -__C infiniStatus_t infiniopGetInterpolateNearestWorkspaceSize(infiniopInterpolateNearestDescriptor_t desc, +__C __export infiniStatus_t infiniopGetInterpolateNearestWorkspaceSize(infiniopInterpolateNearestDescriptor_t desc, size_t *size); -__C infiniStatus_t infiniopInterpolateNearest(infiniopInterpolateNearestDescriptor_t desc, +__C __export infiniStatus_t infiniopInterpolateNearest(infiniopInterpolateNearestDescriptor_t desc, void *workspace, size_t workspace_size, void *output, const void *input, void *stream); -__C infiniStatus_t infiniopDestroyInterpolateNearestDescriptor(infiniopInterpolateNearestDescriptor_t desc); +__C __export infiniStatus_t infiniopDestroyInterpolateNearestDescriptor(infiniopInterpolateNearestDescriptor_t desc); #endif // __INFINIOP_INTERPOLATE_NEAREST_H__ diff --git a/include/infiniop/ops/maxpool.h b/include/infiniop/ops/maxpool.h index e47a43aed..7ee387fe8 100644 --- a/include/infiniop/ops/maxpool.h +++ b/include/infiniop/ops/maxpool.h @@ -5,7 +5,7 @@ __C typedef struct InfiniopDescriptor *infiniopMaxPoolDescriptor_t; -__C infiniStatus_t infiniopCreateMaxPoolDescriptor(infiniopHandle_t handle, +__C __export infiniStatus_t infiniopCreateMaxPoolDescriptor(infiniopHandle_t handle, infiniopMaxPoolDescriptor_t *desc_ptr, infiniopTensorDescriptor_t output_desc, infiniopTensorDescriptor_t input_desc, @@ -14,16 +14,16 @@ __C infiniStatus_t infiniopCreateMaxPoolDescriptor(infiniopHandle_t handle, void *pads, bool ceil_mode); -__C infiniStatus_t infiniopGetMaxPoolWorkspaceSize(infiniopMaxPoolDescriptor_t desc, +__C __export infiniStatus_t infiniopGetMaxPoolWorkspaceSize(infiniopMaxPoolDescriptor_t desc, size_t *size); -__C infiniStatus_t infiniopMaxPool(infiniopMaxPoolDescriptor_t desc, +__C __export infiniStatus_t infiniopMaxPool(infiniopMaxPoolDescriptor_t desc, void *workspace, size_t workspace_size, void *output, const void *input, void *stream); -__C infiniStatus_t infiniopDestroyMaxPoolDescriptor(infiniopMaxPoolDescriptor_t desc); +__C __export infiniStatus_t infiniopDestroyMaxPoolDescriptor(infiniopMaxPoolDescriptor_t desc); #endif // __INFINIOP_MAX_POOL_H__ diff --git a/include/infiniop/ops/unary_ops_api.h b/include/infiniop/ops/unary_ops_api.h index 11cae2190..e97c21b8f 100644 --- a/include/infiniop/ops/unary_ops_api.h +++ b/include/infiniop/ops/unary_ops_api.h @@ -17,7 +17,12 @@ // Declare all unary operator APIs UNARY_OP_API_DECLARE(abs, Abs) UNARY_OP_API_DECLARE(log, Log) +UNARY_OP_API_DECLARE(log2, Log2) +UNARY_OP_API_DECLARE(log10, Log10) +UNARY_OP_API_DECLARE(log1p, Log1p) UNARY_OP_API_DECLARE(sqrt, Sqrt) +UNARY_OP_API_DECLARE(square, Square) +UNARY_OP_API_DECLARE(rsqrt, Rsqrt) UNARY_OP_API_DECLARE(reciprocal, Reciprocal) UNARY_OP_API_DECLARE(neg, Neg) UNARY_OP_API_DECLARE(round, Round) @@ -36,6 +41,12 @@ UNARY_OP_API_DECLARE(atan, Atan) UNARY_OP_API_DECLARE(acos, Acos) UNARY_OP_API_DECLARE(ceil, Ceil) UNARY_OP_API_DECLARE(exp, Exp) +UNARY_OP_API_DECLARE(exp2, Exp2) UNARY_OP_API_DECLARE(hardswish, Hardswish) +UNARY_OP_API_DECLARE(isnan, IsNan) +UNARY_OP_API_DECLARE(isinf, IsInf) +UNARY_OP_API_DECLARE(isfinite, IsFinite) +UNARY_OP_API_DECLARE(sinc, Sinc) +UNARY_OP_API_DECLARE(sin, Sin) #endif // __INFINIOP_UNARY_OPS_API_H__ diff --git a/src/infiniop/README.md b/src/infiniop/README.md index b4d4059e1..50e30e9c2 100644 --- a/src/infiniop/README.md +++ b/src/infiniop/README.md @@ -46,3 +46,211 @@ InfiniOP 是 InfiniCore 下属的统一底层算子框架,为相同算子在 一些 CUDA kernel 可以被多个支持 CUDA 的平台公用,可以考虑在头文件中实现,并在多个源文件中使用。 比如 `mul/cuda/kernel.cuh` 中只有 device 测代码,会被多个支持 CUDA 的平台源代码引用。 5. 算子实现可以成功编译安装后,在 `test/infiniop/` 中添加单测脚本,与 PyTorch 实现进行正确性和性能比较。你可以仿照已有的测试脚本进行开发,以使用各种通用的测试功能。测例应覆盖算子常用类型和形状。测试成功之后可以将测例添加至 `scripts/python_test.py` 一键测试脚本中(这样 Github 自动测试也会包含该算子)。 + +## 添加 Elementwise 算子(Binary/Unary) + +对于逐元素算子(Elementwise Operators),由于重构后的统一框架,添加新算子变得非常简单。以下步骤展示了如何添加一个新的 elementwise 算子。 + +### Binary Elementwise 算子示例(以 `pow` 为例) + +#### 步骤 1: 在 `BinaryMode` 枚举中添加算子 + +在 `src/infiniop/elementwise/binary.h` 的 `BinaryMode` 枚举中添加新算子: + +```cpp +enum class BinaryMode { + // ... 其他算子 + Pow, // 添加新算子 + // ... +}; +``` + +#### 步骤 2: 在 `BinaryOp` 模板中添加计算逻辑 + +在同一文件的 `BinaryOp` 模板中添加对应的计算实现: + +```cpp +template +struct BinaryOp { + template + T operator()(const T &a, const T &b) const { + // ... 其他算子的实现 + else if constexpr (Mode == BinaryMode::Pow) { + return std::pow(a, b); + } + // ... + } +}; +``` + +如果需要在 CUDA 端优化,还需要在 `namespace cuda` 的 `BinaryOp` 模板中添加对应的 CUDA 实现。 + +#### 步骤 3: 在 API 头文件中声明算子 + +在 `include/infiniop/ops/binary_ops_api.h` 中添加: + +```cpp +BINARY_OP_API_DECLARE(pow, Pow) +``` + +#### 步骤 4: 创建算子目录和文件 + +创建目录结构 `src/infiniop/ops/pow/`,并创建以下文件: + +**`operator.cc`** - 主实现文件: +```cpp +#include "../../operator_impl.h" +#include "infiniop/ops/binary_ops_api.h" + +#ifdef ENABLE_CPU_API +#include "cpu/pow_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#include "nvidia/pow_nvidia.cuh" +#endif + +BINARY_OP_IMPL(pow, Pow) +``` + +**`cpu/pow_cpu.h`** - CPU 头文件: +```cpp +#ifndef __POW_CPU_H__ +#define __POW_CPU_H__ + +#include "../../../elementwise/binary.h" +#include "../../../elementwise/cpu/elementwise_cpu.h" + +BINARY_ELEMENTWISE_DESCRIPTOR(pow, cpu, op::elementwise::binary::BinaryMode::Pow) + +#endif // __POW_CPU_H__ +``` + +**`cpu/pow_cpu.cc`** - CPU 实现文件: +```cpp +#include "pow_cpu.h" +#include "../../../elementwise/cpu/elementwise_cpu_impl.h" + +namespace op::pow::cpu { + +ELEMENTWISE_CPU_IMPL_BINARY(pow) + +} // namespace op::pow::cpu +``` + +**`nvidia/pow_nvidia.cuh`** - NVIDIA 头文件: +```cpp +#ifndef __POW_CUDA_API_H__ +#define __POW_CUDA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(pow, nvidia) + +#endif // __POW_CUDA_API_H__ +``` + +**`nvidia/pow_nvidia.cu`** - NVIDIA 实现文件: +```cpp +#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh" + +#include "../cuda/kernel.cuh" +#include "pow_nvidia.cuh" + +namespace op::pow::nvidia { + +ELEMENTWISE_NVIDIA_IMPL_BINARY(pow) + +} // namespace op::pow::nvidia +``` + +**`cuda/kernel.cuh`**(可选)- 如果需要在 CUDA kernel 中实现特殊逻辑: +```cpp +// 通常不需要,除非有特殊的 CUDA 优化需求 +``` + +### Unary Elementwise 算子示例(以 `abs` 为例) + +Unary 算子的添加流程与 Binary 类似,主要区别如下: + +#### 步骤 1: 在 `UnaryMode` 枚举中添加算子 + +在 `src/infiniop/elementwise/unary.h` 的 `UnaryMode` 枚举中添加: + +```cpp +enum class UnaryMode { + // ... 其他算子 + Abs, // 添加新算子 + // ... +}; +``` + +#### 步骤 2: 在 `UnaryOp` 模板中添加计算逻辑 + +```cpp +template +struct UnaryOp { + template + T operator()(const T &x) const { + // ... 其他算子的实现 + else if constexpr (Mode == UnaryMode::Abs) { + if constexpr (std::is_floating_point_v) { + return std::fabs(x); + } else { + return std::abs(x); + } + } + // ... + } +}; +``` + +#### 步骤 3: 在 API 头文件中声明算子 + +在 `include/infiniop/ops/unary_ops_api.h` 中添加: + +```cpp +UNARY_OP_API_DECLARE(abs, Abs) +``` + +#### 步骤 4: 创建算子目录和文件 + +文件结构与 Binary 类似,但使用 `UNARY_` 前缀的宏: + +**`operator.cc`**: +```cpp +UNARY_OP_IMPL(abs, Abs) +``` + +**`cpu/abs_cpu.h`**: +```cpp +UNARY_ELEMENTWISE_DESCRIPTOR(abs, cpu, op::elementwise::unary::UnaryMode::Abs) +``` + +**`cpu/abs_cpu.cc`**: +```cpp +ELEMENTWISE_CPU_IMPL_UNARY(abs) +``` + +**`nvidia/abs_nvidia.cu`**: +```cpp +ELEMENTWISE_NVIDIA_IMPL_UNARY(abs) +``` + +### 总结 + +添加一个新的 elementwise 算子只需要: + +1. ✅ 在对应的 `BinaryMode`/`UnaryMode` 枚举中添加算子 +2. ✅ 在 `BinaryOp`/`UnaryOp` 模板中添加计算逻辑 +3. ✅ 在 API 头文件中使用宏声明算子 +4. ✅ 创建算子目录,使用统一的宏实现各平台代码 + +**关键优势**: +- 代码复用:所有平台共享相同的实现框架 +- 最小改动:只需添加算子特定的计算逻辑 +- 统一接口:自动生成标准的 C API +- 易于维护:修改框架代码即可影响所有算子 + +参考实现: +- Binary: `src/infiniop/ops/pow/` +- Unary: `src/infiniop/ops/abs/` diff --git a/src/infiniop/elementwise/binary.h b/src/infiniop/elementwise/binary.h index 1823fac3f..ba982744d 100644 --- a/src/infiniop/elementwise/binary.h +++ b/src/infiniop/elementwise/binary.h @@ -27,14 +27,40 @@ enum class BinaryMode { Subtract, Multiply, Divide, + FloorDivide, // floor_divide: floor(a / b) Pow, + CopySign, + Hypot, + Atan2, // atan2: atan2(y, x) Mod, + Remainder, Max, Min, - // Logical operations (for future use): - // And, Or, Xor, Less, LessOrEqual, Equal, Greater, GreaterOrEqual + Fmax, + Fmin, + // Comparison operations: + Greater, // gt: a > b + Less, // lt: a < b + GreaterOrEqual, // ge: a >= b + LessOrEqual, // le: a <= b + Equal, // eq: a == b + NotEqual, // ne: a != b + // Logical operations: + LogicalAnd, // logical_and: a && b (non-zero as true) + LogicalOr, // logical_or: a || b (non-zero as true) + LogicalXor, // logical_xor: a ^ b (exactly one non-zero as true) + // Bitwise operations: + BitwiseAnd, // bitwise_and: a & b (only for integral types) + BitwiseOr, // bitwise_or: a | b (only for integral types) + BitwiseXor, // bitwise_xor: a ^ b (only for integral types) + BitwiseLeftShift, // bitwise_left_shift: a << b (only for integral types) + BitwiseRightShift, // bitwise_right_shift: a >> b (only for integral types) }; +// Helper template for static_assert in else branches +template +struct always_false : std::false_type {}; + /** * @brief Generic binary operation template that performs different operations * based on the specified BinaryMode. @@ -59,28 +85,145 @@ struct BinaryOp { return a * b; } else if constexpr (Mode == BinaryMode::Divide) { return a / b; + } else if constexpr (Mode == BinaryMode::FloorDivide) { + // Floor divide: floor(a / b) + if constexpr (std::is_integral_v) { + // For integral types, integer division is already floor division + return a / b; + } else { + // For floating point types, use std::floor + return std::floor(a / b); + } } else if constexpr (Mode == BinaryMode::Pow) { return std::pow(a, b); + } else if constexpr (Mode == BinaryMode::CopySign) { + if constexpr (std::is_floating_point_v) { + return std::copysign(a, b); + } else { + // For integral types, return a with sign of b + return (b < T(0)) ? -std::abs(a) : std::abs(a); + } + } else if constexpr (Mode == BinaryMode::Hypot) { + return std::hypot(a, b); + } else if constexpr (Mode == BinaryMode::Atan2) { + // atan2(y, x): returns the angle whose tangent is y/x + return std::atan2(a, b); } else if constexpr (Mode == BinaryMode::Mod) { if constexpr (std::is_floating_point_v) { return std::fmod(a, b); } else { return a % b; } + } else if constexpr (Mode == BinaryMode::Remainder) { + if constexpr (std::is_floating_point_v) { + // PyTorch remainder: x - floor(x/y) * y, result sign matches divisor (y) + T quotient = std::floor(a / b); + return a - quotient * b; + } else { + // For integral types, remainder is same as mod + return a % b; + } } else if constexpr (Mode == BinaryMode::Max) { + // Max: propagates NaN (if either is NaN, result is NaN) if constexpr (std::is_floating_point_v) { - return std::fmax(a, b); + // Use std::max which propagates NaN (a > b ? a : b behavior with NaN) + return (a > b) ? a : b; } else { return std::max(a, b); } } else if constexpr (Mode == BinaryMode::Min) { + // Min: propagates NaN (if either is NaN, result is NaN) + if constexpr (std::is_floating_point_v) { + // Use std::min which propagates NaN (a < b ? a : b behavior with NaN) + return (a < b) ? a : b; + } else { + return std::min(a, b); + } + } else if constexpr (Mode == BinaryMode::Fmax) { + // Fmax: ignores NaN (if one is NaN, return the other) + if constexpr (std::is_floating_point_v) { + return std::fmax(a, b); + } else { + return std::max(a, b); + } + } else if constexpr (Mode == BinaryMode::Fmin) { + // Fmin: ignores NaN (if one is NaN, return the other) if constexpr (std::is_floating_point_v) { return std::fmin(a, b); } else { return std::min(a, b); } + } else if constexpr (Mode == BinaryMode::Greater) { + // Return 1.0 if a > b, else 0.0 + return static_cast(a > b ? T(1) : T(0)); + } else if constexpr (Mode == BinaryMode::Less) { + // Return 1.0 if a < b, else 0.0 + return static_cast(a < b ? T(1) : T(0)); + } else if constexpr (Mode == BinaryMode::GreaterOrEqual) { + // Return 1.0 if a >= b, else 0.0 + return static_cast(a >= b ? T(1) : T(0)); + } else if constexpr (Mode == BinaryMode::LessOrEqual) { + // Return 1.0 if a <= b, else 0.0 + return static_cast(a <= b ? T(1) : T(0)); + } else if constexpr (Mode == BinaryMode::Equal) { + // Return 1.0 if a == b, else 0.0 + return static_cast(a == b ? T(1) : T(0)); + } else if constexpr (Mode == BinaryMode::NotEqual) { + // Return 1.0 if a != b, else 0.0 + return static_cast(a != b ? T(1) : T(0)); + } else if constexpr (Mode == BinaryMode::LogicalAnd) { + // Return 1.0 if both a and b are non-zero, else 0.0 + return static_cast((a != T(0) && b != T(0)) ? T(1) : T(0)); + } else if constexpr (Mode == BinaryMode::LogicalOr) { + // Return 1.0 if either a or b is non-zero, else 0.0 + return static_cast((a != T(0) || b != T(0)) ? T(1) : T(0)); + } else if constexpr (Mode == BinaryMode::LogicalXor) { + // Return 1.0 if exactly one of a or b is non-zero, else 0.0 + bool a_nonzero = (a != T(0)); + bool b_nonzero = (b != T(0)); + return static_cast((a_nonzero != b_nonzero) ? T(1) : T(0)); + } else if constexpr (Mode == BinaryMode::BitwiseAnd) { + // Bitwise AND: a & b (only for integral types) + if constexpr (std::is_integral_v) { + return a & b; + } else { + static_assert(std::is_integral_v, "Bitwise operations require integral types"); + return T(0); + } + } else if constexpr (Mode == BinaryMode::BitwiseOr) { + // Bitwise OR: a | b (only for integral types) + if constexpr (std::is_integral_v) { + return a | b; + } else { + static_assert(std::is_integral_v, "Bitwise operations require integral types"); + return T(0); + } + } else if constexpr (Mode == BinaryMode::BitwiseXor) { + // Bitwise XOR: a ^ b (only for integral types) + if constexpr (std::is_integral_v) { + return a ^ b; + } else { + static_assert(std::is_integral_v, "Bitwise operations require integral types"); + return T(0); + } + } else if constexpr (Mode == BinaryMode::BitwiseLeftShift) { + // Bitwise left shift: a << b (only for integral types) + if constexpr (std::is_integral_v) { + return a << b; + } else { + static_assert(std::is_integral_v, "Bitwise operations require integral types"); + return T(0); + } + } else if constexpr (Mode == BinaryMode::BitwiseRightShift) { + // Bitwise right shift: a >> b (only for integral types) + if constexpr (std::is_integral_v) { + return a >> b; + } else { + static_assert(std::is_integral_v, "Bitwise operations require integral types"); + return T(0); + } } else { - static_assert(Mode != Mode, "Unsupported binary operation mode"); + static_assert(always_false::value, "Unsupported binary operation mode"); return a; } } @@ -143,6 +286,24 @@ struct BinaryOp { } else { return a / b; } + } else if constexpr (Mode == BinaryMode::FloorDivide) { + // Floor divide: floor(a / b) + if constexpr (std::is_same_v) { + float2 a_f2 = __half22float2(a); + float2 b_f2 = __half22float2(b); + return __float22half2_rn(make_float2(floorf(a_f2.x / b_f2.x), floorf(a_f2.y / b_f2.y))); + } else if constexpr (std::is_same_v) { + float a_ = __half2float(a); + float b_ = __half2float(b); + return __float2half(floorf(a_ / b_)); + } else if constexpr (std::is_integral_v) { + // For integral types, integer division is already floor division + return a / b; + } else if constexpr (std::is_same_v) { + return floorf(a / b); + } else { + return std::floor(a / b); + } } else if constexpr (Mode == BinaryMode::Pow) { if constexpr (std::is_same_v) { float2 a_f2 = __half22float2(a); @@ -151,8 +312,8 @@ struct BinaryOp { } else if constexpr (std::is_same_v) { float a_ = __half2float(a); float b_ = __half2float(b); - float ans_f = __powf(a_, b_); - return __float2half(isnan(ans_f) ? std::pow(a_, b_) : ans_f); + // Use __powf only (std::pow is host function, cannot be used in device code) + return __float2half(__powf(a_, b_)); } else if constexpr (std::is_same_v) { float2 a_f2 = __bfloat1622float2(a); float2 b_f2 = __bfloat1622float2(b); @@ -166,42 +327,403 @@ struct BinaryOp { } else { return std::pow(a, b); } + } else if constexpr (Mode == BinaryMode::CopySign) { + if constexpr (std::is_same_v) { + float2 a_f2 = __half22float2(a); + float2 b_f2 = __half22float2(b); + return __float22half2_rn(make_float2(copysignf(a_f2.x, b_f2.x), copysignf(a_f2.y, b_f2.y))); + } else if constexpr (std::is_same_v) { + float a_ = __half2float(a); + float b_ = __half2float(b); + return __float2half(copysignf(a_, b_)); + } else if constexpr (std::is_same_v) { + float2 a_f2 = __bfloat1622float2(a); + float2 b_f2 = __bfloat1622float2(b); + return __floats2bfloat162_rn(copysignf(a_f2.x, b_f2.x), copysignf(a_f2.y, b_f2.y)); + } else if constexpr (std::is_same_v) { + float a_ = __bfloat162float(a); + float b_ = __bfloat162float(b); + return __float2bfloat16_rn(copysignf(a_, b_)); + } else if constexpr (std::is_same_v) { + return copysignf(a, b); + } else if constexpr (std::is_floating_point_v) { + return std::copysign(a, b); + } else { + // For integral types, return a with sign of b + return (b < T(0)) ? -std::abs(a) : std::abs(a); + } + } else if constexpr (Mode == BinaryMode::Hypot) { + if constexpr (std::is_same_v) { + float2 a_f2 = __half22float2(a); + float2 b_f2 = __half22float2(b); + return __float22half2_rn(make_float2(hypotf(a_f2.x, b_f2.x), hypotf(a_f2.y, b_f2.y))); + } else if constexpr (std::is_same_v) { + float a_ = __half2float(a); + float b_ = __half2float(b); + return __float2half(hypotf(a_, b_)); + } else if constexpr (std::is_same_v) { + float2 a_f2 = __bfloat1622float2(a); + float2 b_f2 = __bfloat1622float2(b); + return __floats2bfloat162_rn(hypotf(a_f2.x, b_f2.x), hypotf(a_f2.y, b_f2.y)); + } else if constexpr (std::is_same_v) { + float a_ = __bfloat162float(a); + float b_ = __bfloat162float(b); + return __float2bfloat16_rn(hypotf(a_, b_)); + } else if constexpr (std::is_same_v) { + return hypotf(a, b); + } else { + return std::hypot(a, b); + } + } else if constexpr (Mode == BinaryMode::Atan2) { + // atan2(y, x): returns the angle whose tangent is y/x + if constexpr (std::is_same_v) { + float2 a_f2 = __half22float2(a); + float2 b_f2 = __half22float2(b); + return __float22half2_rn(make_float2(atan2f(a_f2.x, b_f2.x), atan2f(a_f2.y, b_f2.y))); + } else if constexpr (std::is_same_v) { + float a_ = __half2float(a); + float b_ = __half2float(b); + return __float2half(atan2f(a_, b_)); + } else if constexpr (std::is_same_v) { + float2 a_f2 = __bfloat1622float2(a); + float2 b_f2 = __bfloat1622float2(b); + return __floats2bfloat162_rn(atan2f(a_f2.x, b_f2.x), atan2f(a_f2.y, b_f2.y)); + } else if constexpr (std::is_same_v) { + float a_ = __bfloat162float(a); + float b_ = __bfloat162float(b); + return __float2bfloat16_rn(atan2f(a_, b_)); + } else if constexpr (std::is_same_v) { + return atan2f(a, b); + } else { + return std::atan2(a, b); + } } else if constexpr (Mode == BinaryMode::Mod) { if constexpr (std::is_same_v) { float2 a_f2 = __half22float2(a); float2 b_f2 = __half22float2(b); - return __float22half2_rn(make_float2(std::fmod(a_f2.x, b_f2.x), std::fmod(a_f2.y, b_f2.y))); + return __float22half2_rn(make_float2(fmodf(a_f2.x, b_f2.x), fmodf(a_f2.y, b_f2.y))); } else if constexpr (std::is_same_v) { float a_ = __half2float(a); float b_ = __half2float(b); - return __float2half(std::fmod(a_, b_)); + return __float2half(fmodf(a_, b_)); + } else if constexpr (std::is_same_v) { + float2 a_f2 = __bfloat1622float2(a); + float2 b_f2 = __bfloat1622float2(b); + return __floats2bfloat162_rn(fmodf(a_f2.x, b_f2.x), fmodf(a_f2.y, b_f2.y)); + } else if constexpr (std::is_same_v) { + float a_ = __bfloat162float(a); + float b_ = __bfloat162float(b); + return __float2bfloat16_rn(fmodf(a_, b_)); } else if constexpr (std::is_floating_point_v) { - return std::fmod(a, b); + return fmodf(a, b); + } else { + return a % b; + } + } else if constexpr (Mode == BinaryMode::Remainder) { + // PyTorch remainder: x - floor(x/y) * y, result sign matches divisor (y) + if constexpr (std::is_same_v) { + float2 a_f2 = __half22float2(a); + float2 b_f2 = __half22float2(b); + float2 q_f2 = make_float2(floorf(a_f2.x / b_f2.x), floorf(a_f2.y / b_f2.y)); + float2 r_f2 = make_float2(a_f2.x - q_f2.x * b_f2.x, a_f2.y - q_f2.y * b_f2.y); + return __float22half2_rn(r_f2); + } else if constexpr (std::is_same_v) { + float a_ = __half2float(a); + float b_ = __half2float(b); + float q_ = floorf(a_ / b_); + float r_ = a_ - q_ * b_; + return __float2half(r_); + } else if constexpr (std::is_same_v) { + float2 a_f2 = __bfloat1622float2(a); + float2 b_f2 = __bfloat1622float2(b); + float2 q_f2 = make_float2(floorf(a_f2.x / b_f2.x), floorf(a_f2.y / b_f2.y)); + float2 r_f2 = make_float2(a_f2.x - q_f2.x * b_f2.x, a_f2.y - q_f2.y * b_f2.y); + return __floats2bfloat162_rn(r_f2.x, r_f2.y); + } else if constexpr (std::is_same_v) { + float a_ = __bfloat162float(a); + float b_ = __bfloat162float(b); + float q_ = floorf(a_ / b_); + float r_ = a_ - q_ * b_; + return __float2bfloat16_rn(r_); + } else if constexpr (std::is_same_v) { + float q = floorf(a / b); + return a - q * b; + } else if constexpr (std::is_floating_point_v) { + T quotient = std::floor(a / b); + return a - quotient * b; } else { + // For integral types, remainder is same as mod return a % b; } } else if constexpr (Mode == BinaryMode::Max) { + // Max: propagates NaN (torch.maximum behavior) if constexpr (std::is_same_v) { return __hmax2(a, b); } else if constexpr (std::is_same_v || std::is_same_v) { + // For half/bfloat16, use comparison which propagates NaN return a > b ? a : b; } else if constexpr (std::is_same_v) { - return fmaxf(a, b); + // For float, use comparison which propagates NaN + return a > b ? a : b; } else { return a > b ? a : b; } } else if constexpr (Mode == BinaryMode::Min) { + // Min: propagates NaN (torch.minimum behavior) if constexpr (std::is_same_v) { return __hmin2(a, b); } else if constexpr (std::is_same_v || std::is_same_v) { + // For half/bfloat16, use comparison which propagates NaN + return a < b ? a : b; + } else if constexpr (std::is_same_v) { + // For float, use comparison which propagates NaN return a < b ? a : b; + } else { + return a < b ? a : b; + } + } else if constexpr (Mode == BinaryMode::Fmax) { + // Fmax: ignores NaN (torch.fmax behavior - if one is NaN, return the other) + if constexpr (std::is_same_v) { + // __hmax2 may propagate NaN, so implement custom NaN-ignoring version + float2 a_f2 = __half22float2(a); + float2 b_f2 = __half22float2(b); + return __float22half2_rn(make_float2(fmaxf(a_f2.x, b_f2.x), fmaxf(a_f2.y, b_f2.y))); + } else if constexpr (std::is_same_v) { + float a_ = __half2float(a); + float b_ = __half2float(b); + return __float2half(fmaxf(a_, b_)); + } else if constexpr (std::is_same_v) { + float a0 = __bfloat162float(__low2bfloat16(a)); + float a1 = __bfloat162float(__high2bfloat16(a)); + float b0 = __bfloat162float(__low2bfloat16(b)); + float b1 = __bfloat162float(__high2bfloat16(b)); + return __floats2bfloat162_rn(fmaxf(a0, b0), fmaxf(a1, b1)); + } else if constexpr (std::is_same_v) { + float a_ = __bfloat162float(a); + float b_ = __bfloat162float(b); + return __float2bfloat16_rn(fmaxf(a_, b_)); + } else if constexpr (std::is_same_v) { + return fmaxf(a, b); + } else if constexpr (std::is_same_v) { + return fmax(a, b); + } else { + return a > b ? a : b; + } + } else if constexpr (Mode == BinaryMode::Fmin) { + // Fmin: ignores NaN (torch.fmin behavior - if one is NaN, return the other) + if constexpr (std::is_same_v) { + // __hmin2 may propagate NaN, so implement custom NaN-ignoring version + float2 a_f2 = __half22float2(a); + float2 b_f2 = __half22float2(b); + return __float22half2_rn(make_float2(fminf(a_f2.x, b_f2.x), fminf(a_f2.y, b_f2.y))); + } else if constexpr (std::is_same_v) { + float a_ = __half2float(a); + float b_ = __half2float(b); + return __float2half(fminf(a_, b_)); + } else if constexpr (std::is_same_v) { + float a0 = __bfloat162float(__low2bfloat16(a)); + float a1 = __bfloat162float(__high2bfloat16(a)); + float b0 = __bfloat162float(__low2bfloat16(b)); + float b1 = __bfloat162float(__high2bfloat16(b)); + return __floats2bfloat162_rn(fminf(a0, b0), fminf(a1, b1)); + } else if constexpr (std::is_same_v) { + float a_ = __bfloat162float(a); + float b_ = __bfloat162float(b); + return __float2bfloat16_rn(fminf(a_, b_)); } else if constexpr (std::is_same_v) { return fminf(a, b); + } else if constexpr (std::is_same_v) { + return fmin(a, b); } else { return a < b ? a : b; } + } else if constexpr (Mode == BinaryMode::Greater) { + // Return 1.0 if a > b, else 0.0 + if constexpr (std::is_same_v) { + float2 a_f2 = __half22float2(a); + float2 b_f2 = __half22float2(b); + return __float22half2_rn(make_float2( + (a_f2.x > b_f2.x) ? 1.0f : 0.0f, + (a_f2.y > b_f2.y) ? 1.0f : 0.0f)); + } else if constexpr (std::is_same_v || std::is_same_v) { + return (a > b) ? T(1) : T(0); + } else if constexpr (std::is_same_v) { + return (a > b) ? 1.0f : 0.0f; + } else { + return static_cast((a > b) ? 1 : 0); + } + } else if constexpr (Mode == BinaryMode::Less) { + // Return 1.0 if a < b, else 0.0 + if constexpr (std::is_same_v) { + float2 a_f2 = __half22float2(a); + float2 b_f2 = __half22float2(b); + return __float22half2_rn(make_float2( + (a_f2.x < b_f2.x) ? 1.0f : 0.0f, + (a_f2.y < b_f2.y) ? 1.0f : 0.0f)); + } else if constexpr (std::is_same_v || std::is_same_v) { + return (a < b) ? T(1) : T(0); + } else if constexpr (std::is_same_v) { + return (a < b) ? 1.0f : 0.0f; + } else { + return static_cast((a < b) ? 1 : 0); + } + } else if constexpr (Mode == BinaryMode::GreaterOrEqual) { + // Return 1.0 if a >= b, else 0.0 + if constexpr (std::is_same_v) { + float2 a_f2 = __half22float2(a); + float2 b_f2 = __half22float2(b); + return __float22half2_rn(make_float2( + (a_f2.x >= b_f2.x) ? 1.0f : 0.0f, + (a_f2.y >= b_f2.y) ? 1.0f : 0.0f)); + } else if constexpr (std::is_same_v || std::is_same_v) { + return (a >= b) ? T(1) : T(0); + } else if constexpr (std::is_same_v) { + return (a >= b) ? 1.0f : 0.0f; + } else { + return static_cast((a >= b) ? 1 : 0); + } + } else if constexpr (Mode == BinaryMode::LessOrEqual) { + // Return 1.0 if a <= b, else 0.0 + if constexpr (std::is_same_v) { + float2 a_f2 = __half22float2(a); + float2 b_f2 = __half22float2(b); + return __float22half2_rn(make_float2( + (a_f2.x <= b_f2.x) ? 1.0f : 0.0f, + (a_f2.y <= b_f2.y) ? 1.0f : 0.0f)); + } else if constexpr (std::is_same_v || std::is_same_v) { + return (a <= b) ? T(1) : T(0); + } else if constexpr (std::is_same_v) { + return (a <= b) ? 1.0f : 0.0f; + } else { + return static_cast((a <= b) ? 1 : 0); + } + } else if constexpr (Mode == BinaryMode::Equal) { + // Return 1.0 if a == b, else 0.0 + if constexpr (std::is_same_v) { + float2 a_f2 = __half22float2(a); + float2 b_f2 = __half22float2(b); + return __float22half2_rn(make_float2( + (a_f2.x == b_f2.x) ? 1.0f : 0.0f, + (a_f2.y == b_f2.y) ? 1.0f : 0.0f)); + } else if constexpr (std::is_same_v || std::is_same_v) { + return (a == b) ? T(1) : T(0); + } else if constexpr (std::is_same_v) { + return (a == b) ? 1.0f : 0.0f; + } else { + return static_cast((a == b) ? 1 : 0); + } + } else if constexpr (Mode == BinaryMode::NotEqual) { + // Return 1.0 if a != b, else 0.0 + if constexpr (std::is_same_v) { + float2 a_f2 = __half22float2(a); + float2 b_f2 = __half22float2(b); + return __float22half2_rn(make_float2( + (a_f2.x != b_f2.x) ? 1.0f : 0.0f, + (a_f2.y != b_f2.y) ? 1.0f : 0.0f)); + } else if constexpr (std::is_same_v || std::is_same_v) { + return (a != b) ? T(1) : T(0); + } else if constexpr (std::is_same_v) { + return (a != b) ? 1.0f : 0.0f; + } else { + return static_cast((a != b) ? 1 : 0); + } + } else if constexpr (Mode == BinaryMode::LogicalAnd) { + // Return 1.0 if both a and b are non-zero, else 0.0 + if constexpr (std::is_same_v) { + float2 a_f2 = __half22float2(a); + float2 b_f2 = __half22float2(b); + return __float22half2_rn(make_float2( + ((a_f2.x != 0.0f) && (b_f2.x != 0.0f)) ? 1.0f : 0.0f, + ((a_f2.y != 0.0f) && (b_f2.y != 0.0f)) ? 1.0f : 0.0f)); + } else if constexpr (std::is_same_v || std::is_same_v) { + return ((a != T(0)) && (b != T(0))) ? T(1) : T(0); + } else if constexpr (std::is_same_v) { + return ((a != 0.0f) && (b != 0.0f)) ? 1.0f : 0.0f; + } else { + return static_cast(((a != T(0)) && (b != T(0))) ? 1 : 0); + } + } else if constexpr (Mode == BinaryMode::LogicalOr) { + // Return 1.0 if either a or b is non-zero, else 0.0 + if constexpr (std::is_same_v) { + float2 a_f2 = __half22float2(a); + float2 b_f2 = __half22float2(b); + return __float22half2_rn(make_float2( + ((a_f2.x != 0.0f) || (b_f2.x != 0.0f)) ? 1.0f : 0.0f, + ((a_f2.y != 0.0f) || (b_f2.y != 0.0f)) ? 1.0f : 0.0f)); + } else if constexpr (std::is_same_v || std::is_same_v) { + return ((a != T(0)) || (b != T(0))) ? T(1) : T(0); + } else if constexpr (std::is_same_v) { + return ((a != 0.0f) || (b != 0.0f)) ? 1.0f : 0.0f; + } else { + return static_cast(((a != T(0)) || (b != T(0))) ? 1 : 0); + } + } else if constexpr (Mode == BinaryMode::LogicalXor) { + // Return 1.0 if exactly one of a or b is non-zero, else 0.0 + if constexpr (std::is_same_v) { + float2 a_f2 = __half22float2(a); + float2 b_f2 = __half22float2(b); + bool a_x_nonzero = (a_f2.x != 0.0f); + bool b_x_nonzero = (b_f2.x != 0.0f); + bool a_y_nonzero = (a_f2.y != 0.0f); + bool b_y_nonzero = (b_f2.y != 0.0f); + return __float22half2_rn(make_float2( + (a_x_nonzero != b_x_nonzero) ? 1.0f : 0.0f, + (a_y_nonzero != b_y_nonzero) ? 1.0f : 0.0f)); + } else if constexpr (std::is_same_v || std::is_same_v) { + bool a_nonzero = (a != T(0)); + bool b_nonzero = (b != T(0)); + return (a_nonzero != b_nonzero) ? T(1) : T(0); + } else if constexpr (std::is_same_v) { + bool a_nonzero = (a != 0.0f); + bool b_nonzero = (b != 0.0f); + return (a_nonzero != b_nonzero) ? 1.0f : 0.0f; + } else { + bool a_nonzero = (a != T(0)); + bool b_nonzero = (b != T(0)); + return static_cast((a_nonzero != b_nonzero) ? 1 : 0); + } + } else if constexpr (Mode == BinaryMode::BitwiseAnd) { + // Bitwise AND: a & b (only for integral types) + if constexpr (std::is_integral_v) { + return a & b; + } else { + static_assert(std::is_integral_v, "Bitwise operations require integral types"); + return T(0); + } + } else if constexpr (Mode == BinaryMode::BitwiseOr) { + // Bitwise OR: a | b (only for integral types) + if constexpr (std::is_integral_v) { + return a | b; + } else { + static_assert(std::is_integral_v, "Bitwise operations require integral types"); + return T(0); + } + } else if constexpr (Mode == BinaryMode::BitwiseXor) { + // Bitwise XOR: a ^ b (only for integral types) + if constexpr (std::is_integral_v) { + return a ^ b; + } else { + static_assert(std::is_integral_v, "Bitwise operations require integral types"); + return T(0); + } + } else if constexpr (Mode == BinaryMode::BitwiseLeftShift) { + // Bitwise left shift: a << b (only for integral types) + if constexpr (std::is_integral_v) { + return a << b; + } else { + static_assert(std::is_integral_v, "Bitwise operations require integral types"); + return T(0); + } + } else if constexpr (Mode == BinaryMode::BitwiseRightShift) { + // Bitwise right shift: a >> b (only for integral types) + if constexpr (std::is_integral_v) { + return a >> b; + } else { + static_assert(std::is_integral_v, "Bitwise operations require integral types"); + return T(0); + } } else { - static_assert(Mode != Mode, "Unsupported binary operation mode"); + static_assert(always_false::value, "Unsupported binary operation mode"); return a; } } diff --git a/src/infiniop/elementwise/cpu/elementwise_cpu_impl.h b/src/infiniop/elementwise/cpu/elementwise_cpu_impl.h index fff5b1819..fa0e216e0 100644 --- a/src/infiniop/elementwise/cpu/elementwise_cpu_impl.h +++ b/src/infiniop/elementwise/cpu/elementwise_cpu_impl.h @@ -32,38 +32,50 @@ /** * @brief Common Calculate Switch Cases (F16 & F32) */ -#define _IMPL_CALC_CASES_COMMON \ - case INFINI_DTYPE_F16: \ +#define _IMPL_CALC_CASES_COMMON \ + case INFINI_DTYPE_F16: \ return _device_info->template calculate(_info, output, inputs, stream); \ - case INFINI_DTYPE_F32: \ + case INFINI_DTYPE_F32: \ return _device_info->template calculate(_info, output, inputs, stream); /** * @brief Extended Calculate Switch Cases (Adds F64 & BF16) */ -#define _IMPL_CALC_CASES_EXTENDED \ - _IMPL_CALC_CASES_COMMON \ - case INFINI_DTYPE_F64: \ +#define _IMPL_CALC_CASES_EXTENDED \ + _IMPL_CALC_CASES_COMMON \ + case INFINI_DTYPE_F64: \ return _device_info->template calculate(_info, output, inputs, stream); \ - case INFINI_DTYPE_BF16: \ + case INFINI_DTYPE_BF16: \ return _device_info->template calculate(_info, output, inputs, stream); +/** + * @brief Integral Calculate Switch Cases (I32, I64, U8) + * For bitwise operations that only support integral types + */ +#define _IMPL_CALC_CASES_INTEGRAL \ + case INFINI_DTYPE_I32: \ + return _device_info->template calculate(_info, output, inputs, stream); \ + case INFINI_DTYPE_I64: \ + return _device_info->template calculate(_info, output, inputs, stream); \ + case INFINI_DTYPE_U8: \ + return _device_info->template calculate(_info, output, inputs, stream); + /** * @brief Generic Template for the Calculate method * @param CASES_MACRO The macro containing the switch cases to use */ -#define _IMPL_CALCULATE_METHOD(CASES_MACRO) \ - infiniStatus_t Descriptor::calculate( \ - void *workspace, \ - size_t workspace_size, \ - void *output, \ - std::vector inputs, \ - void *stream) const { \ - switch (_dtype) { \ - CASES_MACRO \ - default: \ - return INFINI_STATUS_BAD_TENSOR_DTYPE; \ - } \ +#define _IMPL_CALCULATE_METHOD(CASES_MACRO) \ + infiniStatus_t Descriptor::calculate( \ + void *workspace, \ + size_t workspace_size, \ + void *output, \ + std::vector inputs, \ + void *stream) const { \ + switch (_dtype) { \ + CASES_MACRO \ + default: \ + return INFINI_STATUS_BAD_TENSOR_DTYPE; \ + } \ } /** @@ -71,20 +83,20 @@ * @param SHAPE_CHECK_BLOCK Code block to execute for shape checking * @param ... Variadic arguments for allowed data types in CHECK_DTYPE */ -#define _IMPL_CREATE_METHOD(SHAPE_CHECK_BLOCK, ...) \ - Descriptor::~Descriptor() = default; \ - infiniStatus_t Descriptor::create( \ - infiniopHandle_t handle_, \ - Descriptor **desc_ptr, \ - infiniopTensorDescriptor_t out_desc, \ - std::vector input_desc_vec) { \ - auto handle = reinterpret_cast(handle_); \ - auto dtype = out_desc->dtype(); \ - const auto &out_shape = out_desc->shape(); \ - SHAPE_CHECK_BLOCK \ - CHECK_DTYPE(dtype, __VA_ARGS__); \ +#define _IMPL_CREATE_METHOD(SHAPE_CHECK_BLOCK, ...) \ + Descriptor::~Descriptor() = default; \ + infiniStatus_t Descriptor::create( \ + infiniopHandle_t handle_, \ + Descriptor **desc_ptr, \ + infiniopTensorDescriptor_t out_desc, \ + std::vector input_desc_vec) { \ + auto handle = reinterpret_cast(handle_); \ + auto dtype = out_desc->dtype(); \ + const auto &out_shape = out_desc->shape(); \ + SHAPE_CHECK_BLOCK \ + CHECK_DTYPE(dtype, __VA_ARGS__); \ CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); \ - return INFINI_STATUS_SUCCESS; \ + return INFINI_STATUS_SUCCESS; \ } // ========================================================================= @@ -103,15 +115,14 @@ * ELEMENTWISE_CPU_IMPL_BINARY(pow) * } */ -#define ELEMENTWISE_CPU_IMPL_BINARY(OP) \ - _IMPL_CREATE_METHOD( \ - const auto &a_desc = input_desc_vec.at(0); \ - const auto &b_desc = input_desc_vec.at(1); \ - const auto &a_shape = a_desc->shape(); \ - const auto &b_shape = b_desc->shape(); \ - CHECK_SAME_SHAPE(out_shape, a_shape, b_shape);, \ - INFINI_DTYPE_F16, INFINI_DTYPE_F32 \ - ) \ +#define ELEMENTWISE_CPU_IMPL_BINARY(OP) \ + _IMPL_CREATE_METHOD( \ + const auto &a_desc = input_desc_vec.at(0); \ + const auto &b_desc = input_desc_vec.at(1); \ + const auto &a_shape = a_desc->shape(); \ + const auto &b_shape = b_desc->shape(); \ + CHECK_SAME_SHAPE(out_shape, a_shape, b_shape);, \ + INFINI_DTYPE_F16, INFINI_DTYPE_F32) \ _IMPL_CALCULATE_METHOD(_IMPL_CALC_CASES_COMMON) /** @@ -126,13 +137,12 @@ * ELEMENTWISE_CPU_IMPL_UNARY(sqrt) * } */ -#define ELEMENTWISE_CPU_IMPL_UNARY(OP) \ - _IMPL_CREATE_METHOD( \ - const auto &x_desc = input_desc_vec.at(0); \ - const auto &x_shape = x_desc->shape(); \ - CHECK_SAME_SHAPE(out_shape, x_shape);, \ - INFINI_DTYPE_F16, INFINI_DTYPE_F32 \ - ) \ +#define ELEMENTWISE_CPU_IMPL_UNARY(OP) \ + _IMPL_CREATE_METHOD( \ + const auto &x_desc = input_desc_vec.at(0); \ + const auto &x_shape = x_desc->shape(); \ + CHECK_SAME_SHAPE(out_shape, x_shape);, \ + INFINI_DTYPE_F16, INFINI_DTYPE_F32) \ _IMPL_CALCULATE_METHOD(_IMPL_CALC_CASES_COMMON) /** @@ -147,13 +157,34 @@ * ELEMENTWISE_CPU_IMPL_UNARY_EXTENDED(exp) * } */ -#define ELEMENTWISE_CPU_IMPL_UNARY_EXTENDED(OP) \ - _IMPL_CREATE_METHOD( \ - const auto &x_desc = input_desc_vec.at(0); \ - const auto &x_shape = x_desc->shape(); \ - CHECK_SAME_SHAPE(out_shape, x_shape);, \ - INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16 \ - ) \ +#define ELEMENTWISE_CPU_IMPL_UNARY_EXTENDED(OP) \ + _IMPL_CREATE_METHOD( \ + const auto &x_desc = input_desc_vec.at(0); \ + const auto &x_shape = x_desc->shape(); \ + CHECK_SAME_SHAPE(out_shape, x_shape);, \ + INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16) \ _IMPL_CALCULATE_METHOD(_IMPL_CALC_CASES_EXTENDED) +/** + * @brief Implementation for Binary Operators with Integral Types (I32, I64, U8) + * + * This macro generates the Descriptor destructor, create, and calculate methods + * for binary operators that only support integral types (e.g., bitwise operations). + * + * Usage: + * namespace op::bitwise_and::cpu { + * using Op = op::elementwise::binary::BinaryOp; + * ELEMENTWISE_CPU_IMPL_BINARY_INTEGRAL(bitwise_and) + * } + */ +#define ELEMENTWISE_CPU_IMPL_BINARY_INTEGRAL(OP) \ + _IMPL_CREATE_METHOD( \ + const auto &a_desc = input_desc_vec.at(0); \ + const auto &b_desc = input_desc_vec.at(1); \ + const auto &a_shape = a_desc->shape(); \ + const auto &b_shape = b_desc->shape(); \ + CHECK_SAME_SHAPE(out_shape, a_shape, b_shape);, \ + INFINI_DTYPE_I32, INFINI_DTYPE_I64, INFINI_DTYPE_U8) \ + _IMPL_CALCULATE_METHOD(_IMPL_CALC_CASES_INTEGRAL) + #endif // __INFINIOP_ELEMENTWISE_CPU_IMPL_H__ diff --git a/src/infiniop/elementwise/nvidia/elementwise_nvidia_impl.cuh b/src/infiniop/elementwise/nvidia/elementwise_nvidia_impl.cuh index b0716db1a..4e325e68a 100644 --- a/src/infiniop/elementwise/nvidia/elementwise_nvidia_impl.cuh +++ b/src/infiniop/elementwise/nvidia/elementwise_nvidia_impl.cuh @@ -32,45 +32,57 @@ /** * @brief Common Calculate Switch Cases (F16 & F32) */ -#define _IMPL_CALC_CASES_COMMON \ - case INFINI_DTYPE_F16: \ +#define _IMPL_CALC_CASES_COMMON \ + case INFINI_DTYPE_F16: \ return _device_info->calculate<256, cuda::Op, half>(_info, workspace, output, inputs, stream); \ - case INFINI_DTYPE_F32: \ + case INFINI_DTYPE_F32: \ return _device_info->calculate<256, cuda::Op, float>(_info, workspace, output, inputs, stream); /** * @brief Extended Calculate Switch Cases (Adds F64 & BF16) * Note: Order is F16, BF16, F32, F64 to match original implementation */ -#define _IMPL_CALC_CASES_EXTENDED \ - case INFINI_DTYPE_F16: \ - return _device_info->calculate<256, cuda::Op, half>(_info, workspace, output, inputs, stream); \ - case INFINI_DTYPE_BF16: \ +#define _IMPL_CALC_CASES_EXTENDED \ + case INFINI_DTYPE_F16: \ + return _device_info->calculate<256, cuda::Op, half>(_info, workspace, output, inputs, stream); \ + case INFINI_DTYPE_BF16: \ return _device_info->calculate<256, cuda::Op, cuda_bfloat16>(_info, workspace, output, inputs, stream); \ - case INFINI_DTYPE_F32: \ - return _device_info->calculate<256, cuda::Op, float>(_info, workspace, output, inputs, stream); \ - case INFINI_DTYPE_F64: \ + case INFINI_DTYPE_F32: \ + return _device_info->calculate<256, cuda::Op, float>(_info, workspace, output, inputs, stream); \ + case INFINI_DTYPE_F64: \ return _device_info->calculate<256, cuda::Op, double>(_info, workspace, output, inputs, stream); +/** + * @brief Integral Calculate Switch Cases (I32, I64, U8) + * For bitwise operations that only support integral types + */ +#define _IMPL_CALC_CASES_INTEGRAL \ + case INFINI_DTYPE_I32: \ + return _device_info->calculate<256, cuda::Op, int32_t>(_info, workspace, output, inputs, stream); \ + case INFINI_DTYPE_I64: \ + return _device_info->calculate<256, cuda::Op, int64_t>(_info, workspace, output, inputs, stream); \ + case INFINI_DTYPE_U8: \ + return _device_info->calculate<256, cuda::Op, uint8_t>(_info, workspace, output, inputs, stream); + /** * @brief Generic Template for the Calculate method * @param CASES_MACRO The macro containing the switch cases to use */ -#define _IMPL_CALCULATE_METHOD(CASES_MACRO) \ - infiniStatus_t Descriptor::calculate( \ - void *workspace, \ - size_t workspace_size, \ - void *output, \ - std::vector inputs, \ - void *stream) const { \ - if (workspace_size < _workspace_size) { \ +#define _IMPL_CALCULATE_METHOD(CASES_MACRO) \ + infiniStatus_t Descriptor::calculate( \ + void *workspace, \ + size_t workspace_size, \ + void *output, \ + std::vector inputs, \ + void *stream) const { \ + if (workspace_size < _workspace_size) { \ return INFINI_STATUS_INSUFFICIENT_WORKSPACE; \ - } \ - switch (_dtype) { \ - CASES_MACRO \ - default: \ - return INFINI_STATUS_BAD_TENSOR_DTYPE; \ - } \ + } \ + switch (_dtype) { \ + CASES_MACRO \ + default: \ + return INFINI_STATUS_BAD_TENSOR_DTYPE; \ + } \ } /** @@ -78,20 +90,20 @@ * @param SHAPE_CHECK_BLOCK Code block to execute for shape checking * @param ... Variadic arguments for allowed data types in CHECK_DTYPE */ -#define _IMPL_CREATE_METHOD(SHAPE_CHECK_BLOCK, ...) \ - Descriptor::~Descriptor() = default; \ - infiniStatus_t Descriptor::create( \ - infiniopHandle_t handle_, \ - Descriptor **desc_ptr, \ - infiniopTensorDescriptor_t out_desc, \ - std::vector input_desc_vec) { \ - auto handle = reinterpret_cast(handle_); \ - auto dtype = out_desc->dtype(); \ - const auto &out_shape = out_desc->shape(); \ - SHAPE_CHECK_BLOCK \ - CHECK_DTYPE(dtype, __VA_ARGS__); \ +#define _IMPL_CREATE_METHOD(SHAPE_CHECK_BLOCK, ...) \ + Descriptor::~Descriptor() = default; \ + infiniStatus_t Descriptor::create( \ + infiniopHandle_t handle_, \ + Descriptor **desc_ptr, \ + infiniopTensorDescriptor_t out_desc, \ + std::vector input_desc_vec) { \ + auto handle = reinterpret_cast(handle_); \ + auto dtype = out_desc->dtype(); \ + const auto &out_shape = out_desc->shape(); \ + SHAPE_CHECK_BLOCK \ + CHECK_DTYPE(dtype, __VA_ARGS__); \ CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); \ - return INFINI_STATUS_SUCCESS; \ + return INFINI_STATUS_SUCCESS; \ } // ========================================================================= @@ -109,15 +121,14 @@ * ELEMENTWISE_NVIDIA_IMPL_BINARY(pow) * } */ -#define ELEMENTWISE_NVIDIA_IMPL_BINARY(OP) \ - _IMPL_CREATE_METHOD( \ - const auto &a_desc = input_desc_vec.at(0); \ - const auto &b_desc = input_desc_vec.at(1); \ - const auto &a_shape = a_desc->shape(); \ - const auto &b_shape = b_desc->shape(); \ - CHECK_SAME_SHAPE(out_shape, a_shape, b_shape);, \ - INFINI_DTYPE_F16, INFINI_DTYPE_F32 \ - ) \ +#define ELEMENTWISE_NVIDIA_IMPL_BINARY(OP) \ + _IMPL_CREATE_METHOD( \ + const auto &a_desc = input_desc_vec.at(0); \ + const auto &b_desc = input_desc_vec.at(1); \ + const auto &a_shape = a_desc->shape(); \ + const auto &b_shape = b_desc->shape(); \ + CHECK_SAME_SHAPE(out_shape, a_shape, b_shape);, \ + INFINI_DTYPE_F16, INFINI_DTYPE_F32) \ _IMPL_CALCULATE_METHOD(_IMPL_CALC_CASES_COMMON) /** @@ -131,13 +142,12 @@ * ELEMENTWISE_NVIDIA_IMPL_UNARY(sqrt) * } */ -#define ELEMENTWISE_NVIDIA_IMPL_UNARY(OP) \ - _IMPL_CREATE_METHOD( \ - const auto &x_desc = input_desc_vec.at(0); \ - const auto &x_shape = x_desc->shape(); \ - CHECK_SAME_SHAPE(out_shape, x_shape);, \ - INFINI_DTYPE_F16, INFINI_DTYPE_F32 \ - ) \ +#define ELEMENTWISE_NVIDIA_IMPL_UNARY(OP) \ + _IMPL_CREATE_METHOD( \ + const auto &x_desc = input_desc_vec.at(0); \ + const auto &x_shape = x_desc->shape(); \ + CHECK_SAME_SHAPE(out_shape, x_shape);, \ + INFINI_DTYPE_F16, INFINI_DTYPE_F32) \ _IMPL_CALCULATE_METHOD(_IMPL_CALC_CASES_COMMON) /** @@ -151,13 +161,33 @@ * ELEMENTWISE_NVIDIA_IMPL_UNARY_EXTENDED(exp) * } */ -#define ELEMENTWISE_NVIDIA_IMPL_UNARY_EXTENDED(OP) \ - _IMPL_CREATE_METHOD( \ - const auto &x_desc = input_desc_vec.at(0); \ - const auto &x_shape = x_desc->shape(); \ - CHECK_SAME_SHAPE(out_shape, x_shape);, \ - INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16 \ - ) \ +#define ELEMENTWISE_NVIDIA_IMPL_UNARY_EXTENDED(OP) \ + _IMPL_CREATE_METHOD( \ + const auto &x_desc = input_desc_vec.at(0); \ + const auto &x_shape = x_desc->shape(); \ + CHECK_SAME_SHAPE(out_shape, x_shape);, \ + INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16) \ _IMPL_CALCULATE_METHOD(_IMPL_CALC_CASES_EXTENDED) +/** + * @brief Implementation for Binary Operators with Integral Types (I32, I64, U8) + * + * This macro generates the Descriptor destructor, create, and calculate methods + * for binary operators that only support integral types (e.g., bitwise operations). + * + * Usage: + * namespace op::bitwise_and::nvidia { + * ELEMENTWISE_NVIDIA_IMPL_BINARY_INTEGRAL(bitwise_and) + * } + */ +#define ELEMENTWISE_NVIDIA_IMPL_BINARY_INTEGRAL(OP) \ + _IMPL_CREATE_METHOD( \ + const auto &a_desc = input_desc_vec.at(0); \ + const auto &b_desc = input_desc_vec.at(1); \ + const auto &a_shape = a_desc->shape(); \ + const auto &b_shape = b_desc->shape(); \ + CHECK_SAME_SHAPE(out_shape, a_shape, b_shape);, \ + INFINI_DTYPE_I32, INFINI_DTYPE_I64, INFINI_DTYPE_U8) \ + _IMPL_CALCULATE_METHOD(_IMPL_CALC_CASES_INTEGRAL) + #endif // __INFINIOP_ELEMENTWISE_NVIDIA_IMPL_CUH__ diff --git a/src/infiniop/elementwise/unary.h b/src/infiniop/elementwise/unary.h index 330f305dd..ef3fdc8fd 100644 --- a/src/infiniop/elementwise/unary.h +++ b/src/infiniop/elementwise/unary.h @@ -25,9 +25,15 @@ enum class UnaryMode { // Math operations: Abs, Exp, + Exp2, // exp2: 2^x Log, + Log2, // log2: log base 2 + Log10, // log10: log base 10 + Log1p, // log1p: log(1 + x), numerically stable for values close to zero Reciprocal, Sqrt, + Square, + Rsqrt, Neg, Ceil, Floor, @@ -49,8 +55,16 @@ enum class UnaryMode { Sign, Erf, Hardswish, + IsNan, + IsInf, + IsFinite, + Sinc, }; +// Helper template for static_assert in else branches +template +struct always_false : std::false_type {}; + /** * @brief Generic unary operation template that performs different operations * based on the specified UnaryMode. @@ -75,12 +89,28 @@ struct UnaryOp { } } else if constexpr (Mode == UnaryMode::Exp) { return std::exp(x); + } else if constexpr (Mode == UnaryMode::Exp2) { + // exp2: 2^x + return std::exp2(x); } else if constexpr (Mode == UnaryMode::Log) { return std::log(x); + } else if constexpr (Mode == UnaryMode::Log2) { + // log2: log base 2 + return std::log2(x); + } else if constexpr (Mode == UnaryMode::Log10) { + // log10: log base 10 + return std::log10(x); + } else if constexpr (Mode == UnaryMode::Log1p) { + // log1p: log(1 + x), numerically stable for values close to zero + return std::log1p(x); } else if constexpr (Mode == UnaryMode::Reciprocal) { return T(1) / x; } else if constexpr (Mode == UnaryMode::Sqrt) { return std::sqrt(x); + } else if constexpr (Mode == UnaryMode::Square) { + return x * x; + } else if constexpr (Mode == UnaryMode::Rsqrt) { + return T(1) / std::sqrt(x); } else if constexpr (Mode == UnaryMode::Neg) { return -x; } else if constexpr (Mode == UnaryMode::Ceil) { @@ -125,19 +155,55 @@ struct UnaryOp { return x > T(0) ? T(1) : (x == T(0) ? T(0) : T(-1)); } else if constexpr (Mode == UnaryMode::Erf) { return std::erf(x); + } else if constexpr (Mode == UnaryMode::IsNan) { + if constexpr (std::is_floating_point_v) { + return std::isnan(x) ? T(1) : T(0); + } else { + // For integral types, NaN doesn't exist, so always return 0 + return T(0); + } + } else if constexpr (Mode == UnaryMode::IsInf) { + if constexpr (std::is_floating_point_v) { + return std::isinf(x) ? T(1) : T(0); + } else { + // For integral types, Inf doesn't exist, so always return 0 + return T(0); + } + } else if constexpr (Mode == UnaryMode::IsFinite) { + if constexpr (std::is_floating_point_v) { + return std::isfinite(x) ? T(1) : T(0); + } else { + // For integral types, all values are finite, so always return 1 + return T(1); + } + } else if constexpr (Mode == UnaryMode::Sinc) { + // sinc(x) = sin(x) / x, sinc(0) = 1 + // For small values, use Taylor expansion for numerical stability + // sinc(x) ≈ 1 - x²/6 + x⁴/120 - x⁶/5040 + if constexpr (std::is_floating_point_v) { + T abs_x = std::abs(x); + if (abs_x < T(1e-2)) { + T x2 = x * x; + return T(1) - x2 * (T(1) / T(6) - x2 * (T(1) / T(120) - x2 * (T(1) / T(5040)))); + } else { + return std::sin(x) / x; + } + } else { + // For integral types, sinc is not well-defined, return 1 for 0, 0 otherwise + return x == T(0) ? T(1) : T(0); + } } else if constexpr (Mode == UnaryMode::Hardswish) { if constexpr (std::is_integral_v) { return static_cast(0); } else { // x * clamp(x + 3, 0, 6) / 6 - auto x_val = static_cast(x); - double y = x_val + 3.0; - y = std::min(std::max(y, 0.0), 6.0); - double out = x_val * (y / 6.0); - return static_cast(out); + // Use template type T directly instead of double for better performance + T y = x + T(3); + y = std::min(std::max(y, T(0)), T(6)); + return x * (y / T(6)); } } else { - static_assert(Mode != Mode, "Unsupported unary operation mode"); + static_assert(always_false::value, "Unsupported unary operation mode"); return x; } } @@ -186,6 +252,23 @@ struct UnaryOp { } else { return std::exp(x); } + } else if constexpr (Mode == UnaryMode::Exp2) { + // exp2: 2^x + if constexpr (std::is_same_v) { + float2 x_f2 = __half22float2(x); + return __float22half2_rn(make_float2(exp2f(x_f2.x), exp2f(x_f2.y))); + } else if constexpr (std::is_same_v) { + return __float2half(exp2f(__half2float(x))); + } else if constexpr (std::is_same_v) { + float2 x_f2 = __bfloat1622float2(x); + return __floats2bfloat162_rn(exp2f(x_f2.x), exp2f(x_f2.y)); + } else if constexpr (std::is_same_v) { + return __float2bfloat16_rn(exp2f(__bfloat162float(x))); + } else if constexpr (std::is_same_v) { + return exp2f(x); + } else { + return std::exp2(x); + } } else if constexpr (Mode == UnaryMode::Log) { if constexpr (std::is_same_v) { return h2log(x); @@ -202,6 +285,62 @@ struct UnaryOp { } else { return std::log(x); } + } else if constexpr (Mode == UnaryMode::Log2) { + // log2: log base 2 + if constexpr (std::is_same_v) { + float2 x_f2 = __half22float2(x); + return __float22half2_rn(make_float2(log2f(x_f2.x), log2f(x_f2.y))); + } else if constexpr (std::is_same_v) { + return __float2half(log2f(__half2float(x))); + } else if constexpr (std::is_same_v) { + float x0 = __bfloat162float(__low2bfloat16(x)); + float x1 = __bfloat162float(__high2bfloat16(x)); + return __floats2bfloat162_rn(log2f(x0), log2f(x1)); + } else if constexpr (std::is_same_v) { + return __float2bfloat16_rn(log2f(__bfloat162float(x))); + } else if constexpr (std::is_same_v) { + return log2f(x); + } else { + return std::log2(x); + } + } else if constexpr (Mode == UnaryMode::Log10) { + // log10: log base 10 + if constexpr (std::is_same_v) { + float2 x_f2 = __half22float2(x); + return __float22half2_rn(make_float2(log10f(x_f2.x), log10f(x_f2.y))); + } else if constexpr (std::is_same_v) { + return __float2half(log10f(__half2float(x))); + } else if constexpr (std::is_same_v) { + float x0 = __bfloat162float(__low2bfloat16(x)); + float x1 = __bfloat162float(__high2bfloat16(x)); + return __floats2bfloat162_rn(log10f(x0), log10f(x1)); + } else if constexpr (std::is_same_v) { + return __float2bfloat16_rn(log10f(__bfloat162float(x))); + } else if constexpr (std::is_same_v) { + return log10f(x); + } else { + return std::log10(x); + } + } else if constexpr (Mode == UnaryMode::Log1p) { + // log1p: log(1 + x), numerically stable for values close to zero + if constexpr (std::is_same_v) { + float2 x_f2 = __half22float2(x); + return __float22half2_rn(make_float2(log1pf(x_f2.x), log1pf(x_f2.y))); + } else if constexpr (std::is_same_v) { + return __float2half(log1pf(__half2float(x))); + } else if constexpr (std::is_same_v) { + float x0 = __bfloat162float(__low2bfloat16(x)); + float x1 = __bfloat162float(__high2bfloat16(x)); + return __floats2bfloat162_rn(log1pf(x0), log1pf(x1)); + } else if constexpr (std::is_same_v) { + return __float2bfloat16_rn(log1pf(__bfloat162float(x))); + } else if constexpr (std::is_same_v) { + return log1pf(x); + } else if constexpr (std::is_same_v) { + return log1p(x); + } else { + return std::log1p(x); + } } else if constexpr (Mode == UnaryMode::Reciprocal) { if constexpr (std::is_same_v) { return h2rcp(x); @@ -234,6 +373,25 @@ struct UnaryOp { } else { return std::sqrt(x); } + } else if constexpr (Mode == UnaryMode::Square) { + return x * x; + } else if constexpr (Mode == UnaryMode::Rsqrt) { + if constexpr (std::is_same_v) { + float2 x_f2 = __half22float2(x); + return __float22half2_rn(make_float2(__frsqrt_rn(x_f2.x), __frsqrt_rn(x_f2.y))); + } else if constexpr (std::is_same_v) { + return __float2half(__frsqrt_rn(__half2float(x))); + } else if constexpr (std::is_same_v) { + float x0 = __bfloat162float(__low2bfloat16(x)); + float x1 = __bfloat162float(__high2bfloat16(x)); + return __floats2bfloat162_rn(__frsqrt_rn(x0), __frsqrt_rn(x1)); + } else if constexpr (std::is_same_v) { + return __float2bfloat16_rn(__frsqrt_rn(__bfloat162float(x))); + } else if constexpr (std::is_same_v) { + return __frsqrt_rn(x); + } else { + return T(1) / std::sqrt(x); + } } else if constexpr (Mode == UnaryMode::Neg) { if constexpr (std::is_same_v) { return __hneg2(x); @@ -409,6 +567,8 @@ struct UnaryOp { return __float2bfloat16_rn(tanhf(__bfloat162float(x))); } else if constexpr (std::is_same_v) { return tanhf(x); + } else if constexpr (std::is_same_v) { + return ::tanh(x); } else { return std::tanh(x); } @@ -474,8 +634,17 @@ struct UnaryOp { } else if constexpr (std::is_same_v) { float x_ = __half2float(x); return __float2half(1.0f / (1.0f + __expf(-x_))); + } else if constexpr (std::is_same_v) { + float2 x_f2 = __bfloat1622float2(x); + float2 exp_neg_x = make_float2(__expf(-x_f2.x), __expf(-x_f2.y)); + return __floats2bfloat162_rn(1.0f / (1.0f + exp_neg_x.x), 1.0f / (1.0f + exp_neg_x.y)); + } else if constexpr (std::is_same_v) { + float x_ = __bfloat162float(x); + return __float2bfloat16_rn(1.0f / (1.0f + __expf(-x_))); } else if constexpr (std::is_same_v) { return 1.0f / (1.0f + __expf(-x)); + } else if constexpr (std::is_same_v) { + return 1.0 / (1.0 + exp(-x)); } else { return T(1) / (T(1) + std::exp(-x)); } @@ -499,6 +668,177 @@ struct UnaryOp { } else { return std::erf(x); } + } else if constexpr (Mode == UnaryMode::IsNan) { + if constexpr (std::is_same_v) { + float2 x_f2 = __half22float2(x); + return __float22half2_rn(make_float2( + __isnanf(x_f2.x) ? 1.0f : 0.0f, + __isnanf(x_f2.y) ? 1.0f : 0.0f)); + } else if constexpr (std::is_same_v) { + float x_ = __half2float(x); + return __float2half(__isnanf(x_) ? 1.0f : 0.0f); + } else if constexpr (std::is_same_v) { + float x0 = __bfloat162float(__low2bfloat16(x)); + float x1 = __bfloat162float(__high2bfloat16(x)); + return __floats2bfloat162_rn( + __isnanf(x0) ? 1.0f : 0.0f, + __isnanf(x1) ? 1.0f : 0.0f); + } else if constexpr (std::is_same_v) { + float x_ = __bfloat162float(x); + return __float2bfloat16_rn(__isnanf(x_) ? 1.0f : 0.0f); + } else if constexpr (std::is_same_v) { + return __isnanf(x) ? 1.0f : 0.0f; + } else if constexpr (std::is_same_v) { + return __isnan(x) ? 1.0 : 0.0; + } else if constexpr (std::is_floating_point_v) { + return std::isnan(x) ? T(1) : T(0); + } else { + // For integral types, NaN doesn't exist, so always return 0 + return T(0); + } + } else if constexpr (Mode == UnaryMode::IsInf) { + if constexpr (std::is_same_v) { + float2 x_f2 = __half22float2(x); + return __float22half2_rn(make_float2( + __isinff(x_f2.x) ? 1.0f : 0.0f, + __isinff(x_f2.y) ? 1.0f : 0.0f)); + } else if constexpr (std::is_same_v) { + float x_ = __half2float(x); + return __float2half(__isinff(x_) ? 1.0f : 0.0f); + } else if constexpr (std::is_same_v) { + float x0 = __bfloat162float(__low2bfloat16(x)); + float x1 = __bfloat162float(__high2bfloat16(x)); + return __floats2bfloat162_rn( + __isinff(x0) ? 1.0f : 0.0f, + __isinff(x1) ? 1.0f : 0.0f); + } else if constexpr (std::is_same_v) { + float x_ = __bfloat162float(x); + return __float2bfloat16_rn(__isinff(x_) ? 1.0f : 0.0f); + } else if constexpr (std::is_same_v) { + return __isinff(x) ? 1.0f : 0.0f; + } else if constexpr (std::is_same_v) { + return __isinf(x) ? 1.0 : 0.0; + } else if constexpr (std::is_floating_point_v) { + return std::isinf(x) ? T(1) : T(0); + } else { + // For integral types, Inf doesn't exist, so always return 0 + return T(0); + } + } else if constexpr (Mode == UnaryMode::IsFinite) { + // isfinite(x) = !isnan(x) && !isinf(x) + if constexpr (std::is_same_v) { + float2 x_f2 = __half22float2(x); + auto isfinite_f32 = [](float val) -> float { + return (!__isnanf(val) && !__isinff(val)) ? 1.0f : 0.0f; + }; + return __float22half2_rn(make_float2( + isfinite_f32(x_f2.x), + isfinite_f32(x_f2.y))); + } else if constexpr (std::is_same_v) { + float x_ = __half2float(x); + return __float2half((!__isnanf(x_) && !__isinff(x_)) ? 1.0f : 0.0f); + } else if constexpr (std::is_same_v) { + float x0 = __bfloat162float(__low2bfloat16(x)); + float x1 = __bfloat162float(__high2bfloat16(x)); + auto isfinite_f32 = [](float val) -> float { + return (!__isnanf(val) && !__isinff(val)) ? 1.0f : 0.0f; + }; + return __floats2bfloat162_rn( + isfinite_f32(x0), + isfinite_f32(x1)); + } else if constexpr (std::is_same_v) { + float x_ = __bfloat162float(x); + return __float2bfloat16_rn((!__isnanf(x_) && !__isinff(x_)) ? 1.0f : 0.0f); + } else if constexpr (std::is_same_v) { + return (!__isnanf(x) && !__isinff(x)) ? 1.0f : 0.0f; + } else if constexpr (std::is_same_v) { + return (!__isnan(x) && !__isinf(x)) ? 1.0 : 0.0; + } else if constexpr (std::is_floating_point_v) { + return std::isfinite(x) ? T(1) : T(0); + } else { + // For integral types, all values are finite, so always return 1 + return T(1); + } + } else if constexpr (Mode == UnaryMode::Sinc) { + // sinc(x) = sin(x) / x, sinc(0) = 1 + // For small values, use Taylor expansion for numerical stability + // sinc(x) ≈ 1 - x²/6 + x⁴/120 - x⁶/5040 + if constexpr (std::is_same_v) { + float2 x_f2 = __half22float2(x); + auto sinc_f32 = [](float val) -> float { + float abs_val = fabsf(val); + if (abs_val < 1e-2f) { + // Use Taylor expansion for small values: 1 - x²/6 + x⁴/120 - x⁶/5040 + float x2 = val * val; + return 1.0f - x2 * (1.0f / 6.0f - x2 * (1.0f / 120.0f - x2 * (1.0f / 5040.0f))); + } else { + return __sinf(val) / val; + } + }; + return __float22half2_rn(make_float2( + sinc_f32(x_f2.x), + sinc_f32(x_f2.y))); + } else if constexpr (std::is_same_v) { + float x_ = __half2float(x); + float abs_x = fabsf(x_); + if (abs_x < 1e-2f) { + float x2 = x_ * x_; + return __float2half(1.0f - x2 * (1.0f / 6.0f - x2 * (1.0f / 120.0f - x2 * (1.0f / 5040.0f)))); + } else { + return __float2half(__sinf(x_) / x_); + } + } else if constexpr (std::is_same_v) { + float x0 = __bfloat162float(__low2bfloat16(x)); + float x1 = __bfloat162float(__high2bfloat16(x)); + auto sinc_f32 = [](float val) -> float { + float abs_val = fabsf(val); + if (abs_val < 1e-2f) { + float x2 = val * val; + return 1.0f - x2 * (1.0f / 6.0f - x2 * (1.0f / 120.0f - x2 * (1.0f / 5040.0f))); + } else { + return sinf(val) / val; + } + }; + return __floats2bfloat162_rn( + sinc_f32(x0), + sinc_f32(x1)); + } else if constexpr (std::is_same_v) { + float x_ = __bfloat162float(x); + float abs_x = fabsf(x_); + if (abs_x < 1e-2f) { + float x2 = x_ * x_; + return __float2bfloat16_rn(1.0f - x2 * (1.0f / 6.0f - x2 * (1.0f / 120.0f - x2 * (1.0f / 5040.0f)))); + } else { + return __float2bfloat16_rn(sinf(x_) / x_); + } + } else if constexpr (std::is_same_v) { + float abs_x = fabsf(x); + if (abs_x < 1e-2f) { + float x2 = x * x; + return 1.0f - x2 * (1.0f / 6.0f - x2 * (1.0f / 120.0f - x2 * (1.0f / 5040.0f))); + } else { + return __sinf(x) / x; + } + } else if constexpr (std::is_same_v) { + double abs_x = std::fabs(x); + if (abs_x < 1e-6) { + double x2 = x * x; + return 1.0 - x2 * (1.0 / 6.0 - x2 * (1.0 / 120.0 - x2 * (1.0 / 5040.0))); + } else { + return std::sin(x) / x; + } + } else if constexpr (std::is_floating_point_v) { + T abs_x = std::abs(x); + if (abs_x < T(1e-2)) { + T x2 = x * x; + return T(1) - x2 * (T(1) / T(6) - x2 * (T(1) / T(120) - x2 * (T(1) / T(5040)))); + } else { + return std::sin(x) / x; + } + } else { + // For integral types, sinc is not well-defined, return 1 for 0, 0 otherwise + return x == T(0) ? T(1) : T(0); + } } else if constexpr (Mode == UnaryMode::Hardswish) { // Hardswish: f(x) = x * clamp(x + 3, 0, 6) / 6 auto hswish_f32 = [](float x) -> float { @@ -535,7 +875,7 @@ struct UnaryOp { return static_cast(yd); } } else { - static_assert(Mode != Mode, "Unsupported unary operation mode"); + static_assert(always_false::value, "Unsupported unary operation mode"); return x; } } diff --git a/src/infiniop/ops/atan2/cpu/atan2_cpu.cc b/src/infiniop/ops/atan2/cpu/atan2_cpu.cc new file mode 100644 index 000000000..8d300d867 --- /dev/null +++ b/src/infiniop/ops/atan2/cpu/atan2_cpu.cc @@ -0,0 +1,8 @@ +#include "atan2_cpu.h" +#include "../../../elementwise/cpu/elementwise_cpu_impl.h" + +namespace op::atan2::cpu { + +ELEMENTWISE_CPU_IMPL_BINARY(atan2) + +} // namespace op::atan2::cpu diff --git a/src/infiniop/ops/atan2/cpu/atan2_cpu.h b/src/infiniop/ops/atan2/cpu/atan2_cpu.h new file mode 100644 index 000000000..9a26d425d --- /dev/null +++ b/src/infiniop/ops/atan2/cpu/atan2_cpu.h @@ -0,0 +1,9 @@ +#ifndef __ATAN2_CPU_H__ +#define __ATAN2_CPU_H__ + +#include "../../../elementwise/binary.h" +#include "../../../elementwise/cpu/elementwise_cpu.h" + +BINARY_ELEMENTWISE_DESCRIPTOR(atan2, cpu, op::elementwise::binary::BinaryMode::Atan2) + +#endif // __ATAN2_CPU_H__ diff --git a/src/infiniop/ops/atan2/cuda/kernel.cuh b/src/infiniop/ops/atan2/cuda/kernel.cuh new file mode 100644 index 000000000..22ca06c8d --- /dev/null +++ b/src/infiniop/ops/atan2/cuda/kernel.cuh @@ -0,0 +1,10 @@ +#ifndef __ATAN2_CUDA_H__ +#define __ATAN2_CUDA_H__ + +#include "../../../elementwise/binary.h" + +namespace op::atan2::cuda { +using Op = op::elementwise::binary::cuda::BinaryOp; +} // namespace op::atan2::cuda + +#endif // __ATAN2_CUDA_H__ diff --git a/src/infiniop/ops/atan2/nvidia/atan2_nvidia.cu b/src/infiniop/ops/atan2/nvidia/atan2_nvidia.cu new file mode 100644 index 000000000..716ff6884 --- /dev/null +++ b/src/infiniop/ops/atan2/nvidia/atan2_nvidia.cu @@ -0,0 +1,10 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh" + +#include "../cuda/kernel.cuh" +#include "atan2_nvidia.cuh" + +namespace op::atan2::nvidia { + +ELEMENTWISE_NVIDIA_IMPL_BINARY(atan2) + +} // namespace op::atan2::nvidia diff --git a/src/infiniop/ops/atan2/nvidia/atan2_nvidia.cuh b/src/infiniop/ops/atan2/nvidia/atan2_nvidia.cuh new file mode 100644 index 000000000..c882c22a8 --- /dev/null +++ b/src/infiniop/ops/atan2/nvidia/atan2_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __ATAN2_CUDA_API_H__ +#define __ATAN2_CUDA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(atan2, nvidia) + +#endif // __ATAN2_CUDA_API_H__ diff --git a/src/infiniop/ops/atan2/operator.cc b/src/infiniop/ops/atan2/operator.cc new file mode 100644 index 000000000..3580402ac --- /dev/null +++ b/src/infiniop/ops/atan2/operator.cc @@ -0,0 +1,11 @@ +#include "../../operator_impl.h" +#include "infiniop/ops/binary_ops_api.h" + +#ifdef ENABLE_CPU_API +#include "cpu/atan2_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#include "nvidia/atan2_nvidia.cuh" +#endif + +BINARY_OP_IMPL(atan2, Atan2) diff --git a/src/infiniop/ops/bitwise_and/cpu/bitwise_and_cpu.cc b/src/infiniop/ops/bitwise_and/cpu/bitwise_and_cpu.cc new file mode 100644 index 000000000..cb293baba --- /dev/null +++ b/src/infiniop/ops/bitwise_and/cpu/bitwise_and_cpu.cc @@ -0,0 +1,8 @@ +#include "bitwise_and_cpu.h" +#include "../../../elementwise/cpu/elementwise_cpu_impl.h" + +namespace op::bitwise_and::cpu { + +ELEMENTWISE_CPU_IMPL_BINARY_INTEGRAL(bitwise_and) + +} // namespace op::bitwise_and::cpu diff --git a/src/infiniop/ops/bitwise_and/cpu/bitwise_and_cpu.h b/src/infiniop/ops/bitwise_and/cpu/bitwise_and_cpu.h new file mode 100644 index 000000000..348022cb2 --- /dev/null +++ b/src/infiniop/ops/bitwise_and/cpu/bitwise_and_cpu.h @@ -0,0 +1,9 @@ +#ifndef __BITWISE_AND_CPU_H__ +#define __BITWISE_AND_CPU_H__ + +#include "../../../elementwise/binary.h" +#include "../../../elementwise/cpu/elementwise_cpu.h" + +BINARY_ELEMENTWISE_DESCRIPTOR(bitwise_and, cpu, op::elementwise::binary::BinaryMode::BitwiseAnd) + +#endif // __BITWISE_AND_CPU_H__ diff --git a/src/infiniop/ops/bitwise_and/cuda/kernel.cuh b/src/infiniop/ops/bitwise_and/cuda/kernel.cuh new file mode 100644 index 000000000..a1a58b3c9 --- /dev/null +++ b/src/infiniop/ops/bitwise_and/cuda/kernel.cuh @@ -0,0 +1,10 @@ +#ifndef __BITWISE_AND_CUDA_H__ +#define __BITWISE_AND_CUDA_H__ + +#include "../../../elementwise/binary.h" + +namespace op::bitwise_and::cuda { +using Op = op::elementwise::binary::cuda::BinaryOp; +} // namespace op::bitwise_and::cuda + +#endif // __BITWISE_AND_CUDA_H__ diff --git a/src/infiniop/ops/bitwise_and/nvidia/bitwise_and_nvidia.cu b/src/infiniop/ops/bitwise_and/nvidia/bitwise_and_nvidia.cu new file mode 100644 index 000000000..e97f957cb --- /dev/null +++ b/src/infiniop/ops/bitwise_and/nvidia/bitwise_and_nvidia.cu @@ -0,0 +1,10 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh" + +#include "../cuda/kernel.cuh" +#include "bitwise_and_nvidia.cuh" + +namespace op::bitwise_and::nvidia { + +ELEMENTWISE_NVIDIA_IMPL_BINARY_INTEGRAL(bitwise_and) + +} // namespace op::bitwise_and::nvidia diff --git a/src/infiniop/ops/bitwise_and/nvidia/bitwise_and_nvidia.cuh b/src/infiniop/ops/bitwise_and/nvidia/bitwise_and_nvidia.cuh new file mode 100644 index 000000000..0f9c65f22 --- /dev/null +++ b/src/infiniop/ops/bitwise_and/nvidia/bitwise_and_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __BITWISE_AND_CUDA_API_H__ +#define __BITWISE_AND_CUDA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(bitwise_and, nvidia) + +#endif // __BITWISE_AND_CUDA_API_H__ diff --git a/src/infiniop/ops/bitwise_and/operator.cc b/src/infiniop/ops/bitwise_and/operator.cc new file mode 100644 index 000000000..c934bee86 --- /dev/null +++ b/src/infiniop/ops/bitwise_and/operator.cc @@ -0,0 +1,11 @@ +#include "../../operator_impl.h" +#include "infiniop/ops/binary_ops_api.h" + +#ifdef ENABLE_CPU_API +#include "cpu/bitwise_and_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#include "nvidia/bitwise_and_nvidia.cuh" +#endif + +BINARY_OP_IMPL(bitwise_and, BitwiseAnd) diff --git a/src/infiniop/ops/bitwise_left_shift/cpu/bitwise_left_shift_cpu.cc b/src/infiniop/ops/bitwise_left_shift/cpu/bitwise_left_shift_cpu.cc new file mode 100644 index 000000000..d5f367515 --- /dev/null +++ b/src/infiniop/ops/bitwise_left_shift/cpu/bitwise_left_shift_cpu.cc @@ -0,0 +1,8 @@ +#include "bitwise_left_shift_cpu.h" +#include "../../../elementwise/cpu/elementwise_cpu_impl.h" + +namespace op::bitwise_left_shift::cpu { + +ELEMENTWISE_CPU_IMPL_BINARY_INTEGRAL(bitwise_left_shift) + +} // namespace op::bitwise_left_shift::cpu diff --git a/src/infiniop/ops/bitwise_left_shift/cpu/bitwise_left_shift_cpu.h b/src/infiniop/ops/bitwise_left_shift/cpu/bitwise_left_shift_cpu.h new file mode 100644 index 000000000..36636bb56 --- /dev/null +++ b/src/infiniop/ops/bitwise_left_shift/cpu/bitwise_left_shift_cpu.h @@ -0,0 +1,9 @@ +#ifndef __BITWISE_LEFT_SHIFT_CPU_H__ +#define __BITWISE_LEFT_SHIFT_CPU_H__ + +#include "../../../elementwise/binary.h" +#include "../../../elementwise/cpu/elementwise_cpu.h" + +BINARY_ELEMENTWISE_DESCRIPTOR(bitwise_left_shift, cpu, op::elementwise::binary::BinaryMode::BitwiseLeftShift) + +#endif // __BITWISE_LEFT_SHIFT_CPU_H__ diff --git a/src/infiniop/ops/bitwise_left_shift/cuda/kernel.cuh b/src/infiniop/ops/bitwise_left_shift/cuda/kernel.cuh new file mode 100644 index 000000000..0f74548b6 --- /dev/null +++ b/src/infiniop/ops/bitwise_left_shift/cuda/kernel.cuh @@ -0,0 +1,10 @@ +#ifndef __BITWISE_LEFT_SHIFT_CUDA_H__ +#define __BITWISE_LEFT_SHIFT_CUDA_H__ + +#include "../../../elementwise/binary.h" + +namespace op::bitwise_left_shift::cuda { +using Op = op::elementwise::binary::cuda::BinaryOp; +} // namespace op::bitwise_left_shift::cuda + +#endif // __BITWISE_LEFT_SHIFT_CUDA_H__ diff --git a/src/infiniop/ops/bitwise_left_shift/nvidia/bitwise_left_shift_nvidia.cu b/src/infiniop/ops/bitwise_left_shift/nvidia/bitwise_left_shift_nvidia.cu new file mode 100644 index 000000000..66fbd856c --- /dev/null +++ b/src/infiniop/ops/bitwise_left_shift/nvidia/bitwise_left_shift_nvidia.cu @@ -0,0 +1,10 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh" + +#include "../cuda/kernel.cuh" +#include "bitwise_left_shift_nvidia.cuh" + +namespace op::bitwise_left_shift::nvidia { + +ELEMENTWISE_NVIDIA_IMPL_BINARY_INTEGRAL(bitwise_left_shift) + +} // namespace op::bitwise_left_shift::nvidia diff --git a/src/infiniop/ops/bitwise_left_shift/nvidia/bitwise_left_shift_nvidia.cuh b/src/infiniop/ops/bitwise_left_shift/nvidia/bitwise_left_shift_nvidia.cuh new file mode 100644 index 000000000..89a573145 --- /dev/null +++ b/src/infiniop/ops/bitwise_left_shift/nvidia/bitwise_left_shift_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __BITWISE_LEFT_SHIFT_CUDA_API_H__ +#define __BITWISE_LEFT_SHIFT_CUDA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(bitwise_left_shift, nvidia) + +#endif // __BITWISE_LEFT_SHIFT_CUDA_API_H__ diff --git a/src/infiniop/ops/bitwise_left_shift/operator.cc b/src/infiniop/ops/bitwise_left_shift/operator.cc new file mode 100644 index 000000000..297874f10 --- /dev/null +++ b/src/infiniop/ops/bitwise_left_shift/operator.cc @@ -0,0 +1,11 @@ +#include "../../operator_impl.h" +#include "infiniop/ops/binary_ops_api.h" + +#ifdef ENABLE_CPU_API +#include "cpu/bitwise_left_shift_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#include "nvidia/bitwise_left_shift_nvidia.cuh" +#endif + +BINARY_OP_IMPL(bitwise_left_shift, BitwiseLeftShift) diff --git a/src/infiniop/ops/bitwise_or/cpu/bitwise_or_cpu.cc b/src/infiniop/ops/bitwise_or/cpu/bitwise_or_cpu.cc new file mode 100644 index 000000000..9808c8294 --- /dev/null +++ b/src/infiniop/ops/bitwise_or/cpu/bitwise_or_cpu.cc @@ -0,0 +1,8 @@ +#include "bitwise_or_cpu.h" +#include "../../../elementwise/cpu/elementwise_cpu_impl.h" + +namespace op::bitwise_or::cpu { + +ELEMENTWISE_CPU_IMPL_BINARY_INTEGRAL(bitwise_or) + +} // namespace op::bitwise_or::cpu diff --git a/src/infiniop/ops/bitwise_or/cpu/bitwise_or_cpu.h b/src/infiniop/ops/bitwise_or/cpu/bitwise_or_cpu.h new file mode 100644 index 000000000..f0503b1fd --- /dev/null +++ b/src/infiniop/ops/bitwise_or/cpu/bitwise_or_cpu.h @@ -0,0 +1,9 @@ +#ifndef __BITWISE_OR_CPU_H__ +#define __BITWISE_OR_CPU_H__ + +#include "../../../elementwise/binary.h" +#include "../../../elementwise/cpu/elementwise_cpu.h" + +BINARY_ELEMENTWISE_DESCRIPTOR(bitwise_or, cpu, op::elementwise::binary::BinaryMode::BitwiseOr) + +#endif // __BITWISE_OR_CPU_H__ diff --git a/src/infiniop/ops/bitwise_or/cuda/kernel.cuh b/src/infiniop/ops/bitwise_or/cuda/kernel.cuh new file mode 100644 index 000000000..0424f03b8 --- /dev/null +++ b/src/infiniop/ops/bitwise_or/cuda/kernel.cuh @@ -0,0 +1,10 @@ +#ifndef __BITWISE_OR_CUDA_H__ +#define __BITWISE_OR_CUDA_H__ + +#include "../../../elementwise/binary.h" + +namespace op::bitwise_or::cuda { +using Op = op::elementwise::binary::cuda::BinaryOp; +} // namespace op::bitwise_or::cuda + +#endif // __BITWISE_OR_CUDA_H__ diff --git a/src/infiniop/ops/bitwise_or/nvidia/bitwise_or_nvidia.cu b/src/infiniop/ops/bitwise_or/nvidia/bitwise_or_nvidia.cu new file mode 100644 index 000000000..8bf58dbfa --- /dev/null +++ b/src/infiniop/ops/bitwise_or/nvidia/bitwise_or_nvidia.cu @@ -0,0 +1,10 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh" + +#include "../cuda/kernel.cuh" +#include "bitwise_or_nvidia.cuh" + +namespace op::bitwise_or::nvidia { + +ELEMENTWISE_NVIDIA_IMPL_BINARY_INTEGRAL(bitwise_or) + +} // namespace op::bitwise_or::nvidia diff --git a/src/infiniop/ops/bitwise_or/nvidia/bitwise_or_nvidia.cuh b/src/infiniop/ops/bitwise_or/nvidia/bitwise_or_nvidia.cuh new file mode 100644 index 000000000..419ac2603 --- /dev/null +++ b/src/infiniop/ops/bitwise_or/nvidia/bitwise_or_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __BITWISE_OR_CUDA_API_H__ +#define __BITWISE_OR_CUDA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(bitwise_or, nvidia) + +#endif // __BITWISE_OR_CUDA_API_H__ diff --git a/src/infiniop/ops/bitwise_or/operator.cc b/src/infiniop/ops/bitwise_or/operator.cc new file mode 100644 index 000000000..40a68f2a8 --- /dev/null +++ b/src/infiniop/ops/bitwise_or/operator.cc @@ -0,0 +1,11 @@ +#include "../../operator_impl.h" +#include "infiniop/ops/binary_ops_api.h" + +#ifdef ENABLE_CPU_API +#include "cpu/bitwise_or_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#include "nvidia/bitwise_or_nvidia.cuh" +#endif + +BINARY_OP_IMPL(bitwise_or, BitwiseOr) diff --git a/src/infiniop/ops/bitwise_right_shift/cpu/bitwise_right_shift_cpu.cc b/src/infiniop/ops/bitwise_right_shift/cpu/bitwise_right_shift_cpu.cc new file mode 100644 index 000000000..c11022f26 --- /dev/null +++ b/src/infiniop/ops/bitwise_right_shift/cpu/bitwise_right_shift_cpu.cc @@ -0,0 +1,8 @@ +#include "bitwise_right_shift_cpu.h" +#include "../../../elementwise/cpu/elementwise_cpu_impl.h" + +namespace op::bitwise_right_shift::cpu { + +ELEMENTWISE_CPU_IMPL_BINARY_INTEGRAL(bitwise_right_shift) + +} // namespace op::bitwise_right_shift::cpu diff --git a/src/infiniop/ops/bitwise_right_shift/cpu/bitwise_right_shift_cpu.h b/src/infiniop/ops/bitwise_right_shift/cpu/bitwise_right_shift_cpu.h new file mode 100644 index 000000000..f4d023c4d --- /dev/null +++ b/src/infiniop/ops/bitwise_right_shift/cpu/bitwise_right_shift_cpu.h @@ -0,0 +1,9 @@ +#ifndef __BITWISE_RIGHT_SHIFT_CPU_H__ +#define __BITWISE_RIGHT_SHIFT_CPU_H__ + +#include "../../../elementwise/binary.h" +#include "../../../elementwise/cpu/elementwise_cpu.h" + +BINARY_ELEMENTWISE_DESCRIPTOR(bitwise_right_shift, cpu, op::elementwise::binary::BinaryMode::BitwiseRightShift) + +#endif // __BITWISE_RIGHT_SHIFT_CPU_H__ diff --git a/src/infiniop/ops/bitwise_right_shift/cuda/kernel.cuh b/src/infiniop/ops/bitwise_right_shift/cuda/kernel.cuh new file mode 100644 index 000000000..43308fb3b --- /dev/null +++ b/src/infiniop/ops/bitwise_right_shift/cuda/kernel.cuh @@ -0,0 +1,10 @@ +#ifndef __BITWISE_RIGHT_SHIFT_CUDA_H__ +#define __BITWISE_RIGHT_SHIFT_CUDA_H__ + +#include "../../../elementwise/binary.h" + +namespace op::bitwise_right_shift::cuda { +using Op = op::elementwise::binary::cuda::BinaryOp; +} // namespace op::bitwise_right_shift::cuda + +#endif // __BITWISE_RIGHT_SHIFT_CUDA_H__ diff --git a/src/infiniop/ops/bitwise_right_shift/nvidia/bitwise_right_shift_nvidia.cu b/src/infiniop/ops/bitwise_right_shift/nvidia/bitwise_right_shift_nvidia.cu new file mode 100644 index 000000000..2be31f1ce --- /dev/null +++ b/src/infiniop/ops/bitwise_right_shift/nvidia/bitwise_right_shift_nvidia.cu @@ -0,0 +1,10 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh" + +#include "../cuda/kernel.cuh" +#include "bitwise_right_shift_nvidia.cuh" + +namespace op::bitwise_right_shift::nvidia { + +ELEMENTWISE_NVIDIA_IMPL_BINARY_INTEGRAL(bitwise_right_shift) + +} // namespace op::bitwise_right_shift::nvidia diff --git a/src/infiniop/ops/bitwise_right_shift/nvidia/bitwise_right_shift_nvidia.cuh b/src/infiniop/ops/bitwise_right_shift/nvidia/bitwise_right_shift_nvidia.cuh new file mode 100644 index 000000000..749189263 --- /dev/null +++ b/src/infiniop/ops/bitwise_right_shift/nvidia/bitwise_right_shift_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __BITWISE_RIGHT_SHIFT_CUDA_API_H__ +#define __BITWISE_RIGHT_SHIFT_CUDA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(bitwise_right_shift, nvidia) + +#endif // __BITWISE_RIGHT_SHIFT_CUDA_API_H__ diff --git a/src/infiniop/ops/bitwise_right_shift/operator.cc b/src/infiniop/ops/bitwise_right_shift/operator.cc new file mode 100644 index 000000000..6c5c1957e --- /dev/null +++ b/src/infiniop/ops/bitwise_right_shift/operator.cc @@ -0,0 +1,11 @@ +#include "../../operator_impl.h" +#include "infiniop/ops/binary_ops_api.h" + +#ifdef ENABLE_CPU_API +#include "cpu/bitwise_right_shift_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#include "nvidia/bitwise_right_shift_nvidia.cuh" +#endif + +BINARY_OP_IMPL(bitwise_right_shift, BitwiseRightShift) diff --git a/src/infiniop/ops/bitwise_xor/cpu/bitwise_xor_cpu.cc b/src/infiniop/ops/bitwise_xor/cpu/bitwise_xor_cpu.cc new file mode 100644 index 000000000..6f8cdc7e3 --- /dev/null +++ b/src/infiniop/ops/bitwise_xor/cpu/bitwise_xor_cpu.cc @@ -0,0 +1,8 @@ +#include "bitwise_xor_cpu.h" +#include "../../../elementwise/cpu/elementwise_cpu_impl.h" + +namespace op::bitwise_xor::cpu { + +ELEMENTWISE_CPU_IMPL_BINARY_INTEGRAL(bitwise_xor) + +} // namespace op::bitwise_xor::cpu diff --git a/src/infiniop/ops/bitwise_xor/cpu/bitwise_xor_cpu.h b/src/infiniop/ops/bitwise_xor/cpu/bitwise_xor_cpu.h new file mode 100644 index 000000000..e971b7a16 --- /dev/null +++ b/src/infiniop/ops/bitwise_xor/cpu/bitwise_xor_cpu.h @@ -0,0 +1,9 @@ +#ifndef __BITWISE_XOR_CPU_H__ +#define __BITWISE_XOR_CPU_H__ + +#include "../../../elementwise/binary.h" +#include "../../../elementwise/cpu/elementwise_cpu.h" + +BINARY_ELEMENTWISE_DESCRIPTOR(bitwise_xor, cpu, op::elementwise::binary::BinaryMode::BitwiseXor) + +#endif // __BITWISE_XOR_CPU_H__ diff --git a/src/infiniop/ops/bitwise_xor/cuda/kernel.cuh b/src/infiniop/ops/bitwise_xor/cuda/kernel.cuh new file mode 100644 index 000000000..ef7f23bca --- /dev/null +++ b/src/infiniop/ops/bitwise_xor/cuda/kernel.cuh @@ -0,0 +1,10 @@ +#ifndef __BITWISE_XOR_CUDA_H__ +#define __BITWISE_XOR_CUDA_H__ + +#include "../../../elementwise/binary.h" + +namespace op::bitwise_xor::cuda { +using Op = op::elementwise::binary::cuda::BinaryOp; +} // namespace op::bitwise_xor::cuda + +#endif // __BITWISE_XOR_CUDA_H__ diff --git a/src/infiniop/ops/bitwise_xor/nvidia/bitwise_xor_nvidia.cu b/src/infiniop/ops/bitwise_xor/nvidia/bitwise_xor_nvidia.cu new file mode 100644 index 000000000..64aff8297 --- /dev/null +++ b/src/infiniop/ops/bitwise_xor/nvidia/bitwise_xor_nvidia.cu @@ -0,0 +1,10 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh" + +#include "../cuda/kernel.cuh" +#include "bitwise_xor_nvidia.cuh" + +namespace op::bitwise_xor::nvidia { + +ELEMENTWISE_NVIDIA_IMPL_BINARY_INTEGRAL(bitwise_xor) + +} // namespace op::bitwise_xor::nvidia diff --git a/src/infiniop/ops/bitwise_xor/nvidia/bitwise_xor_nvidia.cuh b/src/infiniop/ops/bitwise_xor/nvidia/bitwise_xor_nvidia.cuh new file mode 100644 index 000000000..e3ae06a54 --- /dev/null +++ b/src/infiniop/ops/bitwise_xor/nvidia/bitwise_xor_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __BITWISE_XOR_CUDA_API_H__ +#define __BITWISE_XOR_CUDA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(bitwise_xor, nvidia) + +#endif // __BITWISE_XOR_CUDA_API_H__ diff --git a/src/infiniop/ops/bitwise_xor/operator.cc b/src/infiniop/ops/bitwise_xor/operator.cc new file mode 100644 index 000000000..04529fe68 --- /dev/null +++ b/src/infiniop/ops/bitwise_xor/operator.cc @@ -0,0 +1,11 @@ +#include "../../operator_impl.h" +#include "infiniop/ops/binary_ops_api.h" + +#ifdef ENABLE_CPU_API +#include "cpu/bitwise_xor_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#include "nvidia/bitwise_xor_nvidia.cuh" +#endif + +BINARY_OP_IMPL(bitwise_xor, BitwiseXor) diff --git a/src/infiniop/ops/copysign/cpu/copysign_cpu.cc b/src/infiniop/ops/copysign/cpu/copysign_cpu.cc new file mode 100644 index 000000000..fe3757fa5 --- /dev/null +++ b/src/infiniop/ops/copysign/cpu/copysign_cpu.cc @@ -0,0 +1,8 @@ +#include "copysign_cpu.h" +#include "../../../elementwise/cpu/elementwise_cpu_impl.h" + +namespace op::copysign::cpu { + +ELEMENTWISE_CPU_IMPL_BINARY(copysign) + +} // namespace op::copysign::cpu diff --git a/src/infiniop/ops/copysign/cpu/copysign_cpu.h b/src/infiniop/ops/copysign/cpu/copysign_cpu.h new file mode 100644 index 000000000..99123765e --- /dev/null +++ b/src/infiniop/ops/copysign/cpu/copysign_cpu.h @@ -0,0 +1,9 @@ +#ifndef __COPYSIGN_CPU_H__ +#define __COPYSIGN_CPU_H__ + +#include "../../../elementwise/binary.h" +#include "../../../elementwise/cpu/elementwise_cpu.h" + +BINARY_ELEMENTWISE_DESCRIPTOR(copysign, cpu, op::elementwise::binary::BinaryMode::CopySign) + +#endif // __COPYSIGN_CPU_H__ diff --git a/src/infiniop/ops/copysign/cuda/kernel.cuh b/src/infiniop/ops/copysign/cuda/kernel.cuh new file mode 100644 index 000000000..2064923c9 --- /dev/null +++ b/src/infiniop/ops/copysign/cuda/kernel.cuh @@ -0,0 +1,10 @@ +#ifndef __COPYSIGN_CUDA_H__ +#define __COPYSIGN_CUDA_H__ + +#include "../../../elementwise/binary.h" + +namespace op::copysign::cuda { +using Op = op::elementwise::binary::cuda::BinaryOp; +} // namespace op::copysign::cuda + +#endif // __COPYSIGN_CUDA_H__ diff --git a/src/infiniop/ops/copysign/nvidia/copysign_nvidia.cu b/src/infiniop/ops/copysign/nvidia/copysign_nvidia.cu new file mode 100644 index 000000000..261a4a204 --- /dev/null +++ b/src/infiniop/ops/copysign/nvidia/copysign_nvidia.cu @@ -0,0 +1,10 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh" + +#include "../cuda/kernel.cuh" +#include "copysign_nvidia.cuh" + +namespace op::copysign::nvidia { + +ELEMENTWISE_NVIDIA_IMPL_BINARY(copysign) + +} // namespace op::copysign::nvidia diff --git a/src/infiniop/ops/copysign/nvidia/copysign_nvidia.cuh b/src/infiniop/ops/copysign/nvidia/copysign_nvidia.cuh new file mode 100644 index 000000000..33005c379 --- /dev/null +++ b/src/infiniop/ops/copysign/nvidia/copysign_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __COPYSIGN_NVIDIA_API_H__ +#define __COPYSIGN_NVIDIA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(copysign, nvidia) + +#endif // __COPYSIGN_NVIDIA_API_H__ diff --git a/src/infiniop/ops/copysign/operator.cc b/src/infiniop/ops/copysign/operator.cc new file mode 100644 index 000000000..0c8652ff7 --- /dev/null +++ b/src/infiniop/ops/copysign/operator.cc @@ -0,0 +1,11 @@ +#include "../../operator_impl.h" +#include "infiniop/ops/binary_ops_api.h" + +#ifdef ENABLE_CPU_API +#include "cpu/copysign_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#include "nvidia/copysign_nvidia.cuh" +#endif + +BINARY_OP_IMPL(copysign, CopySign) diff --git a/src/infiniop/ops/eq/cpu/eq_cpu.cc b/src/infiniop/ops/eq/cpu/eq_cpu.cc new file mode 100644 index 000000000..e9b5dcefa --- /dev/null +++ b/src/infiniop/ops/eq/cpu/eq_cpu.cc @@ -0,0 +1,8 @@ +#include "eq_cpu.h" +#include "../../../elementwise/cpu/elementwise_cpu_impl.h" + +namespace op::eq::cpu { + +ELEMENTWISE_CPU_IMPL_BINARY(eq) + +} // namespace op::eq::cpu diff --git a/src/infiniop/ops/eq/cpu/eq_cpu.h b/src/infiniop/ops/eq/cpu/eq_cpu.h new file mode 100644 index 000000000..c1de9b01c --- /dev/null +++ b/src/infiniop/ops/eq/cpu/eq_cpu.h @@ -0,0 +1,9 @@ +#ifndef __EQ_CPU_H__ +#define __EQ_CPU_H__ + +#include "../../../elementwise/binary.h" +#include "../../../elementwise/cpu/elementwise_cpu.h" + +BINARY_ELEMENTWISE_DESCRIPTOR(eq, cpu, op::elementwise::binary::BinaryMode::Equal) + +#endif // __EQ_CPU_H__ diff --git a/src/infiniop/ops/eq/cuda/kernel.cuh b/src/infiniop/ops/eq/cuda/kernel.cuh new file mode 100644 index 000000000..a8557604c --- /dev/null +++ b/src/infiniop/ops/eq/cuda/kernel.cuh @@ -0,0 +1,10 @@ +#ifndef __EQ_CUDA_H__ +#define __EQ_CUDA_H__ + +#include "../../../elementwise/binary.h" + +namespace op::eq::cuda { +using Op = op::elementwise::binary::cuda::BinaryOp; +} // namespace op::eq::cuda + +#endif // __EQ_CUDA_H__ diff --git a/src/infiniop/ops/eq/nvidia/eq_nvidia.cu b/src/infiniop/ops/eq/nvidia/eq_nvidia.cu new file mode 100644 index 000000000..b3dd9722d --- /dev/null +++ b/src/infiniop/ops/eq/nvidia/eq_nvidia.cu @@ -0,0 +1,10 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh" + +#include "../cuda/kernel.cuh" +#include "eq_nvidia.cuh" + +namespace op::eq::nvidia { + +ELEMENTWISE_NVIDIA_IMPL_BINARY(eq) + +} // namespace op::eq::nvidia diff --git a/src/infiniop/ops/eq/nvidia/eq_nvidia.cuh b/src/infiniop/ops/eq/nvidia/eq_nvidia.cuh new file mode 100644 index 000000000..bbe6ccbdb --- /dev/null +++ b/src/infiniop/ops/eq/nvidia/eq_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __EQ_CUDA_API_H__ +#define __EQ_CUDA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(eq, nvidia) + +#endif // __EQ_CUDA_API_H__ diff --git a/src/infiniop/ops/eq/operator.cc b/src/infiniop/ops/eq/operator.cc new file mode 100644 index 000000000..380fcfab7 --- /dev/null +++ b/src/infiniop/ops/eq/operator.cc @@ -0,0 +1,11 @@ +#include "../../operator_impl.h" +#include "infiniop/ops/binary_ops_api.h" + +#ifdef ENABLE_CPU_API +#include "cpu/eq_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#include "nvidia/eq_nvidia.cuh" +#endif + +BINARY_OP_IMPL(eq, Eq) diff --git a/src/infiniop/ops/exp2/cpu/exp2_cpu.cc b/src/infiniop/ops/exp2/cpu/exp2_cpu.cc new file mode 100644 index 000000000..4a1bfdc88 --- /dev/null +++ b/src/infiniop/ops/exp2/cpu/exp2_cpu.cc @@ -0,0 +1,8 @@ +#include "exp2_cpu.h" +#include "../../../elementwise/cpu/elementwise_cpu_impl.h" + +namespace op::exp2::cpu { + +ELEMENTWISE_CPU_IMPL_UNARY_EXTENDED(exp2) + +} // namespace op::exp2::cpu diff --git a/src/infiniop/ops/exp2/cpu/exp2_cpu.h b/src/infiniop/ops/exp2/cpu/exp2_cpu.h new file mode 100644 index 000000000..1f86686a4 --- /dev/null +++ b/src/infiniop/ops/exp2/cpu/exp2_cpu.h @@ -0,0 +1,9 @@ +#ifndef __EXP2_CPU_H__ +#define __EXP2_CPU_H__ + +#include "../../../elementwise/cpu/elementwise_cpu.h" +#include "../../../elementwise/unary.h" + +UNARY_ELEMENTWISE_DESCRIPTOR(exp2, cpu, op::elementwise::unary::UnaryMode::Exp2) + +#endif // __EXP2_CPU_H__ diff --git a/src/infiniop/ops/exp2/cuda/kernel.cuh b/src/infiniop/ops/exp2/cuda/kernel.cuh new file mode 100644 index 000000000..50987350c --- /dev/null +++ b/src/infiniop/ops/exp2/cuda/kernel.cuh @@ -0,0 +1,10 @@ +#ifndef __EXP2_CUDA_H__ +#define __EXP2_CUDA_H__ + +#include "../../../elementwise/unary.h" + +namespace op::exp2::cuda { +using Op = op::elementwise::unary::cuda::UnaryOp; +} // namespace op::exp2::cuda + +#endif // __EXP2_CUDA_H__ diff --git a/src/infiniop/ops/exp2/nvidia/exp2_nvidia.cu b/src/infiniop/ops/exp2/nvidia/exp2_nvidia.cu new file mode 100644 index 000000000..6097a7745 --- /dev/null +++ b/src/infiniop/ops/exp2/nvidia/exp2_nvidia.cu @@ -0,0 +1,10 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh" + +#include "../cuda/kernel.cuh" +#include "exp2_nvidia.cuh" + +namespace op::exp2::nvidia { + +ELEMENTWISE_NVIDIA_IMPL_UNARY_EXTENDED(exp2) + +} // namespace op::exp2::nvidia diff --git a/src/infiniop/ops/exp2/nvidia/exp2_nvidia.cuh b/src/infiniop/ops/exp2/nvidia/exp2_nvidia.cuh new file mode 100644 index 000000000..d00e368fb --- /dev/null +++ b/src/infiniop/ops/exp2/nvidia/exp2_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __EXP2_CUDA_API_H__ +#define __EXP2_CUDA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(exp2, nvidia) + +#endif // __EXP2_CUDA_API_H__ diff --git a/src/infiniop/ops/exp2/operator.cc b/src/infiniop/ops/exp2/operator.cc new file mode 100644 index 000000000..c4f695f18 --- /dev/null +++ b/src/infiniop/ops/exp2/operator.cc @@ -0,0 +1,11 @@ +#include "../../operator_impl.h" +#include "infiniop/ops/unary_ops_api.h" + +#ifdef ENABLE_CPU_API +#include "cpu/exp2_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#include "nvidia/exp2_nvidia.cuh" +#endif + +UNARY_OP_IMPL(exp2, Exp2) diff --git a/src/infiniop/ops/floor_divide/cpu/floor_divide_cpu.cc b/src/infiniop/ops/floor_divide/cpu/floor_divide_cpu.cc new file mode 100644 index 000000000..5e6665a70 --- /dev/null +++ b/src/infiniop/ops/floor_divide/cpu/floor_divide_cpu.cc @@ -0,0 +1,8 @@ +#include "floor_divide_cpu.h" +#include "../../../elementwise/cpu/elementwise_cpu_impl.h" + +namespace op::floor_divide::cpu { + +ELEMENTWISE_CPU_IMPL_BINARY(floor_divide) + +} // namespace op::floor_divide::cpu diff --git a/src/infiniop/ops/floor_divide/cpu/floor_divide_cpu.h b/src/infiniop/ops/floor_divide/cpu/floor_divide_cpu.h new file mode 100644 index 000000000..e3286b837 --- /dev/null +++ b/src/infiniop/ops/floor_divide/cpu/floor_divide_cpu.h @@ -0,0 +1,9 @@ +#ifndef __FLOOR_DIVIDE_CPU_H__ +#define __FLOOR_DIVIDE_CPU_H__ + +#include "../../../elementwise/binary.h" +#include "../../../elementwise/cpu/elementwise_cpu.h" + +BINARY_ELEMENTWISE_DESCRIPTOR(floor_divide, cpu, op::elementwise::binary::BinaryMode::FloorDivide) + +#endif // __FLOOR_DIVIDE_CPU_H__ diff --git a/src/infiniop/ops/floor_divide/cuda/kernel.cuh b/src/infiniop/ops/floor_divide/cuda/kernel.cuh new file mode 100644 index 000000000..a4303f883 --- /dev/null +++ b/src/infiniop/ops/floor_divide/cuda/kernel.cuh @@ -0,0 +1,10 @@ +#ifndef __FLOOR_DIVIDE_CUDA_H__ +#define __FLOOR_DIVIDE_CUDA_H__ + +#include "../../../elementwise/binary.h" + +namespace op::floor_divide::cuda { +using Op = op::elementwise::binary::cuda::BinaryOp; +} // namespace op::floor_divide::cuda + +#endif // __FLOOR_DIVIDE_CUDA_H__ diff --git a/src/infiniop/ops/floor_divide/nvidia/floor_divide_nvidia.cu b/src/infiniop/ops/floor_divide/nvidia/floor_divide_nvidia.cu new file mode 100644 index 000000000..764891fa4 --- /dev/null +++ b/src/infiniop/ops/floor_divide/nvidia/floor_divide_nvidia.cu @@ -0,0 +1,10 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh" + +#include "../cuda/kernel.cuh" +#include "floor_divide_nvidia.cuh" + +namespace op::floor_divide::nvidia { + +ELEMENTWISE_NVIDIA_IMPL_BINARY(floor_divide) + +} // namespace op::floor_divide::nvidia diff --git a/src/infiniop/ops/floor_divide/nvidia/floor_divide_nvidia.cuh b/src/infiniop/ops/floor_divide/nvidia/floor_divide_nvidia.cuh new file mode 100644 index 000000000..1c70343cf --- /dev/null +++ b/src/infiniop/ops/floor_divide/nvidia/floor_divide_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __FLOOR_DIVIDE_CUDA_API_H__ +#define __FLOOR_DIVIDE_CUDA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(floor_divide, nvidia) + +#endif // __FLOOR_DIVIDE_CUDA_API_H__ diff --git a/src/infiniop/ops/floor_divide/operator.cc b/src/infiniop/ops/floor_divide/operator.cc new file mode 100644 index 000000000..3de7b971d --- /dev/null +++ b/src/infiniop/ops/floor_divide/operator.cc @@ -0,0 +1,11 @@ +#include "../../operator_impl.h" +#include "infiniop/ops/binary_ops_api.h" + +#ifdef ENABLE_CPU_API +#include "cpu/floor_divide_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#include "nvidia/floor_divide_nvidia.cuh" +#endif + +BINARY_OP_IMPL(floor_divide, FloorDivide) diff --git a/src/infiniop/ops/fmax/cpu/fmax_cpu.cc b/src/infiniop/ops/fmax/cpu/fmax_cpu.cc new file mode 100644 index 000000000..72cda24f9 --- /dev/null +++ b/src/infiniop/ops/fmax/cpu/fmax_cpu.cc @@ -0,0 +1,8 @@ +#include "fmax_cpu.h" +#include "../../../elementwise/cpu/elementwise_cpu_impl.h" + +namespace op::fmax::cpu { + +ELEMENTWISE_CPU_IMPL_BINARY(fmax) + +} // namespace op::fmax::cpu diff --git a/src/infiniop/ops/fmax/cpu/fmax_cpu.h b/src/infiniop/ops/fmax/cpu/fmax_cpu.h new file mode 100644 index 000000000..f9abc2e87 --- /dev/null +++ b/src/infiniop/ops/fmax/cpu/fmax_cpu.h @@ -0,0 +1,9 @@ +#ifndef __FMAX_CPU_H__ +#define __FMAX_CPU_H__ + +#include "../../../elementwise/binary.h" +#include "../../../elementwise/cpu/elementwise_cpu.h" + +BINARY_ELEMENTWISE_DESCRIPTOR(fmax, cpu, op::elementwise::binary::BinaryMode::Fmax) + +#endif // __FMAX_CPU_H__ diff --git a/src/infiniop/ops/fmax/cuda/kernel.cuh b/src/infiniop/ops/fmax/cuda/kernel.cuh new file mode 100644 index 000000000..ad1c61116 --- /dev/null +++ b/src/infiniop/ops/fmax/cuda/kernel.cuh @@ -0,0 +1,10 @@ +#ifndef __FMAX_CUDA_H__ +#define __FMAX_CUDA_H__ + +#include "../../../elementwise/binary.h" + +namespace op::fmax::cuda { +using Op = op::elementwise::binary::cuda::BinaryOp; +} // namespace op::fmax::cuda + +#endif // __FMAX_CUDA_H__ diff --git a/src/infiniop/ops/fmax/nvidia/fmax_nvidia.cu b/src/infiniop/ops/fmax/nvidia/fmax_nvidia.cu new file mode 100644 index 000000000..3a259f435 --- /dev/null +++ b/src/infiniop/ops/fmax/nvidia/fmax_nvidia.cu @@ -0,0 +1,10 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh" + +#include "../cuda/kernel.cuh" +#include "fmax_nvidia.cuh" + +namespace op::fmax::nvidia { + +ELEMENTWISE_NVIDIA_IMPL_BINARY(fmax) + +} // namespace op::fmax::nvidia diff --git a/src/infiniop/ops/fmax/nvidia/fmax_nvidia.cuh b/src/infiniop/ops/fmax/nvidia/fmax_nvidia.cuh new file mode 100644 index 000000000..8e2b1d74a --- /dev/null +++ b/src/infiniop/ops/fmax/nvidia/fmax_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __FMAX_NVIDIA_API_H__ +#define __FMAX_NVIDIA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(fmax, nvidia) + +#endif // __FMAX_NVIDIA_API_H__ diff --git a/src/infiniop/ops/fmax/operator.cc b/src/infiniop/ops/fmax/operator.cc new file mode 100644 index 000000000..32ce7235c --- /dev/null +++ b/src/infiniop/ops/fmax/operator.cc @@ -0,0 +1,11 @@ +#include "../../operator_impl.h" +#include "infiniop/ops/binary_ops_api.h" + +#ifdef ENABLE_CPU_API +#include "cpu/fmax_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#include "nvidia/fmax_nvidia.cuh" +#endif + +BINARY_OP_IMPL(fmax, Fmax) diff --git a/src/infiniop/ops/fmin/cpu/fmin_cpu.cc b/src/infiniop/ops/fmin/cpu/fmin_cpu.cc new file mode 100644 index 000000000..730371b63 --- /dev/null +++ b/src/infiniop/ops/fmin/cpu/fmin_cpu.cc @@ -0,0 +1,8 @@ +#include "fmin_cpu.h" +#include "../../../elementwise/cpu/elementwise_cpu_impl.h" + +namespace op::fmin::cpu { + +ELEMENTWISE_CPU_IMPL_BINARY(fmin) + +} // namespace op::fmin::cpu diff --git a/src/infiniop/ops/fmin/cpu/fmin_cpu.h b/src/infiniop/ops/fmin/cpu/fmin_cpu.h new file mode 100644 index 000000000..9ea1781e4 --- /dev/null +++ b/src/infiniop/ops/fmin/cpu/fmin_cpu.h @@ -0,0 +1,9 @@ +#ifndef __FMIN_CPU_H__ +#define __FMIN_CPU_H__ + +#include "../../../elementwise/binary.h" +#include "../../../elementwise/cpu/elementwise_cpu.h" + +BINARY_ELEMENTWISE_DESCRIPTOR(fmin, cpu, op::elementwise::binary::BinaryMode::Fmin) + +#endif // __FMIN_CPU_H__ diff --git a/src/infiniop/ops/fmin/cuda/kernel.cuh b/src/infiniop/ops/fmin/cuda/kernel.cuh new file mode 100644 index 000000000..57e641c8b --- /dev/null +++ b/src/infiniop/ops/fmin/cuda/kernel.cuh @@ -0,0 +1,10 @@ +#ifndef __FMIN_CUDA_H__ +#define __FMIN_CUDA_H__ + +#include "../../../elementwise/binary.h" + +namespace op::fmin::cuda { +using Op = op::elementwise::binary::cuda::BinaryOp; +} // namespace op::fmin::cuda + +#endif // __FMIN_CUDA_H__ diff --git a/src/infiniop/ops/fmin/nvidia/fmin_nvidia.cu b/src/infiniop/ops/fmin/nvidia/fmin_nvidia.cu new file mode 100644 index 000000000..cda271a51 --- /dev/null +++ b/src/infiniop/ops/fmin/nvidia/fmin_nvidia.cu @@ -0,0 +1,10 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh" + +#include "../cuda/kernel.cuh" +#include "fmin_nvidia.cuh" + +namespace op::fmin::nvidia { + +ELEMENTWISE_NVIDIA_IMPL_BINARY(fmin) + +} // namespace op::fmin::nvidia diff --git a/src/infiniop/ops/fmin/nvidia/fmin_nvidia.cuh b/src/infiniop/ops/fmin/nvidia/fmin_nvidia.cuh new file mode 100644 index 000000000..10a391e40 --- /dev/null +++ b/src/infiniop/ops/fmin/nvidia/fmin_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __FMIN_NVIDIA_API_H__ +#define __FMIN_NVIDIA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(fmin, nvidia) + +#endif // __FMIN_NVIDIA_API_H__ diff --git a/src/infiniop/ops/fmin/operator.cc b/src/infiniop/ops/fmin/operator.cc new file mode 100644 index 000000000..f0e85cfe9 --- /dev/null +++ b/src/infiniop/ops/fmin/operator.cc @@ -0,0 +1,11 @@ +#include "../../operator_impl.h" +#include "infiniop/ops/binary_ops_api.h" + +#ifdef ENABLE_CPU_API +#include "cpu/fmin_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#include "nvidia/fmin_nvidia.cuh" +#endif + +BINARY_OP_IMPL(fmin, Fmin) diff --git a/src/infiniop/ops/ge/cpu/ge_cpu.cc b/src/infiniop/ops/ge/cpu/ge_cpu.cc new file mode 100644 index 000000000..56b3bc383 --- /dev/null +++ b/src/infiniop/ops/ge/cpu/ge_cpu.cc @@ -0,0 +1,8 @@ +#include "ge_cpu.h" +#include "../../../elementwise/cpu/elementwise_cpu_impl.h" + +namespace op::ge::cpu { + +ELEMENTWISE_CPU_IMPL_BINARY(ge) + +} // namespace op::ge::cpu diff --git a/src/infiniop/ops/ge/cpu/ge_cpu.h b/src/infiniop/ops/ge/cpu/ge_cpu.h new file mode 100644 index 000000000..461c36411 --- /dev/null +++ b/src/infiniop/ops/ge/cpu/ge_cpu.h @@ -0,0 +1,9 @@ +#ifndef __GE_CPU_H__ +#define __GE_CPU_H__ + +#include "../../../elementwise/binary.h" +#include "../../../elementwise/cpu/elementwise_cpu.h" + +BINARY_ELEMENTWISE_DESCRIPTOR(ge, cpu, op::elementwise::binary::BinaryMode::GreaterOrEqual) + +#endif // __GE_CPU_H__ diff --git a/src/infiniop/ops/ge/cuda/kernel.cuh b/src/infiniop/ops/ge/cuda/kernel.cuh new file mode 100644 index 000000000..40dfdd8b2 --- /dev/null +++ b/src/infiniop/ops/ge/cuda/kernel.cuh @@ -0,0 +1,10 @@ +#ifndef __GE_CUDA_H__ +#define __GE_CUDA_H__ + +#include "../../../elementwise/binary.h" + +namespace op::ge::cuda { +using Op = op::elementwise::binary::cuda::BinaryOp; +} // namespace op::ge::cuda + +#endif // __GE_CUDA_H__ diff --git a/src/infiniop/ops/ge/nvidia/ge_nvidia.cu b/src/infiniop/ops/ge/nvidia/ge_nvidia.cu new file mode 100644 index 000000000..c29f985aa --- /dev/null +++ b/src/infiniop/ops/ge/nvidia/ge_nvidia.cu @@ -0,0 +1,10 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh" + +#include "../cuda/kernel.cuh" +#include "ge_nvidia.cuh" + +namespace op::ge::nvidia { + +ELEMENTWISE_NVIDIA_IMPL_BINARY(ge) + +} // namespace op::ge::nvidia diff --git a/src/infiniop/ops/ge/nvidia/ge_nvidia.cuh b/src/infiniop/ops/ge/nvidia/ge_nvidia.cuh new file mode 100644 index 000000000..a20742080 --- /dev/null +++ b/src/infiniop/ops/ge/nvidia/ge_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __GE_CUDA_API_H__ +#define __GE_CUDA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(ge, nvidia) + +#endif // __GE_CUDA_API_H__ diff --git a/src/infiniop/ops/ge/operator.cc b/src/infiniop/ops/ge/operator.cc new file mode 100644 index 000000000..eda9851d2 --- /dev/null +++ b/src/infiniop/ops/ge/operator.cc @@ -0,0 +1,11 @@ +#include "../../operator_impl.h" +#include "infiniop/ops/binary_ops_api.h" + +#ifdef ENABLE_CPU_API +#include "cpu/ge_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#include "nvidia/ge_nvidia.cuh" +#endif + +BINARY_OP_IMPL(ge, Ge) diff --git a/src/infiniop/ops/gt/cpu/gt_cpu.cc b/src/infiniop/ops/gt/cpu/gt_cpu.cc new file mode 100644 index 000000000..bdd204dde --- /dev/null +++ b/src/infiniop/ops/gt/cpu/gt_cpu.cc @@ -0,0 +1,8 @@ +#include "gt_cpu.h" +#include "../../../elementwise/cpu/elementwise_cpu_impl.h" + +namespace op::gt::cpu { + +ELEMENTWISE_CPU_IMPL_BINARY(gt) + +} // namespace op::gt::cpu diff --git a/src/infiniop/ops/gt/cpu/gt_cpu.h b/src/infiniop/ops/gt/cpu/gt_cpu.h new file mode 100644 index 000000000..a637a856d --- /dev/null +++ b/src/infiniop/ops/gt/cpu/gt_cpu.h @@ -0,0 +1,9 @@ +#ifndef __GT_CPU_H__ +#define __GT_CPU_H__ + +#include "../../../elementwise/binary.h" +#include "../../../elementwise/cpu/elementwise_cpu.h" + +BINARY_ELEMENTWISE_DESCRIPTOR(gt, cpu, op::elementwise::binary::BinaryMode::Greater) + +#endif // __GT_CPU_H__ diff --git a/src/infiniop/ops/gt/cuda/kernel.cuh b/src/infiniop/ops/gt/cuda/kernel.cuh new file mode 100644 index 000000000..1d4bfa720 --- /dev/null +++ b/src/infiniop/ops/gt/cuda/kernel.cuh @@ -0,0 +1,10 @@ +#ifndef __GT_CUDA_H__ +#define __GT_CUDA_H__ + +#include "../../../elementwise/binary.h" + +namespace op::gt::cuda { +using Op = op::elementwise::binary::cuda::BinaryOp; +} // namespace op::gt::cuda + +#endif // __GT_CUDA_H__ diff --git a/src/infiniop/ops/gt/nvidia/gt_nvidia.cu b/src/infiniop/ops/gt/nvidia/gt_nvidia.cu new file mode 100644 index 000000000..a0eea040e --- /dev/null +++ b/src/infiniop/ops/gt/nvidia/gt_nvidia.cu @@ -0,0 +1,10 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh" + +#include "../cuda/kernel.cuh" +#include "gt_nvidia.cuh" + +namespace op::gt::nvidia { + +ELEMENTWISE_NVIDIA_IMPL_BINARY(gt) + +} // namespace op::gt::nvidia diff --git a/src/infiniop/ops/gt/nvidia/gt_nvidia.cuh b/src/infiniop/ops/gt/nvidia/gt_nvidia.cuh new file mode 100644 index 000000000..ce5517d89 --- /dev/null +++ b/src/infiniop/ops/gt/nvidia/gt_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __GT_CUDA_API_H__ +#define __GT_CUDA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(gt, nvidia) + +#endif // __GT_CUDA_API_H__ diff --git a/src/infiniop/ops/gt/operator.cc b/src/infiniop/ops/gt/operator.cc new file mode 100644 index 000000000..74a5cec53 --- /dev/null +++ b/src/infiniop/ops/gt/operator.cc @@ -0,0 +1,11 @@ +#include "../../operator_impl.h" +#include "infiniop/ops/binary_ops_api.h" + +#ifdef ENABLE_CPU_API +#include "cpu/gt_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#include "nvidia/gt_nvidia.cuh" +#endif + +BINARY_OP_IMPL(gt, Gt) diff --git a/src/infiniop/ops/hypot/cpu/hypot_cpu.cc b/src/infiniop/ops/hypot/cpu/hypot_cpu.cc new file mode 100644 index 000000000..20f90fe40 --- /dev/null +++ b/src/infiniop/ops/hypot/cpu/hypot_cpu.cc @@ -0,0 +1,8 @@ +#include "hypot_cpu.h" +#include "../../../elementwise/cpu/elementwise_cpu_impl.h" + +namespace op::hypot::cpu { + +ELEMENTWISE_CPU_IMPL_BINARY(hypot) + +} // namespace op::hypot::cpu diff --git a/src/infiniop/ops/hypot/cpu/hypot_cpu.h b/src/infiniop/ops/hypot/cpu/hypot_cpu.h new file mode 100644 index 000000000..d9b1db8cd --- /dev/null +++ b/src/infiniop/ops/hypot/cpu/hypot_cpu.h @@ -0,0 +1,9 @@ +#ifndef __HYPOT_CPU_H__ +#define __HYPOT_CPU_H__ + +#include "../../../elementwise/binary.h" +#include "../../../elementwise/cpu/elementwise_cpu.h" + +BINARY_ELEMENTWISE_DESCRIPTOR(hypot, cpu, op::elementwise::binary::BinaryMode::Hypot) + +#endif // __HYPOT_CPU_H__ diff --git a/src/infiniop/ops/hypot/cuda/kernel.cuh b/src/infiniop/ops/hypot/cuda/kernel.cuh new file mode 100644 index 000000000..9616d5bbe --- /dev/null +++ b/src/infiniop/ops/hypot/cuda/kernel.cuh @@ -0,0 +1,10 @@ +#ifndef __HYPOT_CUDA_H__ +#define __HYPOT_CUDA_H__ + +#include "../../../elementwise/binary.h" + +namespace op::hypot::cuda { +using Op = op::elementwise::binary::cuda::BinaryOp; +} // namespace op::hypot::cuda + +#endif // __HYPOT_CUDA_H__ diff --git a/src/infiniop/ops/hypot/nvidia/hypot_nvidia.cu b/src/infiniop/ops/hypot/nvidia/hypot_nvidia.cu new file mode 100644 index 000000000..5c3aaaa73 --- /dev/null +++ b/src/infiniop/ops/hypot/nvidia/hypot_nvidia.cu @@ -0,0 +1,10 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh" + +#include "../cuda/kernel.cuh" +#include "hypot_nvidia.cuh" + +namespace op::hypot::nvidia { + +ELEMENTWISE_NVIDIA_IMPL_BINARY(hypot) + +} // namespace op::hypot::nvidia diff --git a/src/infiniop/ops/hypot/nvidia/hypot_nvidia.cuh b/src/infiniop/ops/hypot/nvidia/hypot_nvidia.cuh new file mode 100644 index 000000000..879bc02dc --- /dev/null +++ b/src/infiniop/ops/hypot/nvidia/hypot_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __HYPOT_NVIDIA_API_H__ +#define __HYPOT_NVIDIA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(hypot, nvidia) + +#endif // __HYPOT_NVIDIA_API_H__ diff --git a/src/infiniop/ops/hypot/operator.cc b/src/infiniop/ops/hypot/operator.cc new file mode 100644 index 000000000..a2ebf9bea --- /dev/null +++ b/src/infiniop/ops/hypot/operator.cc @@ -0,0 +1,11 @@ +#include "../../operator_impl.h" +#include "infiniop/ops/binary_ops_api.h" + +#ifdef ENABLE_CPU_API +#include "cpu/hypot_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#include "nvidia/hypot_nvidia.cuh" +#endif + +BINARY_OP_IMPL(hypot, Hypot) diff --git a/src/infiniop/ops/isfinite/cpu/isfinite_cpu.cc b/src/infiniop/ops/isfinite/cpu/isfinite_cpu.cc new file mode 100644 index 000000000..f7bcb3d8a --- /dev/null +++ b/src/infiniop/ops/isfinite/cpu/isfinite_cpu.cc @@ -0,0 +1,8 @@ +#include "isfinite_cpu.h" +#include "../../../elementwise/cpu/elementwise_cpu_impl.h" + +namespace op::isfinite::cpu { + +ELEMENTWISE_CPU_IMPL_UNARY(isfinite) + +} // namespace op::isfinite::cpu diff --git a/src/infiniop/ops/isfinite/cpu/isfinite_cpu.h b/src/infiniop/ops/isfinite/cpu/isfinite_cpu.h new file mode 100644 index 000000000..9a0fa1c3f --- /dev/null +++ b/src/infiniop/ops/isfinite/cpu/isfinite_cpu.h @@ -0,0 +1,9 @@ +#ifndef __ISFINITE_CPU_H__ +#define __ISFINITE_CPU_H__ + +#include "../../../elementwise/cpu/elementwise_cpu.h" +#include "../../../elementwise/unary.h" + +UNARY_ELEMENTWISE_DESCRIPTOR(isfinite, cpu, op::elementwise::unary::UnaryMode::IsFinite) + +#endif // __ISFINITE_CPU_H__ diff --git a/src/infiniop/ops/isfinite/cuda/kernel.cuh b/src/infiniop/ops/isfinite/cuda/kernel.cuh new file mode 100644 index 000000000..edbd0a548 --- /dev/null +++ b/src/infiniop/ops/isfinite/cuda/kernel.cuh @@ -0,0 +1,10 @@ +#ifndef __ISFINITE_CUDA_H__ +#define __ISFINITE_CUDA_H__ + +#include "../../../elementwise/unary.h" + +namespace op::isfinite::cuda { +using Op = op::elementwise::unary::cuda::UnaryOp; +} // namespace op::isfinite::cuda + +#endif // __ISFINITE_CUDA_H__ diff --git a/src/infiniop/ops/isfinite/nvidia/isfinite_nvidia.cu b/src/infiniop/ops/isfinite/nvidia/isfinite_nvidia.cu new file mode 100644 index 000000000..a76ea559d --- /dev/null +++ b/src/infiniop/ops/isfinite/nvidia/isfinite_nvidia.cu @@ -0,0 +1,10 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh" + +#include "../cuda/kernel.cuh" +#include "isfinite_nvidia.cuh" + +namespace op::isfinite::nvidia { + +ELEMENTWISE_NVIDIA_IMPL_UNARY(isfinite) + +} // namespace op::isfinite::nvidia diff --git a/src/infiniop/ops/isfinite/nvidia/isfinite_nvidia.cuh b/src/infiniop/ops/isfinite/nvidia/isfinite_nvidia.cuh new file mode 100644 index 000000000..49e0f3e99 --- /dev/null +++ b/src/infiniop/ops/isfinite/nvidia/isfinite_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __ISFINITE_NVIDIA_API_H__ +#define __ISFINITE_NVIDIA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(isfinite, nvidia) + +#endif // __ISFINITE_NVIDIA_API_H__ diff --git a/src/infiniop/ops/isfinite/operator.cc b/src/infiniop/ops/isfinite/operator.cc new file mode 100644 index 000000000..92c88523e --- /dev/null +++ b/src/infiniop/ops/isfinite/operator.cc @@ -0,0 +1,11 @@ +#include "../../operator_impl.h" +#include "infiniop/ops/unary_ops_api.h" + +#ifdef ENABLE_CPU_API +#include "cpu/isfinite_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#include "nvidia/isfinite_nvidia.cuh" +#endif + +UNARY_OP_IMPL(isfinite, IsFinite) diff --git a/src/infiniop/ops/isinf/cpu/isinf_cpu.cc b/src/infiniop/ops/isinf/cpu/isinf_cpu.cc new file mode 100644 index 000000000..8327746fc --- /dev/null +++ b/src/infiniop/ops/isinf/cpu/isinf_cpu.cc @@ -0,0 +1,8 @@ +#include "isinf_cpu.h" +#include "../../../elementwise/cpu/elementwise_cpu_impl.h" + +namespace op::isinf::cpu { + +ELEMENTWISE_CPU_IMPL_UNARY(isinf) + +} // namespace op::isinf::cpu diff --git a/src/infiniop/ops/isinf/cpu/isinf_cpu.h b/src/infiniop/ops/isinf/cpu/isinf_cpu.h new file mode 100644 index 000000000..edc4a1fcc --- /dev/null +++ b/src/infiniop/ops/isinf/cpu/isinf_cpu.h @@ -0,0 +1,9 @@ +#ifndef __ISINF_CPU_H__ +#define __ISINF_CPU_H__ + +#include "../../../elementwise/cpu/elementwise_cpu.h" +#include "../../../elementwise/unary.h" + +UNARY_ELEMENTWISE_DESCRIPTOR(isinf, cpu, op::elementwise::unary::UnaryMode::IsInf) + +#endif // __ISINF_CPU_H__ diff --git a/src/infiniop/ops/isinf/cuda/kernel.cuh b/src/infiniop/ops/isinf/cuda/kernel.cuh new file mode 100644 index 000000000..272ff3f08 --- /dev/null +++ b/src/infiniop/ops/isinf/cuda/kernel.cuh @@ -0,0 +1,10 @@ +#ifndef __ISINF_CUDA_H__ +#define __ISINF_CUDA_H__ + +#include "../../../elementwise/unary.h" + +namespace op::isinf::cuda { +using Op = op::elementwise::unary::cuda::UnaryOp; +} // namespace op::isinf::cuda + +#endif // __ISINF_CUDA_H__ diff --git a/src/infiniop/ops/isinf/nvidia/isinf_nvidia.cu b/src/infiniop/ops/isinf/nvidia/isinf_nvidia.cu new file mode 100644 index 000000000..1fd88363f --- /dev/null +++ b/src/infiniop/ops/isinf/nvidia/isinf_nvidia.cu @@ -0,0 +1,10 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh" + +#include "../cuda/kernel.cuh" +#include "isinf_nvidia.cuh" + +namespace op::isinf::nvidia { + +ELEMENTWISE_NVIDIA_IMPL_UNARY(isinf) + +} // namespace op::isinf::nvidia diff --git a/src/infiniop/ops/isinf/nvidia/isinf_nvidia.cuh b/src/infiniop/ops/isinf/nvidia/isinf_nvidia.cuh new file mode 100644 index 000000000..072a749ab --- /dev/null +++ b/src/infiniop/ops/isinf/nvidia/isinf_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __ISINF_NVIDIA_API_H__ +#define __ISINF_NVIDIA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(isinf, nvidia) + +#endif // __ISINF_NVIDIA_API_H__ diff --git a/src/infiniop/ops/isinf/operator.cc b/src/infiniop/ops/isinf/operator.cc new file mode 100644 index 000000000..de042520d --- /dev/null +++ b/src/infiniop/ops/isinf/operator.cc @@ -0,0 +1,11 @@ +#include "../../operator_impl.h" +#include "infiniop/ops/unary_ops_api.h" + +#ifdef ENABLE_CPU_API +#include "cpu/isinf_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#include "nvidia/isinf_nvidia.cuh" +#endif + +UNARY_OP_IMPL(isinf, IsInf) diff --git a/src/infiniop/ops/isnan/cpu/isnan_cpu.cc b/src/infiniop/ops/isnan/cpu/isnan_cpu.cc new file mode 100644 index 000000000..398bc3039 --- /dev/null +++ b/src/infiniop/ops/isnan/cpu/isnan_cpu.cc @@ -0,0 +1,8 @@ +#include "isnan_cpu.h" +#include "../../../elementwise/cpu/elementwise_cpu_impl.h" + +namespace op::isnan::cpu { + +ELEMENTWISE_CPU_IMPL_UNARY(isnan) + +} // namespace op::isnan::cpu diff --git a/src/infiniop/ops/isnan/cpu/isnan_cpu.h b/src/infiniop/ops/isnan/cpu/isnan_cpu.h new file mode 100644 index 000000000..002ad36d4 --- /dev/null +++ b/src/infiniop/ops/isnan/cpu/isnan_cpu.h @@ -0,0 +1,9 @@ +#ifndef __ISNAN_CPU_H__ +#define __ISNAN_CPU_H__ + +#include "../../../elementwise/cpu/elementwise_cpu.h" +#include "../../../elementwise/unary.h" + +UNARY_ELEMENTWISE_DESCRIPTOR(isnan, cpu, op::elementwise::unary::UnaryMode::IsNan) + +#endif // __ISNAN_CPU_H__ diff --git a/src/infiniop/ops/isnan/cuda/kernel.cuh b/src/infiniop/ops/isnan/cuda/kernel.cuh new file mode 100644 index 000000000..890688422 --- /dev/null +++ b/src/infiniop/ops/isnan/cuda/kernel.cuh @@ -0,0 +1,10 @@ +#ifndef __ISNAN_CUDA_H__ +#define __ISNAN_CUDA_H__ + +#include "../../../elementwise/unary.h" + +namespace op::isnan::cuda { +using Op = op::elementwise::unary::cuda::UnaryOp; +} // namespace op::isnan::cuda + +#endif // __ISNAN_CUDA_H__ diff --git a/src/infiniop/ops/isnan/nvidia/isnan_nvidia.cu b/src/infiniop/ops/isnan/nvidia/isnan_nvidia.cu new file mode 100644 index 000000000..8f1cd695e --- /dev/null +++ b/src/infiniop/ops/isnan/nvidia/isnan_nvidia.cu @@ -0,0 +1,10 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh" + +#include "../cuda/kernel.cuh" +#include "isnan_nvidia.cuh" + +namespace op::isnan::nvidia { + +ELEMENTWISE_NVIDIA_IMPL_UNARY(isnan) + +} // namespace op::isnan::nvidia diff --git a/src/infiniop/ops/isnan/nvidia/isnan_nvidia.cuh b/src/infiniop/ops/isnan/nvidia/isnan_nvidia.cuh new file mode 100644 index 000000000..341adf103 --- /dev/null +++ b/src/infiniop/ops/isnan/nvidia/isnan_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __ISNAN_NVIDIA_API_H__ +#define __ISNAN_NVIDIA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(isnan, nvidia) + +#endif // __ISNAN_NVIDIA_API_H__ diff --git a/src/infiniop/ops/isnan/operator.cc b/src/infiniop/ops/isnan/operator.cc new file mode 100644 index 000000000..fedad566a --- /dev/null +++ b/src/infiniop/ops/isnan/operator.cc @@ -0,0 +1,11 @@ +#include "../../operator_impl.h" +#include "infiniop/ops/unary_ops_api.h" + +#ifdef ENABLE_CPU_API +#include "cpu/isnan_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#include "nvidia/isnan_nvidia.cuh" +#endif + +UNARY_OP_IMPL(isnan, IsNan) diff --git a/src/infiniop/ops/le/cpu/le_cpu.cc b/src/infiniop/ops/le/cpu/le_cpu.cc new file mode 100644 index 000000000..9be2df7a0 --- /dev/null +++ b/src/infiniop/ops/le/cpu/le_cpu.cc @@ -0,0 +1,8 @@ +#include "le_cpu.h" +#include "../../../elementwise/cpu/elementwise_cpu_impl.h" + +namespace op::le::cpu { + +ELEMENTWISE_CPU_IMPL_BINARY(le) + +} // namespace op::le::cpu diff --git a/src/infiniop/ops/le/cpu/le_cpu.h b/src/infiniop/ops/le/cpu/le_cpu.h new file mode 100644 index 000000000..2a5679a16 --- /dev/null +++ b/src/infiniop/ops/le/cpu/le_cpu.h @@ -0,0 +1,9 @@ +#ifndef __LE_CPU_H__ +#define __LE_CPU_H__ + +#include "../../../elementwise/binary.h" +#include "../../../elementwise/cpu/elementwise_cpu.h" + +BINARY_ELEMENTWISE_DESCRIPTOR(le, cpu, op::elementwise::binary::BinaryMode::LessOrEqual) + +#endif // __LE_CPU_H__ diff --git a/src/infiniop/ops/le/cuda/kernel.cuh b/src/infiniop/ops/le/cuda/kernel.cuh new file mode 100644 index 000000000..d8d64f9f9 --- /dev/null +++ b/src/infiniop/ops/le/cuda/kernel.cuh @@ -0,0 +1,10 @@ +#ifndef __LE_CUDA_H__ +#define __LE_CUDA_H__ + +#include "../../../elementwise/binary.h" + +namespace op::le::cuda { +using Op = op::elementwise::binary::cuda::BinaryOp; +} // namespace op::le::cuda + +#endif // __LE_CUDA_H__ diff --git a/src/infiniop/ops/le/nvidia/le_nvidia.cu b/src/infiniop/ops/le/nvidia/le_nvidia.cu new file mode 100644 index 000000000..93d1327fb --- /dev/null +++ b/src/infiniop/ops/le/nvidia/le_nvidia.cu @@ -0,0 +1,10 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh" + +#include "../cuda/kernel.cuh" +#include "le_nvidia.cuh" + +namespace op::le::nvidia { + +ELEMENTWISE_NVIDIA_IMPL_BINARY(le) + +} // namespace op::le::nvidia diff --git a/src/infiniop/ops/le/nvidia/le_nvidia.cuh b/src/infiniop/ops/le/nvidia/le_nvidia.cuh new file mode 100644 index 000000000..62ea3d392 --- /dev/null +++ b/src/infiniop/ops/le/nvidia/le_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __LE_CUDA_API_H__ +#define __LE_CUDA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(le, nvidia) + +#endif // __LE_CUDA_API_H__ diff --git a/src/infiniop/ops/le/operator.cc b/src/infiniop/ops/le/operator.cc new file mode 100644 index 000000000..1809a3241 --- /dev/null +++ b/src/infiniop/ops/le/operator.cc @@ -0,0 +1,11 @@ +#include "../../operator_impl.h" +#include "infiniop/ops/binary_ops_api.h" + +#ifdef ENABLE_CPU_API +#include "cpu/le_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#include "nvidia/le_nvidia.cuh" +#endif + +BINARY_OP_IMPL(le, Le) diff --git a/src/infiniop/ops/log10/cpu/log10_cpu.cc b/src/infiniop/ops/log10/cpu/log10_cpu.cc new file mode 100644 index 000000000..2b28eff21 --- /dev/null +++ b/src/infiniop/ops/log10/cpu/log10_cpu.cc @@ -0,0 +1,8 @@ +#include "log10_cpu.h" +#include "../../../elementwise/cpu/elementwise_cpu_impl.h" + +namespace op::log10::cpu { + +ELEMENTWISE_CPU_IMPL_UNARY_EXTENDED(log10) + +} // namespace op::log10::cpu diff --git a/src/infiniop/ops/log10/cpu/log10_cpu.h b/src/infiniop/ops/log10/cpu/log10_cpu.h new file mode 100644 index 000000000..5ddc6d437 --- /dev/null +++ b/src/infiniop/ops/log10/cpu/log10_cpu.h @@ -0,0 +1,9 @@ +#ifndef __LOG10_CPU_H__ +#define __LOG10_CPU_H__ + +#include "../../../elementwise/cpu/elementwise_cpu.h" +#include "../../../elementwise/unary.h" + +UNARY_ELEMENTWISE_DESCRIPTOR(log10, cpu, op::elementwise::unary::UnaryMode::Log10) + +#endif // __LOG10_CPU_H__ diff --git a/src/infiniop/ops/log10/cuda/kernel.cuh b/src/infiniop/ops/log10/cuda/kernel.cuh new file mode 100644 index 000000000..eca7e0517 --- /dev/null +++ b/src/infiniop/ops/log10/cuda/kernel.cuh @@ -0,0 +1,10 @@ +#ifndef __LOG10_CUDA_H__ +#define __LOG10_CUDA_H__ + +#include "../../../elementwise/unary.h" + +namespace op::log10::cuda { +using Op = op::elementwise::unary::cuda::UnaryOp; +} // namespace op::log10::cuda + +#endif // __LOG10_CUDA_H__ diff --git a/src/infiniop/ops/log10/nvidia/log10_nvidia.cu b/src/infiniop/ops/log10/nvidia/log10_nvidia.cu new file mode 100644 index 000000000..fbfca7d35 --- /dev/null +++ b/src/infiniop/ops/log10/nvidia/log10_nvidia.cu @@ -0,0 +1,10 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh" + +#include "../cuda/kernel.cuh" +#include "log10_nvidia.cuh" + +namespace op::log10::nvidia { + +ELEMENTWISE_NVIDIA_IMPL_UNARY_EXTENDED(log10) + +} // namespace op::log10::nvidia diff --git a/src/infiniop/ops/log10/nvidia/log10_nvidia.cuh b/src/infiniop/ops/log10/nvidia/log10_nvidia.cuh new file mode 100644 index 000000000..1514994eb --- /dev/null +++ b/src/infiniop/ops/log10/nvidia/log10_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __LOG10_CUDA_API_H__ +#define __LOG10_CUDA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(log10, nvidia) + +#endif // __LOG10_CUDA_API_H__ diff --git a/src/infiniop/ops/log10/operator.cc b/src/infiniop/ops/log10/operator.cc new file mode 100644 index 000000000..da799c858 --- /dev/null +++ b/src/infiniop/ops/log10/operator.cc @@ -0,0 +1,11 @@ +#include "../../operator_impl.h" +#include "infiniop/ops/unary_ops_api.h" + +#ifdef ENABLE_CPU_API +#include "cpu/log10_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#include "nvidia/log10_nvidia.cuh" +#endif + +UNARY_OP_IMPL(log10, Log10) diff --git a/src/infiniop/ops/log1p/cpu/log1p_cpu.cc b/src/infiniop/ops/log1p/cpu/log1p_cpu.cc new file mode 100644 index 000000000..12aa543a9 --- /dev/null +++ b/src/infiniop/ops/log1p/cpu/log1p_cpu.cc @@ -0,0 +1,8 @@ +#include "log1p_cpu.h" +#include "../../../elementwise/cpu/elementwise_cpu_impl.h" + +namespace op::log1p::cpu { + +ELEMENTWISE_CPU_IMPL_UNARY(log1p) + +} // namespace op::log1p::cpu diff --git a/src/infiniop/ops/log1p/cpu/log1p_cpu.h b/src/infiniop/ops/log1p/cpu/log1p_cpu.h new file mode 100644 index 000000000..0c999a48a --- /dev/null +++ b/src/infiniop/ops/log1p/cpu/log1p_cpu.h @@ -0,0 +1,9 @@ +#ifndef __LOG1P_CPU_H__ +#define __LOG1P_CPU_H__ + +#include "../../../elementwise/cpu/elementwise_cpu.h" +#include "../../../elementwise/unary.h" + +UNARY_ELEMENTWISE_DESCRIPTOR(log1p, cpu, op::elementwise::unary::UnaryMode::Log1p) + +#endif // __LOG1P_CPU_H__ diff --git a/src/infiniop/ops/log1p/cuda/kernel.cuh b/src/infiniop/ops/log1p/cuda/kernel.cuh new file mode 100644 index 000000000..0bcfc73de --- /dev/null +++ b/src/infiniop/ops/log1p/cuda/kernel.cuh @@ -0,0 +1,10 @@ +#ifndef __LOG1P_CUDA_H__ +#define __LOG1P_CUDA_H__ + +#include "../../../elementwise/unary.h" + +namespace op::log1p::cuda { +using Op = op::elementwise::unary::cuda::UnaryOp; +} // namespace op::log1p::cuda + +#endif // __LOG1P_CUDA_H__ diff --git a/src/infiniop/ops/log1p/nvidia/log1p_nvidia.cu b/src/infiniop/ops/log1p/nvidia/log1p_nvidia.cu new file mode 100644 index 000000000..695b7c743 --- /dev/null +++ b/src/infiniop/ops/log1p/nvidia/log1p_nvidia.cu @@ -0,0 +1,10 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh" + +#include "../cuda/kernel.cuh" +#include "log1p_nvidia.cuh" + +namespace op::log1p::nvidia { + +ELEMENTWISE_NVIDIA_IMPL_UNARY(log1p) + +} // namespace op::log1p::nvidia diff --git a/src/infiniop/ops/log1p/nvidia/log1p_nvidia.cuh b/src/infiniop/ops/log1p/nvidia/log1p_nvidia.cuh new file mode 100644 index 000000000..2522dbc08 --- /dev/null +++ b/src/infiniop/ops/log1p/nvidia/log1p_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __LOG1P_NVIDIA_API_H__ +#define __LOG1P_NVIDIA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(log1p, nvidia) + +#endif // __LOG1P_NVIDIA_API_H__ diff --git a/src/infiniop/ops/log1p/operator.cc b/src/infiniop/ops/log1p/operator.cc new file mode 100644 index 000000000..795e9c009 --- /dev/null +++ b/src/infiniop/ops/log1p/operator.cc @@ -0,0 +1,11 @@ +#include "../../operator_impl.h" +#include "infiniop/ops/unary_ops_api.h" + +#ifdef ENABLE_CPU_API +#include "cpu/log1p_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#include "nvidia/log1p_nvidia.cuh" +#endif + +UNARY_OP_IMPL(log1p, Log1p) diff --git a/src/infiniop/ops/log2/cpu/log2_cpu.cc b/src/infiniop/ops/log2/cpu/log2_cpu.cc new file mode 100644 index 000000000..79676e4ab --- /dev/null +++ b/src/infiniop/ops/log2/cpu/log2_cpu.cc @@ -0,0 +1,8 @@ +#include "log2_cpu.h" +#include "../../../elementwise/cpu/elementwise_cpu_impl.h" + +namespace op::log2::cpu { + +ELEMENTWISE_CPU_IMPL_UNARY_EXTENDED(log2) + +} // namespace op::log2::cpu diff --git a/src/infiniop/ops/log2/cpu/log2_cpu.h b/src/infiniop/ops/log2/cpu/log2_cpu.h new file mode 100644 index 000000000..db62d672c --- /dev/null +++ b/src/infiniop/ops/log2/cpu/log2_cpu.h @@ -0,0 +1,9 @@ +#ifndef __LOG2_CPU_H__ +#define __LOG2_CPU_H__ + +#include "../../../elementwise/cpu/elementwise_cpu.h" +#include "../../../elementwise/unary.h" + +UNARY_ELEMENTWISE_DESCRIPTOR(log2, cpu, op::elementwise::unary::UnaryMode::Log2) + +#endif // __LOG2_CPU_H__ diff --git a/src/infiniop/ops/log2/cuda/kernel.cuh b/src/infiniop/ops/log2/cuda/kernel.cuh new file mode 100644 index 000000000..3d50dd7ae --- /dev/null +++ b/src/infiniop/ops/log2/cuda/kernel.cuh @@ -0,0 +1,10 @@ +#ifndef __LOG2_CUDA_H__ +#define __LOG2_CUDA_H__ + +#include "../../../elementwise/unary.h" + +namespace op::log2::cuda { +using Op = op::elementwise::unary::cuda::UnaryOp; +} // namespace op::log2::cuda + +#endif // __LOG2_CUDA_H__ diff --git a/src/infiniop/ops/log2/nvidia/log2_nvidia.cu b/src/infiniop/ops/log2/nvidia/log2_nvidia.cu new file mode 100644 index 000000000..68a4e33a0 --- /dev/null +++ b/src/infiniop/ops/log2/nvidia/log2_nvidia.cu @@ -0,0 +1,10 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh" + +#include "../cuda/kernel.cuh" +#include "log2_nvidia.cuh" + +namespace op::log2::nvidia { + +ELEMENTWISE_NVIDIA_IMPL_UNARY_EXTENDED(log2) + +} // namespace op::log2::nvidia diff --git a/src/infiniop/ops/log2/nvidia/log2_nvidia.cuh b/src/infiniop/ops/log2/nvidia/log2_nvidia.cuh new file mode 100644 index 000000000..e1fc178e9 --- /dev/null +++ b/src/infiniop/ops/log2/nvidia/log2_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __LOG2_CUDA_API_H__ +#define __LOG2_CUDA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(log2, nvidia) + +#endif // __LOG2_CUDA_API_H__ diff --git a/src/infiniop/ops/log2/operator.cc b/src/infiniop/ops/log2/operator.cc new file mode 100644 index 000000000..70deeaa68 --- /dev/null +++ b/src/infiniop/ops/log2/operator.cc @@ -0,0 +1,11 @@ +#include "../../operator_impl.h" +#include "infiniop/ops/unary_ops_api.h" + +#ifdef ENABLE_CPU_API +#include "cpu/log2_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#include "nvidia/log2_nvidia.cuh" +#endif + +UNARY_OP_IMPL(log2, Log2) diff --git a/src/infiniop/ops/logical_and/cpu/logical_and_cpu.cc b/src/infiniop/ops/logical_and/cpu/logical_and_cpu.cc new file mode 100644 index 000000000..4f664c648 --- /dev/null +++ b/src/infiniop/ops/logical_and/cpu/logical_and_cpu.cc @@ -0,0 +1,8 @@ +#include "logical_and_cpu.h" +#include "../../../elementwise/cpu/elementwise_cpu_impl.h" + +namespace op::logical_and::cpu { + +ELEMENTWISE_CPU_IMPL_BINARY(logical_and) + +} // namespace op::logical_and::cpu diff --git a/src/infiniop/ops/logical_and/cpu/logical_and_cpu.h b/src/infiniop/ops/logical_and/cpu/logical_and_cpu.h new file mode 100644 index 000000000..531a8d31d --- /dev/null +++ b/src/infiniop/ops/logical_and/cpu/logical_and_cpu.h @@ -0,0 +1,9 @@ +#ifndef __LOGICAL_AND_CPU_H__ +#define __LOGICAL_AND_CPU_H__ + +#include "../../../elementwise/binary.h" +#include "../../../elementwise/cpu/elementwise_cpu.h" + +BINARY_ELEMENTWISE_DESCRIPTOR(logical_and, cpu, op::elementwise::binary::BinaryMode::LogicalAnd) + +#endif // __LOGICAL_AND_CPU_H__ diff --git a/src/infiniop/ops/logical_and/cuda/kernel.cuh b/src/infiniop/ops/logical_and/cuda/kernel.cuh new file mode 100644 index 000000000..85b80fee6 --- /dev/null +++ b/src/infiniop/ops/logical_and/cuda/kernel.cuh @@ -0,0 +1,10 @@ +#ifndef __LOGICAL_AND_CUDA_H__ +#define __LOGICAL_AND_CUDA_H__ + +#include "../../../elementwise/binary.h" + +namespace op::logical_and::cuda { +using Op = op::elementwise::binary::cuda::BinaryOp; +} // namespace op::logical_and::cuda + +#endif // __LOGICAL_AND_CUDA_H__ diff --git a/src/infiniop/ops/logical_and/nvidia/logical_and_nvidia.cu b/src/infiniop/ops/logical_and/nvidia/logical_and_nvidia.cu new file mode 100644 index 000000000..2bf34aa08 --- /dev/null +++ b/src/infiniop/ops/logical_and/nvidia/logical_and_nvidia.cu @@ -0,0 +1,10 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh" + +#include "../cuda/kernel.cuh" +#include "logical_and_nvidia.cuh" + +namespace op::logical_and::nvidia { + +ELEMENTWISE_NVIDIA_IMPL_BINARY(logical_and) + +} // namespace op::logical_and::nvidia diff --git a/src/infiniop/ops/logical_and/nvidia/logical_and_nvidia.cuh b/src/infiniop/ops/logical_and/nvidia/logical_and_nvidia.cuh new file mode 100644 index 000000000..9d68754bf --- /dev/null +++ b/src/infiniop/ops/logical_and/nvidia/logical_and_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __LOGICAL_AND_CUDA_API_H__ +#define __LOGICAL_AND_CUDA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(logical_and, nvidia) + +#endif // __LOGICAL_AND_CUDA_API_H__ diff --git a/src/infiniop/ops/logical_and/operator.cc b/src/infiniop/ops/logical_and/operator.cc new file mode 100644 index 000000000..036d8d061 --- /dev/null +++ b/src/infiniop/ops/logical_and/operator.cc @@ -0,0 +1,11 @@ +#include "../../operator_impl.h" +#include "infiniop/ops/binary_ops_api.h" + +#ifdef ENABLE_CPU_API +#include "cpu/logical_and_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#include "nvidia/logical_and_nvidia.cuh" +#endif + +BINARY_OP_IMPL(logical_and, LogicalAnd) diff --git a/src/infiniop/ops/logical_or/cpu/logical_or_cpu.cc b/src/infiniop/ops/logical_or/cpu/logical_or_cpu.cc new file mode 100644 index 000000000..dcc824a16 --- /dev/null +++ b/src/infiniop/ops/logical_or/cpu/logical_or_cpu.cc @@ -0,0 +1,8 @@ +#include "logical_or_cpu.h" +#include "../../../elementwise/cpu/elementwise_cpu_impl.h" + +namespace op::logical_or::cpu { + +ELEMENTWISE_CPU_IMPL_BINARY(logical_or) + +} // namespace op::logical_or::cpu diff --git a/src/infiniop/ops/logical_or/cpu/logical_or_cpu.h b/src/infiniop/ops/logical_or/cpu/logical_or_cpu.h new file mode 100644 index 000000000..77b1cc6e7 --- /dev/null +++ b/src/infiniop/ops/logical_or/cpu/logical_or_cpu.h @@ -0,0 +1,9 @@ +#ifndef __LOGICAL_OR_CPU_H__ +#define __LOGICAL_OR_CPU_H__ + +#include "../../../elementwise/binary.h" +#include "../../../elementwise/cpu/elementwise_cpu.h" + +BINARY_ELEMENTWISE_DESCRIPTOR(logical_or, cpu, op::elementwise::binary::BinaryMode::LogicalOr) + +#endif // __LOGICAL_OR_CPU_H__ diff --git a/src/infiniop/ops/logical_or/cuda/kernel.cuh b/src/infiniop/ops/logical_or/cuda/kernel.cuh new file mode 100644 index 000000000..60ec81e28 --- /dev/null +++ b/src/infiniop/ops/logical_or/cuda/kernel.cuh @@ -0,0 +1,10 @@ +#ifndef __LOGICAL_OR_CUDA_H__ +#define __LOGICAL_OR_CUDA_H__ + +#include "../../../elementwise/binary.h" + +namespace op::logical_or::cuda { +using Op = op::elementwise::binary::cuda::BinaryOp; +} // namespace op::logical_or::cuda + +#endif // __LOGICAL_OR_CUDA_H__ diff --git a/src/infiniop/ops/logical_or/nvidia/logical_or_nvidia.cu b/src/infiniop/ops/logical_or/nvidia/logical_or_nvidia.cu new file mode 100644 index 000000000..1a7830e84 --- /dev/null +++ b/src/infiniop/ops/logical_or/nvidia/logical_or_nvidia.cu @@ -0,0 +1,10 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh" + +#include "../cuda/kernel.cuh" +#include "logical_or_nvidia.cuh" + +namespace op::logical_or::nvidia { + +ELEMENTWISE_NVIDIA_IMPL_BINARY(logical_or) + +} // namespace op::logical_or::nvidia diff --git a/src/infiniop/ops/logical_or/nvidia/logical_or_nvidia.cuh b/src/infiniop/ops/logical_or/nvidia/logical_or_nvidia.cuh new file mode 100644 index 000000000..a70bd8da7 --- /dev/null +++ b/src/infiniop/ops/logical_or/nvidia/logical_or_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __LOGICAL_OR_CUDA_API_H__ +#define __LOGICAL_OR_CUDA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(logical_or, nvidia) + +#endif // __LOGICAL_OR_CUDA_API_H__ diff --git a/src/infiniop/ops/logical_or/operator.cc b/src/infiniop/ops/logical_or/operator.cc new file mode 100644 index 000000000..7e06f23a0 --- /dev/null +++ b/src/infiniop/ops/logical_or/operator.cc @@ -0,0 +1,11 @@ +#include "../../operator_impl.h" +#include "infiniop/ops/binary_ops_api.h" + +#ifdef ENABLE_CPU_API +#include "cpu/logical_or_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#include "nvidia/logical_or_nvidia.cuh" +#endif + +BINARY_OP_IMPL(logical_or, LogicalOr) diff --git a/src/infiniop/ops/logical_xor/cpu/logical_xor_cpu.cc b/src/infiniop/ops/logical_xor/cpu/logical_xor_cpu.cc new file mode 100644 index 000000000..d7512db10 --- /dev/null +++ b/src/infiniop/ops/logical_xor/cpu/logical_xor_cpu.cc @@ -0,0 +1,8 @@ +#include "logical_xor_cpu.h" +#include "../../../elementwise/cpu/elementwise_cpu_impl.h" + +namespace op::logical_xor::cpu { + +ELEMENTWISE_CPU_IMPL_BINARY(logical_xor) + +} // namespace op::logical_xor::cpu diff --git a/src/infiniop/ops/logical_xor/cpu/logical_xor_cpu.h b/src/infiniop/ops/logical_xor/cpu/logical_xor_cpu.h new file mode 100644 index 000000000..2e4b0b038 --- /dev/null +++ b/src/infiniop/ops/logical_xor/cpu/logical_xor_cpu.h @@ -0,0 +1,9 @@ +#ifndef __LOGICAL_XOR_CPU_H__ +#define __LOGICAL_XOR_CPU_H__ + +#include "../../../elementwise/binary.h" +#include "../../../elementwise/cpu/elementwise_cpu.h" + +BINARY_ELEMENTWISE_DESCRIPTOR(logical_xor, cpu, op::elementwise::binary::BinaryMode::LogicalXor) + +#endif // __LOGICAL_XOR_CPU_H__ diff --git a/src/infiniop/ops/logical_xor/cuda/kernel.cuh b/src/infiniop/ops/logical_xor/cuda/kernel.cuh new file mode 100644 index 000000000..e3ab59ef9 --- /dev/null +++ b/src/infiniop/ops/logical_xor/cuda/kernel.cuh @@ -0,0 +1,10 @@ +#ifndef __LOGICAL_XOR_CUDA_H__ +#define __LOGICAL_XOR_CUDA_H__ + +#include "../../../elementwise/binary.h" + +namespace op::logical_xor::cuda { +using Op = op::elementwise::binary::cuda::BinaryOp; +} // namespace op::logical_xor::cuda + +#endif // __LOGICAL_XOR_CUDA_H__ diff --git a/src/infiniop/ops/logical_xor/nvidia/logical_xor_nvidia.cu b/src/infiniop/ops/logical_xor/nvidia/logical_xor_nvidia.cu new file mode 100644 index 000000000..ac2bd04dc --- /dev/null +++ b/src/infiniop/ops/logical_xor/nvidia/logical_xor_nvidia.cu @@ -0,0 +1,10 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh" + +#include "../cuda/kernel.cuh" +#include "logical_xor_nvidia.cuh" + +namespace op::logical_xor::nvidia { + +ELEMENTWISE_NVIDIA_IMPL_BINARY(logical_xor) + +} // namespace op::logical_xor::nvidia diff --git a/src/infiniop/ops/logical_xor/nvidia/logical_xor_nvidia.cuh b/src/infiniop/ops/logical_xor/nvidia/logical_xor_nvidia.cuh new file mode 100644 index 000000000..f264f8025 --- /dev/null +++ b/src/infiniop/ops/logical_xor/nvidia/logical_xor_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __LOGICAL_XOR_CUDA_API_H__ +#define __LOGICAL_XOR_CUDA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(logical_xor, nvidia) + +#endif // __LOGICAL_XOR_CUDA_API_H__ diff --git a/src/infiniop/ops/logical_xor/operator.cc b/src/infiniop/ops/logical_xor/operator.cc new file mode 100644 index 000000000..3af7d138a --- /dev/null +++ b/src/infiniop/ops/logical_xor/operator.cc @@ -0,0 +1,11 @@ +#include "../../operator_impl.h" +#include "infiniop/ops/binary_ops_api.h" + +#ifdef ENABLE_CPU_API +#include "cpu/logical_xor_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#include "nvidia/logical_xor_nvidia.cuh" +#endif + +BINARY_OP_IMPL(logical_xor, LogicalXor) diff --git a/src/infiniop/ops/lt/cpu/lt_cpu.cc b/src/infiniop/ops/lt/cpu/lt_cpu.cc new file mode 100644 index 000000000..25a5bfae0 --- /dev/null +++ b/src/infiniop/ops/lt/cpu/lt_cpu.cc @@ -0,0 +1,8 @@ +#include "lt_cpu.h" +#include "../../../elementwise/cpu/elementwise_cpu_impl.h" + +namespace op::lt::cpu { + +ELEMENTWISE_CPU_IMPL_BINARY(lt) + +} // namespace op::lt::cpu diff --git a/src/infiniop/ops/lt/cpu/lt_cpu.h b/src/infiniop/ops/lt/cpu/lt_cpu.h new file mode 100644 index 000000000..5870ee818 --- /dev/null +++ b/src/infiniop/ops/lt/cpu/lt_cpu.h @@ -0,0 +1,9 @@ +#ifndef __LT_CPU_H__ +#define __LT_CPU_H__ + +#include "../../../elementwise/binary.h" +#include "../../../elementwise/cpu/elementwise_cpu.h" + +BINARY_ELEMENTWISE_DESCRIPTOR(lt, cpu, op::elementwise::binary::BinaryMode::Less) + +#endif // __LT_CPU_H__ diff --git a/src/infiniop/ops/lt/cuda/kernel.cuh b/src/infiniop/ops/lt/cuda/kernel.cuh new file mode 100644 index 000000000..fa81f75ba --- /dev/null +++ b/src/infiniop/ops/lt/cuda/kernel.cuh @@ -0,0 +1,10 @@ +#ifndef __LT_CUDA_H__ +#define __LT_CUDA_H__ + +#include "../../../elementwise/binary.h" + +namespace op::lt::cuda { +using Op = op::elementwise::binary::cuda::BinaryOp; +} // namespace op::lt::cuda + +#endif // __LT_CUDA_H__ diff --git a/src/infiniop/ops/lt/nvidia/lt_nvidia.cu b/src/infiniop/ops/lt/nvidia/lt_nvidia.cu new file mode 100644 index 000000000..f019e95fe --- /dev/null +++ b/src/infiniop/ops/lt/nvidia/lt_nvidia.cu @@ -0,0 +1,10 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh" + +#include "../cuda/kernel.cuh" +#include "lt_nvidia.cuh" + +namespace op::lt::nvidia { + +ELEMENTWISE_NVIDIA_IMPL_BINARY(lt) + +} // namespace op::lt::nvidia diff --git a/src/infiniop/ops/lt/nvidia/lt_nvidia.cuh b/src/infiniop/ops/lt/nvidia/lt_nvidia.cuh new file mode 100644 index 000000000..198fe9161 --- /dev/null +++ b/src/infiniop/ops/lt/nvidia/lt_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __LT_CUDA_API_H__ +#define __LT_CUDA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(lt, nvidia) + +#endif // __LT_CUDA_API_H__ diff --git a/src/infiniop/ops/lt/operator.cc b/src/infiniop/ops/lt/operator.cc new file mode 100644 index 000000000..9b392768a --- /dev/null +++ b/src/infiniop/ops/lt/operator.cc @@ -0,0 +1,11 @@ +#include "../../operator_impl.h" +#include "infiniop/ops/binary_ops_api.h" + +#ifdef ENABLE_CPU_API +#include "cpu/lt_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#include "nvidia/lt_nvidia.cuh" +#endif + +BINARY_OP_IMPL(lt, Lt) diff --git a/src/infiniop/ops/ne/cpu/ne_cpu.cc b/src/infiniop/ops/ne/cpu/ne_cpu.cc new file mode 100644 index 000000000..a8eb321ec --- /dev/null +++ b/src/infiniop/ops/ne/cpu/ne_cpu.cc @@ -0,0 +1,8 @@ +#include "ne_cpu.h" +#include "../../../elementwise/cpu/elementwise_cpu_impl.h" + +namespace op::ne::cpu { + +ELEMENTWISE_CPU_IMPL_BINARY(ne) + +} // namespace op::ne::cpu diff --git a/src/infiniop/ops/ne/cpu/ne_cpu.h b/src/infiniop/ops/ne/cpu/ne_cpu.h new file mode 100644 index 000000000..65e592742 --- /dev/null +++ b/src/infiniop/ops/ne/cpu/ne_cpu.h @@ -0,0 +1,9 @@ +#ifndef __NE_CPU_H__ +#define __NE_CPU_H__ + +#include "../../../elementwise/binary.h" +#include "../../../elementwise/cpu/elementwise_cpu.h" + +BINARY_ELEMENTWISE_DESCRIPTOR(ne, cpu, op::elementwise::binary::BinaryMode::NotEqual) + +#endif // __NE_CPU_H__ diff --git a/src/infiniop/ops/ne/cuda/kernel.cuh b/src/infiniop/ops/ne/cuda/kernel.cuh new file mode 100644 index 000000000..339819488 --- /dev/null +++ b/src/infiniop/ops/ne/cuda/kernel.cuh @@ -0,0 +1,10 @@ +#ifndef __NE_CUDA_H__ +#define __NE_CUDA_H__ + +#include "../../../elementwise/binary.h" + +namespace op::ne::cuda { +using Op = op::elementwise::binary::cuda::BinaryOp; +} // namespace op::ne::cuda + +#endif // __NE_CUDA_H__ diff --git a/src/infiniop/ops/ne/nvidia/ne_nvidia.cu b/src/infiniop/ops/ne/nvidia/ne_nvidia.cu new file mode 100644 index 000000000..2de40bb53 --- /dev/null +++ b/src/infiniop/ops/ne/nvidia/ne_nvidia.cu @@ -0,0 +1,10 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh" + +#include "../cuda/kernel.cuh" +#include "ne_nvidia.cuh" + +namespace op::ne::nvidia { + +ELEMENTWISE_NVIDIA_IMPL_BINARY(ne) + +} // namespace op::ne::nvidia diff --git a/src/infiniop/ops/ne/nvidia/ne_nvidia.cuh b/src/infiniop/ops/ne/nvidia/ne_nvidia.cuh new file mode 100644 index 000000000..514d2a884 --- /dev/null +++ b/src/infiniop/ops/ne/nvidia/ne_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __NE_CUDA_API_H__ +#define __NE_CUDA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(ne, nvidia) + +#endif // __NE_CUDA_API_H__ diff --git a/src/infiniop/ops/ne/operator.cc b/src/infiniop/ops/ne/operator.cc new file mode 100644 index 000000000..cb4c1ed6d --- /dev/null +++ b/src/infiniop/ops/ne/operator.cc @@ -0,0 +1,11 @@ +#include "../../operator_impl.h" +#include "infiniop/ops/binary_ops_api.h" + +#ifdef ENABLE_CPU_API +#include "cpu/ne_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#include "nvidia/ne_nvidia.cuh" +#endif + +BINARY_OP_IMPL(ne, Ne) diff --git a/src/infiniop/ops/remainder/cpu/remainder_cpu.cc b/src/infiniop/ops/remainder/cpu/remainder_cpu.cc new file mode 100644 index 000000000..6ccb2cd63 --- /dev/null +++ b/src/infiniop/ops/remainder/cpu/remainder_cpu.cc @@ -0,0 +1,8 @@ +#include "remainder_cpu.h" +#include "../../../elementwise/cpu/elementwise_cpu_impl.h" + +namespace op::remainder::cpu { + +ELEMENTWISE_CPU_IMPL_BINARY(remainder) + +} // namespace op::remainder::cpu diff --git a/src/infiniop/ops/remainder/cpu/remainder_cpu.h b/src/infiniop/ops/remainder/cpu/remainder_cpu.h new file mode 100644 index 000000000..db6b7f760 --- /dev/null +++ b/src/infiniop/ops/remainder/cpu/remainder_cpu.h @@ -0,0 +1,9 @@ +#ifndef __REMAINDER_CPU_H__ +#define __REMAINDER_CPU_H__ + +#include "../../../elementwise/binary.h" +#include "../../../elementwise/cpu/elementwise_cpu.h" + +BINARY_ELEMENTWISE_DESCRIPTOR(remainder, cpu, op::elementwise::binary::BinaryMode::Remainder) + +#endif // __REMAINDER_CPU_H__ diff --git a/src/infiniop/ops/remainder/cuda/kernel.cuh b/src/infiniop/ops/remainder/cuda/kernel.cuh new file mode 100644 index 000000000..48af7d5d7 --- /dev/null +++ b/src/infiniop/ops/remainder/cuda/kernel.cuh @@ -0,0 +1,10 @@ +#ifndef __REMAINDER_CUDA_H__ +#define __REMAINDER_CUDA_H__ + +#include "../../../elementwise/binary.h" + +namespace op::remainder::cuda { +using Op = op::elementwise::binary::cuda::BinaryOp; +} // namespace op::remainder::cuda + +#endif // __REMAINDER_CUDA_H__ diff --git a/src/infiniop/ops/remainder/nvidia/remainder_nvidia.cu b/src/infiniop/ops/remainder/nvidia/remainder_nvidia.cu new file mode 100644 index 000000000..9bea21004 --- /dev/null +++ b/src/infiniop/ops/remainder/nvidia/remainder_nvidia.cu @@ -0,0 +1,10 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh" + +#include "../cuda/kernel.cuh" +#include "remainder_nvidia.cuh" + +namespace op::remainder::nvidia { + +ELEMENTWISE_NVIDIA_IMPL_BINARY(remainder) + +} // namespace op::remainder::nvidia diff --git a/src/infiniop/ops/remainder/nvidia/remainder_nvidia.cuh b/src/infiniop/ops/remainder/nvidia/remainder_nvidia.cuh new file mode 100644 index 000000000..d6834dfc4 --- /dev/null +++ b/src/infiniop/ops/remainder/nvidia/remainder_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __REMAINDER_NVIDIA_API_H__ +#define __REMAINDER_NVIDIA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(remainder, nvidia) + +#endif // __REMAINDER_NVIDIA_API_H__ diff --git a/src/infiniop/ops/remainder/operator.cc b/src/infiniop/ops/remainder/operator.cc new file mode 100644 index 000000000..c331cebc9 --- /dev/null +++ b/src/infiniop/ops/remainder/operator.cc @@ -0,0 +1,11 @@ +#include "../../operator_impl.h" +#include "infiniop/ops/binary_ops_api.h" + +#ifdef ENABLE_CPU_API +#include "cpu/remainder_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#include "nvidia/remainder_nvidia.cuh" +#endif + +BINARY_OP_IMPL(remainder, Remainder) diff --git a/src/infiniop/ops/rsqrt/cpu/rsqrt_cpu.cc b/src/infiniop/ops/rsqrt/cpu/rsqrt_cpu.cc new file mode 100644 index 000000000..78b0138c5 --- /dev/null +++ b/src/infiniop/ops/rsqrt/cpu/rsqrt_cpu.cc @@ -0,0 +1,8 @@ +#include "rsqrt_cpu.h" +#include "../../../elementwise/cpu/elementwise_cpu_impl.h" + +namespace op::rsqrt::cpu { + +ELEMENTWISE_CPU_IMPL_UNARY(rsqrt) + +} // namespace op::rsqrt::cpu diff --git a/src/infiniop/ops/rsqrt/cpu/rsqrt_cpu.h b/src/infiniop/ops/rsqrt/cpu/rsqrt_cpu.h new file mode 100644 index 000000000..9bafc6609 --- /dev/null +++ b/src/infiniop/ops/rsqrt/cpu/rsqrt_cpu.h @@ -0,0 +1,9 @@ +#ifndef __RSQRT_CPU_H__ +#define __RSQRT_CPU_H__ + +#include "../../../elementwise/cpu/elementwise_cpu.h" +#include "../../../elementwise/unary.h" + +UNARY_ELEMENTWISE_DESCRIPTOR(rsqrt, cpu, op::elementwise::unary::UnaryMode::Rsqrt) + +#endif // __RSQRT_CPU_H__ diff --git a/src/infiniop/ops/rsqrt/cuda/kernel.cuh b/src/infiniop/ops/rsqrt/cuda/kernel.cuh new file mode 100644 index 000000000..9381f2bae --- /dev/null +++ b/src/infiniop/ops/rsqrt/cuda/kernel.cuh @@ -0,0 +1,10 @@ +#ifndef __RSQRT_CUDA_H__ +#define __RSQRT_CUDA_H__ + +#include "../../../elementwise/unary.h" + +namespace op::rsqrt::cuda { +using Op = op::elementwise::unary::cuda::UnaryOp; +} // namespace op::rsqrt::cuda + +#endif // __RSQRT_CUDA_H__ diff --git a/src/infiniop/ops/rsqrt/nvidia/rsqrt_nvidia.cu b/src/infiniop/ops/rsqrt/nvidia/rsqrt_nvidia.cu new file mode 100644 index 000000000..4cdaccc19 --- /dev/null +++ b/src/infiniop/ops/rsqrt/nvidia/rsqrt_nvidia.cu @@ -0,0 +1,10 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh" + +#include "../cuda/kernel.cuh" +#include "rsqrt_nvidia.cuh" + +namespace op::rsqrt::nvidia { + +ELEMENTWISE_NVIDIA_IMPL_UNARY(rsqrt) + +} // namespace op::rsqrt::nvidia diff --git a/src/infiniop/ops/rsqrt/nvidia/rsqrt_nvidia.cuh b/src/infiniop/ops/rsqrt/nvidia/rsqrt_nvidia.cuh new file mode 100644 index 000000000..afffff923 --- /dev/null +++ b/src/infiniop/ops/rsqrt/nvidia/rsqrt_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __RSQRT_NVIDIA_API_H__ +#define __RSQRT_NVIDIA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(rsqrt, nvidia) + +#endif // __RSQRT_NVIDIA_API_H__ diff --git a/src/infiniop/ops/rsqrt/operator.cc b/src/infiniop/ops/rsqrt/operator.cc new file mode 100644 index 000000000..d92e32510 --- /dev/null +++ b/src/infiniop/ops/rsqrt/operator.cc @@ -0,0 +1,11 @@ +#include "../../operator_impl.h" +#include "infiniop/ops/unary_ops_api.h" + +#ifdef ENABLE_CPU_API +#include "cpu/rsqrt_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#include "nvidia/rsqrt_nvidia.cuh" +#endif + +UNARY_OP_IMPL(rsqrt, Rsqrt) diff --git a/src/infiniop/ops/sin/cpu/sin_cpu.cc b/src/infiniop/ops/sin/cpu/sin_cpu.cc new file mode 100644 index 000000000..c7b172bb0 --- /dev/null +++ b/src/infiniop/ops/sin/cpu/sin_cpu.cc @@ -0,0 +1,8 @@ +#include "sin_cpu.h" +#include "../../../elementwise/cpu/elementwise_cpu_impl.h" + +namespace op::sin::cpu { + +ELEMENTWISE_CPU_IMPL_UNARY(sin) + +} // namespace op::sin::cpu diff --git a/src/infiniop/ops/sin/cpu/sin_cpu.h b/src/infiniop/ops/sin/cpu/sin_cpu.h new file mode 100644 index 000000000..14117695d --- /dev/null +++ b/src/infiniop/ops/sin/cpu/sin_cpu.h @@ -0,0 +1,9 @@ +#ifndef __SIN_CPU_H__ +#define __SIN_CPU_H__ + +#include "../../../elementwise/cpu/elementwise_cpu.h" +#include "../../../elementwise/unary.h" + +UNARY_ELEMENTWISE_DESCRIPTOR(sin, cpu, op::elementwise::unary::UnaryMode::Sin) + +#endif // __SIN_CPU_H__ diff --git a/src/infiniop/ops/sin/cuda/kernel.cuh b/src/infiniop/ops/sin/cuda/kernel.cuh new file mode 100644 index 000000000..918bb8345 --- /dev/null +++ b/src/infiniop/ops/sin/cuda/kernel.cuh @@ -0,0 +1,10 @@ +#ifndef __SIN_CUDA_H__ +#define __SIN_CUDA_H__ + +#include "../../../elementwise/unary.h" + +namespace op::sin::cuda { +using Op = op::elementwise::unary::cuda::UnaryOp; +} // namespace op::sin::cuda + +#endif // __SIN_CUDA_H__ diff --git a/src/infiniop/ops/sin/nvidia/sin_nvidia.cu b/src/infiniop/ops/sin/nvidia/sin_nvidia.cu new file mode 100644 index 000000000..36c667ccf --- /dev/null +++ b/src/infiniop/ops/sin/nvidia/sin_nvidia.cu @@ -0,0 +1,10 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh" + +#include "../cuda/kernel.cuh" +#include "sin_nvidia.cuh" + +namespace op::sin::nvidia { + +ELEMENTWISE_NVIDIA_IMPL_UNARY(sin) + +} // namespace op::sin::nvidia diff --git a/src/infiniop/ops/sin/nvidia/sin_nvidia.cuh b/src/infiniop/ops/sin/nvidia/sin_nvidia.cuh new file mode 100644 index 000000000..617119eb4 --- /dev/null +++ b/src/infiniop/ops/sin/nvidia/sin_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __SIN_NVIDIA_API_H__ +#define __SIN_NVIDIA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(sin, nvidia) + +#endif // __SIN_NVIDIA_API_H__ diff --git a/src/infiniop/ops/sin/operator.cc b/src/infiniop/ops/sin/operator.cc new file mode 100644 index 000000000..6772de6fe --- /dev/null +++ b/src/infiniop/ops/sin/operator.cc @@ -0,0 +1,11 @@ +#include "../../operator_impl.h" +#include "infiniop/ops/unary_ops_api.h" + +#ifdef ENABLE_CPU_API +#include "cpu/sin_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#include "nvidia/sin_nvidia.cuh" +#endif + +UNARY_OP_IMPL(sin, Sin) diff --git a/src/infiniop/ops/sinc/cpu/sinc_cpu.cc b/src/infiniop/ops/sinc/cpu/sinc_cpu.cc new file mode 100644 index 000000000..6a5dc21d3 --- /dev/null +++ b/src/infiniop/ops/sinc/cpu/sinc_cpu.cc @@ -0,0 +1,8 @@ +#include "sinc_cpu.h" +#include "../../../elementwise/cpu/elementwise_cpu_impl.h" + +namespace op::sinc::cpu { + +ELEMENTWISE_CPU_IMPL_UNARY(sinc) + +} // namespace op::sinc::cpu diff --git a/src/infiniop/ops/sinc/cpu/sinc_cpu.h b/src/infiniop/ops/sinc/cpu/sinc_cpu.h new file mode 100644 index 000000000..094fb2f30 --- /dev/null +++ b/src/infiniop/ops/sinc/cpu/sinc_cpu.h @@ -0,0 +1,9 @@ +#ifndef __SINC_CPU_H__ +#define __SINC_CPU_H__ + +#include "../../../elementwise/cpu/elementwise_cpu.h" +#include "../../../elementwise/unary.h" + +UNARY_ELEMENTWISE_DESCRIPTOR(sinc, cpu, op::elementwise::unary::UnaryMode::Sinc) + +#endif // __SINC_CPU_H__ diff --git a/src/infiniop/ops/sinc/cuda/kernel.cuh b/src/infiniop/ops/sinc/cuda/kernel.cuh new file mode 100644 index 000000000..fc31025b0 --- /dev/null +++ b/src/infiniop/ops/sinc/cuda/kernel.cuh @@ -0,0 +1,10 @@ +#ifndef __SINC_CUDA_H__ +#define __SINC_CUDA_H__ + +#include "../../../elementwise/unary.h" + +namespace op::sinc::cuda { +using Op = op::elementwise::unary::cuda::UnaryOp; +} // namespace op::sinc::cuda + +#endif // __SINC_CUDA_H__ diff --git a/src/infiniop/ops/sinc/nvidia/sinc_nvidia.cu b/src/infiniop/ops/sinc/nvidia/sinc_nvidia.cu new file mode 100644 index 000000000..85d36106b --- /dev/null +++ b/src/infiniop/ops/sinc/nvidia/sinc_nvidia.cu @@ -0,0 +1,10 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh" + +#include "../cuda/kernel.cuh" +#include "sinc_nvidia.cuh" + +namespace op::sinc::nvidia { + +ELEMENTWISE_NVIDIA_IMPL_UNARY(sinc) + +} // namespace op::sinc::nvidia diff --git a/src/infiniop/ops/sinc/nvidia/sinc_nvidia.cuh b/src/infiniop/ops/sinc/nvidia/sinc_nvidia.cuh new file mode 100644 index 000000000..04058aafb --- /dev/null +++ b/src/infiniop/ops/sinc/nvidia/sinc_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __SINC_NVIDIA_API_H__ +#define __SINC_NVIDIA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(sinc, nvidia) + +#endif // __SINC_NVIDIA_API_H__ diff --git a/src/infiniop/ops/sinc/operator.cc b/src/infiniop/ops/sinc/operator.cc new file mode 100644 index 000000000..636030916 --- /dev/null +++ b/src/infiniop/ops/sinc/operator.cc @@ -0,0 +1,11 @@ +#include "../../operator_impl.h" +#include "infiniop/ops/unary_ops_api.h" + +#ifdef ENABLE_CPU_API +#include "cpu/sinc_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#include "nvidia/sinc_nvidia.cuh" +#endif + +UNARY_OP_IMPL(sinc, Sinc) diff --git a/src/infiniop/ops/square/cpu/square_cpu.cc b/src/infiniop/ops/square/cpu/square_cpu.cc new file mode 100644 index 000000000..2ba497866 --- /dev/null +++ b/src/infiniop/ops/square/cpu/square_cpu.cc @@ -0,0 +1,8 @@ +#include "square_cpu.h" +#include "../../../elementwise/cpu/elementwise_cpu_impl.h" + +namespace op::square::cpu { + +ELEMENTWISE_CPU_IMPL_UNARY(square) + +} // namespace op::square::cpu diff --git a/src/infiniop/ops/square/cpu/square_cpu.h b/src/infiniop/ops/square/cpu/square_cpu.h new file mode 100644 index 000000000..014bdcf79 --- /dev/null +++ b/src/infiniop/ops/square/cpu/square_cpu.h @@ -0,0 +1,9 @@ +#ifndef __SQUARE_CPU_H__ +#define __SQUARE_CPU_H__ + +#include "../../../elementwise/cpu/elementwise_cpu.h" +#include "../../../elementwise/unary.h" + +UNARY_ELEMENTWISE_DESCRIPTOR(square, cpu, op::elementwise::unary::UnaryMode::Square) + +#endif // __SQUARE_CPU_H__ diff --git a/src/infiniop/ops/square/cuda/kernel.cuh b/src/infiniop/ops/square/cuda/kernel.cuh new file mode 100644 index 000000000..07dd19124 --- /dev/null +++ b/src/infiniop/ops/square/cuda/kernel.cuh @@ -0,0 +1,10 @@ +#ifndef __SQUARE_CUDA_H__ +#define __SQUARE_CUDA_H__ + +#include "../../../elementwise/unary.h" + +namespace op::square::cuda { +using Op = op::elementwise::unary::cuda::UnaryOp; +} // namespace op::square::cuda + +#endif // __SQUARE_CUDA_H__ diff --git a/src/infiniop/ops/square/nvidia/square_nvidia.cu b/src/infiniop/ops/square/nvidia/square_nvidia.cu new file mode 100644 index 000000000..ac446d4cf --- /dev/null +++ b/src/infiniop/ops/square/nvidia/square_nvidia.cu @@ -0,0 +1,10 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh" + +#include "../cuda/kernel.cuh" +#include "square_nvidia.cuh" + +namespace op::square::nvidia { + +ELEMENTWISE_NVIDIA_IMPL_UNARY(square) + +} // namespace op::square::nvidia diff --git a/src/infiniop/ops/square/nvidia/square_nvidia.cuh b/src/infiniop/ops/square/nvidia/square_nvidia.cuh new file mode 100644 index 000000000..deed0905a --- /dev/null +++ b/src/infiniop/ops/square/nvidia/square_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __SQUARE_NVIDIA_API_H__ +#define __SQUARE_NVIDIA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(square, nvidia) + +#endif // __SQUARE_NVIDIA_API_H__ diff --git a/src/infiniop/ops/square/operator.cc b/src/infiniop/ops/square/operator.cc new file mode 100644 index 000000000..b66e1621e --- /dev/null +++ b/src/infiniop/ops/square/operator.cc @@ -0,0 +1,11 @@ +#include "../../operator_impl.h" +#include "infiniop/ops/unary_ops_api.h" + +#ifdef ENABLE_CPU_API +#include "cpu/square_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#include "nvidia/square_nvidia.cuh" +#endif + +UNARY_OP_IMPL(square, Square) diff --git a/test/infiniop/libinfiniop/op_register.py b/test/infiniop/libinfiniop/op_register.py index a0f7cbccb..afa0e7bef 100644 --- a/test/infiniop/libinfiniop/op_register.py +++ b/test/infiniop/libinfiniop/op_register.py @@ -303,6 +303,108 @@ def pow_(lib): ] +@OpRegister.operator +def copysign_(lib): + lib.infiniopCreateCopySignDescriptor.restype = c_int32 + lib.infiniopCreateCopySignDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + + lib.infiniopGetCopySignWorkspaceSize.restype = c_int32 + lib.infiniopGetCopySignWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + + lib.infiniopCopySign.restype = c_int32 + lib.infiniopCopySign.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + ] + + lib.infiniopDestroyCopySignDescriptor.restype = c_int32 + lib.infiniopDestroyCopySignDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + +@OpRegister.operator +def hypot_(lib): + lib.infiniopCreateHypotDescriptor.restype = c_int32 + lib.infiniopCreateHypotDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + + lib.infiniopGetHypotWorkspaceSize.restype = c_int32 + lib.infiniopGetHypotWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + + lib.infiniopHypot.restype = c_int32 + lib.infiniopHypot.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + ] + + lib.infiniopDestroyHypotDescriptor.restype = c_int32 + lib.infiniopDestroyHypotDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + +@OpRegister.operator +def atan2_(lib): + lib.infiniopCreateAtan2Descriptor.restype = c_int32 + lib.infiniopCreateAtan2Descriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + + lib.infiniopGetAtan2WorkspaceSize.restype = c_int32 + lib.infiniopGetAtan2WorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + + lib.infiniopAtan2.restype = c_int32 + lib.infiniopAtan2.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + ] + + lib.infiniopDestroyAtan2Descriptor.restype = c_int32 + lib.infiniopDestroyAtan2Descriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + @OpRegister.operator def div_(lib): lib.infiniopCreateDivDescriptor.restype = c_int32 @@ -314,14 +416,618 @@ def div_(lib): infiniopTensorDescriptor_t, ] - lib.infiniopGetDivWorkspaceSize.restype = c_int32 - lib.infiniopGetDivWorkspaceSize.argtypes = [ + lib.infiniopGetDivWorkspaceSize.restype = c_int32 + lib.infiniopGetDivWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + + lib.infiniopDiv.restype = c_int32 + lib.infiniopDiv.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + ] + + lib.infiniopDestroyDivDescriptor.restype = c_int32 + lib.infiniopDestroyDivDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + +@OpRegister.operator +def floor_divide_(lib): + lib.infiniopCreateFloorDivideDescriptor.restype = c_int32 + lib.infiniopCreateFloorDivideDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + + lib.infiniopGetFloorDivideWorkspaceSize.restype = c_int32 + lib.infiniopGetFloorDivideWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + + lib.infiniopFloorDivide.restype = c_int32 + lib.infiniopFloorDivide.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + ] + + lib.infiniopDestroyFloorDivideDescriptor.restype = c_int32 + lib.infiniopDestroyFloorDivideDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + +@OpRegister.operator +def mod_(lib): + lib.infiniopCreateModDescriptor.restype = c_int32 + lib.infiniopCreateModDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + + lib.infiniopGetModWorkspaceSize.restype = c_int32 + lib.infiniopGetModWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + + lib.infiniopMod.restype = c_int32 + lib.infiniopMod.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + ] + + lib.infiniopDestroyModDescriptor.restype = c_int32 + lib.infiniopDestroyModDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + +@OpRegister.operator +def remainder_(lib): + lib.infiniopCreateRemainderDescriptor.restype = c_int32 + lib.infiniopCreateRemainderDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + + lib.infiniopGetRemainderWorkspaceSize.restype = c_int32 + lib.infiniopGetRemainderWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + + lib.infiniopRemainder.restype = c_int32 + lib.infiniopRemainder.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + ] + + lib.infiniopDestroyRemainderDescriptor.restype = c_int32 + lib.infiniopDestroyRemainderDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + +@OpRegister.operator +def max_(lib): + lib.infiniopCreateMaxDescriptor.restype = c_int32 + lib.infiniopCreateMaxDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + + lib.infiniopGetMaxWorkspaceSize.restype = c_int32 + lib.infiniopGetMaxWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + + lib.infiniopMax.restype = c_int32 + lib.infiniopMax.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + ] + + lib.infiniopDestroyMaxDescriptor.restype = c_int32 + lib.infiniopDestroyMaxDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + +@OpRegister.operator +def min_(lib): + lib.infiniopCreateMinDescriptor.restype = c_int32 + lib.infiniopCreateMinDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + + lib.infiniopGetMinWorkspaceSize.restype = c_int32 + lib.infiniopGetMinWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + + lib.infiniopMin.restype = c_int32 + lib.infiniopMin.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + ] + + lib.infiniopDestroyMinDescriptor.restype = c_int32 + lib.infiniopDestroyMinDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + +@OpRegister.operator +def fmax_(lib): + lib.infiniopCreateFmaxDescriptor.restype = c_int32 + lib.infiniopCreateFmaxDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + lib.infiniopGetFmaxWorkspaceSize.restype = c_int32 + lib.infiniopGetFmaxWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + lib.infiniopFmax.restype = c_int32 + lib.infiniopFmax.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + ] + lib.infiniopDestroyFmaxDescriptor.restype = c_int32 + lib.infiniopDestroyFmaxDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + +@OpRegister.operator +def fmin_(lib): + lib.infiniopCreateFminDescriptor.restype = c_int32 + lib.infiniopCreateFminDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + lib.infiniopGetFminWorkspaceSize.restype = c_int32 + lib.infiniopGetFminWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + lib.infiniopFmin.restype = c_int32 + lib.infiniopFmin.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + ] + lib.infiniopDestroyFminDescriptor.restype = c_int32 + lib.infiniopDestroyFminDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + +@OpRegister.operator +def gt_(lib): + lib.infiniopCreateGtDescriptor.restype = c_int32 + lib.infiniopCreateGtDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + + lib.infiniopGetGtWorkspaceSize.restype = c_int32 + lib.infiniopGetGtWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + + lib.infiniopGt.restype = c_int32 + lib.infiniopGt.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + ] + + lib.infiniopDestroyGtDescriptor.restype = c_int32 + lib.infiniopDestroyGtDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + +@OpRegister.operator +def lt_(lib): + lib.infiniopCreateLtDescriptor.restype = c_int32 + lib.infiniopCreateLtDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + + lib.infiniopGetLtWorkspaceSize.restype = c_int32 + lib.infiniopGetLtWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + + lib.infiniopLt.restype = c_int32 + lib.infiniopLt.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + ] + + lib.infiniopDestroyLtDescriptor.restype = c_int32 + lib.infiniopDestroyLtDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + +@OpRegister.operator +def ge_(lib): + lib.infiniopCreateGeDescriptor.restype = c_int32 + lib.infiniopCreateGeDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + + lib.infiniopGetGeWorkspaceSize.restype = c_int32 + lib.infiniopGetGeWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + + lib.infiniopGe.restype = c_int32 + lib.infiniopGe.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + ] + + lib.infiniopDestroyGeDescriptor.restype = c_int32 + lib.infiniopDestroyGeDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + +@OpRegister.operator +def le_(lib): + lib.infiniopCreateLeDescriptor.restype = c_int32 + lib.infiniopCreateLeDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + + lib.infiniopGetLeWorkspaceSize.restype = c_int32 + lib.infiniopGetLeWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + + lib.infiniopLe.restype = c_int32 + lib.infiniopLe.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + ] + + lib.infiniopDestroyLeDescriptor.restype = c_int32 + lib.infiniopDestroyLeDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + +@OpRegister.operator +def eq_(lib): + lib.infiniopCreateEqDescriptor.restype = c_int32 + lib.infiniopCreateEqDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + + lib.infiniopGetEqWorkspaceSize.restype = c_int32 + lib.infiniopGetEqWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + + lib.infiniopEq.restype = c_int32 + lib.infiniopEq.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + ] + + lib.infiniopDestroyEqDescriptor.restype = c_int32 + lib.infiniopDestroyEqDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + +@OpRegister.operator +def ne_(lib): + lib.infiniopCreateNeDescriptor.restype = c_int32 + lib.infiniopCreateNeDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + + lib.infiniopGetNeWorkspaceSize.restype = c_int32 + lib.infiniopGetNeWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + + lib.infiniopNe.restype = c_int32 + lib.infiniopNe.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + ] + + lib.infiniopDestroyNeDescriptor.restype = c_int32 + lib.infiniopDestroyNeDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + +@OpRegister.operator +def logical_and_(lib): + lib.infiniopCreateLogicalAndDescriptor.restype = c_int32 + lib.infiniopCreateLogicalAndDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + + lib.infiniopGetLogicalAndWorkspaceSize.restype = c_int32 + lib.infiniopGetLogicalAndWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + + lib.infiniopLogicalAnd.restype = c_int32 + lib.infiniopLogicalAnd.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + ] + + lib.infiniopDestroyLogicalAndDescriptor.restype = c_int32 + lib.infiniopDestroyLogicalAndDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + +@OpRegister.operator +def logical_or_(lib): + lib.infiniopCreateLogicalOrDescriptor.restype = c_int32 + lib.infiniopCreateLogicalOrDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + + lib.infiniopGetLogicalOrWorkspaceSize.restype = c_int32 + lib.infiniopGetLogicalOrWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + + lib.infiniopLogicalOr.restype = c_int32 + lib.infiniopLogicalOr.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + ] + + lib.infiniopDestroyLogicalOrDescriptor.restype = c_int32 + lib.infiniopDestroyLogicalOrDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + +@OpRegister.operator +def logical_xor_(lib): + lib.infiniopCreateLogicalXorDescriptor.restype = c_int32 + lib.infiniopCreateLogicalXorDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + + lib.infiniopGetLogicalXorWorkspaceSize.restype = c_int32 + lib.infiniopGetLogicalXorWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + + lib.infiniopLogicalXor.restype = c_int32 + lib.infiniopLogicalXor.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + ] + + lib.infiniopDestroyLogicalXorDescriptor.restype = c_int32 + lib.infiniopDestroyLogicalXorDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + +@OpRegister.operator +def bitwise_and_(lib): + lib.infiniopCreateBitwiseAndDescriptor.restype = c_int32 + lib.infiniopCreateBitwiseAndDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + + lib.infiniopGetBitwiseAndWorkspaceSize.restype = c_int32 + lib.infiniopGetBitwiseAndWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + + lib.infiniopBitwiseAnd.restype = c_int32 + lib.infiniopBitwiseAnd.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + ] + + lib.infiniopDestroyBitwiseAndDescriptor.restype = c_int32 + lib.infiniopDestroyBitwiseAndDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + +@OpRegister.operator +def bitwise_or_(lib): + lib.infiniopCreateBitwiseOrDescriptor.restype = c_int32 + lib.infiniopCreateBitwiseOrDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + + lib.infiniopGetBitwiseOrWorkspaceSize.restype = c_int32 + lib.infiniopGetBitwiseOrWorkspaceSize.argtypes = [ infiniopOperatorDescriptor_t, POINTER(c_size_t), ] - lib.infiniopDiv.restype = c_int32 - lib.infiniopDiv.argtypes = [ + lib.infiniopBitwiseOr.restype = c_int32 + lib.infiniopBitwiseOr.argtypes = [ infiniopOperatorDescriptor_t, c_void_p, c_size_t, @@ -331,16 +1037,16 @@ def div_(lib): c_void_p, ] - lib.infiniopDestroyDivDescriptor.restype = c_int32 - lib.infiniopDestroyDivDescriptor.argtypes = [ + lib.infiniopDestroyBitwiseOrDescriptor.restype = c_int32 + lib.infiniopDestroyBitwiseOrDescriptor.argtypes = [ infiniopOperatorDescriptor_t, ] @OpRegister.operator -def mod_(lib): - lib.infiniopCreateModDescriptor.restype = c_int32 - lib.infiniopCreateModDescriptor.argtypes = [ +def bitwise_xor_(lib): + lib.infiniopCreateBitwiseXorDescriptor.restype = c_int32 + lib.infiniopCreateBitwiseXorDescriptor.argtypes = [ infiniopHandle_t, POINTER(infiniopOperatorDescriptor_t), infiniopTensorDescriptor_t, @@ -348,14 +1054,14 @@ def mod_(lib): infiniopTensorDescriptor_t, ] - lib.infiniopGetModWorkspaceSize.restype = c_int32 - lib.infiniopGetModWorkspaceSize.argtypes = [ + lib.infiniopGetBitwiseXorWorkspaceSize.restype = c_int32 + lib.infiniopGetBitwiseXorWorkspaceSize.argtypes = [ infiniopOperatorDescriptor_t, POINTER(c_size_t), ] - lib.infiniopMod.restype = c_int32 - lib.infiniopMod.argtypes = [ + lib.infiniopBitwiseXor.restype = c_int32 + lib.infiniopBitwiseXor.argtypes = [ infiniopOperatorDescriptor_t, c_void_p, c_size_t, @@ -365,16 +1071,16 @@ def mod_(lib): c_void_p, ] - lib.infiniopDestroyModDescriptor.restype = c_int32 - lib.infiniopDestroyModDescriptor.argtypes = [ + lib.infiniopDestroyBitwiseXorDescriptor.restype = c_int32 + lib.infiniopDestroyBitwiseXorDescriptor.argtypes = [ infiniopOperatorDescriptor_t, ] @OpRegister.operator -def max_(lib): - lib.infiniopCreateMaxDescriptor.restype = c_int32 - lib.infiniopCreateMaxDescriptor.argtypes = [ +def bitwise_left_shift_(lib): + lib.infiniopCreateBitwiseLeftShiftDescriptor.restype = c_int32 + lib.infiniopCreateBitwiseLeftShiftDescriptor.argtypes = [ infiniopHandle_t, POINTER(infiniopOperatorDescriptor_t), infiniopTensorDescriptor_t, @@ -382,14 +1088,14 @@ def max_(lib): infiniopTensorDescriptor_t, ] - lib.infiniopGetMaxWorkspaceSize.restype = c_int32 - lib.infiniopGetMaxWorkspaceSize.argtypes = [ + lib.infiniopGetBitwiseLeftShiftWorkspaceSize.restype = c_int32 + lib.infiniopGetBitwiseLeftShiftWorkspaceSize.argtypes = [ infiniopOperatorDescriptor_t, POINTER(c_size_t), ] - lib.infiniopMax.restype = c_int32 - lib.infiniopMax.argtypes = [ + lib.infiniopBitwiseLeftShift.restype = c_int32 + lib.infiniopBitwiseLeftShift.argtypes = [ infiniopOperatorDescriptor_t, c_void_p, c_size_t, @@ -399,16 +1105,16 @@ def max_(lib): c_void_p, ] - lib.infiniopDestroyMaxDescriptor.restype = c_int32 - lib.infiniopDestroyMaxDescriptor.argtypes = [ + lib.infiniopDestroyBitwiseLeftShiftDescriptor.restype = c_int32 + lib.infiniopDestroyBitwiseLeftShiftDescriptor.argtypes = [ infiniopOperatorDescriptor_t, ] @OpRegister.operator -def min_(lib): - lib.infiniopCreateMinDescriptor.restype = c_int32 - lib.infiniopCreateMinDescriptor.argtypes = [ +def bitwise_right_shift_(lib): + lib.infiniopCreateBitwiseRightShiftDescriptor.restype = c_int32 + lib.infiniopCreateBitwiseRightShiftDescriptor.argtypes = [ infiniopHandle_t, POINTER(infiniopOperatorDescriptor_t), infiniopTensorDescriptor_t, @@ -416,14 +1122,14 @@ def min_(lib): infiniopTensorDescriptor_t, ] - lib.infiniopGetMinWorkspaceSize.restype = c_int32 - lib.infiniopGetMinWorkspaceSize.argtypes = [ + lib.infiniopGetBitwiseRightShiftWorkspaceSize.restype = c_int32 + lib.infiniopGetBitwiseRightShiftWorkspaceSize.argtypes = [ infiniopOperatorDescriptor_t, POINTER(c_size_t), ] - lib.infiniopMin.restype = c_int32 - lib.infiniopMin.argtypes = [ + lib.infiniopBitwiseRightShift.restype = c_int32 + lib.infiniopBitwiseRightShift.argtypes = [ infiniopOperatorDescriptor_t, c_void_p, c_size_t, @@ -433,8 +1139,8 @@ def min_(lib): c_void_p, ] - lib.infiniopDestroyMinDescriptor.restype = c_int32 - lib.infiniopDestroyMinDescriptor.argtypes = [ + lib.infiniopDestroyBitwiseRightShiftDescriptor.restype = c_int32 + lib.infiniopDestroyBitwiseRightShiftDescriptor.argtypes = [ infiniopOperatorDescriptor_t, ] @@ -1021,6 +1727,64 @@ def sqrt_(lib): ] +@OpRegister.operator +def square_(lib): + lib.infiniopCreateSquareDescriptor.restype = c_int32 + lib.infiniopCreateSquareDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + lib.infiniopGetSquareWorkspaceSize.restype = c_int32 + lib.infiniopGetSquareWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + lib.infiniopSquare.restype = c_int32 + lib.infiniopSquare.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + ] + lib.infiniopDestroySquareDescriptor.restype = c_int32 + lib.infiniopDestroySquareDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + +@OpRegister.operator +def rsqrt_(lib): + lib.infiniopCreateRsqrtDescriptor.restype = c_int32 + lib.infiniopCreateRsqrtDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + lib.infiniopGetRsqrtWorkspaceSize.restype = c_int32 + lib.infiniopGetRsqrtWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + lib.infiniopRsqrt.restype = c_int32 + lib.infiniopRsqrt.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + ] + lib.infiniopDestroyRsqrtDescriptor.restype = c_int32 + lib.infiniopDestroyRsqrtDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + @OpRegister.operator def log_(lib): lib.infiniopCreateLogDescriptor.restype = c_int32 @@ -1050,6 +1814,122 @@ def log_(lib): ] +@OpRegister.operator +def log2_(lib): + lib.infiniopCreateLog2Descriptor.restype = c_int32 + lib.infiniopCreateLog2Descriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + lib.infiniopGetLog2WorkspaceSize.restype = c_int32 + lib.infiniopGetLog2WorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + lib.infiniopLog2.restype = c_int32 + lib.infiniopLog2.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + ] + lib.infiniopDestroyLog2Descriptor.restype = c_int32 + lib.infiniopDestroyLog2Descriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + +@OpRegister.operator +def log10_(lib): + lib.infiniopCreateLog10Descriptor.restype = c_int32 + lib.infiniopCreateLog10Descriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + lib.infiniopGetLog10WorkspaceSize.restype = c_int32 + lib.infiniopGetLog10WorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + lib.infiniopLog10.restype = c_int32 + lib.infiniopLog10.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + ] + lib.infiniopDestroyLog10Descriptor.restype = c_int32 + lib.infiniopDestroyLog10Descriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + +@OpRegister.operator +def log1p_(lib): + lib.infiniopCreateLog1pDescriptor.restype = c_int32 + lib.infiniopCreateLog1pDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + lib.infiniopGetLog1pWorkspaceSize.restype = c_int32 + lib.infiniopGetLog1pWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + lib.infiniopLog1p.restype = c_int32 + lib.infiniopLog1p.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + ] + lib.infiniopDestroyLog1pDescriptor.restype = c_int32 + lib.infiniopDestroyLog1pDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + +@OpRegister.operator +def sin_(lib): + lib.infiniopCreateSinDescriptor.restype = c_int32 + lib.infiniopCreateSinDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + lib.infiniopGetSinWorkspaceSize.restype = c_int32 + lib.infiniopGetSinWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + lib.infiniopSin.restype = c_int32 + lib.infiniopSin.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + ] + lib.infiniopDestroySinDescriptor.restype = c_int32 + lib.infiniopDestroySinDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + @OpRegister.operator def tan_(lib): lib.infiniopCreateTanDescriptor.restype = c_int32 @@ -2031,6 +2911,35 @@ def exp_(lib): ] +@OpRegister.operator +def exp2_(lib): + lib.infiniopCreateExp2Descriptor.restype = c_int32 + lib.infiniopCreateExp2Descriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + lib.infiniopGetExp2WorkspaceSize.restype = c_int32 + lib.infiniopGetExp2WorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + lib.infiniopExp2.restype = c_int32 + lib.infiniopExp2.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + ] + lib.infiniopDestroyExp2Descriptor.restype = c_int32 + lib.infiniopDestroyExp2Descriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + @OpRegister.operator def gather_(lib): lib.infiniopCreateGatherDescriptor.restype = c_int32 @@ -2092,6 +3001,122 @@ def hardswish_(lib): ] +@OpRegister.operator +def isnan_(lib): + lib.infiniopCreateIsNanDescriptor.restype = c_int32 + lib.infiniopCreateIsNanDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + lib.infiniopGetIsNanWorkspaceSize.restype = c_int32 + lib.infiniopGetIsNanWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + lib.infiniopIsNan.restype = c_int32 + lib.infiniopIsNan.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + ] + lib.infiniopDestroyIsNanDescriptor.restype = c_int32 + lib.infiniopDestroyIsNanDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + +@OpRegister.operator +def isinf_(lib): + lib.infiniopCreateIsInfDescriptor.restype = c_int32 + lib.infiniopCreateIsInfDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + lib.infiniopGetIsInfWorkspaceSize.restype = c_int32 + lib.infiniopGetIsInfWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + lib.infiniopIsInf.restype = c_int32 + lib.infiniopIsInf.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + ] + lib.infiniopDestroyIsInfDescriptor.restype = c_int32 + lib.infiniopDestroyIsInfDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + +@OpRegister.operator +def isfinite_(lib): + lib.infiniopCreateIsFiniteDescriptor.restype = c_int32 + lib.infiniopCreateIsFiniteDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + lib.infiniopGetIsFiniteWorkspaceSize.restype = c_int32 + lib.infiniopGetIsFiniteWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + lib.infiniopIsFinite.restype = c_int32 + lib.infiniopIsFinite.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + ] + lib.infiniopDestroyIsFiniteDescriptor.restype = c_int32 + lib.infiniopDestroyIsFiniteDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + +@OpRegister.operator +def sinc_(lib): + lib.infiniopCreateSincDescriptor.restype = c_int32 + lib.infiniopCreateSincDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + lib.infiniopGetSincWorkspaceSize.restype = c_int32 + lib.infiniopGetSincWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + lib.infiniopSinc.restype = c_int32 + lib.infiniopSinc.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + ] + lib.infiniopDestroySincDescriptor.restype = c_int32 + lib.infiniopDestroySincDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + @OpRegister.operator def index_copy_inplace_(lib): lib.infiniopCreateIndexCopyInplaceDescriptor.restype = c_int32 diff --git a/test/infiniop/libinfiniop/utils.py b/test/infiniop/libinfiniop/utils.py index 9b43c47c5..b6d8d4207 100644 --- a/test/infiniop/libinfiniop/utils.py +++ b/test/infiniop/libinfiniop/utils.py @@ -83,8 +83,25 @@ def __init__( InfiniDtype.BYTE, InfiniDtype.BOOL, ]: - randint_low = -2000000000 if randint_low is None else randint_low - randint_high = 2000000000 if randint_high is None else randint_high + # Set appropriate default ranges based on dtype + if randint_low is None or randint_high is None: + if dt == InfiniDtype.U8 or dt == InfiniDtype.BYTE: + randint_low = 0 if randint_low is None else randint_low + randint_high = 256 if randint_high is None else randint_high + elif dt == InfiniDtype.BOOL: + randint_low = 0 if randint_low is None else randint_low + randint_high = 2 if randint_high is None else randint_high + elif dt == InfiniDtype.U16: + randint_low = 0 if randint_low is None else randint_low + randint_high = 65536 if randint_high is None else randint_high + elif dt in [InfiniDtype.U32, InfiniDtype.U64]: + randint_low = 0 if randint_low is None else randint_low + randint_high = 2000000000 if randint_high is None else randint_high + else: + # For signed integer types (I8, I16, I32, I64) + randint_low = -2000000000 if randint_low is None else randint_low + randint_high = 2000000000 if randint_high is None else randint_high + self._torch_tensor = torch.randint( randint_low, randint_high, @@ -107,8 +124,25 @@ def __init__( torch_shape, dtype=to_torch_dtype(dt), device=torch_device_map[device] ) elif mode == "randint": - randint_low = -2000000000 if randint_low is None else randint_low - randint_high = 2000000000 if randint_high is None else randint_high + # Set appropriate default ranges based on dtype (same logic as mode="random") + if randint_low is None or randint_high is None: + if dt == InfiniDtype.U8 or dt == InfiniDtype.BYTE: + randint_low = 0 if randint_low is None else randint_low + randint_high = 256 if randint_high is None else randint_high + elif dt == InfiniDtype.BOOL: + randint_low = 0 if randint_low is None else randint_low + randint_high = 2 if randint_high is None else randint_high + elif dt == InfiniDtype.U16: + randint_low = 0 if randint_low is None else randint_low + randint_high = 65536 if randint_high is None else randint_high + elif dt in [InfiniDtype.U32, InfiniDtype.U64]: + randint_low = 0 if randint_low is None else randint_low + randint_high = 2000000000 if randint_high is None else randint_high + else: + # For signed integer types (I8, I16, I32, I64) + randint_low = -2000000000 if randint_low is None else randint_low + randint_high = 2000000000 if randint_high is None else randint_high + self._torch_tensor = torch.randint( randint_low, randint_high, diff --git a/test/infiniop/test_all_binary_ops.py b/test/infiniop/test_all_binary_ops.py index e08b3e41b..61fe15574 100644 --- a/test/infiniop/test_all_binary_ops.py +++ b/test/infiniop/test_all_binary_ops.py @@ -50,6 +50,31 @@ def generate_input_b(shape, b_stride, dtype, device): EQUAL_NAN = True +class FloorDivideTest(BinaryTestBase): + OP_NAME = "FloorDivide" + OP_NAME_LOWER = "floor_divide" + + @staticmethod + def torch_op(c, a, b): + torch.floor_divide(a, b, out=c) + + @staticmethod + def generate_input_a(shape, a_stride, dtype, device): + return TestTensor(shape, a_stride, dtype, device) + + @staticmethod + def generate_input_b(shape, b_stride, dtype, device): + # For division, ensure b doesn't contain zeros + return TestTensor(shape, b_stride, dtype, device, scale=2, bias=0.1) + + TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6}, + } + + EQUAL_NAN = True + + class PowTest(BinaryTestBase): OP_NAME = "Pow" OP_NAME_LOWER = "pow" @@ -75,6 +100,81 @@ def generate_input_b(shape, b_stride, dtype, device): EQUAL_NAN = True +class CopySignTest(BinaryTestBase): + OP_NAME = "CopySign" + OP_NAME_LOWER = "copysign" + + @staticmethod + def torch_op(c, a, b): + torch.copysign(a, b, out=c) + + @staticmethod + def generate_input_a(shape, a_stride, dtype, device): + # Generate values with various magnitudes + return TestTensor(shape, a_stride, dtype, device, mode="random", scale=10.0, bias=-5.0) + + @staticmethod + def generate_input_b(shape, b_stride, dtype, device): + # Generate values with various signs + return TestTensor(shape, b_stride, dtype, device, mode="random", scale=10.0, bias=-5.0) + + TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7}, + } + + EQUAL_NAN = True + + +class HypotTest(BinaryTestBase): + OP_NAME = "Hypot" + OP_NAME_LOWER = "hypot" + + @staticmethod + def torch_op(c, a, b): + torch.hypot(a, b, out=c) + + @staticmethod + def generate_input_a(shape, a_stride, dtype, device): + return TestTensor(shape, a_stride, dtype, device, mode="random", scale=10.0, bias=-5.0) + + @staticmethod + def generate_input_b(shape, b_stride, dtype, device): + return TestTensor(shape, b_stride, dtype, device, mode="random", scale=10.0, bias=-5.0) + + TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6}, + } + + EQUAL_NAN = True + + +class Atan2Test(BinaryTestBase): + OP_NAME = "Atan2" + OP_NAME_LOWER = "atan2" + + @staticmethod + def torch_op(c, a, b): + torch.atan2(a, b, out=c) + + @staticmethod + def generate_input_a(shape, a_stride, dtype, device): + return TestTensor(shape, a_stride, dtype, device) + + @staticmethod + def generate_input_b(shape, b_stride, dtype, device): + # For atan2, avoid zeros in denominator (b) + return TestTensor(shape, b_stride, dtype, device, scale=2, bias=0.1) + + TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6}, + } + + EQUAL_NAN = True + + class ModTest(BinaryTestBase): OP_NAME = "Mod" OP_NAME_LOWER = "mod" @@ -100,6 +200,31 @@ def generate_input_b(shape, b_stride, dtype, device): EQUAL_NAN = True +class RemainderTest(BinaryTestBase): + OP_NAME = "Remainder" + OP_NAME_LOWER = "remainder" + + @staticmethod + def torch_op(c, a, b): + torch.remainder(a, b, out=c) + + @staticmethod + def generate_input_a(shape, a_stride, dtype, device): + return TestTensor(shape, a_stride, dtype, device) + + @staticmethod + def generate_input_b(shape, b_stride, dtype, device): + # Avoid zeros + return TestTensor(shape, b_stride, dtype, device, scale=2, bias=0.1) + + TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6}, + } + + EQUAL_NAN = True + + class MaxTest(BinaryTestBase): OP_NAME = "Max" OP_NAME_LOWER = "max" @@ -148,6 +273,466 @@ def generate_input_b(shape, b_stride, dtype, device): EQUAL_NAN = True +class FmaxTest(BinaryTestBase): + OP_NAME = "Fmax" + OP_NAME_LOWER = "fmax" + + @staticmethod + def torch_op(c, a, b): + # torch.fmax ignores NaN: if one is NaN, return the other + result = torch.fmax(a, b) + c.copy_(result) + + @staticmethod + def generate_input_a(shape, a_stride, dtype, device): + return TestTensor(shape, a_stride, dtype, device) + + @staticmethod + def generate_input_b(shape, b_stride, dtype, device): + return TestTensor(shape, b_stride, dtype, device) + + TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6}, + } + + EQUAL_NAN = True + + +class FminTest(BinaryTestBase): + OP_NAME = "Fmin" + OP_NAME_LOWER = "fmin" + + @staticmethod + def torch_op(c, a, b): + # torch.fmin ignores NaN: if one is NaN, return the other + result = torch.fmin(a, b) + c.copy_(result) + + @staticmethod + def generate_input_a(shape, a_stride, dtype, device): + return TestTensor(shape, a_stride, dtype, device) + + @staticmethod + def generate_input_b(shape, b_stride, dtype, device): + return TestTensor(shape, b_stride, dtype, device) + + TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6}, + } + + EQUAL_NAN = True + + +class GtTest(BinaryTestBase): + OP_NAME = "Gt" + OP_NAME_LOWER = "gt" + + @staticmethod + def torch_op(c, a, b): + # torch.gt returns bool, convert to float (1.0 or 0.0) to match our implementation + result = torch.gt(a, b) + c.copy_(result.float()) + + @staticmethod + def generate_input_a(shape, a_stride, dtype, device): + return TestTensor(shape, a_stride, dtype, device) + + @staticmethod + def generate_input_b(shape, b_stride, dtype, device): + return TestTensor(shape, b_stride, dtype, device) + + TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6}, + } + + EQUAL_NAN = True + + +class LtTest(BinaryTestBase): + OP_NAME = "Lt" + OP_NAME_LOWER = "lt" + + @staticmethod + def torch_op(c, a, b): + # torch.lt returns bool, convert to float (1.0 or 0.0) to match our implementation + result = torch.lt(a, b) + c.copy_(result.float()) + + @staticmethod + def generate_input_a(shape, a_stride, dtype, device): + return TestTensor(shape, a_stride, dtype, device) + + @staticmethod + def generate_input_b(shape, b_stride, dtype, device): + return TestTensor(shape, b_stride, dtype, device) + + TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6}, + } + + EQUAL_NAN = True + + +class GeTest(BinaryTestBase): + OP_NAME = "Ge" + OP_NAME_LOWER = "ge" + + @staticmethod + def torch_op(c, a, b): + # torch.ge returns bool, convert to float (1.0 or 0.0) to match our implementation + result = torch.ge(a, b) + c.copy_(result.float()) + + @staticmethod + def generate_input_a(shape, a_stride, dtype, device): + return TestTensor(shape, a_stride, dtype, device) + + @staticmethod + def generate_input_b(shape, b_stride, dtype, device): + return TestTensor(shape, b_stride, dtype, device) + + TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6}, + } + + EQUAL_NAN = True + + +class LeTest(BinaryTestBase): + OP_NAME = "Le" + OP_NAME_LOWER = "le" + + @staticmethod + def torch_op(c, a, b): + # torch.le returns bool, convert to float (1.0 or 0.0) to match our implementation + result = torch.le(a, b) + c.copy_(result.float()) + + @staticmethod + def generate_input_a(shape, a_stride, dtype, device): + return TestTensor(shape, a_stride, dtype, device) + + @staticmethod + def generate_input_b(shape, b_stride, dtype, device): + return TestTensor(shape, b_stride, dtype, device) + + TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6}, + } + + EQUAL_NAN = True + + +class EqTest(BinaryTestBase): + OP_NAME = "Eq" + OP_NAME_LOWER = "eq" + + @staticmethod + def torch_op(c, a, b): + # torch.eq returns bool, convert to float (1.0 or 0.0) to match our implementation + result = torch.eq(a, b) + c.copy_(result.float()) + + @staticmethod + def generate_input_a(shape, a_stride, dtype, device): + return TestTensor(shape, a_stride, dtype, device) + + @staticmethod + def generate_input_b(shape, b_stride, dtype, device): + return TestTensor(shape, b_stride, dtype, device) + + TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6}, + } + + EQUAL_NAN = True + + +class NeTest(BinaryTestBase): + OP_NAME = "Ne" + OP_NAME_LOWER = "ne" + + @staticmethod + def torch_op(c, a, b): + # torch.ne returns bool, convert to float (1.0 or 0.0) to match our implementation + result = torch.ne(a, b) + c.copy_(result.float()) + + @staticmethod + def generate_input_a(shape, a_stride, dtype, device): + return TestTensor(shape, a_stride, dtype, device) + + @staticmethod + def generate_input_b(shape, b_stride, dtype, device): + return TestTensor(shape, b_stride, dtype, device) + + TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6}, + } + + EQUAL_NAN = True + + +class LogicalAndTest(BinaryTestBase): + OP_NAME = "LogicalAnd" + OP_NAME_LOWER = "logical_and" + + @staticmethod + def torch_op(c, a, b): + # torch.logical_and returns bool, convert to float (1.0 or 0.0) to match our implementation + result = torch.logical_and(a, b) + c.copy_(result.float()) + + @staticmethod + def generate_input_a(shape, a_stride, dtype, device): + return TestTensor(shape, a_stride, dtype, device) + + @staticmethod + def generate_input_b(shape, b_stride, dtype, device): + return TestTensor(shape, b_stride, dtype, device) + + TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6}, + } + + EQUAL_NAN = True + + +class LogicalOrTest(BinaryTestBase): + OP_NAME = "LogicalOr" + OP_NAME_LOWER = "logical_or" + + @staticmethod + def torch_op(c, a, b): + # torch.logical_or returns bool, convert to float (1.0 or 0.0) to match our implementation + result = torch.logical_or(a, b) + c.copy_(result.float()) + + @staticmethod + def generate_input_a(shape, a_stride, dtype, device): + return TestTensor(shape, a_stride, dtype, device) + + @staticmethod + def generate_input_b(shape, b_stride, dtype, device): + return TestTensor(shape, b_stride, dtype, device) + + TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6}, + } + + EQUAL_NAN = True + + +class LogicalXorTest(BinaryTestBase): + OP_NAME = "LogicalXor" + OP_NAME_LOWER = "logical_xor" + + @staticmethod + def torch_op(c, a, b): + # torch.logical_xor returns bool, convert to float (1.0 or 0.0) to match our implementation + result = torch.logical_xor(a, b) + c.copy_(result.float()) + + @staticmethod + def generate_input_a(shape, a_stride, dtype, device): + return TestTensor(shape, a_stride, dtype, device) + + @staticmethod + def generate_input_b(shape, b_stride, dtype, device): + return TestTensor(shape, b_stride, dtype, device) + + TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6}, + } + + EQUAL_NAN = True + + +class BitwiseAndTest(BinaryTestBase): + OP_NAME = "BitwiseAnd" + OP_NAME_LOWER = "bitwise_and" + + @staticmethod + def torch_op(c, a, b): + # torch.bitwise_and only supports integral types + result = torch.bitwise_and(a, b) + c.copy_(result) + + @staticmethod + def generate_input_a(shape, a_stride, dtype, device): + # Use default TestTensor (utils.py now handles correct ranges for integral types) + return TestTensor(shape, a_stride, dtype, device) + + @staticmethod + def generate_input_b(shape, b_stride, dtype, device): + # Use default TestTensor (utils.py now handles correct ranges for integral types) + return TestTensor(shape, b_stride, dtype, device) + + TOLERANCE_MAP = { + InfiniDtype.I32: {"atol": 0, "rtol": 0}, + InfiniDtype.I64: {"atol": 0, "rtol": 0}, + InfiniDtype.U8: {"atol": 0, "rtol": 0}, + } + + # Bitwise operations only support integral types + TENSOR_DTYPES = [InfiniDtype.I32, InfiniDtype.I64, InfiniDtype.U8] + + EQUAL_NAN = True + + +class BitwiseOrTest(BinaryTestBase): + OP_NAME = "BitwiseOr" + OP_NAME_LOWER = "bitwise_or" + + @staticmethod + def torch_op(c, a, b): + # torch.bitwise_or only supports integral types + result = torch.bitwise_or(a, b) + c.copy_(result) + + @staticmethod + def generate_input_a(shape, a_stride, dtype, device): + # Use default TestTensor (utils.py now handles correct ranges for integral types) + return TestTensor(shape, a_stride, dtype, device) + + @staticmethod + def generate_input_b(shape, b_stride, dtype, device): + # Use default TestTensor (utils.py now handles correct ranges for integral types) + return TestTensor(shape, b_stride, dtype, device) + + TOLERANCE_MAP = { + InfiniDtype.I32: {"atol": 0, "rtol": 0}, + InfiniDtype.I64: {"atol": 0, "rtol": 0}, + InfiniDtype.U8: {"atol": 0, "rtol": 0}, + } + + # Bitwise operations only support integral types + TENSOR_DTYPES = [InfiniDtype.I32, InfiniDtype.I64, InfiniDtype.U8] + + EQUAL_NAN = True + + +class BitwiseXorTest(BinaryTestBase): + OP_NAME = "BitwiseXor" + OP_NAME_LOWER = "bitwise_xor" + + @staticmethod + def torch_op(c, a, b): + # torch.bitwise_xor only supports integral types + result = torch.bitwise_xor(a, b) + c.copy_(result) + + @staticmethod + def generate_input_a(shape, a_stride, dtype, device): + # Use default TestTensor (utils.py now handles correct ranges for integral types) + return TestTensor(shape, a_stride, dtype, device) + + @staticmethod + def generate_input_b(shape, b_stride, dtype, device): + # Use default TestTensor (utils.py now handles correct ranges for integral types) + return TestTensor(shape, b_stride, dtype, device) + + TOLERANCE_MAP = { + InfiniDtype.I32: {"atol": 0, "rtol": 0}, + InfiniDtype.I64: {"atol": 0, "rtol": 0}, + InfiniDtype.U8: {"atol": 0, "rtol": 0}, + } + + # Bitwise operations only support integral types + TENSOR_DTYPES = [InfiniDtype.I32, InfiniDtype.I64, InfiniDtype.U8] + + EQUAL_NAN = True + + +class BitwiseLeftShiftTest(BinaryTestBase): + OP_NAME = "BitwiseLeftShift" + OP_NAME_LOWER = "bitwise_left_shift" + + @staticmethod + def torch_op(c, a, b): + # torch.bitwise_left_shift only supports integral types + result = torch.bitwise_left_shift(a, b) + c.copy_(result) + + @staticmethod + def generate_input_a(shape, a_stride, dtype, device): + # Use default TestTensor (utils.py now handles correct ranges for integral types) + return TestTensor(shape, a_stride, dtype, device) + + @staticmethod + def generate_input_b(shape, b_stride, dtype, device): + # For shift operations, b should be non-negative and within reasonable range + # Generate shift amounts between 0 and bit_width-1 for each type + if dtype == InfiniDtype.U8: + return TestTensor(shape, b_stride, dtype, device, randint_low=0, randint_high=8) + elif dtype == InfiniDtype.I32: + return TestTensor(shape, b_stride, dtype, device, randint_low=0, randint_high=32) + elif dtype == InfiniDtype.I64: + return TestTensor(shape, b_stride, dtype, device, randint_low=0, randint_high=64) + return TestTensor(shape, b_stride, dtype, device) + + TOLERANCE_MAP = { + InfiniDtype.I32: {"atol": 0, "rtol": 0}, + InfiniDtype.I64: {"atol": 0, "rtol": 0}, + InfiniDtype.U8: {"atol": 0, "rtol": 0}, + } + + # Bitwise operations only support integral types + TENSOR_DTYPES = [InfiniDtype.I32, InfiniDtype.I64, InfiniDtype.U8] + + EQUAL_NAN = True + + +class BitwiseRightShiftTest(BinaryTestBase): + OP_NAME = "BitwiseRightShift" + OP_NAME_LOWER = "bitwise_right_shift" + + @staticmethod + def torch_op(c, a, b): + # torch.bitwise_right_shift only supports integral types + result = torch.bitwise_right_shift(a, b) + c.copy_(result) + + @staticmethod + def generate_input_a(shape, a_stride, dtype, device): + # Use default TestTensor (utils.py now handles correct ranges for integral types) + return TestTensor(shape, a_stride, dtype, device) + + @staticmethod + def generate_input_b(shape, b_stride, dtype, device): + # For shift operations, b should be non-negative and within reasonable range + # Generate shift amounts between 0 and bit_width-1 for each type + if dtype == InfiniDtype.U8: + return TestTensor(shape, b_stride, dtype, device, randint_low=0, randint_high=8) + elif dtype == InfiniDtype.I32: + return TestTensor(shape, b_stride, dtype, device, randint_low=0, randint_high=32) + elif dtype == InfiniDtype.I64: + return TestTensor(shape, b_stride, dtype, device, randint_low=0, randint_high=64) + return TestTensor(shape, b_stride, dtype, device) + + TOLERANCE_MAP = { + InfiniDtype.I32: {"atol": 0, "rtol": 0}, + InfiniDtype.I64: {"atol": 0, "rtol": 0}, + InfiniDtype.U8: {"atol": 0, "rtol": 0}, + } + + # Bitwise operations only support integral types + TENSOR_DTYPES = [InfiniDtype.I32, InfiniDtype.I64, InfiniDtype.U8] + + EQUAL_NAN = True + + # ============================================================================== # 算子注册表 # ============================================================================== @@ -155,10 +740,31 @@ def generate_input_b(shape, b_stride, dtype, device): # 所有 binary 算子的测试类映射 BINARY_OP_TESTS = { "div": DivTest, + "floor_divide": FloorDivideTest, "pow": PowTest, + "copysign": CopySignTest, + "hypot": HypotTest, + "atan2": Atan2Test, "mod": ModTest, + "remainder": RemainderTest, "max": MaxTest, "min": MinTest, + "fmax": FmaxTest, + "fmin": FminTest, + "gt": GtTest, + "lt": LtTest, + "ge": GeTest, + "le": LeTest, + "eq": EqTest, + "ne": NeTest, + "logical_and": LogicalAndTest, + "logical_or": LogicalOrTest, + "logical_xor": LogicalXorTest, + "bitwise_and": BitwiseAndTest, + "bitwise_or": BitwiseOrTest, + "bitwise_xor": BitwiseXorTest, + "bitwise_left_shift": BitwiseLeftShiftTest, + "bitwise_right_shift": BitwiseRightShiftTest, } diff --git a/test/infiniop/test_all_unary_ops.py b/test/infiniop/test_all_unary_ops.py index 2a65cf938..54a8ef70c 100644 --- a/test/infiniop/test_all_unary_ops.py +++ b/test/infiniop/test_all_unary_ops.py @@ -185,6 +185,28 @@ def generate_input(shape, dtype, device): } +class SinTest(UnaryTestBase): + OP_NAME = "Sin" + OP_NAME_LOWER = "sin" + + @staticmethod + def torch_op(x): + return torch.sin(x).to(x.dtype) + + @staticmethod + def generate_input(shape, dtype, device): + # Generate test tensors with values in range [-200, -100) for sin operation + # sin domain is (-∞, +∞), so we use range [-200, -100) + return torch.rand(shape, dtype=dtype, device=device) * 100 - 200 + + TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-4, "rtol": 1e-2}, + InfiniDtype.F32: {"atol": 1e-4, "rtol": 1e-2}, + } + + EQUAL_NAN = True + + class CosTest(UnaryTestBase): OP_NAME = "Cos" OP_NAME_LOWER = "cos" @@ -288,6 +310,77 @@ def generate_input(shape, dtype, device): EQUAL_NAN = True +class Log2Test(UnaryTestBase): + OP_NAME = "Log2" + OP_NAME_LOWER = "log2" + + @staticmethod + def torch_op(x): + return torch.log2(x).to(x.dtype) + + @staticmethod + def generate_input(shape, dtype, device): + # log2 domain is (0, +∞), so we use range [0.1, 1.1) + return torch.rand(shape, dtype=dtype, device=device) + 0.1 + + TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-7, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-3}, + InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2}, + } + + # Support BF16 + TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16] + + EQUAL_NAN = True + + +class Log10Test(UnaryTestBase): + OP_NAME = "Log10" + OP_NAME_LOWER = "log10" + + @staticmethod + def torch_op(x): + return torch.log10(x).to(x.dtype) + + @staticmethod + def generate_input(shape, dtype, device): + # log10 domain is (0, +∞), so we use range [0.1, 1.1) + return torch.rand(shape, dtype=dtype, device=device) + 0.1 + + TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-7, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-3}, + InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2}, + } + + # Support BF16 + TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16] + + EQUAL_NAN = True + + +class Log1pTest(UnaryTestBase): + OP_NAME = "Log1p" + OP_NAME_LOWER = "log1p" + + @staticmethod + def torch_op(x): + return torch.log1p(x).to(x.dtype) + + @staticmethod + def generate_input(shape, dtype, device): + # log1p domain is (-1, +∞), so we use range [-0.9, 1.1) + # Include values close to zero to test numerical stability + x = torch.rand(shape, dtype=dtype, device=device) * 2 - 0.9 + return x + + TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6}, + } + + class NegTest(UnaryTestBase): OP_NAME = "Neg" OP_NAME_LOWER = "neg" @@ -410,6 +503,47 @@ def generate_input(shape, dtype, device): EQUAL_NAN = True +class SquareTest(UnaryTestBase): + OP_NAME = "Square" + OP_NAME_LOWER = "square" + + @staticmethod + def torch_op(x): + return torch.square(x).to(x.dtype) + + @staticmethod + def generate_input(shape, dtype, device): + return torch.rand(shape, dtype=dtype, device=device) * 10 - 5 + + TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7}, + } + + EQUAL_NAN = True + + +class RsqrtTest(UnaryTestBase): + OP_NAME = "Rsqrt" + OP_NAME_LOWER = "rsqrt" + + @staticmethod + def torch_op(x): + return torch.rsqrt(x).to(x.dtype) + + @staticmethod + def generate_input(shape, dtype, device): + # rsqrt domain is (0, +∞), avoid zero + return torch.rand(shape, dtype=dtype, device=device) * 100 + 1e-6 + + TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 2e-3}, + InfiniDtype.F32: {"atol": 0, "rtol": 1e-3}, + } + + EQUAL_NAN = True + + class TanTest(UnaryTestBase): OP_NAME = "Tan" OP_NAME_LOWER = "tan" @@ -452,6 +586,29 @@ def generate_input(shape, dtype, device): TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16] +class Exp2Test(UnaryTestBase): + OP_NAME = "Exp2" + OP_NAME_LOWER = "exp2" + + @staticmethod + def torch_op(x): + return torch.exp2(x).to(x.dtype) + + @staticmethod + def generate_input(shape, dtype, device): + # Keep input in reasonable range to avoid overflow + return torch.rand(shape, dtype=dtype, device=device) * 4 - 2 + + TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6}, + InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2}, + } + + # Support BF16 + TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16] + + class HardswishTest(UnaryTestBase): OP_NAME = "Hardswish" OP_NAME_LOWER = "hardswish" @@ -474,6 +631,117 @@ def generate_input(shape, dtype, device): TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16] +class IsNanTest(UnaryTestBase): + OP_NAME = "IsNan" + OP_NAME_LOWER = "isnan" + + @staticmethod + def torch_op(x): + return torch.isnan(x).to(x.dtype) + + @staticmethod + def generate_input(shape, dtype, device): + # Generate a mix of normal values and NaN values + x = torch.rand(shape, dtype=dtype, device=device) * 10 - 5 + # Set some values to NaN + nan_mask = torch.rand(shape, device=device) < 0.3 + x[nan_mask] = float('nan') + return x + + TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 0, "rtol": 0}, + InfiniDtype.F32: {"atol": 0, "rtol": 0}, + } + + EQUAL_NAN = False # For isnan, we want exact match (0 or 1) + + +class IsInfTest(UnaryTestBase): + OP_NAME = "IsInf" + OP_NAME_LOWER = "isinf" + + @staticmethod + def torch_op(x): + return torch.isinf(x).to(x.dtype) + + @staticmethod + def generate_input(shape, dtype, device): + # Generate a mix of normal values and Inf values + x = torch.rand(shape, dtype=dtype, device=device) * 10 - 5 + # Set some values to Inf + inf_mask = torch.rand(shape, device=device) < 0.3 + x[inf_mask] = float('inf') + # Set some to -Inf + neg_inf_mask = torch.rand(shape, device=device) < 0.15 + x[neg_inf_mask] = float('-inf') + return x + + TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 0, "rtol": 0}, + InfiniDtype.F32: {"atol": 0, "rtol": 0}, + } + + EQUAL_NAN = False # For isinf, we want exact match (0 or 1) + + +class IsFiniteTest(UnaryTestBase): + OP_NAME = "IsFinite" + OP_NAME_LOWER = "isfinite" + + @staticmethod + def torch_op(x): + return torch.isfinite(x).to(x.dtype) + + @staticmethod + def generate_input(shape, dtype, device): + # Generate a mix of normal values, NaN, and Inf values + x = torch.rand(shape, dtype=dtype, device=device) * 10 - 5 + # Set some values to NaN + nan_mask = torch.rand(shape, device=device) < 0.2 + x[nan_mask] = float('nan') + # Set some values to Inf + inf_mask = torch.rand(shape, device=device) < 0.2 + x[inf_mask] = float('inf') + return x + + TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 0, "rtol": 0}, + InfiniDtype.F32: {"atol": 0, "rtol": 0}, + } + + EQUAL_NAN = False # For isfinite, we want exact match (0 or 1) + + +class SincTest(UnaryTestBase): + OP_NAME = "Sinc" + OP_NAME_LOWER = "sinc" + + @staticmethod + def torch_op(x): + # PyTorch doesn't have sinc, so we implement it manually + # sinc(x) = sin(x) / x, sinc(0) = 1 + result = torch.sin(x) / x + result[x == 0] = 1.0 + return result.to(x.dtype) + + @staticmethod + def generate_input(shape, dtype, device): + # Generate values around zero and some larger values + # Include zero to test the special case + x = torch.rand(shape, dtype=dtype, device=device) * 10 - 5 + # Set some values to exactly zero + zero_mask = torch.rand(shape, device=device) < 0.1 + x[zero_mask] = 0.0 + return x + + TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-4, "rtol": 1e-4}, # sinc can have larger errors near zero + } + + EQUAL_NAN = True + + # ============================================================================== # 算子注册表 # ============================================================================== @@ -493,15 +761,26 @@ def generate_input(shape, dtype, device): "erf": ErfTest, "floor": FloorTest, "log": LogTest, + "log2": Log2Test, + "log10": Log10Test, + "log1p": Log1pTest, "neg": NegTest, "reciprocal": ReciprocalTest, "round": RoundTest, "sign": SignTest, + "sin": SinTest, "sinh": SinhTest, "sqrt": SqrtTest, + "square": SquareTest, + "rsqrt": RsqrtTest, "tan": TanTest, "exp": ExpTest, + "exp2": Exp2Test, "hardswish": HardswishTest, + "isnan": IsNanTest, + "isinf": IsInfTest, + "isfinite": IsFiniteTest, + "sinc": SincTest, }