Skip to content

Commit a1340f7

Browse files
committed
Namespace updates for cadence ops, adding 6 optimized ops
1 parent e9d9f6c commit a1340f7

File tree

9 files changed

+320
-238
lines changed

9 files changed

+320
-238
lines changed

backends/cadence/aot/functions_hifi.yaml

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -45,12 +45,12 @@
4545
- op: div.out
4646
kernels:
4747
- arg_meta: null
48-
kernel_name: impl::HiFi::div_out
48+
kernel_name: cadence::impl::HiFi::div_out
4949

5050
- op: div.out_mode
5151
kernels:
5252
- arg_meta: null
53-
kernel_name: impl::HiFi::div_out_mode
53+
kernel_name: cadence::impl::HiFi::div_out_mode
5454

5555
- op: embedding.out
5656
kernels:
@@ -65,7 +65,7 @@
6565
- op: mul.out
6666
kernels:
6767
- arg_meta: null
68-
kernel_name: impl::HiFi::mul_out
68+
kernel_name: cadence::impl::HiFi::mul_out
6969

7070
- op: permute_copy.out
7171
kernels:
@@ -75,7 +75,7 @@
7575
- op: sigmoid.out
7676
kernels:
7777
- arg_meta: null
78-
kernel_name: impl::HiFi::sigmoid_out
78+
kernel_name: cadence::impl::HiFi::sigmoid_out
7979

8080
- op: slice_copy.Tensor_out
8181
kernels:
@@ -90,12 +90,12 @@
9090
- op: sub.out
9191
kernels:
9292
- arg_meta: null
93-
kernel_name: impl::HiFi::sub_out
93+
kernel_name: cadence::impl::HiFi::sub_out
9494

9595
- op: tanh.out
9696
kernels:
9797
- arg_meta: null
98-
kernel_name: impl::HiFi::tanh_out
98+
kernel_name: cadence::impl::HiFi::tanh_out
9999

100100
- op: view_copy.out
101101
kernels:

backends/cadence/hifi/kernels/kernels.h

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,51 @@
1111
#include <inttypes.h>
1212
#include <stddef.h>
1313
#include <xa_type_def.h>
14+
/* For NNLIB APIs */
15+
#include "xa_nnlib_kernels_api.h"
1416

17+
/* Potential NNLIB function/APIs */
18+
extern "C" WORD32 xa_nn_elm_add_broadcast_4D_f32xf32_f32(
19+
FLOAT32* __restrict__ p_out,
20+
const WORD32* const p_out_shape,
21+
const FLOAT32* __restrict__ p_inp1,
22+
const WORD32* const p_inp1_shape,
23+
const FLOAT32* __restrict__ p_inp2,
24+
const WORD32* const p_inp2_shape);
25+
26+
extern "C" WORD32 xa_nn_elm_div_broadcast_4D_f32xf32_f32(
27+
FLOAT32* __restrict__ p_out,
28+
const WORD32* const p_out_shape,
29+
const FLOAT32* __restrict__ p_inp1,
30+
const WORD32* const p_inp1_shape,
31+
const FLOAT32* __restrict__ p_inp2,
32+
const WORD32* const p_inp2_shape);
33+
34+
extern "C" WORD32 xa_nn_elm_div_mode_f32xf32_f32(
35+
FLOAT32* __restrict__ p_out,
36+
const FLOAT32* __restrict__ p_inp1,
37+
const FLOAT32* __restrict__ p_inp2,
38+
WORD32 num_elm,
39+
WORD32 mode);
40+
41+
extern "C" WORD32 xa_nn_elm_div_mode_broadcast_4D_f32xf32_f32(
42+
FLOAT32* __restrict__ p_out,
43+
const WORD32* const p_out_shape,
44+
const FLOAT32* __restrict__ p_inp1,
45+
const WORD32* const p_inp1_shape,
46+
const FLOAT32* __restrict__ p_inp2,
47+
const WORD32* const p_inp2_shape,
48+
WORD32 mode);
49+
50+
extern "C" WORD32 xa_nn_elm_mul_broadcast_4D_f32xf32_f32(
51+
FLOAT32* __restrict__ p_out,
52+
const WORD32* const p_out_shape,
53+
const FLOAT32* __restrict__ p_inp1,
54+
const WORD32* const p_inp1_shape,
55+
const FLOAT32* __restrict__ p_inp2,
56+
const WORD32* const p_inp2_shape);
57+
58+
namespace cadence {
1559
namespace impl {
1660
namespace HiFi {
1761
namespace kernels {

backends/cadence/hifi/operators/CMakeLists.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,10 +47,11 @@ set(_aten_ops__srcs
4747
"${EXECUTORCH_ROOT}/kernels/portable/cpu/op_to_copy.cpp"
4848
"${EXECUTORCH_ROOT}/kernels/portable/cpu/op_view_copy.cpp"
4949
"${EXECUTORCH_ROOT}/kernels/portable/cpu/op_where.cpp"
50-
"${EXECUTORCH_ROOT}/kernels/portable/cpu/pattern/unary_ufunc_realhb_to_floath.cpp"
50+
"${EXECUTORCH_ROOT}/kernels/portable/cpu/pattern/unary_ufunc_realhbbf16_to_floathbf16.cpp"
5151
"${EXECUTORCH_ROOT}/kernels/portable/cpu/util/activation_ops_util.cpp"
5252
"${EXECUTORCH_ROOT}/kernels/portable/cpu/util/broadcast_util.cpp"
5353
"${EXECUTORCH_ROOT}/kernels/portable/cpu/util/copy_ops_util.cpp"
54+
"${EXECUTORCH_ROOT}/kernels/portable/cpu/util/dtype_util.cpp"
5455
"${EXECUTORCH_ROOT}/kernels/portable/cpu/util/index_util.cpp"
5556
"${EXECUTORCH_ROOT}/kernels/portable/cpu/util/kernel_ops_util.cpp"
5657
"${EXECUTORCH_ROOT}/kernels/portable/cpu/util/matmul_ops_util.cpp"

backends/cadence/hifi/operators/op_add.cpp

Lines changed: 53 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,7 @@ Tensor& add_out(
8383
Tensor& out) {
8484
ET_KERNEL_CHECK(
8585
ctx,
86-
resize_to_broadcast_target_size(a, b, out) == Error::Ok,
86+
torch::executor::resize_to_broadcast_target_size(a, b, out) == Error::Ok,
8787
InvalidArgument,
8888
out);
8989

@@ -93,25 +93,36 @@ Tensor& add_out(
9393
InvalidArgument,
9494
out);
9595
ET_KERNEL_CHECK(
96-
ctx, tensors_have_same_dim_order(a, b, out), InvalidArgument, out);
96+
ctx,
97+
executorch::runtime::tensors_have_same_dim_order(a, b, out),
98+
InvalidArgument,
99+
out);
97100

98101
ScalarType a_type = a.scalar_type();
99102
ScalarType b_type = b.scalar_type();
100-
ScalarType alpha_type =
101-
torch::executor::native::utils::get_scalar_dtype(alpha);
102-
ScalarType common_type = promoteTypes(a_type, b_type, /*half_to_float*/ true);
103+
ScalarType alpha_type =
104+
torch::executor::native::utils::get_scalar_dtype(alpha);
105+
ScalarType common_type =
106+
executorch::runtime::promoteTypes(a_type, b_type, /*half_to_float*/ true);
103107
ScalarType out_type = out.scalar_type();
104108

105-
ET_KERNEL_CHECK(ctx, canCast(common_type, out_type), InvalidArgument, out);
106109
ET_KERNEL_CHECK(
107-
ctx, check_alpha_type(alpha_type, common_type), InvalidArgument, out);
108-
110+
ctx,
111+
executorch::runtime::canCast(common_type, out_type),
112+
InvalidArgument,
113+
out);
114+
ET_KERNEL_CHECK(
115+
ctx,
116+
torch::executor::check_alpha_type(alpha_type, common_type),
117+
InvalidArgument,
118+
out);
119+
109120
float alpha_val;
110121
torch::executor::native::utils::extract_scalar(alpha, &alpha_val);
111122

112123
constexpr auto name = "add.out";
113124
constexpr int kNnlibMaxDim = 4; /*fallback if broadcast and dim > 4 */
114-
125+
115126
int a_dim = a.dim(), b_dim = b.dim(), out_dim = out.dim();
116127
bool optimized = 1;
117128
/*find broadcast*/
@@ -124,51 +135,48 @@ Tensor& add_out(
124135
if ((out_type != ScalarType::Float) || (alpha_val != 1.0))
125136
optimized = 0;
126137

127-
if ((a_dim == 0) || (b_dim == 0) )
138+
if ((a_dim == 0) || (b_dim == 0))
128139
optimized = 0;
129140

130141
if ((broadcast == 1) && (max_dim > kNnlibMaxDim))
131142
optimized = 0;
132143

133-
134144
if (optimized) {
135-
const float* const a_data = a.const_data_ptr<float>();
136-
const float* const b_data = b.const_data_ptr<float>();
137-
float* const out_data = out.mutable_data_ptr<float>();
138-
139-
if(broadcast == 1) {
140-
int out_shape[kNnlibMaxDim];
141-
int inp1_shape[kNnlibMaxDim];
142-
int inp2_shape[kNnlibMaxDim];
143-
144-
for (int i = 0; i < kNnlibMaxDim; i++) {
145-
out_shape[i] = 1;
146-
inp1_shape[i] = 1;
147-
inp2_shape[i] = 1;
148-
}
149-
150-
int off_o = kNnlibMaxDim - out.dim();
151-
int off_a = kNnlibMaxDim - a.dim();
152-
int off_b = kNnlibMaxDim - b.dim();
153-
154-
for (int i = 0; i < out.dim(); i++)
155-
out_shape[i+off_o] = out.size(i);
156-
for (int i = 0; i < a.dim(); i++)
157-
inp1_shape[i+off_a] = a.size(i);
158-
for (int i = 0; i < b.dim(); i++)
159-
inp2_shape[i+off_b] = b.size(i);
160-
161-
xa_nn_elm_add_broadcast_4D_f32xf32_f32(
162-
out_data, out_shape, a_data, inp1_shape, b_data, inp2_shape);
163-
}
164-
else
165-
{
166-
xa_nn_elm_add_f32xf32_f32(out_data, a_data, b_data, out.numel());
145+
const float* const a_data = a.const_data_ptr<float>();
146+
const float* const b_data = b.const_data_ptr<float>();
147+
float* const out_data = out.mutable_data_ptr<float>();
148+
149+
if (broadcast == 1) {
150+
int out_shape[kNnlibMaxDim];
151+
int inp1_shape[kNnlibMaxDim];
152+
int inp2_shape[kNnlibMaxDim];
153+
154+
for (int i = 0; i < kNnlibMaxDim; i++) {
155+
out_shape[i] = 1;
156+
inp1_shape[i] = 1;
157+
inp2_shape[i] = 1;
167158
}
168159

169-
return out;
160+
int off_o = kNnlibMaxDim - out.dim();
161+
int off_a = kNnlibMaxDim - a.dim();
162+
int off_b = kNnlibMaxDim - b.dim();
163+
164+
for (int i = 0; i < out.dim(); i++)
165+
out_shape[i + off_o] = out.size(i);
166+
for (int i = 0; i < a.dim(); i++)
167+
inp1_shape[i + off_a] = a.size(i);
168+
for (int i = 0; i < b.dim(); i++)
169+
inp2_shape[i + off_b] = b.size(i);
170+
171+
xa_nn_elm_add_broadcast_4D_f32xf32_f32(
172+
out_data, out_shape, a_data, inp1_shape, b_data, inp2_shape);
173+
} else {
174+
xa_nn_elm_add_f32xf32_f32(out_data, a_data, b_data, out.numel());
175+
}
176+
177+
return out;
170178
}
171-
179+
172180
ET_SWITCH_REALHBBF16_TYPES(a_type, ctx, name, CTYPE_A, [&]() {
173181
ET_SWITCH_REALHBBF16_TYPES(b_type, ctx, name, CTYPE_B, [&]() {
174182
using CTYPE_IN = typename torch::executor::
@@ -191,7 +199,6 @@ Tensor& add_out(
191199
return out;
192200
}
193201

194-
195202
} // namespace native
196203
} // namespace HiFi
197204
} // namespace impl

0 commit comments

Comments
 (0)