diff --git a/backends/cadence/aot/functions_hifi.yaml b/backends/cadence/aot/functions_hifi.yaml
index 70b2dd02076..5c25d89946e 100644
--- a/backends/cadence/aot/functions_hifi.yaml
+++ b/backends/cadence/aot/functions_hifi.yaml
@@ -25,7 +25,7 @@
 - op: add.out
   kernels:
     - arg_meta: null
-      kernel_name: impl::HiFi::add_out
+      kernel_name: torch::executor::add_out
 
 - op: bmm.out
   kernels:
@@ -45,12 +45,12 @@
 - op: div.out
   kernels:
     - arg_meta: null
-      kernel_name: cadence::impl::HiFi::div_out
+      kernel_name: torch::executor::div_out
 
 - op: div.out_mode
   kernels:
     - arg_meta: null
-      kernel_name: cadence::impl::HiFi::div_out_mode
+      kernel_name: torch::executor::div_out_mode
 
 - op: embedding.out
   kernels:
@@ -65,7 +65,7 @@
 - op: mul.out
   kernels:
     - arg_meta: null
-      kernel_name: cadence::impl::HiFi::mul_out
+      kernel_name: torch::executor::mul_out
 
 - op: permute_copy.out
   kernels:
@@ -75,7 +75,7 @@
 - op: sigmoid.out
   kernels:
     - arg_meta: null
-      kernel_name: cadence::impl::HiFi::sigmoid_out
+      kernel_name: torch::executor::sigmoid_out
 
 - op: slice_copy.Tensor_out
   kernels:
@@ -90,12 +90,7 @@
 - op: sub.out
   kernels:
     - arg_meta: null
-      kernel_name: cadence::impl::HiFi::sub_out
-
-- op: tanh.out
-  kernels:
-    - arg_meta: null
-      kernel_name: cadence::impl::HiFi::tanh_out
+      kernel_name: torch::executor::sub_out
 
 - op: view_copy.out
   kernels:
diff --git a/backends/cadence/cadence.cmake b/backends/cadence/cadence.cmake
index 0fa55c6a65b..25f241f205c 100644
--- a/backends/cadence/cadence.cmake
+++ b/backends/cadence/cadence.cmake
@@ -43,9 +43,6 @@ set(CMAKE_CXX_COMPILER ${TOOLCHAIN_HOME}/bin/${CROSS_COMPILE_TARGET}-clang++)
 
 set(CMAKE_C_FLAGS_INIT "-stdlib=libc++ -mtext-section-literals -mlongcalls")
 set(CMAKE_CXX_FLAGS_INIT "-stdlib=libc++ -mtext-section-literals -mlongcalls")
-#workaround for larger compilation time
-set(CMAKE_CXX_FLAGS_INIT "${CMAKE_CXX_FLAGS_INIT} -fno-strict-aliasing")
-
 set(CMAKE_SYSROOT ${TOOLCHAIN_HOME}/${SYSROOT_TARGET})
 set(CMAKE_LINKER ${TOOLCHAIN_HOME}/bin/xt-ld)
 add_link_options(-lm -stdlib=libc++ -Wl,--no-as-needed -static)
diff --git a/backends/cadence/hifi/kernels/CMakeLists.txt b/backends/cadence/hifi/kernels/CMakeLists.txt
index 8fee7e85361..d03bb1c01ef 100644
--- a/backends/cadence/hifi/kernels/CMakeLists.txt
+++ b/backends/cadence/hifi/kernels/CMakeLists.txt
@@ -9,10 +9,6 @@ add_library(
   cadence_kernels
   kernels.cpp
   ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/matmul_asym8uxasym8u_asym8u.cpp
-  ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_add_f32_broadcast.c
-  ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_div_f32_broadcast.c
-  ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_div_mode_f32_broadcast.c
-  ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_mul_f32_broadcast.c
 )
 # Let files say "include <executorch/path/to/header.h>".
 set(_common_include_directories ${EXECUTORCH_ROOT}/..)
diff --git a/backends/cadence/hifi/kernels/kernels.h b/backends/cadence/hifi/kernels/kernels.h
index a206635a285..d27e8051f52 100644
--- a/backends/cadence/hifi/kernels/kernels.h
+++ b/backends/cadence/hifi/kernels/kernels.h
@@ -11,49 +11,6 @@
 #include <inttypes.h>
 #include <stddef.h>
 #include <xa_type_def.h>
-/* For NNLIB APIs */
-#include "xa_nnlib_kernels_api.h"
-
-/* Potential NNLIB function/APIs */
-extern "C" WORD32 xa_nn_elm_add_broadcast_4D_f32xf32_f32(
-    FLOAT32* __restrict__ p_out,
-    const WORD32* const p_out_shape,
-    const FLOAT32* __restrict__ p_inp1,
-    const WORD32* const p_inp1_shape,
-    const FLOAT32* __restrict__ p_inp2,
-    const WORD32* const p_inp2_shape);
-
-extern "C" WORD32 xa_nn_elm_div_broadcast_4D_f32xf32_f32(
-    FLOAT32* __restrict__ p_out,
-    const WORD32* const p_out_shape,
-    const FLOAT32* __restrict__ p_inp1,
-    const WORD32* const p_inp1_shape,
-    const FLOAT32* __restrict__ p_inp2,
-    const WORD32* const p_inp2_shape);
-
-extern "C" WORD32 xa_nn_elm_div_mode_f32xf32_f32(
-    FLOAT32* __restrict__ p_out,
-    const FLOAT32* __restrict__ p_inp1,
-    const FLOAT32* __restrict__ p_inp2,
-    WORD32 num_elm,
-    WORD32 mode);
-
-extern "C" WORD32 xa_nn_elm_div_mode_broadcast_4D_f32xf32_f32(
-    FLOAT32* __restrict__ p_out,
-    const WORD32* const p_out_shape,
-    const FLOAT32* __restrict__ p_inp1,
-    const WORD32* const p_inp1_shape,
-    const FLOAT32* __restrict__ p_inp2,
-    const WORD32* const p_inp2_shape,
-    WORD32 mode);
-
-extern "C" WORD32 xa_nn_elm_mul_broadcast_4D_f32xf32_f32(
-    FLOAT32* __restrict__ p_out,
-    const WORD32* const p_out_shape,
-    const FLOAT32* __restrict__ p_inp1,
-    const WORD32* const p_inp1_shape,
-    const FLOAT32* __restrict__ p_inp2,
-    const WORD32* const p_inp2_shape);
 
 namespace cadence {
 namespace impl {
diff --git a/backends/cadence/hifi/operators/CMakeLists.txt b/backends/cadence/hifi/operators/CMakeLists.txt
index cbbb279e5d6..78413ef312e 100644
--- a/backends/cadence/hifi/operators/CMakeLists.txt
+++ b/backends/cadence/hifi/operators/CMakeLists.txt
@@ -20,12 +20,6 @@ endif()
 
 # ATen compliant ops that are needed to run this model.
 set(_aten_ops__srcs
-    "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_add.cpp"
-    "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_div.cpp"
-    "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_mul.cpp"
-    "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_sigmoid.cpp"
-    "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_sub.cpp"
-    "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_tanh.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/activation_ops_util.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/copy_ops_util.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/broadcast_util.cpp"
@@ -35,29 +29,24 @@ set(_aten_ops__srcs
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/reduce_util.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/repeat_util.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/slice_util.cpp"
+    "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_add.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_bmm.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_cat.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_clone.cpp"
+    "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_div.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_embedding.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_full.cpp"
+    "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_mul.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_permute_copy.cpp"
+    "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_sigmoid.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_slice_copy.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_softmax.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_split_with_sizes_copy.cpp"
+    "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_sub.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_to_copy.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_view_copy.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_where.cpp"
-    "${EXECUTORCH_ROOT}/kernels/portable/cpu/pattern/unary_ufunc_realhbbf16_to_floathbf16.cpp"
-    "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/activation_ops_util.cpp"
-    "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/broadcast_util.cpp"
-    "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/copy_ops_util.cpp"
-    "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/dtype_util.cpp"
-    "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/index_util.cpp"
-    "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/kernel_ops_util.cpp"
-    "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/matmul_ops_util.cpp"
-    "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/reduce_util.cpp"
-    "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/repeat_util.cpp"
-    )
+)
 add_library(aten_ops_cadence ${_aten_ops__srcs})
 target_link_libraries(aten_ops_cadence PUBLIC executorch)
 target_link_libraries(aten_ops_cadence PRIVATE cadence_kernels)
diff --git a/backends/cadence/hifi/operators/op_add.cpp b/backends/cadence/hifi/operators/op_add.cpp
deleted file mode 100644
index 10e06938f2e..00000000000
--- a/backends/cadence/hifi/operators/op_add.cpp
+++ /dev/null
@@ -1,204 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <executorch/backends/cadence/hifi/kernels/kernels.h>
-#include <executorch/kernels/portable/cpu/scalar_utils.h>
-#include <executorch/kernels/portable/cpu/util/broadcast_util.h>
-#include <executorch/kernels/portable/cpu/util/functional_util.h>
-#include <executorch/kernels/portable/cpu/util/kernel_ops_util.h>
-#include <executorch/runtime/kernel/kernel_includes.h>
-#include <executorch/runtime/platform/assert.h>
-
-using exec_aten::Scalar;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
-using executorch::runtime::can_cast;
-using executorch::runtime::CppTypeToScalarType;
-using executorch::runtime::KernelRuntimeContext;
-using torch::executor::Error;
-
-namespace impl {
-namespace HiFi {
-namespace native {
-
-namespace {
-template <
-    bool can_cast,
-    typename CTYPE_A,
-    typename CTYPE_B,
-    typename CTYPE_IN,
-    typename CTYPE_OUT>
-struct AddInner;
-
-template <
-    typename CTYPE_A,
-    typename CTYPE_B,
-    typename CTYPE_IN,
-    typename CTYPE_OUT>
-struct AddInner<true, CTYPE_A, CTYPE_B, CTYPE_IN, CTYPE_OUT> {
-  static void
-  run(const Tensor& a, const Tensor& b, CTYPE_IN alpha_val, Tensor& out) {
-    torch::executor::apply_binary_elementwise_fn<CTYPE_A, CTYPE_B, CTYPE_OUT>(
-        // NOLINTNEXTLINE(facebook-hte-ConstantArgumentPassByValue)
-        [alpha_val](const CTYPE_A val_a, const CTYPE_B val_b) {
-          CTYPE_IN a_casted = static_cast<CTYPE_IN>(val_a);
-          CTYPE_IN b_casted = static_cast<CTYPE_IN>(val_b);
-          CTYPE_IN value = a_casted + alpha_val * b_casted;
-
-          return static_cast<CTYPE_OUT>(value);
-        },
-        a,
-        b,
-        out);
-  }
-};
-
-template <typename CTYPE_IN>
-struct ReportCanCastBug {
-  static void run(const Tensor&, const Tensor&, CTYPE_IN, Tensor&) {
-    ET_DCHECK_MSG(false, "BUG: canCast should have been checked above");
-  }
-};
-
-template <
-    typename CTYPE_A,
-    typename CTYPE_B,
-    typename CTYPE_IN,
-    typename CTYPE_OUT>
-struct AddInner<false, CTYPE_A, CTYPE_B, CTYPE_IN, CTYPE_OUT>
-    : public ReportCanCastBug<CTYPE_IN> {};
-
-} // namespace
-
-Tensor& add_out(
-    KernelRuntimeContext& ctx,
-    const Tensor& a,
-    const Tensor& b,
-    const Scalar& alpha,
-    Tensor& out) {
-  ET_KERNEL_CHECK(
-      ctx,
-      torch::executor::resize_to_broadcast_target_size(a, b, out) == Error::Ok,
-      InvalidArgument,
-      out);
-
-  ET_KERNEL_CHECK(
-      ctx,
-      executorch::runtime::tensor_is_realhbbf16_type(out),
-      InvalidArgument,
-      out);
-  ET_KERNEL_CHECK(
-      ctx,
-      executorch::runtime::tensors_have_same_dim_order(a, b, out),
-      InvalidArgument,
-      out);
-
-  ScalarType a_type = a.scalar_type();
-  ScalarType b_type = b.scalar_type();
-  ScalarType alpha_type =
-      torch::executor::native::utils::get_scalar_dtype(alpha);
-  ScalarType common_type =
-      executorch::runtime::promoteTypes(a_type, b_type, /*half_to_float*/ true);
-  ScalarType out_type = out.scalar_type();
-
-  ET_KERNEL_CHECK(
-      ctx,
-      executorch::runtime::canCast(common_type, out_type),
-      InvalidArgument,
-      out);
-  ET_KERNEL_CHECK(
-      ctx,
-      torch::executor::check_alpha_type(alpha_type, common_type),
-      InvalidArgument,
-      out);
-
-  float alpha_val;
-  torch::executor::native::utils::extract_scalar(alpha, &alpha_val);
-
-  constexpr auto name = "add.out";
-  constexpr int kNnlibMaxDim = 4; /*fallback if broadcast and dim > 4 */
-
-  int a_dim = a.dim(), b_dim = b.dim(), out_dim = out.dim();
-  bool optimized = 1;
-  /*find broadcast*/
-  const bool a_is_broadcasted = !out.sizes().equals(a.sizes());
-  const bool b_is_broadcasted = !out.sizes().equals(b.sizes());
-  const bool broadcast = (a_is_broadcasted || b_is_broadcasted);
-  int max_dim = a.dim() > b.dim() ? a.dim() : b.dim();
-  max_dim = out.dim() > max_dim ? out.dim() : max_dim;
-
-  if ((out_type != ScalarType::Float) || (alpha_val != 1.0))
-    optimized = 0;
-
-  if ((a_dim == 0) || (b_dim == 0))
-    optimized = 0;
-
-  if ((broadcast == 1) && (max_dim > kNnlibMaxDim))
-    optimized = 0;
-
-  if (optimized) {
-    const float* const a_data = a.const_data_ptr<float>();
-    const float* const b_data = b.const_data_ptr<float>();
-    float* const out_data = out.mutable_data_ptr<float>();
-
-    if (broadcast == 1) {
-      int out_shape[kNnlibMaxDim];
-      int inp1_shape[kNnlibMaxDim];
-      int inp2_shape[kNnlibMaxDim];
-
-      for (int i = 0; i < kNnlibMaxDim; i++) {
-        out_shape[i] = 1;
-        inp1_shape[i] = 1;
-        inp2_shape[i] = 1;
-      }
-
-      int off_o = kNnlibMaxDim - out.dim();
-      int off_a = kNnlibMaxDim - a.dim();
-      int off_b = kNnlibMaxDim - b.dim();
-
-      for (int i = 0; i < out.dim(); i++)
-        out_shape[i + off_o] = out.size(i);
-      for (int i = 0; i < a.dim(); i++)
-        inp1_shape[i + off_a] = a.size(i);
-      for (int i = 0; i < b.dim(); i++)
-        inp2_shape[i + off_b] = b.size(i);
-
-      xa_nn_elm_add_broadcast_4D_f32xf32_f32(
-          out_data, out_shape, a_data, inp1_shape, b_data, inp2_shape);
-    } else {
-      xa_nn_elm_add_f32xf32_f32(out_data, a_data, b_data, out.numel());
-    }
-
-    return out;
-  }
-
-  ET_SWITCH_REALHBBF16_TYPES(a_type, ctx, name, CTYPE_A, [&]() {
-    ET_SWITCH_REALHBBF16_TYPES(b_type, ctx, name, CTYPE_B, [&]() {
-      using CTYPE_IN = typename torch::executor::
-          promote_types<CTYPE_A, CTYPE_B, /*half_to_float*/ true>::type;
-      ET_DCHECK(CppTypeToScalarType<CTYPE_IN>::value == common_type);
-      CTYPE_IN alpha_val;
-      torch::executor::native::utils::extract_scalar(alpha, &alpha_val);
-
-      ET_SWITCH_REALHBBF16_TYPES(out_type, ctx, name, CTYPE_OUT, [&]() {
-        AddInner<
-            can_cast<CTYPE_IN, CTYPE_OUT>::value,
-            CTYPE_A,
-            CTYPE_B,
-            CTYPE_IN,
-            CTYPE_OUT>::run(a, b, alpha_val, out);
-      });
-    });
-  });
-
-  return out;
-}
-
-} // namespace native
-} // namespace HiFi
-} // namespace impl
diff --git a/backends/cadence/hifi/operators/op_div.cpp b/backends/cadence/hifi/operators/op_div.cpp
deleted file mode 100644
index 88e670b432f..00000000000
--- a/backends/cadence/hifi/operators/op_div.cpp
+++ /dev/null
@@ -1,288 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <executorch/backends/cadence/hifi/kernels/kernels.h>
-#include <executorch/kernels/portable/cpu/scalar_utils.h>
-#include <executorch/kernels/portable/cpu/util/broadcast_util.h>
-#include <executorch/kernels/portable/cpu/util/functional_util.h>
-#include <executorch/kernels/portable/cpu/util/math_util.h>
-#include <executorch/runtime/kernel/kernel_includes.h>
-#include <executorch/runtime/platform/assert.h>
-#include <cmath>
-
-using exec_aten::Scalar;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
-using executorch::aten::RuntimeContext;
-using torch::executor::Error;
-
-namespace cadence {
-namespace impl {
-namespace HiFi {
-namespace native {
-
-namespace {
-
-ScalarType get_compute_type(ScalarType a_type, ScalarType b_type) {
-  if (executorch::runtime::isFloatingType(a_type) &&
-      executorch::runtime::isFloatingType(b_type)) {
-    return executorch::runtime::promoteTypes(a_type, b_type);
-  } else if (executorch::runtime::isFloatingType(a_type)) {
-    return a_type;
-  } else if (executorch::runtime::isFloatingType(b_type)) {
-    return b_type;
-  }
-  return ScalarType::Float;
-}
-
-} // namespace
-
-Tensor&
-div_out(RuntimeContext& ctx, const Tensor& a, const Tensor& b, Tensor& out) {
-  ET_KERNEL_CHECK(
-      ctx,
-      torch::executor::resize_to_broadcast_target_size(a, b, out) == Error::Ok,
-      InvalidArgument,
-      out);
-
-  ScalarType a_type = a.scalar_type();
-  ScalarType b_type = b.scalar_type();
-
-  ET_KERNEL_CHECK(
-      ctx,
-      !executorch::runtime::isComplexType(a_type) &&
-          !executorch::runtime::isQIntType(a_type) &&
-          !executorch::runtime::isBitsType(a_type),
-      InvalidArgument,
-      out);
-  ET_KERNEL_CHECK(
-      ctx,
-      !executorch::runtime::isComplexType(b_type) &&
-          !executorch::runtime::isQIntType(b_type) &&
-          !executorch::runtime::isBitsType(b_type),
-      InvalidArgument,
-      out);
-
-  ET_KERNEL_CHECK(
-      ctx, executorch::runtime::tensor_is_real_type(out), InvalidArgument, out);
-
-  constexpr int kNnlibMaxDim = 4; /*fallback if broadcast and dim > 4 */
-  int a_dim = a.dim(), b_dim = b.dim(), out_dim = out.dim();
-  bool optimized = 1;
-  /*find broadcast*/
-  const bool a_is_broadcasted = !out.sizes().equals(a.sizes());
-  const bool b_is_broadcasted = !out.sizes().equals(b.sizes());
-  const bool broadcast = (a_is_broadcasted || b_is_broadcasted);
-  int max_dim = a.dim() > b.dim() ? a.dim() : b.dim();
-  max_dim = out.dim() > max_dim ? out.dim() : max_dim;
-
-  if ((a_type != ScalarType::Float) || (b_type != ScalarType::Float))
-    optimized = 0;
-
-  if ((a_dim == 0) || (b_dim == 0))
-    optimized = 0;
-
-  if ((broadcast == 1) && (max_dim > kNnlibMaxDim))
-    optimized = 0;
-
-  if (optimized) {
-    float* a_data = a.mutable_data_ptr<float>();
-    float* b_data = b.mutable_data_ptr<float>();
-    float* out_data = out.mutable_data_ptr<float>();
-
-    if (broadcast == 1) {
-      int out_shape[kNnlibMaxDim];
-      int inp1_shape[kNnlibMaxDim];
-      int inp2_shape[kNnlibMaxDim];
-
-      for (int i = 0; i < kNnlibMaxDim; i++) {
-        out_shape[i] = 1;
-        inp1_shape[i] = 1;
-        inp2_shape[i] = 1;
-      }
-
-      int off_o = kNnlibMaxDim - out.dim();
-      int off_a = kNnlibMaxDim - a.dim();
-      int off_b = kNnlibMaxDim - b.dim();
-      for (int i = 0; i < out.dim(); i++)
-        out_shape[i + off_o] = out.size(i);
-      for (int i = 0; i < a.dim(); i++)
-        inp1_shape[i + off_a] = a.size(i);
-      for (int i = 0; i < b.dim(); i++)
-        inp2_shape[i + off_b] = b.size(i);
-
-      xa_nn_elm_div_broadcast_4D_f32xf32_f32(
-          out_data, out_shape, a_data, inp1_shape, b_data, inp2_shape);
-    } else {
-      xa_nn_elm_div_f32xf32_f32(out_data, a_data, b_data, out.numel());
-    }
-
-    return out;
-  }
-
-  ScalarType common_type = get_compute_type(a_type, b_type);
-  ScalarType out_type = out.scalar_type();
-
-  ET_KERNEL_CHECK(
-      ctx,
-      executorch::runtime::canCast(common_type, out_type),
-      InvalidArgument,
-      out);
-
-  ET_SWITCH_REAL_TYPES_AND(Bool, a_type, ctx, "div.out", CTYPE_A, [&]() {
-    ET_SWITCH_REAL_TYPES_AND(Bool, b_type, ctx, "div.out", CTYPE_B, [&]() {
-      ET_SWITCH_FLOAT_TYPES(common_type, ctx, "div.out", CTYPE_IN, [&]() {
-        ET_SWITCH_FLOAT_TYPES(out_type, ctx, "div.out", CTYPE_OUT, [&]() {
-          torch::executor::
-              apply_binary_elementwise_fn<CTYPE_A, CTYPE_B, CTYPE_OUT>(
-                  [](const CTYPE_A val_a, const CTYPE_B val_b) {
-                    CTYPE_IN a_casted = static_cast<CTYPE_IN>(val_a);
-                    CTYPE_IN b_casted = static_cast<CTYPE_IN>(val_b);
-                    CTYPE_IN value = a_casted / b_casted;
-
-                    return static_cast<CTYPE_OUT>(value);
-                  },
-                  a,
-                  b,
-                  out);
-        });
-      });
-    });
-  });
-
-  return out;
-}
-
-Tensor& div_out_mode(
-    RuntimeContext& ctx,
-    const Tensor& a,
-    const Tensor& b,
-    exec_aten::optional<exec_aten::string_view> mode,
-    Tensor& out) {
-  ET_KERNEL_CHECK(
-      ctx,
-      torch::executor::resize_to_broadcast_target_size(a, b, out) == Error::Ok,
-      InvalidArgument,
-      out);
-
-  ScalarType a_type = a.scalar_type();
-  ScalarType b_type = b.scalar_type();
-  ScalarType common_type = get_compute_type(a_type, b_type);
-  ScalarType out_type = out.scalar_type();
-
-  ET_KERNEL_CHECK(
-      ctx, executorch::runtime::tensor_is_real_type(out), InvalidArgument, out);
-
-  // Allow casting float -> integral here
-  // non-bool -> bool is still disallowed
-  ET_KERNEL_CHECK(
-      ctx,
-      !(common_type != ScalarType::Bool && out_type == ScalarType::Bool),
-      InvalidArgument,
-      out);
-  constexpr int kNnlibMaxDim = 4; /*fallback if broadcast and dim > 4 */
-  int a_dim = a.dim(), b_dim = b.dim(), out_dim = out.dim();
-  bool optimized = 1;
-  /*find broadcast*/
-  const bool a_is_broadcasted = !out.sizes().equals(a.sizes());
-  const bool b_is_broadcasted = !out.sizes().equals(b.sizes());
-  const bool broadcast = (a_is_broadcasted || b_is_broadcasted);
-  int max_dim = a.dim() > b.dim() ? a.dim() : b.dim();
-  max_dim = out.dim() > max_dim ? out.dim() : max_dim;
-
-  if ((a_type != ScalarType::Float) || (b_type != ScalarType::Float))
-    optimized = 0;
-
-  if ((a_dim == 0) || (b_dim == 0))
-    optimized = 0;
-
-  if ((broadcast == 1) && (max_dim > kNnlibMaxDim))
-    optimized = 0;
-  int mode_val = -1;
-  if (mode.has_value() && mode.value() == "trunc")
-    mode_val = 0;
-  else if (mode.has_value() && mode.value() == "floor")
-    mode_val = 1;
-  else
-    optimized = 0;
-
-  if (optimized) {
-    float* a_data = a.mutable_data_ptr<float>();
-    float* b_data = b.mutable_data_ptr<float>();
-    float* out_data = out.mutable_data_ptr<float>();
-
-    if (broadcast) {
-      int out_shape[kNnlibMaxDim];
-      int inp1_shape[kNnlibMaxDim];
-      int inp2_shape[kNnlibMaxDim];
-
-      for (int i = 0; i < kNnlibMaxDim; i++) {
-        inp1_shape[i] = 1;
-        inp2_shape[i] = 1;
-        out_shape[i] = 1;
-      }
-
-      int off_o = kNnlibMaxDim - out.dim();
-      int off_a = kNnlibMaxDim - a.dim();
-      int off_b = kNnlibMaxDim - b.dim();
-
-      for (int i = 0; i < out.dim(); i++)
-        out_shape[i + off_o] = out.size(i);
-      for (int i = 0; i < a.dim(); i++)
-        inp1_shape[i + off_a] = a.size(i);
-      for (int i = 0; i < b.dim(); i++)
-        inp2_shape[i + off_b] = b.size(i);
-
-      xa_nn_elm_div_mode_broadcast_4D_f32xf32_f32(
-          out_data,
-          out_shape,
-          a_data,
-          inp1_shape,
-          b_data,
-          inp2_shape,
-          mode_val);
-    } else {
-      xa_nn_elm_div_mode_f32xf32_f32(
-          out_data, a_data, b_data, out.numel(), mode_val);
-    }
-
-    return out;
-  }
-
-  ET_SWITCH_REAL_TYPES_AND(Bool, a_type, ctx, "div.out_mode", CTYPE_A, [&]() {
-    ET_SWITCH_REAL_TYPES_AND(Bool, b_type, ctx, "div.out_mode", CTYPE_B, [&]() {
-      ET_SWITCH_FLOAT_TYPES(common_type, ctx, "div.out_mode", CTYPE_IN, [&]() {
-        ET_SWITCH_REAL_TYPES(out_type, ctx, "div.out_mode", CTYPE_OUT, [&]() {
-          torch::executor::
-              apply_binary_elementwise_fn<CTYPE_A, CTYPE_B, CTYPE_OUT>(
-                  [mode](const CTYPE_A val_a, const CTYPE_B val_b) {
-                    CTYPE_IN a_casted = static_cast<CTYPE_IN>(val_a);
-                    CTYPE_IN b_casted = static_cast<CTYPE_IN>(val_b);
-                    CTYPE_IN value = a_casted / b_casted;
-                    if (mode.has_value() && mode.value() == "trunc") {
-                      value = std::trunc(value);
-                    } else if (mode.has_value() && mode.value() == "floor") {
-                      value = std::floor(value);
-                    }
-                    return static_cast<CTYPE_OUT>(value);
-                  },
-                  a,
-                  b,
-                  out);
-        });
-      });
-    });
-  });
-
-  return out;
-}
-
-} // namespace native
-} // namespace HiFi
-} // namespace impl
-} // namespace cadence
\ No newline at end of file
diff --git a/backends/cadence/hifi/operators/op_mul.cpp b/backends/cadence/hifi/operators/op_mul.cpp
deleted file mode 100644
index ad12606bdf6..00000000000
--- a/backends/cadence/hifi/operators/op_mul.cpp
+++ /dev/null
@@ -1,169 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <executorch/backends/cadence/hifi/kernels/kernels.h>
-#include <executorch/kernels/portable/cpu/scalar_utils.h>
-#include <executorch/kernels/portable/cpu/util/broadcast_util.h>
-#include <executorch/kernels/portable/cpu/util/functional_util.h>
-#include <executorch/runtime/kernel/kernel_includes.h>
-#include <executorch/runtime/platform/assert.h>
-
-using exec_aten::Scalar;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
-using executorch::aten::RuntimeContext;
-using executorch::runtime::can_cast;
-using executorch::runtime::CppTypeToScalarType;
-using torch::executor::Error;
-
-namespace cadence {
-namespace impl {
-namespace HiFi {
-namespace native {
-
-namespace {
-template <
-    bool can_cast,
-    typename CTYPE_A,
-    typename CTYPE_B,
-    typename CTYPE_IN,
-    typename CTYPE_OUT>
-struct MulInner;
-
-template <
-    typename CTYPE_A,
-    typename CTYPE_B,
-    typename CTYPE_IN,
-    typename CTYPE_OUT>
-struct MulInner<true, CTYPE_A, CTYPE_B, CTYPE_IN, CTYPE_OUT> {
-  static void run(const Tensor& a, const Tensor& b, Tensor& out) {
-    torch::executor::apply_binary_elementwise_fn<CTYPE_A, CTYPE_B, CTYPE_OUT>(
-        // NOLINTNEXTLINE(facebook-hte-ConstantArgumentPassByValue)
-        [](const CTYPE_A val_a, const CTYPE_B val_b) {
-          CTYPE_IN a_casted = static_cast<CTYPE_IN>(val_a);
-          CTYPE_IN b_casted = static_cast<CTYPE_IN>(val_b);
-          CTYPE_IN value = a_casted * b_casted;
-
-          return static_cast<CTYPE_OUT>(value);
-        },
-        a,
-        b,
-        out);
-  }
-};
-
-struct ReportCanCastBug {
-  static void run(const Tensor&, const Tensor&, Tensor&) {
-    ET_DCHECK_MSG(false, "BUG: canCast should have been checked above");
-  }
-};
-
-template <
-    typename CTYPE_A,
-    typename CTYPE_B,
-    typename CTYPE_IN,
-    typename CTYPE_OUT>
-struct MulInner<false, CTYPE_A, CTYPE_B, CTYPE_IN, CTYPE_OUT>
-    : public ReportCanCastBug {};
-} // namespace
-
-Tensor&
-mul_out(RuntimeContext& ctx, const Tensor& a, const Tensor& b, Tensor& out) {
-  ET_KERNEL_CHECK(
-      ctx,
-      torch::executor::resize_to_broadcast_target_size(a, b, out) == Error::Ok,
-      InvalidArgument,
-      out);
-
-  ET_KERNEL_CHECK(
-      ctx,
-      executorch::runtime::tensor_is_realhb_type(out),
-      InvalidArgument,
-      out);
-
-  ScalarType a_type = a.scalar_type();
-  ScalarType b_type = b.scalar_type();
-  ScalarType common_type =
-      executorch::runtime::promoteTypes(a_type, b_type, /*half_to_float*/ true);
-  ScalarType out_type = out.scalar_type();
-  constexpr int kNnlibMaxDim = 4; /*fallback if broadcast and dim > 4 */
-
-  int a_dim = a.dim(), b_dim = b.dim(), out_dim = out.dim();
-  bool optimized = 1;
-  /*find broadcast*/
-  const bool a_is_broadcasted = !out.sizes().equals(a.sizes());
-  const bool b_is_broadcasted = !out.sizes().equals(b.sizes());
-  const bool broadcast = (a_is_broadcasted || b_is_broadcasted);
-  int max_dim = a.dim() > b.dim() ? a.dim() : b.dim();
-  max_dim = out.dim() > max_dim ? out.dim() : max_dim;
-
-  if ((a_type != ScalarType::Float) || (b_type != ScalarType::Float))
-    optimized = 0;
-
-  if ((a_dim == 0) || (b_dim == 0))
-    optimized = 0;
-
-  if ((broadcast == 1) && (max_dim > kNnlibMaxDim))
-    optimized = 0;
-
-  if (optimized) {
-    float* a_data = a.mutable_data_ptr<float>();
-    float* b_data = b.mutable_data_ptr<float>();
-    float* out_data = out.mutable_data_ptr<float>();
-
-    if (broadcast == 1) {
-      int out_shape[kNnlibMaxDim];
-      int inp1_shape[kNnlibMaxDim];
-      int inp2_shape[kNnlibMaxDim];
-      for (int i = 0; i < kNnlibMaxDim; i++) {
-        out_shape[i] = 1;
-        inp1_shape[i] = 1;
-        inp2_shape[i] = 1;
-      }
-      int off_o = kNnlibMaxDim - out.dim();
-      int off_a = kNnlibMaxDim - a.dim();
-      int off_b = kNnlibMaxDim - b.dim();
-      for (int i = 0; i < out.dim(); i++)
-        out_shape[i + off_o] = out.size(i);
-      for (int i = 0; i < a.dim(); i++)
-        inp1_shape[i + off_a] = a.size(i);
-      for (int i = 0; i < b.dim(); i++)
-        inp2_shape[i + off_b] = b.size(i);
-
-      xa_nn_elm_mul_broadcast_4D_f32xf32_f32(
-          out_data, out_shape, a_data, inp1_shape, b_data, inp2_shape);
-    } else {
-      xa_nn_elm_mul_f32xf32_f32(out_data, a_data, b_data, out.numel());
-    }
-
-    return out;
-  }
-
-  ET_SWITCH_REALHB_TYPES(a_type, ctx, "mul.out", CTYPE_A, [&]() {
-    ET_SWITCH_REALHB_TYPES(b_type, ctx, "mul.out", CTYPE_B, [&]() {
-      using CTYPE_IN = typename torch::executor::
-          promote_types<CTYPE_A, CTYPE_B, /*half_to_float*/ true>::type;
-      ET_DCHECK(CppTypeToScalarType<CTYPE_IN>::value == common_type);
-      ET_SWITCH_REALHB_TYPES(out_type, ctx, "mul.out", CTYPE_OUT, [&]() {
-        MulInner<
-            can_cast<CTYPE_IN, CTYPE_OUT>::value,
-            CTYPE_A,
-            CTYPE_B,
-            CTYPE_IN,
-            CTYPE_OUT>::run(a, b, out);
-      });
-    });
-  });
-
-  return out;
-}
-
-} // namespace native
-} // namespace HiFi
-} // namespace impl
-} // namespace cadence
\ No newline at end of file
diff --git a/backends/cadence/hifi/operators/op_sigmoid.cpp b/backends/cadence/hifi/operators/op_sigmoid.cpp
deleted file mode 100644
index b9fa73b879f..00000000000
--- a/backends/cadence/hifi/operators/op_sigmoid.cpp
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <cmath>
-
-#include <executorch/backends/cadence/hifi/kernels/kernels.h>
-#include <executorch/kernels/portable/cpu/util/functional_util.h>
-#include <executorch/runtime/kernel/kernel_includes.h>
-
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
-using executorch::aten::RuntimeContext;
-using torch::executor::Error;
-
-namespace cadence {
-namespace impl {
-namespace HiFi {
-namespace native {
-
-using Tensor = exec_aten::Tensor;
-
-Tensor& sigmoid_out(RuntimeContext& ctx, const Tensor& in, Tensor& out) {
-  (void)ctx;
-
-  ET_KERNEL_CHECK(
-      ctx, in.scalar_type() != ScalarType::Bool, InvalidArgument, out);
-  ET_KERNEL_CHECK(
-      ctx,
-      executorch::runtime::tensor_is_floating_type(out),
-      InvalidArgument,
-      out);
-
-  // Resize for dynamic shape
-  ET_KERNEL_CHECK_MSG(
-      ctx,
-      resize_tensor(out, in.sizes()) == Error::Ok,
-      InvalidArgument,
-      out,
-      "Failed to resize output tensor.");
-
-  ScalarType in_type = in.scalar_type();
-  ScalarType out_type = out.scalar_type();
-
-  bool optimized = 1;
-  if ((in_type != ScalarType::Float) || (out_type != ScalarType::Float))
-    optimized = 0;
-
-  if (optimized) {
-    float* data_in = in.mutable_data_ptr<float>();
-    float* data_out = out.mutable_data_ptr<float>();
-    xa_nn_vec_sigmoid_f32_f32(data_out, data_in, in.numel());
-
-    return out;
-  }
-
-  ET_SWITCH_REALHB_TYPES(in_type, ctx, "sigmoid.out", CTYPE_IN, [&]() {
-    ET_SWITCH_FLOATH_TYPES(out_type, ctx, "sigmoid.out", CTYPE_OUT, [&]() {
-      torch::executor::apply_unary_map_fn(
-          [](const CTYPE_IN val_in) {
-            // perform math in double to preserve precision
-            double in_casted = static_cast<double>(val_in);
-            double out_val = 1.0 / (1.0 + exp(-in_casted));
-            return static_cast<CTYPE_OUT>(out_val);
-          },
-          in.const_data_ptr<CTYPE_IN>(),
-          out.mutable_data_ptr<CTYPE_OUT>(),
-          in.numel());
-    });
-  });
-
-  return out;
-}
-
-} // namespace native
-} // namespace HiFi
-} // namespace impl
-} // namespace cadence
\ No newline at end of file
diff --git a/backends/cadence/hifi/operators/op_sub.cpp b/backends/cadence/hifi/operators/op_sub.cpp
deleted file mode 100644
index 0a362dbf959..00000000000
--- a/backends/cadence/hifi/operators/op_sub.cpp
+++ /dev/null
@@ -1,203 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <executorch/backends/cadence/hifi/kernels/kernels.h>
-#include <executorch/kernels/portable/cpu/scalar_utils.h>
-#include <executorch/kernels/portable/cpu/util/broadcast_util.h>
-#include <executorch/kernels/portable/cpu/util/functional_util.h>
-#include <executorch/kernels/portable/cpu/util/kernel_ops_util.h>
-#include <executorch/runtime/kernel/kernel_includes.h>
-#include <executorch/runtime/platform/assert.h>
-
-using exec_aten::Scalar;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
-using executorch::aten::RuntimeContext;
-using executorch::runtime::can_cast;
-using executorch::runtime::CppTypeToScalarType;
-using torch::executor::Error;
-
-namespace cadence {
-namespace impl {
-namespace HiFi {
-namespace native {
-
-namespace {
-template <
-    bool can_cast,
-    typename CTYPE_A,
-    typename CTYPE_B,
-    typename CTYPE_IN,
-    typename CTYPE_OUT>
-struct SubInner;
-
-template <
-    typename CTYPE_A,
-    typename CTYPE_B,
-    typename CTYPE_IN,
-    typename CTYPE_OUT>
-struct SubInner<true, CTYPE_A, CTYPE_B, CTYPE_IN, CTYPE_OUT> {
-  static void
-  run(const Tensor& a, const Tensor& b, CTYPE_IN alpha_val, Tensor& out) {
-    torch::executor::apply_binary_elementwise_fn<CTYPE_A, CTYPE_B, CTYPE_OUT>(
-        // NOLINTNEXTLINE(facebook-hte-ConstantArgumentPassByValue)
-        [alpha_val](const CTYPE_A val_a, const CTYPE_B val_b) {
-          CTYPE_IN a_casted = static_cast<CTYPE_IN>(val_a);
-          CTYPE_IN b_casted = static_cast<CTYPE_IN>(val_b);
-          CTYPE_IN value = a_casted - alpha_val * b_casted;
-
-          return static_cast<CTYPE_OUT>(value);
-        },
-        a,
-        b,
-        out);
-  }
-};
-
-template <typename CTYPE_IN>
-struct ReportCanCastBug {
-  static void run(const Tensor&, const Tensor&, CTYPE_IN, Tensor&) {
-    ET_DCHECK_MSG(false, "BUG: canCast should have been checked above");
-  }
-};
-
-template <
-    typename CTYPE_A,
-    typename CTYPE_B,
-    typename CTYPE_IN,
-    typename CTYPE_OUT>
-struct SubInner<false, CTYPE_A, CTYPE_B, CTYPE_IN, CTYPE_OUT>
-    : public ReportCanCastBug<CTYPE_IN> {};
-
-} // namespace
-
-Tensor& sub_out(
-    RuntimeContext& ctx,
-    const Tensor& a,
-    const Tensor& b,
-    const Scalar& alpha,
-    Tensor& out) {
-  ET_KERNEL_CHECK(
-      ctx,
-      torch::executor::resize_to_broadcast_target_size(a, b, out) == Error::Ok,
-      InvalidArgument,
-      out);
-
-  ET_KERNEL_CHECK(
-      ctx,
-      executorch::runtime::tensor_is_realh_type(out),
-      InvalidArgument,
-      out);
-
-  ScalarType a_type = a.scalar_type();
-  ScalarType b_type = b.scalar_type();
-  ScalarType alpha_type =
-      torch::executor::native::utils::get_scalar_dtype(alpha);
-  ScalarType common_type =
-      executorch::runtime::promoteTypes(a_type, b_type, /*half_to_float*/ true);
-  ScalarType out_type = out.scalar_type();
-
-  ET_KERNEL_CHECK(
-      ctx,
-      executorch::runtime::canCast(common_type, out_type),
-      InvalidArgument,
-      out);
-  ET_KERNEL_CHECK(
-      ctx,
-      torch::executor::check_alpha_type(alpha_type, common_type),
-      InvalidArgument,
-      out);
-
-  float alpha_val;
-  torch::executor::native::utils::extract_scalar(alpha, &alpha_val);
-
-  constexpr auto name = "sub.out";
-  constexpr int kNnlibMaxDim = 4; /*fallback if broadcast and dim > 4 */
-
-  int a_dim = a.dim(), b_dim = b.dim(), out_dim = out.dim();
-  bool optimized = 1;
-  /*find broadcast*/
-  const bool a_is_broadcasted = !out.sizes().equals(a.sizes());
-  const bool b_is_broadcasted = !out.sizes().equals(b.sizes());
-  const bool broadcast = (a_is_broadcasted || b_is_broadcasted);
-  int max_dim = a.dim() > b.dim() ? a.dim() : b.dim();
-  max_dim = out.dim() > max_dim ? out.dim() : max_dim;
-
-  if ((out_type != ScalarType::Float) || (alpha_val != 1.0))
-    optimized = 0;
-
-  if ((a_dim == 0) || (b_dim == 0))
-    optimized = 0;
-
-  if ((broadcast == 1) && (max_dim > kNnlibMaxDim))
-    optimized = 0;
-
-  if (optimized) {
-    /*logic to find broadcast*/
-    const int a_is_broadcasted = !out.sizes().equals(a.sizes());
-    const int b_is_broadcasted = !out.sizes().equals(b.sizes());
-    const int broadcast = (a_is_broadcasted || b_is_broadcasted);
-
-    const float* const a_data = a.const_data_ptr<float>();
-    const float* const b_data = b.const_data_ptr<float>();
-    float* const out_data = out.mutable_data_ptr<float>();
-    if (broadcast == 1) {
-      int out_shape[kNnlibMaxDim];
-      int inp1_shape[kNnlibMaxDim];
-      int inp2_shape[kNnlibMaxDim];
-
-      for (int i = 0; i < kNnlibMaxDim; i++) {
-        out_shape[i] = 1;
-        inp1_shape[i] = 1;
-        inp2_shape[i] = 1;
-      }
-
-      int off_o = kNnlibMaxDim - out_dim;
-      int off_a = kNnlibMaxDim - a_dim;
-      int off_b = kNnlibMaxDim - b_dim;
-      for (int i = 0; i < out_dim; i++)
-        out_shape[i + off_o] = out.size(i);
-      for (int i = 0; i < a_dim; i++)
-        inp1_shape[i + off_a] = a.size(i);
-      for (int i = 0; i < b_dim; i++)
-        inp2_shape[i + off_b] = b.size(i);
-
-      xa_nn_elm_sub_broadcast_4D_f32xf32_f32(
-          out_data, out_shape, a_data, inp1_shape, b_data, inp2_shape);
-    } else {
-      xa_nn_elm_sub_f32xf32_f32(out_data, a_data, b_data, out.numel());
-    }
-
-    return out;
-  }
-
-  ET_SWITCH_REALH_TYPES(a_type, ctx, name, CTYPE_A, [&]() {
-    ET_SWITCH_REALH_TYPES(b_type, ctx, name, CTYPE_B, [&]() {
-      using CTYPE_IN = typename torch::executor::
-          promote_types<CTYPE_A, CTYPE_B, /*half_to_float*/ true>::type;
-      ET_DCHECK(CppTypeToScalarType<CTYPE_IN>::value == common_type);
-      CTYPE_IN alpha_val;
-      torch::executor::native::utils::extract_scalar(alpha, &alpha_val);
-      ET_SWITCH_REALH_TYPES(out_type, ctx, name, CTYPE_OUT, [&]() {
-        SubInner<
-            can_cast<CTYPE_IN, CTYPE_OUT>::value,
-            CTYPE_A,
-            CTYPE_B,
-            CTYPE_IN,
-            CTYPE_OUT>::run(a, b, alpha_val, out);
-      });
-    });
-  });
-
-  return out;
-}
-
-} // namespace native
-} // namespace HiFi
-} // namespace impl
-} // namespace cadence
\ No newline at end of file
diff --git a/backends/cadence/hifi/operators/op_tanh.cpp b/backends/cadence/hifi/operators/op_tanh.cpp
deleted file mode 100644
index 13578beb887..00000000000
--- a/backends/cadence/hifi/operators/op_tanh.cpp
+++ /dev/null
@@ -1,44 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <executorch/backends/cadence/hifi/kernels/kernels.h>
-#include <executorch/kernels/portable/cpu/pattern/pattern.h>
-#include <executorch/runtime/kernel/kernel_includes.h>
-#include <cmath>
-
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
-using executorch::aten::RuntimeContext;
-using torch::executor::Error;
-
-namespace cadence {
-namespace impl {
-namespace HiFi {
-namespace native {
-
-Tensor& tanh_out(RuntimeContext& ctx, const Tensor& in, Tensor& out) {
-  bool optimized = 1;
-  if ((in.scalar_type() != ScalarType::Float) ||
-      (out.scalar_type() != ScalarType::Float))
-    optimized = 0;
-
-  if (optimized) {
-    float* data_in = in.mutable_data_ptr<float>();
-    float* data_out = out.mutable_data_ptr<float>();
-    xa_nn_vec_tanh_f32_f32(data_out, data_in, (int)in.numel());
-    return out;
-  }
-
-  return torch::executor::native::internal::
-      unary_ufunc_realhbbf16_to_floathbf16(std::tanh, ctx, in, out);
-}
-
-} // namespace native
-} // namespace HiFi
-} // namespace impl
-} // namespace cadence
\ No newline at end of file
diff --git a/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_add_f32_broadcast.c b/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_add_f32_broadcast.c
deleted file mode 100644
index 9eab22b05b7..00000000000
--- a/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_add_f32_broadcast.c
+++ /dev/null
@@ -1,428 +0,0 @@
-/*******************************************************************************
-* Copyright (c) 2018-2024 Cadence Design Systems, Inc.
-*
-* Permission is hereby granted, free of charge, to any person obtaining
-* a copy of this software and associated documentation files (the
-* "Software"), to use this Software with Cadence processor cores only and
-* not with any other processors and platforms, subject to
-* the following conditions:
-*
-* The above copyright notice and this permission notice shall be included
-* in all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
-* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
-* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
-* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
-******************************************************************************/
-#include "xa_type_def.h"
-#include "xa_nnlib_common_fpu.h"
-#include "xa_nn_common.h"
-#include "xa_nnlib_err_chk.h"
-#include "xa_nnlib_kernels_api.h"
-
-
-#if HAVE_VFPU
-static void internal_elm_add_broadcast_2D_f32xf32_f32(FLOAT32 * __restrict__ p_out,
-                    const    FLOAT32 * __restrict__ p_inp1,
-                    const    FLOAT32 * __restrict__ p_inp2,
-                             WORD32  out_lc,
-                             WORD32  in_lc,
-                             xtbool  sign_flag)
-{
-  int i, j;
-
-  xtfloatx2  * __restrict__ p_a = (xtfloatx2 *)p_inp1;
-  xtfloatx2  * __restrict__ p_b = (xtfloatx2 *)p_inp2;
-  xtfloatx2  *__restrict__  p_c =  (xtfloatx2 *)p_out;
-
-  int num_simd2_ops;
-  int num_scalar_ops;
-
-  if(out_lc)
-  {
-    num_simd2_ops = in_lc >> 1;
-    num_scalar_ops = in_lc & 1;
-  }
-  else
-  {
-    num_simd2_ops = (in_lc >> 2) << 1;
-    num_scalar_ops = in_lc & 3;
-  }
-
-    xtfloatx2 x1, x2, y;
-    xtfloat a0, b0, c0;
-
-  /* For computing inp2 + inp1 */
-  if(sign_flag){
-    for(i = 0; i < out_lc; i++)
-    {
-      p_a = (xtfloatx2 *)&p_inp1[i * in_lc];
-      p_b = (xtfloatx2 *)p_inp2;
-      p_c = (xtfloatx2 *)&p_out[i * in_lc];
-      if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_b)&7) == 0) && ((((unsigned)p_c)&7) == 0))
-      {
-        for(j = 0; j < num_simd2_ops; j++)
-        {
-          XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32));
-          XT_LSX2IP(x2, p_b, 2 * sizeof(FLOAT32));
-          y = XT_ADD_SX2(x2, x1);
-          XT_SSX2IP(y, p_c, 2 * sizeof(FLOAT32));
-        }
-      }
-      else
-      {
-        ae_valign vinp1, vinp2, out_a = AE_ZALIGN64();
-        vinp1 = XT_LASX2PP(p_a);
-        vinp2 = XT_LASX2PP(p_b);
-        for(j = 0; j < num_simd2_ops; j++)
-        {
-          XT_LASX2IP(x1, vinp1, p_a);
-          XT_LASX2IP(x2, vinp2, p_b);
-          y = XT_ADD_SX2(x2, x1);
-          XT_SASX2IP(y, out_a, p_c);
-        }
-        XT_SASX2POSFP(out_a, (xtfloatx2 *)p_c);
-      }
-      if(num_scalar_ops !=0)
-      {
-        XT_LSIP(a0, (xtfloat *)p_a, sizeof(FLOAT32));
-        XT_LSIP(b0, (xtfloat *)p_b, sizeof(FLOAT32));
-        c0 = XT_ADD_S(b0, a0);
-        XT_SSI(c0, (xtfloat *)p_c, 0);
-      }
-    }
-  }
-  /* For computing inp1 + inp2 */
-  else
-  {
-    for(i = 0; i < out_lc; i++)
-    {
-      p_a = (xtfloatx2 *)&p_inp1[i * in_lc];
-      p_b = (xtfloatx2 *)p_inp2;
-      p_c = (xtfloatx2 *)&p_out[i * in_lc];
-      if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_b)&7) == 0) && ((((unsigned)p_c)&7) == 0))
-      {
-        for(j = 0; j < num_simd2_ops; j++)
-        {
-          XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32));
-          XT_LSX2IP(x2, p_b, 2 * sizeof(FLOAT32));
-          y = XT_ADD_SX2(x1, x2);
-          XT_SSX2IP(y, p_c, 2 * sizeof(FLOAT32));
-        }
-      }
-      else
-      {
-        ae_valign vinp1, vinp2, out_a = AE_ZALIGN64();
-        vinp1 = XT_LASX2PP(p_a);
-        vinp2 = XT_LASX2PP(p_b);
-
-        for(j = 0; j < num_simd2_ops; j++)
-        {
-          XT_LASX2IP(x1, vinp1, p_a);
-          XT_LASX2IP(x2, vinp2, p_b);
-          y = XT_ADD_SX2(x1, x2);
-          XT_SASX2IP(y, out_a, p_c);
-        }
-        XT_SASX2POSFP(out_a, (xtfloatx2 *)p_c);
-      }
-      if(num_scalar_ops !=0)
-      {
-        XT_LSIP(a0, (xtfloat *)p_a, sizeof(FLOAT32));
-        XT_LSIP(b0, (xtfloat *)p_b, sizeof(FLOAT32));
-        c0 = XT_ADD_S(a0, b0);
-        XT_SSI(c0, (xtfloat *)p_c, 0);
-      }
-    }
-  }
-}
-
-static void internal_elm_add_broadcast_f32xf32_f32(FLOAT32 * __restrict__ p_out,
-                    const    FLOAT32 * __restrict__ p_inp1,
-                    const    FLOAT32 * __restrict__ p_inp2,
-                             WORD32  num_elm,
-                             xtbool  sign_flag)
-{
-  int i;
-  xtfloatx2  * __restrict__ p_a = (xtfloatx2 *)p_inp1;
-  xtfloatx2  * __restrict__ p_b = (xtfloatx2 *)p_inp2;
-  xtfloatx2  *__restrict__  p_c =  (xtfloatx2 *)p_out;
-
-  const int num_simd2_ops = num_elm >> 1;
-  const int num_scalar_ops = num_elm & 1;
-
-  xtfloat a0_7, out;
-  xtfloatx2 x1, x2, y;
-  x2 = XT_LSI((xtfloat *)p_b, 0);
-
-  /* For computing inp2 + inp1 */
-  if(sign_flag){
-    if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_c)&7) == 0))
-    {
-      for(i=0; i<num_simd2_ops; i++)
-      {
-        XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32));
-        y = XT_ADD_SX2(x2, x1);
-        XT_SSX2IP(y, p_c, 2 * sizeof(FLOAT32));
-      }
-    }
-    else
-    {
-      ae_valign inp1_a, out_a;
-      inp1_a = XT_LASX2PP(p_a);
-      out_a = AE_ZALIGN64();
-      for(i=0; i<num_simd2_ops; i++)
-      {
-        XT_LASX2IP(x1, inp1_a, p_a);
-        y = XT_ADD_SX2(x2, x1);
-        XT_SASX2IP(y, out_a, p_c);
-      }
-      XT_SASX2POSFP(out_a, (xtfloatx2 *)p_c);
-    }
-    if(num_scalar_ops !=0)
-    {
-      XT_LSIP(a0_7, (xtfloat *)p_a, sizeof(FLOAT32));
-      out = XT_ADD_S(x2, a0_7);
-      XT_SSI(out, (xtfloat *)p_c, 0);
-    }
-  }
-  /* For computing inp1 + inp2 */
-  else
-  {
-    if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_c)&7) == 0))
-    {
-      for(i=0; i<num_simd2_ops; i++)
-      {
-        XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32));
-        y = XT_ADD_SX2(x1, x2);
-        XT_SSX2IP(y, p_c, 2 * sizeof(FLOAT32));
-      }
-    }
-    else
-    {
-      ae_valign inp1_a, out_a;
-      inp1_a = XT_LASX2PP(p_a);
-      out_a = AE_ZALIGN64();
-      for(i=0; i<num_simd2_ops; i++)
-      {
-        XT_LASX2IP(x1, inp1_a, p_a);
-        y = XT_ADD_SX2(x1, x2);
-        XT_SASX2IP(y, out_a, p_c);
-      }
-      XT_SASX2POSFP(out_a, (xtfloatx2 *)p_c);
-    }
-    if(num_scalar_ops !=0)
-    {
-      XT_LSIP(a0_7, (xtfloat *)p_a, sizeof(FLOAT32));
-      out = XT_ADD_S(a0_7, x2);
-      XT_SSI(out, (xtfloat *)p_c, 0);
-    }
-  }
-}
-#endif
-
-WORD32 xa_nn_elm_add_broadcast_4D_f32xf32_f32(FLOAT32 * __restrict__ p_out,
-                      const WORD32 *const p_out_shape,
-                      const FLOAT32 * __restrict__ p_inp1,
-                      const WORD32 *const p_inp1_shape,
-                      const FLOAT32 * __restrict__ p_inp2,
-                      const WORD32 *const p_inp2_shape)
-{
-  /* NULL pointer checks */
-  XA_NNLIB_ARG_CHK_PTR(p_out, -1);
-  XA_NNLIB_ARG_CHK_PTR(p_inp1, -1);
-  XA_NNLIB_ARG_CHK_PTR(p_inp2, -1);
-  XA_NNLIB_ARG_CHK_PTR(p_out_shape, -1);
-  XA_NNLIB_ARG_CHK_PTR(p_inp1_shape, -1);
-  XA_NNLIB_ARG_CHK_PTR(p_inp2_shape, -1);
-  /* Pointer alignment checks */
-  XA_NNLIB_ARG_CHK_ALIGN(p_out, sizeof(FLOAT32), -1);
-  XA_NNLIB_ARG_CHK_ALIGN(p_inp1, sizeof(FLOAT32), -1);
-  XA_NNLIB_ARG_CHK_ALIGN(p_inp2, sizeof(FLOAT32), -1);
-  XA_NNLIB_ARG_CHK_ALIGN(p_out_shape, sizeof(WORD32), -1);
-  XA_NNLIB_ARG_CHK_ALIGN(p_inp1_shape, sizeof(WORD32), -1);
-  XA_NNLIB_ARG_CHK_ALIGN(p_inp2_shape, sizeof(WORD32), -1);
-
-  /* Check shapes */
-  int i;
-  xtbool sign_flag;
-  for(i = 0; i < 4; i++)
-  {
-    if((p_inp1_shape[i] != p_inp2_shape[i] && p_inp1_shape[i] != 1 && p_inp2_shape[i] != 1) ||
-       (p_out_shape[i] != (p_inp1_shape[i] > p_inp2_shape[i] ? p_inp1_shape[i] : p_inp2_shape[i])))
-    {
-      return -1;
-    }
-  }
-
-  WORD32 inp1_strides[4], inp2_strides[4];
-  inp1_strides[3] = 1;
-  inp2_strides[3] = 1;
-  for(i = 2; i >= 0; i--)
-  {
-    ae_int32x2 d_str, d_shape;
-    d_str = AE_MOVDA32X2(inp1_strides[i + 1], inp2_strides[i + 1]);
-    d_shape = AE_MOVDA32X2(p_inp1_shape[i + 1], p_inp2_shape[i + 1]);
-    d_str = AE_MULP32X2(d_str, d_shape);
-    inp1_strides[i] = AE_MOVAD32_H(d_str);
-    inp2_strides[i] = AE_MOVAD32_L(d_str);
-  }
-
-  int need_broadcast = 0;
-  int inp1_const = 1, inp2_const = 1;
-  for(i = 0; i < 4; i++)
-  {
-    if(p_inp1_shape[i] != p_inp2_shape[i])
-    {
-      if(p_inp1_shape[i] == 1)
-        inp1_strides[i] = 0;
-      else
-        inp2_strides[i] = 0;
-
-      need_broadcast = 1;
-    }
-    if(p_inp1_shape[i] != 1)
-      inp1_const &= 0;
-    if(p_inp2_shape[i] != 1)
-      inp2_const &= 0;
-  }
-  int itr0, itr1, itr2;
-
-  FLOAT32 *p_out_tmp = p_out;
-  const FLOAT32 *__restrict__ p_inp1_tmp = p_inp1;
-  const FLOAT32 *__restrict__ p_inp2_tmp = p_inp2;
-  if(need_broadcast == 0)
-  {
-    sign_flag = 0;
-    internal_elm_add_broadcast_2D_f32xf32_f32(
-                p_out,
-                p_inp1,
-                p_inp2,
-                1,
-                p_out_shape[0] * inp1_strides[0],
-                sign_flag);
-  }
-  else if(inp1_strides[3] == inp2_strides[3])
-  {
-    WORD32 in_lc, out_lc;
-    sign_flag = 0;
-    in_lc = p_out_shape[2] * p_out_shape[3];
-    out_lc = 1;
-    if(inp1_strides[2] == 0)
-    {
-      const FLOAT32 *tmp;
-      tmp = p_inp1_tmp;   p_inp1_tmp = p_inp2_tmp;    p_inp2_tmp = tmp;
-      sign_flag = 1;
-      int tmp_strides[2];
-      tmp_strides[0] = inp1_strides[0];
-      tmp_strides[1] = inp1_strides[1];
-
-      inp1_strides[0] = inp2_strides[0];
-      inp1_strides[1] = inp2_strides[1];
-
-      inp2_strides[0] = tmp_strides[0];
-      inp2_strides[1] = tmp_strides[1];
-      in_lc = p_out_shape[3];
-      out_lc = p_out_shape[2];
-    }
-    else if(inp2_strides[2] == 0)
-    {
-      in_lc = p_out_shape[3];
-      out_lc = p_out_shape[2];
-    }
-
-    for(itr0 = 0; itr0 < p_out_shape[0]; itr0++)
-    {
-      const FLOAT32 *__restrict__ p_inp1_tmp0 = p_inp1_tmp;
-      const FLOAT32 *__restrict__ p_inp2_tmp0 = p_inp2_tmp;
-      for(itr1 = 0; itr1 < p_out_shape[1]; itr1++)
-      {
-        internal_elm_add_broadcast_2D_f32xf32_f32(
-            p_out_tmp,
-            p_inp1_tmp0,
-            p_inp2_tmp0,
-            out_lc,
-            in_lc,
-            sign_flag);
-        p_out_tmp += in_lc * out_lc;
-        p_inp1_tmp0 += inp1_strides[1];
-        p_inp2_tmp0 += inp2_strides[1];
-      }
-      p_inp1_tmp += inp1_strides[0];
-      p_inp2_tmp += inp2_strides[0];
-    }
-  }
-  else if(inp1_const == 1 || inp2_const == 1)
-  {
-    sign_flag = 0;
-    if(inp1_strides[3] == 0)
-    {
-      sign_flag = 1;
-      const FLOAT32 *tmp;
-      tmp = p_inp1_tmp;   p_inp1_tmp = p_inp2_tmp;    p_inp2_tmp = tmp;
-    }
-    internal_elm_add_broadcast_f32xf32_f32(
-        p_out_tmp,
-        p_inp1_tmp,
-        p_inp2_tmp,
-        p_out_shape[0] * p_out_shape[1] * p_out_shape[2] * p_out_shape[3],
-        sign_flag);
-  }
-  else
-  {
-    sign_flag = 0;
-    if(inp1_strides[3] == 0)
-    {
-      const FLOAT32 *tmp;
-      tmp = p_inp1_tmp;   p_inp1_tmp = p_inp2_tmp;    p_inp2_tmp = tmp;
-      sign_flag = 1;
-      int tmp_strides[3];
-      tmp_strides[0] = inp1_strides[0];
-      tmp_strides[1] = inp1_strides[1];
-      tmp_strides[2] = inp1_strides[2];
-
-      inp1_strides[0] = inp2_strides[0];
-      inp1_strides[1] = inp2_strides[1];
-      inp1_strides[2] = inp2_strides[2];
-
-      inp2_strides[0] = tmp_strides[0];
-      inp2_strides[1] = tmp_strides[1];
-      inp2_strides[2] = tmp_strides[2];
-    }
-    for(itr0 = 0; itr0 < p_out_shape[0]; itr0++)
-    {
-      const FLOAT32 *__restrict__ p_inp1_tmp0 = p_inp1_tmp;
-      const FLOAT32 *__restrict__ p_inp2_tmp0 = p_inp2_tmp;
-      for(itr1 = 0; itr1 < p_out_shape[1]; itr1++)
-      {
-        const FLOAT32 *__restrict__ p_inp1_tmp1 = p_inp1_tmp0;
-        const FLOAT32 *__restrict__ p_inp2_tmp1 = p_inp2_tmp0;
-        for(itr2 = 0; itr2 < p_out_shape[2]; itr2++)
-        {
-          {
-            internal_elm_add_broadcast_f32xf32_f32(
-                p_out_tmp,
-                p_inp1_tmp1,
-                p_inp2_tmp1,
-                p_out_shape[3],
-                sign_flag);
-          }
-          p_out_tmp += p_out_shape[3];
-          p_inp1_tmp1 += inp1_strides[2];
-          p_inp2_tmp1 += inp2_strides[2];
-        }
-        p_inp1_tmp0 += inp1_strides[1];
-        p_inp2_tmp0 += inp2_strides[1];
-      }
-      p_inp1_tmp += inp1_strides[0];
-      p_inp2_tmp += inp2_strides[0];
-    }
-  }
-  return 0;
-
-}
-
diff --git a/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_div_f32_broadcast.c b/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_div_f32_broadcast.c
deleted file mode 100644
index 03b8d625186..00000000000
--- a/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_div_f32_broadcast.c
+++ /dev/null
@@ -1,419 +0,0 @@
-#include "xa_type_def.h"
-#include "xa_nnlib_common_fpu.h"
-#include "xa_nn_common.h"
-#include "xa_nnlib_err_chk.h"
-//#include "xa_nn_basic_state.h"
-#include "xa_nnlib_kernels_api.h"
-
-#if HAVE_VFPU
-static void internal_elm_div_broadcast_2D_f32xf32_f32(FLOAT32 * __restrict__ p_out,
-                    const    FLOAT32 * __restrict__ p_inp1,
-                    const    FLOAT32 * __restrict__ p_inp2,
-                             WORD32  out_lc,
-                             WORD32  in_lc,
-                             xtbool  sign_flag)
-{
-  int i, j;
-
-  xtfloatx2  * __restrict__ p_a = (xtfloatx2 *)p_inp1;
-  xtfloatx2  * __restrict__ p_b = (xtfloatx2 *)p_inp2; 
-  xtfloatx2  *__restrict__  p_c =  (xtfloatx2 *)p_out;
-
-  int num_simd2_ops;
-  int num_scalar_ops;
-
-  if(out_lc)
-  {
-    num_simd2_ops = in_lc >> 1;
-    num_scalar_ops = in_lc & 1;
-  }
-  else
-  {
-    num_simd2_ops = (in_lc >> 2) << 1;
-    num_scalar_ops = in_lc & 3;
-  }
-
-    xtfloatx2 x1, x2, y;
-    xtfloat a0, b0, c0;
-
-  /* For computing inp2 - inp1 */   
-  if(sign_flag){  
-    for(i = 0; i < out_lc; i++)
-    {
-      p_a = (xtfloatx2 *)&p_inp1[i * in_lc];
-      p_b = (xtfloatx2 *)p_inp2;
-      p_c = (xtfloatx2 *)&p_out[i * in_lc];
-      if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_b)&7) == 0) && ((((unsigned)p_c)&7) == 0))
-      {
-        for(j = 0; j < num_simd2_ops; j++)
-        {
-          XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32));
-          XT_LSX2IP(x2, p_b, 2 * sizeof(FLOAT32));
-          y = XT_DIV_SX2(x2, x1);
-          XT_SSX2IP(y, p_c, 2 * sizeof(FLOAT32)); 
-        }
-      }
-      else
-      {
-        ae_valign vinp1, vinp2, out_a = AE_ZALIGN64();
-        vinp1 = XT_LASX2PP(p_a);
-        vinp2 = XT_LASX2PP(p_b);
-        for(j = 0; j < num_simd2_ops; j++)
-        {
-          XT_LASX2IP(x1, vinp1, p_a);
-          XT_LASX2IP(x2, vinp2, p_b);
-          y = XT_DIV_SX2(x2, x1);
-          XT_SASX2IP(y, out_a, p_c); 
-        }
-        XT_SASX2POSFP(out_a, (xtfloatx2 *)p_c);
-      }
-      if(num_scalar_ops !=0)
-      {
-        XT_LSIP(a0, (xtfloat *)p_a, sizeof(FLOAT32));
-        XT_LSIP(b0, (xtfloat *)p_b, sizeof(FLOAT32));
-        c0 = XT_DIV_S(b0, a0);   
-        XT_SSI(c0, (xtfloat *)p_c, 0);
-      }      
-    }
-  }
-  /* For computing inp1 - inp2 */   
-  else
-  {
-    for(i = 0; i < out_lc; i++)
-    {
-      p_a = (xtfloatx2 *)&p_inp1[i * in_lc];
-      p_b = (xtfloatx2 *)p_inp2;
-      p_c = (xtfloatx2 *)&p_out[i * in_lc];
-      if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_b)&7) == 0) && ((((unsigned)p_c)&7) == 0))
-      {
-        for(j = 0; j < num_simd2_ops; j++)
-        {
-          XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32));
-          XT_LSX2IP(x2, p_b, 2 * sizeof(FLOAT32));
-          y = XT_DIV_SX2(x1, x2);
-          XT_SSX2IP(y, p_c, 2 * sizeof(FLOAT32)); 
-        }
-      }
-      else
-      {
-        ae_valign vinp1, vinp2, out_a = AE_ZALIGN64();
-        vinp1 = XT_LASX2PP(p_a);
-        vinp2 = XT_LASX2PP(p_b);
-
-        for(j = 0; j < num_simd2_ops; j++)
-        {
-          XT_LASX2IP(x1, vinp1, p_a);
-          XT_LASX2IP(x2, vinp2, p_b);
-          y = XT_DIV_SX2(x1, x2);
-          XT_SASX2IP(y, out_a, p_c); 
-        }
-        XT_SASX2POSFP(out_a, (xtfloatx2 *)p_c);
-      }
-      if(num_scalar_ops !=0)
-      {
-        XT_LSIP(a0, (xtfloat *)p_a, sizeof(FLOAT32));
-        XT_LSIP(b0, (xtfloat *)p_b, sizeof(FLOAT32));
-        c0 = XT_DIV_S(a0, b0);   
-        XT_SSI(c0, (xtfloat *)p_c, 0);
-      }      
-    }  
-  }
-}
-
-static void internal_elm_div_broadcast_f32xf32_f32(FLOAT32 * __restrict__ p_out,
-                    const    FLOAT32 * __restrict__ p_inp1,
-                    const    FLOAT32 * __restrict__ p_inp2,
-                             WORD32  num_elm,
-                             xtbool  sign_flag)
-{
-  int i;
-  xtfloatx2  * __restrict__ p_a = (xtfloatx2 *)p_inp1;
-  xtfloatx2  * __restrict__ p_b = (xtfloatx2 *)p_inp2; 
-  xtfloatx2  *__restrict__  p_c =  (xtfloatx2 *)p_out;
-
-  const int num_simd2_ops = num_elm >> 1;
-  const int num_scalar_ops = num_elm & 1;
-
-  xtfloat a0_7, out;
-  xtfloatx2 x1, x2, y;
-  x2 = XT_LSI((xtfloat *)p_b, 0);
-        
-  /* For computing inp2 - inp1 */      
-  if(sign_flag){
-    if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_c)&7) == 0))
-    {
-      for(i=0; i<num_simd2_ops; i++)
-      {
-        XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32));
-        y = XT_DIV_SX2(x2, x1);
-        XT_SSX2IP(y, p_c, 2 * sizeof(FLOAT32)); 
-      }
-    }
-    else
-    {
-      ae_valign inp1_a, out_a;
-      inp1_a = XT_LASX2PP(p_a);
-      out_a = AE_ZALIGN64();      
-      for(i=0; i<num_simd2_ops; i++)
-      {
-        XT_LASX2IP(x1, inp1_a, p_a);
-        y = XT_DIV_SX2(x2, x1);
-        XT_SASX2IP(y, out_a, p_c);
-      }
-      XT_SASX2POSFP(out_a, (xtfloatx2 *)p_c);   
-    }  
-    if(num_scalar_ops !=0)
-    {
-      XT_LSIP(a0_7, (xtfloat *)p_a, sizeof(FLOAT32));
-      out = XT_DIV_S(x2, a0_7);   
-      XT_SSI(out, (xtfloat *)p_c, 0);
-    }
-  }
-  /* For computing inp1 - inp2 */   
-  else
-  {
-    if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_c)&7) == 0))
-    {
-      for(i=0; i<num_simd2_ops; i++)
-      {
-        XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32));
-        y = XT_DIV_SX2(x1, x2);
-        XT_SSX2IP(y, p_c, 2 * sizeof(FLOAT32)); 
-      }
-    }
-    else
-    {
-      ae_valign inp1_a, out_a;
-      inp1_a = XT_LASX2PP(p_a);
-      out_a = AE_ZALIGN64();       
-      for(i=0; i<num_simd2_ops; i++)
-      {
-        XT_LASX2IP(x1, inp1_a, p_a);
-        y = XT_DIV_SX2(x1, x2);
-        XT_SASX2IP(y, out_a, p_c);
-      }
-      XT_SASX2POSFP(out_a, (xtfloatx2 *)p_c);
-    }
-    if(num_scalar_ops !=0)
-    {
-      XT_LSIP(a0_7, (xtfloat *)p_a, sizeof(FLOAT32));
-      out = XT_DIV_S(a0_7, x2);   
-      XT_SSI(out, (xtfloat *)p_c, 0);
-    }    
-  }
-}
-#endif
-
-#if !HAVE_VFPU
-DISCARD_FUN_FOR_NONVOID_RETURN(
-             WORD32, xa_nn_elm_div_broadcast_4D_f32xf32_f32,
-             (
-                      FLOAT32 * p_out,
-                      const WORD32 *const p_out_shape,
-                      const FLOAT32 * p_inp1,
-                      const WORD32 *const p_inp1_shape,
-                      const FLOAT32 * p_inp2,
-                      const WORD32 *const p_inp2_shape
-              )
-           )
-#else           
-WORD32 xa_nn_elm_div_broadcast_4D_f32xf32_f32(FLOAT32 * __restrict__ p_out,
-                      const WORD32 *const p_out_shape,
-                      const FLOAT32 * __restrict__ p_inp1,
-                      const WORD32 *const p_inp1_shape,
-                      const FLOAT32 * __restrict__ p_inp2,
-                      const WORD32 *const p_inp2_shape)
-{
-  /* NULL pointer checks */
-  XA_NNLIB_ARG_CHK_PTR(p_out, -1);
-  XA_NNLIB_ARG_CHK_PTR(p_inp1, -1);
-  XA_NNLIB_ARG_CHK_PTR(p_inp2, -1);
-  XA_NNLIB_ARG_CHK_PTR(p_out_shape, -1);
-  XA_NNLIB_ARG_CHK_PTR(p_inp1_shape, -1);
-  XA_NNLIB_ARG_CHK_PTR(p_inp2_shape, -1);
-  /* Pointer alignment checks */
-  XA_NNLIB_ARG_CHK_ALIGN(p_out, sizeof(FLOAT32), -1);
-  XA_NNLIB_ARG_CHK_ALIGN(p_inp1, sizeof(FLOAT32), -1);
-  XA_NNLIB_ARG_CHK_ALIGN(p_inp2, sizeof(FLOAT32), -1);
-  XA_NNLIB_ARG_CHK_ALIGN(p_out_shape, sizeof(WORD32), -1);
-  XA_NNLIB_ARG_CHK_ALIGN(p_inp1_shape, sizeof(WORD32), -1);
-  XA_NNLIB_ARG_CHK_ALIGN(p_inp2_shape, sizeof(WORD32), -1);
-
-  /* Check shapes */
-  int i;
-  xtbool sign_flag;
-  for(i = 0; i < 4; i++)
-  {
-    if((p_inp1_shape[i] != p_inp2_shape[i] && p_inp1_shape[i] != 1 && p_inp2_shape[i] != 1) ||
-       (p_out_shape[i] != (p_inp1_shape[i] > p_inp2_shape[i] ? p_inp1_shape[i] : p_inp2_shape[i])))
-    {
-      return -1;
-    }
-  }
-
-  WORD32 inp1_strides[4], inp2_strides[4];
-  inp1_strides[3] = 1;
-  inp2_strides[3] = 1;
-  for(i = 2; i >= 0; i--)
-  {
-    ae_int32x2 d_str, d_shape;
-    d_str = AE_MOVDA32X2(inp1_strides[i + 1], inp2_strides[i + 1]);
-    d_shape = AE_MOVDA32X2(p_inp1_shape[i + 1], p_inp2_shape[i + 1]);
-    d_str = AE_MULP32X2(d_str, d_shape);
-    inp1_strides[i] = AE_MOVAD32_H(d_str);
-    inp2_strides[i] = AE_MOVAD32_L(d_str);
-  }
-
-  int need_broadcast = 0;
-  int inp1_const = 1, inp2_const = 1;
-  for(i = 0; i < 4; i++)
-  {
-    if(p_inp1_shape[i] != p_inp2_shape[i])
-    {
-      if(p_inp1_shape[i] == 1)
-        inp1_strides[i] = 0;
-      else
-        inp2_strides[i] = 0;
-
-      need_broadcast = 1;
-    }
-    if(p_inp1_shape[i] != 1)
-      inp1_const &= 0;
-    if(p_inp2_shape[i] != 1)
-      inp2_const &= 0;
-  }
-  int itr0, itr1, itr2;
-
-  FLOAT32 *p_out_tmp = p_out;
-  const FLOAT32 *__restrict__ p_inp1_tmp = p_inp1;
-  const FLOAT32 *__restrict__ p_inp2_tmp = p_inp2;
-  if(need_broadcast == 0)
-  {
-    sign_flag = 0;
-    internal_elm_div_broadcast_2D_f32xf32_f32(
-                p_out,
-                p_inp1,
-                p_inp2,
-                1,
-                p_out_shape[0] * inp1_strides[0],
-                sign_flag);
-  }
-  else if(inp1_strides[3] == inp2_strides[3])
-  {
-    WORD32 in_lc, out_lc;
-    sign_flag = 0;
-    in_lc = p_out_shape[2] * p_out_shape[3];
-    out_lc = 1;
-    if(inp1_strides[2] == 0)
-    {
-      const FLOAT32 *tmp;
-      tmp = p_inp1_tmp;   p_inp1_tmp = p_inp2_tmp;    p_inp2_tmp = tmp;
-      sign_flag = 1;
-      int tmp_strides[2];
-      tmp_strides[0] = inp1_strides[0];
-      tmp_strides[1] = inp1_strides[1];
-
-      inp1_strides[0] = inp2_strides[0];
-      inp1_strides[1] = inp2_strides[1];
-
-      inp2_strides[0] = tmp_strides[0];
-      inp2_strides[1] = tmp_strides[1];
-      in_lc = p_out_shape[3];
-      out_lc = p_out_shape[2];
-    }
-    else if(inp2_strides[2] == 0)
-    {
-      in_lc = p_out_shape[3];
-      out_lc = p_out_shape[2];
-    }
-
-    for(itr0 = 0; itr0 < p_out_shape[0]; itr0++)
-    {
-      const FLOAT32 *__restrict__ p_inp1_tmp0 = p_inp1_tmp;
-      const FLOAT32 *__restrict__ p_inp2_tmp0 = p_inp2_tmp;
-      for(itr1 = 0; itr1 < p_out_shape[1]; itr1++)
-      {
-        internal_elm_div_broadcast_2D_f32xf32_f32(
-            p_out_tmp,
-            p_inp1_tmp0,
-            p_inp2_tmp0,
-            out_lc,
-            in_lc,
-            sign_flag);
-        p_out_tmp += in_lc * out_lc;
-        p_inp1_tmp0 += inp1_strides[1];
-        p_inp2_tmp0 += inp2_strides[1];
-      }
-      p_inp1_tmp += inp1_strides[0];
-      p_inp2_tmp += inp2_strides[0];
-    }
-  }
-  else if(inp1_const == 1 || inp2_const == 1)
-  {
-    sign_flag = 0;
-    if(inp1_strides[3] == 0)
-    {
-      sign_flag = 1;
-      const FLOAT32 *tmp;
-      tmp = p_inp1_tmp;   p_inp1_tmp = p_inp2_tmp;    p_inp2_tmp = tmp;
-    }
-    internal_elm_div_broadcast_f32xf32_f32(
-        p_out_tmp,
-        p_inp1_tmp,
-        p_inp2_tmp,
-        p_out_shape[0] * p_out_shape[1] * p_out_shape[2] * p_out_shape[3],
-        sign_flag);
-  }
-  else
-  {
-    sign_flag = 0;
-    if(inp1_strides[3] == 0)
-    {
-      const FLOAT32 *tmp;
-      tmp = p_inp1_tmp;   p_inp1_tmp = p_inp2_tmp;    p_inp2_tmp = tmp;
-      sign_flag = 1;
-      int tmp_strides[3];
-      tmp_strides[0] = inp1_strides[0];
-      tmp_strides[1] = inp1_strides[1];
-      tmp_strides[2] = inp1_strides[2];
-
-      inp1_strides[0] = inp2_strides[0];
-      inp1_strides[1] = inp2_strides[1];
-      inp1_strides[2] = inp2_strides[2];
-
-      inp2_strides[0] = tmp_strides[0];
-      inp2_strides[1] = tmp_strides[1];
-      inp2_strides[2] = tmp_strides[2];
-    }
-    for(itr0 = 0; itr0 < p_out_shape[0]; itr0++)
-    {
-      const FLOAT32 *__restrict__ p_inp1_tmp0 = p_inp1_tmp;
-      const FLOAT32 *__restrict__ p_inp2_tmp0 = p_inp2_tmp;
-      for(itr1 = 0; itr1 < p_out_shape[1]; itr1++)
-      {
-        const FLOAT32 *__restrict__ p_inp1_tmp1 = p_inp1_tmp0;
-        const FLOAT32 *__restrict__ p_inp2_tmp1 = p_inp2_tmp0;
-        for(itr2 = 0; itr2 < p_out_shape[2]; itr2++)
-        {
-          {
-            internal_elm_div_broadcast_f32xf32_f32(
-                p_out_tmp,
-                p_inp1_tmp1,
-                p_inp2_tmp1,
-                p_out_shape[3], 
-                sign_flag);
-          }
-          p_out_tmp += p_out_shape[3];
-          p_inp1_tmp1 += inp1_strides[2];
-          p_inp2_tmp1 += inp2_strides[2];
-        }
-        p_inp1_tmp0 += inp1_strides[1];
-        p_inp2_tmp0 += inp2_strides[1];
-      }
-      p_inp1_tmp += inp1_strides[0];
-      p_inp2_tmp += inp2_strides[0];
-    }
-  }
-  return 0;
-}
-#endif
diff --git a/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_div_mode_f32_broadcast.c b/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_div_mode_f32_broadcast.c
deleted file mode 100644
index 95b449f43f7..00000000000
--- a/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_div_mode_f32_broadcast.c
+++ /dev/null
@@ -1,644 +0,0 @@
-#include "xa_type_def.h"
-#include "xa_nnlib_common_fpu.h"
-#include "xa_nn_common.h"
-#include "xa_nnlib_err_chk.h"
-//#include "xa_nn_basic_state.h"
-#include "xa_nnlib_kernels_api.h"
-
-#if !HAVE_VFPU
-    DISCARD_FUN_FOR_NONVOID_RETURN(
-             WORD32, xa_nn_elm_div_mode_f32xf32_f32,
-             (
-                FLOAT32 *p_out,
-                const FLOAT32 *p_inp1,
-                const FLOAT32 *p_inp2,
-                WORD32 num_elm,
-                WORD32 mode
-              )
-           )
-#else
-WORD32 xa_nn_elm_div_mode_f32xf32_f32(FLOAT32 * __restrict__ p_out,
-                               const FLOAT32 * __restrict__ p_inp1,
-                               const FLOAT32 * __restrict__ p_inp2,
-                               WORD32 num_elm,
-                               WORD32 mode)
-{
-    /* NULL pointer checks */
-    XA_NNLIB_ARG_CHK_PTR(p_out, -1);
-    XA_NNLIB_ARG_CHK_PTR(p_inp1, -1);
-    XA_NNLIB_ARG_CHK_PTR(p_inp2, -1);
-    /* Pointer alignment checks */
-    XA_NNLIB_ARG_CHK_ALIGN(p_out, sizeof(FLOAT32), -1);
-    XA_NNLIB_ARG_CHK_ALIGN(p_inp1, sizeof(FLOAT32), -1);
-    XA_NNLIB_ARG_CHK_ALIGN(p_inp2, sizeof(FLOAT32), -1);
-    /* Basic Parameter checks */
-    XA_NNLIB_ARG_CHK_COND((num_elm <= 0), -1);
-    XA_NNLIB_ARG_CHK_COND(((mode != 0) && (mode != 1)), -1);
-
-    int i;
-    xtfloatx2 *inp1 = (xtfloatx2 *)p_inp1;
-    xtfloatx2 *inp2 = (xtfloatx2 *)p_inp2;
-    xtfloatx2 *out =  (xtfloatx2 *)p_out;
-    xtfloatx2 x1, x2, y;
-    ae_valign inp1_a, inp2_a, out_a;
-
-    inp1_a = XT_LASX2PP(inp1);
-    inp2_a = XT_LASX2PP(inp2);
-    out_a = AE_ZALIGN64();
-    /* Each iteration of loop is independent so safe to use concurrent pragma */
-    if(mode == 0)
-    {
-#pragma concurrent /* Each iteration of loop is independent so safe to use concurrent pragma */
-      for(i=0;i < num_elm>>1;i++)
-      {
-          XT_LASX2IP(x1, inp1_a, inp1);
-          XT_LASX2IP(x2, inp2_a, inp2);
-          y = XT_DIV_SX2(x1, x2);
-          y = FITRUNC_SX2(y);
-          XT_SASX2IP(y, out_a, out);
-      }
-    }
-    else
-    {
-#pragma concurrent
-    for(i=0;i < num_elm>>1;i++)
-    {
-        XT_LASX2IP(x1, inp1_a, inp1);
-        XT_LASX2IP(x2, inp2_a, inp2);
-        y = XT_DIV_SX2(x1, x2);
-        y = FIFLOOR_SX2(y);
-        XT_SASX2IP(y, out_a, out);
-    }
-    }
-    XT_SASX2POSFP(out_a, out);
-
-    // Remainder Loop
-    if (num_elm & 1)
-    {
-        xtfloat a1, a2, a;
-        XT_LSIP(a1, (xtfloat *)inp1, 0);
-        XT_LSIP(a2, (xtfloat *)inp2, 0);
-        a = XT_DIV_S(a1, a2);
-      if(mode == 0)
-        a = FITRUNC_S(a);
-      else
-        a = FIFLOOR_S(a);
-        XT_SSI(a, (xtfloat *)out, 0);
-    }
-
-    return 0;
-}
-#endif
-
-#if HAVE_VFPU
-static void internal_elm_div_mode_broadcast_2D_f32xf32_f32(FLOAT32 * __restrict__ p_out,
-                    const    FLOAT32 * __restrict__ p_inp1,
-                    const    FLOAT32 * __restrict__ p_inp2,
-                             WORD32  out_lc,
-                             WORD32  in_lc,
-                             xtbool  sign_flag,
-                             WORD32 mode)
-{
-  int i, j;
-
-  xtfloatx2  * __restrict__ p_a = (xtfloatx2 *)p_inp1;
-  xtfloatx2  * __restrict__ p_b = (xtfloatx2 *)p_inp2; 
-  xtfloatx2  *__restrict__  p_c =  (xtfloatx2 *)p_out;
-
-  int num_simd2_ops;
-  int num_scalar_ops;
-
-  if(out_lc)
-  {
-    num_simd2_ops = in_lc >> 1;
-    num_scalar_ops = in_lc & 1;
-  }
-  else
-  {
-    num_simd2_ops = (in_lc >> 2) << 1;
-    num_scalar_ops = in_lc & 3;
-  }
-
-    xtfloatx2 x1, x2, y;
-    xtfloat a0, b0, c0;
-
-  /* For computing inp2 - inp1 */   
-  if(sign_flag){  
-    for(i = 0; i < out_lc; i++)
-    {
-      p_a = (xtfloatx2 *)&p_inp1[i * in_lc];
-      p_b = (xtfloatx2 *)p_inp2;
-      p_c = (xtfloatx2 *)&p_out[i * in_lc];
-      if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_b)&7) == 0) && ((((unsigned)p_c)&7) == 0))
-      {
-        if(mode == 0)
-        {
-          for(j = 0; j < num_simd2_ops; j++)
-          {
-            XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32));
-            XT_LSX2IP(x2, p_b, 2 * sizeof(FLOAT32));
-            y = XT_DIV_SX2(x2, x1);
-            y = FITRUNC_SX2(y);
-            XT_SSX2IP(y, p_c, 2 * sizeof(FLOAT32)); 
-          }
-        }
-        else
-        {
-        for(j = 0; j < num_simd2_ops; j++)
-        {
-          XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32));
-          XT_LSX2IP(x2, p_b, 2 * sizeof(FLOAT32));
-          y = XT_DIV_SX2(x2, x1);
-          y = FIFLOOR_SX2(y);
-          XT_SSX2IP(y, p_c, 2 * sizeof(FLOAT32)); 
-        }
-      }
-      }
-      else
-      {
-        ae_valign vinp1, vinp2, out_a = AE_ZALIGN64();
-        vinp1 = XT_LASX2PP(p_a);
-        vinp2 = XT_LASX2PP(p_b);
-        if(mode == 0)
-        {
-          for(j = 0; j < num_simd2_ops; j++)
-          {
-            XT_LASX2IP(x1, vinp1, p_a);
-            XT_LASX2IP(x2, vinp2, p_b);
-            y = XT_DIV_SX2(x2, x1);
-            y = FITRUNC_SX2(y);
-            XT_SASX2IP(y, out_a, p_c); 
-          }
-        }
-        else
-        {
-        for(j = 0; j < num_simd2_ops; j++)
-        {
-          XT_LASX2IP(x1, vinp1, p_a);
-          XT_LASX2IP(x2, vinp2, p_b);
-          y = XT_DIV_SX2(x2, x1);
-          y = FIFLOOR_SX2(y);
-          XT_SASX2IP(y, out_a, p_c); 
-        }
-        }
-        XT_SASX2POSFP(out_a, (xtfloatx2 *)p_c);
-      }
-      if(num_scalar_ops !=0)
-      {
-        XT_LSIP(a0, (xtfloat *)p_a, sizeof(FLOAT32));
-        XT_LSIP(b0, (xtfloat *)p_b, sizeof(FLOAT32));
-        c0 = XT_DIV_S(b0, a0);   
-        if(mode == 0)
-          c0 = FITRUNC_S(c0);
-        else
-        c0 = FIFLOOR_S(c0);
-        XT_SSI(c0, (xtfloat *)p_c, 0);
-      }      
-    }
-  }
-  /* For computing inp1 - inp2 */   
-  else
-  {
-    for(i = 0; i < out_lc; i++)
-    {
-      p_a = (xtfloatx2 *)&p_inp1[i * in_lc];
-      p_b = (xtfloatx2 *)p_inp2;
-      p_c = (xtfloatx2 *)&p_out[i * in_lc];
-      if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_b)&7) == 0) && ((((unsigned)p_c)&7) == 0))
-      {
-        if(mode == 0)
-        {
-          for(j = 0; j < num_simd2_ops; j++)
-          {
-            XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32));
-            XT_LSX2IP(x2, p_b, 2 * sizeof(FLOAT32));
-            y = XT_DIV_SX2(x1, x2);
-            y = FITRUNC_SX2(y);
-            XT_SSX2IP(y, p_c, 2 * sizeof(FLOAT32)); 
-          }
-        }
-        else
-        {
-        for(j = 0; j < num_simd2_ops; j++)
-        {
-          XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32));
-          XT_LSX2IP(x2, p_b, 2 * sizeof(FLOAT32));
-          y = XT_DIV_SX2(x1, x2);
-          y = FIFLOOR_SX2(y);
-          XT_SSX2IP(y, p_c, 2 * sizeof(FLOAT32)); 
-        }
-      }
-      }/* if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_b)&7) == 0) && ((((unsigned)p_c)&7) == 0))*/
-      else
-      {
-        ae_valign vinp1, vinp2, out_a = AE_ZALIGN64();
-        vinp1 = XT_LASX2PP(p_a);
-        vinp2 = XT_LASX2PP(p_b);
-        if(mode == 0)
-        {
-          for(j = 0; j < num_simd2_ops; j++)
-          {
-            XT_LASX2IP(x1, vinp1, p_a);
-            XT_LASX2IP(x2, vinp2, p_b);
-            y = XT_DIV_SX2(x1, x2);
-            y = FITRUNC_SX2(y);
-            XT_SASX2IP(y, out_a, p_c); 
-          }
-        }
-        else
-        {
-        for(j = 0; j < num_simd2_ops; j++)
-        {
-          XT_LASX2IP(x1, vinp1, p_a);
-          XT_LASX2IP(x2, vinp2, p_b);
-          y = XT_DIV_SX2(x1, x2);
-          y = FIFLOOR_SX2(y);
-          XT_SASX2IP(y, out_a, p_c); 
-        }
-        }
-        XT_SASX2POSFP(out_a, (xtfloatx2 *)p_c);
-      }
-      if(num_scalar_ops !=0)
-      {
-        XT_LSIP(a0, (xtfloat *)p_a, sizeof(FLOAT32));
-        XT_LSIP(b0, (xtfloat *)p_b, sizeof(FLOAT32));
-        c0 = XT_DIV_S(a0, b0);   
-        if(mode == 0)
-          c0 = FITRUNC_S(c0);
-        else
-        c0 = FIFLOOR_S(c0);
-        XT_SSI(c0, (xtfloat *)p_c, 0);
-      }      
-    }  
-  }
-}
-
-static void internal_elm_div_mode_broadcast_f32xf32_f32(FLOAT32 * __restrict__ p_out,
-                    const    FLOAT32 * __restrict__ p_inp1,
-                    const    FLOAT32 * __restrict__ p_inp2,
-                             WORD32  num_elm,
-                             xtbool  sign_flag,
-                             WORD32 mode)
-{
-  int i;
-  xtfloatx2  * __restrict__ p_a = (xtfloatx2 *)p_inp1;
-  xtfloatx2  * __restrict__ p_b = (xtfloatx2 *)p_inp2; 
-  xtfloatx2  *__restrict__  p_c =  (xtfloatx2 *)p_out;
-
-  const int num_simd2_ops = num_elm >> 1;
-  const int num_scalar_ops = num_elm & 1;
-
-  xtfloat a0_7, out;
-  xtfloatx2 x1, x2, y;
-  x2 = XT_LSI((xtfloat *)p_b, 0);
-        
-  /* For computing inp2 - inp1 */      
-  if(sign_flag){
-    if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_c)&7) == 0))
-    {
-      if(mode == 0)
-      {
-        for(i=0; i<num_simd2_ops; i++)
-        {
-          XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32));
-          y = XT_DIV_SX2(x2, x1);
-          y = FITRUNC_SX2(y);
-          XT_SSX2IP(y, p_c, 2 * sizeof(FLOAT32)); 
-        }
-      }
-      else
-      {
-      for(i=0; i<num_simd2_ops; i++)
-      {
-        XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32));
-        y = XT_DIV_SX2(x2, x1);
-        y = FIFLOOR_SX2(y);
-        XT_SSX2IP(y, p_c, 2 * sizeof(FLOAT32)); 
-      }
-    }
-    }
-    else
-    {
-      ae_valign inp1_a, out_a;
-      inp1_a = XT_LASX2PP(p_a);
-      out_a = AE_ZALIGN64();      
-      if(mode == 0)
-      {
-      for(i=0; i<num_simd2_ops; i++)
-        {
-          XT_LASX2IP(x1, inp1_a, p_a);
-          y = XT_DIV_SX2(x2, x1);
-          y = FITRUNC_SX2(y);
-          XT_SASX2IP(y, out_a, p_c);
-        }
-      }
-      else
-      {
-        for(i=0; i<num_simd2_ops; i++)
-        {
-        XT_LASX2IP(x1, inp1_a, p_a);
-        y = XT_DIV_SX2(x2, x1);
-        y = FIFLOOR_SX2(y);
-        XT_SASX2IP(y, out_a, p_c);
-        }          
-      }
-      XT_SASX2POSFP(out_a, (xtfloatx2 *)p_c);   
-    }  
-    if(num_scalar_ops !=0)
-    {
-      XT_LSIP(a0_7, (xtfloat *)p_a, sizeof(FLOAT32));
-      out = XT_DIV_S(x2, a0_7);   
-      if(mode == 0)
-        out = FITRUNC_S(out);
-      else
-      out = FIFLOOR_S(out);
-      XT_SSI(out, (xtfloat *)p_c, 0);
-    }
-  }
-  /* For computing inp1 - inp2 */   
-  else
-  {
-    if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_c)&7) == 0))
-    {
-      if(mode == 0)
-      {
-        for(i=0; i<num_simd2_ops; i++)
-        {
-          XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32));
-          y = XT_DIV_SX2(x1, x2);
-          y = FITRUNC_SX2(y);
-          XT_SSX2IP(y, p_c, 2 * sizeof(FLOAT32)); 
-        }
-      }
-      else
-    {
-      for(i=0; i<num_simd2_ops; i++)
-      {
-        XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32));
-        y = XT_DIV_SX2(x1, x2);
-        y = FIFLOOR_SX2(y);
-        XT_SSX2IP(y, p_c, 2 * sizeof(FLOAT32)); 
-      }
-    }
-    } /* if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_c)&7) == 0)) */
-    else
-    {
-      ae_valign inp1_a, out_a;
-      inp1_a = XT_LASX2PP(p_a);
-      out_a = AE_ZALIGN64();       
-      if(mode == 0)
-      {
-      for(i=0; i<num_simd2_ops; i++)
-        {
-          XT_LASX2IP(x1, inp1_a, p_a);
-          y = XT_DIV_SX2(x1, x2);
-          y = FITRUNC_SX2(y);
-          XT_SASX2IP(y, out_a, p_c);
-        }
-      }
-      else
-      {
-        for(i=0; i<num_simd2_ops; i++)
-        {
-        XT_LASX2IP(x1, inp1_a, p_a);
-        y = XT_DIV_SX2(x1, x2);
-        y = FIFLOOR_SX2(y);
-        XT_SASX2IP(y, out_a, p_c);
-        }
-      }
-      XT_SASX2POSFP(out_a, (xtfloatx2 *)p_c);
-    }
-    if(num_scalar_ops !=0)
-    {
-      XT_LSIP(a0_7, (xtfloat *)p_a, sizeof(FLOAT32));
-      out = XT_DIV_S(a0_7, x2);   
-      if(mode == 0)
-        out = FITRUNC_S(out);
-      else
-      out = FIFLOOR_S(out);
-      XT_SSI(out, (xtfloat *)p_c, 0);
-    }    
-  }
-}
-#endif
-
-#if !HAVE_VFPU
-DISCARD_FUN_FOR_NONVOID_RETURN(
-             WORD32, xa_nn_elm_div_mode_broadcast_4D_f32xf32_f32,
-             (
-                      FLOAT32 * p_out,
-                      const WORD32 *const p_out_shape,
-                      const FLOAT32 * p_inp1,
-                      const WORD32 *const p_inp1_shape,
-                      const FLOAT32 * p_inp2,
-                      const WORD32 *const p_inp2_shape,
-                      WORD32 mode
-              )
-           )
-#else           
-WORD32 xa_nn_elm_div_mode_broadcast_4D_f32xf32_f32(FLOAT32 * __restrict__ p_out,
-                      const WORD32 *const p_out_shape,
-                      const FLOAT32 * __restrict__ p_inp1,
-                      const WORD32 *const p_inp1_shape,
-                      const FLOAT32 * __restrict__ p_inp2,
-                      const WORD32 *const p_inp2_shape,
-                      WORD32 mode)
-{
-  /* NULL pointer checks */
-  XA_NNLIB_ARG_CHK_PTR(p_out, -1);
-  XA_NNLIB_ARG_CHK_PTR(p_inp1, -1);
-  XA_NNLIB_ARG_CHK_PTR(p_inp2, -1);
-  XA_NNLIB_ARG_CHK_PTR(p_out_shape, -1);
-  XA_NNLIB_ARG_CHK_PTR(p_inp1_shape, -1);
-  XA_NNLIB_ARG_CHK_PTR(p_inp2_shape, -1);
-  /* Pointer alignment checks */
-  XA_NNLIB_ARG_CHK_ALIGN(p_out, sizeof(FLOAT32), -1);
-  XA_NNLIB_ARG_CHK_ALIGN(p_inp1, sizeof(FLOAT32), -1);
-  XA_NNLIB_ARG_CHK_ALIGN(p_inp2, sizeof(FLOAT32), -1);
-  XA_NNLIB_ARG_CHK_ALIGN(p_out_shape, sizeof(WORD32), -1);
-  XA_NNLIB_ARG_CHK_ALIGN(p_inp1_shape, sizeof(WORD32), -1);
-  XA_NNLIB_ARG_CHK_ALIGN(p_inp2_shape, sizeof(WORD32), -1);
-
-  /* Check shapes */
-  int i;
-  xtbool sign_flag;
-  for(i = 0; i < 4; i++)
-  {
-    if((p_inp1_shape[i] != p_inp2_shape[i] && p_inp1_shape[i] != 1 && p_inp2_shape[i] != 1) ||
-       (p_out_shape[i] != (p_inp1_shape[i] > p_inp2_shape[i] ? p_inp1_shape[i] : p_inp2_shape[i])))
-    {
-      return -1;
-    }
-  }
-  XA_NNLIB_ARG_CHK_COND(((mode != 0) && (mode != 1)), -1);
-
-  WORD32 inp1_strides[4], inp2_strides[4];
-  inp1_strides[3] = 1;
-  inp2_strides[3] = 1;
-  for(i = 2; i >= 0; i--)
-  {
-    ae_int32x2 d_str, d_shape;
-    d_str = AE_MOVDA32X2(inp1_strides[i + 1], inp2_strides[i + 1]);
-    d_shape = AE_MOVDA32X2(p_inp1_shape[i + 1], p_inp2_shape[i + 1]);
-    d_str = AE_MULP32X2(d_str, d_shape);
-    inp1_strides[i] = AE_MOVAD32_H(d_str);
-    inp2_strides[i] = AE_MOVAD32_L(d_str);
-  }
-
-  int need_broadcast = 0;
-  int inp1_const = 1, inp2_const = 1;
-  for(i = 0; i < 4; i++)
-  {
-    if(p_inp1_shape[i] != p_inp2_shape[i])
-    {
-      if(p_inp1_shape[i] == 1)
-        inp1_strides[i] = 0;
-      else
-        inp2_strides[i] = 0;
-
-      need_broadcast = 1;
-    }
-    if(p_inp1_shape[i] != 1)
-      inp1_const &= 0;
-    if(p_inp2_shape[i] != 1)
-      inp2_const &= 0;
-  }
-  int itr0, itr1, itr2;
-
-  FLOAT32 *p_out_tmp = p_out;
-  const FLOAT32 *__restrict__ p_inp1_tmp = p_inp1;
-  const FLOAT32 *__restrict__ p_inp2_tmp = p_inp2;
-  if(need_broadcast == 0)
-  {
-    sign_flag = 0;
-    internal_elm_div_mode_broadcast_2D_f32xf32_f32(
-                p_out,
-                p_inp1,
-                p_inp2,
-                1,
-                p_out_shape[0] * inp1_strides[0],
-                sign_flag,
-                mode);
-  }
-  else if(inp1_strides[3] == inp2_strides[3])
-  {
-    WORD32 in_lc, out_lc;
-    sign_flag = 0;
-    in_lc = p_out_shape[2] * p_out_shape[3];
-    out_lc = 1;
-    if(inp1_strides[2] == 0)
-    {
-      const FLOAT32 *tmp;
-      tmp = p_inp1_tmp;   p_inp1_tmp = p_inp2_tmp;    p_inp2_tmp = tmp;
-      sign_flag = 1;
-      int tmp_strides[2];
-      tmp_strides[0] = inp1_strides[0];
-      tmp_strides[1] = inp1_strides[1];
-
-      inp1_strides[0] = inp2_strides[0];
-      inp1_strides[1] = inp2_strides[1];
-
-      inp2_strides[0] = tmp_strides[0];
-      inp2_strides[1] = tmp_strides[1];
-      in_lc = p_out_shape[3];
-      out_lc = p_out_shape[2];
-    }
-    else if(inp2_strides[2] == 0)
-    {
-      in_lc = p_out_shape[3];
-      out_lc = p_out_shape[2];
-    }
-
-    for(itr0 = 0; itr0 < p_out_shape[0]; itr0++)
-    {
-      const FLOAT32 *__restrict__ p_inp1_tmp0 = p_inp1_tmp;
-      const FLOAT32 *__restrict__ p_inp2_tmp0 = p_inp2_tmp;
-      for(itr1 = 0; itr1 < p_out_shape[1]; itr1++)
-      {
-        internal_elm_div_mode_broadcast_2D_f32xf32_f32(
-            p_out_tmp,
-            p_inp1_tmp0,
-            p_inp2_tmp0,
-            out_lc,
-            in_lc,
-            sign_flag,
-            mode);
-        p_out_tmp += in_lc * out_lc;
-        p_inp1_tmp0 += inp1_strides[1];
-        p_inp2_tmp0 += inp2_strides[1];
-      }
-      p_inp1_tmp += inp1_strides[0];
-      p_inp2_tmp += inp2_strides[0];
-    }
-  }
-  else if(inp1_const == 1 || inp2_const == 1)
-  {
-    sign_flag = 0;
-    if(inp1_strides[3] == 0)
-    {
-      sign_flag = 1;
-      const FLOAT32 *tmp;
-      tmp = p_inp1_tmp;   p_inp1_tmp = p_inp2_tmp;    p_inp2_tmp = tmp;
-    }
-    internal_elm_div_mode_broadcast_f32xf32_f32(
-        p_out_tmp,
-        p_inp1_tmp,
-        p_inp2_tmp,
-        p_out_shape[0] * p_out_shape[1] * p_out_shape[2] * p_out_shape[3],
-        sign_flag,
-        mode);
-  }
-  else
-  {
-    sign_flag = 0;
-    if(inp1_strides[3] == 0)
-    {
-      const FLOAT32 *tmp;
-      tmp = p_inp1_tmp;   p_inp1_tmp = p_inp2_tmp;    p_inp2_tmp = tmp;
-      sign_flag = 1;
-      int tmp_strides[3];
-      tmp_strides[0] = inp1_strides[0];
-      tmp_strides[1] = inp1_strides[1];
-      tmp_strides[2] = inp1_strides[2];
-
-      inp1_strides[0] = inp2_strides[0];
-      inp1_strides[1] = inp2_strides[1];
-      inp1_strides[2] = inp2_strides[2];
-
-      inp2_strides[0] = tmp_strides[0];
-      inp2_strides[1] = tmp_strides[1];
-      inp2_strides[2] = tmp_strides[2];
-    }
-    for(itr0 = 0; itr0 < p_out_shape[0]; itr0++)
-    {
-      const FLOAT32 *__restrict__ p_inp1_tmp0 = p_inp1_tmp;
-      const FLOAT32 *__restrict__ p_inp2_tmp0 = p_inp2_tmp;
-      for(itr1 = 0; itr1 < p_out_shape[1]; itr1++)
-      {
-        const FLOAT32 *__restrict__ p_inp1_tmp1 = p_inp1_tmp0;
-        const FLOAT32 *__restrict__ p_inp2_tmp1 = p_inp2_tmp0;
-        for(itr2 = 0; itr2 < p_out_shape[2]; itr2++)
-        {
-          {
-            internal_elm_div_mode_broadcast_f32xf32_f32(
-                p_out_tmp,
-                p_inp1_tmp1,
-                p_inp2_tmp1,
-                p_out_shape[3], 
-                sign_flag,
-                mode);
-          }
-          p_out_tmp += p_out_shape[3];
-          p_inp1_tmp1 += inp1_strides[2];
-          p_inp2_tmp1 += inp2_strides[2];
-        }
-        p_inp1_tmp0 += inp1_strides[1];
-        p_inp2_tmp0 += inp2_strides[1];
-      }
-      p_inp1_tmp += inp1_strides[0];
-      p_inp2_tmp += inp2_strides[0];
-    }
-  }
-  return 0;
-}
-#endif
diff --git a/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_mul_f32_broadcast.c b/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_mul_f32_broadcast.c
deleted file mode 100644
index b9aa102a15f..00000000000
--- a/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_mul_f32_broadcast.c
+++ /dev/null
@@ -1,360 +0,0 @@
-/*******************************************************************************
-* Copyright (c) 2018-2024 Cadence Design Systems, Inc.
-*
-* Permission is hereby granted, free of charge, to any person obtaining
-* a copy of this software and associated documentation files (the
-* "Software"), to use this Software with Cadence processor cores only and
-* not with any other processors and platforms, subject to
-* the following conditions:
-*
-* The above copyright notice and this permission notice shall be included
-* in all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
-* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
-* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
-* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
-******************************************************************************/
-#include "xa_type_def.h"
-#include "nnlib-hifi4/xa_nnlib/algo/common/include/xa_nnlib_common_fpu.h"
-#include "nnlib-hifi4/xa_nnlib/algo/common/include/xa_nn_common.h"
-#include "nnlib-hifi4/xa_nnlib/algo/common/include/xa_nnlib_err_chk.h"
-#include "nnlib-hifi4/xa_nnlib/algo/kernels/basic/hifi4/xa_nn_basic_state.h"
-#include "nnlib-hifi4/xa_nnlib/include/nnlib/xa_nnlib_kernels_api.h"
-
-#if HAVE_VFPU
-static void internal_elm_mul_broadcast_2D_f32xf32_f32(FLOAT32 * __restrict__ p_out,
-                    const    FLOAT32 * __restrict__ p_inp1,
-                    const    FLOAT32 * __restrict__ p_inp2,
-                             WORD32  out_lc,
-                             WORD32  in_lc,
-                             xtbool  sign_flag)
-{
-  int i, j;
-
-  xtfloatx2  * __restrict__ p_a = (xtfloatx2 *)p_inp1;
-  xtfloatx2  * __restrict__ p_b = (xtfloatx2 *)p_inp2; 
-  xtfloatx2  *__restrict__  p_c =  (xtfloatx2 *)p_out;
-
-  int num_simd2_ops;
-  int num_scalar_ops;
-
-  if(out_lc)
-  {
-    num_simd2_ops = in_lc >> 1;
-    num_scalar_ops = in_lc & 1;
-  }
-  else
-  {
-    num_simd2_ops = (in_lc >> 2) << 1;
-    num_scalar_ops = in_lc & 3;
-  }
-
-    xtfloatx2 x1, x2, y;
-    xtfloat a0, b0, c0;
- 
-  for(i = 0; i < out_lc; i++)
-  {
-    p_a = (xtfloatx2 *)&p_inp1[i * in_lc];
-    p_b = (xtfloatx2 *)p_inp2;
-    p_c = (xtfloatx2 *)&p_out[i * in_lc];
-    if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_b)&7) == 0) && ((((unsigned)p_c)&7) == 0))
-    {
-      for(j = 0; j < num_simd2_ops; j++)
-      {
-        XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32));
-        XT_LSX2IP(x2, p_b, 2 * sizeof(FLOAT32));
-        y = XT_MUL_SX2(x2, x1);
-        XT_SSX2IP(y, p_c, 2 * sizeof(FLOAT32)); 
-      }
-    }
-    else
-    {
-      ae_valign vinp1, vinp2, out_a = AE_ZALIGN64();
-      vinp1 = XT_LASX2PP(p_a);
-      vinp2 = XT_LASX2PP(p_b);
-      for(j = 0; j < num_simd2_ops; j++)
-      {
-        XT_LASX2IP(x1, vinp1, p_a);
-        XT_LASX2IP(x2, vinp2, p_b);
-        y = XT_MUL_SX2(x2, x1);
-        XT_SASX2IP(y, out_a, p_c); 
-      }
-      XT_SASX2POSFP(out_a, (xtfloatx2 *)p_c);
-    }
-    if(num_scalar_ops !=0)
-    {
-      XT_LSIP(a0, (xtfloat *)p_a, sizeof(FLOAT32));
-      XT_LSIP(b0, (xtfloat *)p_b, sizeof(FLOAT32));
-      c0 = XT_MUL_S(b0, a0);   
-      XT_SSI(c0, (xtfloat *)p_c, 0);
-    }
-  }
-}
-
-static void internal_elm_mul_broadcast_f32xf32_f32(FLOAT32 * __restrict__ p_out,
-                    const    FLOAT32 * __restrict__ p_inp1,
-                    const    FLOAT32 * __restrict__ p_inp2,
-                             WORD32  num_elm,
-                             xtbool  sign_flag)
-{
-  int i;
-  xtfloatx2  * __restrict__ p_a = (xtfloatx2 *)p_inp1;
-  xtfloatx2  * __restrict__ p_b = (xtfloatx2 *)p_inp2; 
-  xtfloatx2  *__restrict__  p_c =  (xtfloatx2 *)p_out;
-
-  const int num_simd2_ops = num_elm >> 1;
-  const int num_scalar_ops = num_elm & 1;
-
-  xtfloat a0_7, out;
-  xtfloatx2 x1, x2, y;
-  x2 = XT_LSI((xtfloat *)p_b, 0);
-        
-  if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_c)&7) == 0))
-  {
-    for(i=0; i<num_simd2_ops; i++)
-    {
-      XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32));
-      y = XT_MUL_SX2(x2, x1);
-      XT_SSX2IP(y, p_c, 2 * sizeof(FLOAT32)); 
-    }
-  }
-  else
-  {
-    ae_valign inp1_a, out_a;
-    inp1_a = XT_LASX2PP(p_a);
-    out_a = AE_ZALIGN64();      
-    for(i=0; i<num_simd2_ops; i++)
-    {
-      XT_LASX2IP(x1, inp1_a, p_a);
-      y = XT_MUL_SX2(x2, x1);
-      XT_SASX2IP(y, out_a, p_c);
-    }
-    XT_SASX2POSFP(out_a, (xtfloatx2 *)p_c);   
-  }  
-  if(num_scalar_ops !=0)
-  {
-    XT_LSIP(a0_7, (xtfloat *)p_a, sizeof(FLOAT32));
-    out = XT_MUL_S(x2, a0_7);   
-    XT_SSI(out, (xtfloat *)p_c, 0);
-  }
-}
-#endif
-
-#if !HAVE_VFPU
-DISCARD_FUN_FOR_NONVOID_RETURN(
-             WORD32, xa_nn_elm_mul_broadcast_4D_f32xf32_f32,
-             (
-                      FLOAT32 * p_out,
-                      const WORD32 *const p_out_shape,
-                      const FLOAT32 * p_inp1,
-                      const WORD32 *const p_inp1_shape,
-                      const FLOAT32 * p_inp2,
-                      const WORD32 *const p_inp2_shape
-              )
-           )
-#else           
-WORD32 xa_nn_elm_mul_broadcast_4D_f32xf32_f32(FLOAT32 * __restrict__ p_out,
-                      const WORD32 *const p_out_shape,
-                      const FLOAT32 * __restrict__ p_inp1,
-                      const WORD32 *const p_inp1_shape,
-                      const FLOAT32 * __restrict__ p_inp2,
-                      const WORD32 *const p_inp2_shape)
-{
-  /* NULL pointer checks */
-  XA_NNLIB_ARG_CHK_PTR(p_out, -1);
-  XA_NNLIB_ARG_CHK_PTR(p_inp1, -1);
-  XA_NNLIB_ARG_CHK_PTR(p_inp2, -1);
-  XA_NNLIB_ARG_CHK_PTR(p_out_shape, -1);
-  XA_NNLIB_ARG_CHK_PTR(p_inp1_shape, -1);
-  XA_NNLIB_ARG_CHK_PTR(p_inp2_shape, -1);
-  /* Pointer alignment checks */
-  XA_NNLIB_ARG_CHK_ALIGN(p_out, sizeof(FLOAT32), -1);
-  XA_NNLIB_ARG_CHK_ALIGN(p_inp1, sizeof(FLOAT32), -1);
-  XA_NNLIB_ARG_CHK_ALIGN(p_inp2, sizeof(FLOAT32), -1);
-  XA_NNLIB_ARG_CHK_ALIGN(p_out_shape, sizeof(WORD32), -1);
-  XA_NNLIB_ARG_CHK_ALIGN(p_inp1_shape, sizeof(WORD32), -1);
-  XA_NNLIB_ARG_CHK_ALIGN(p_inp2_shape, sizeof(WORD32), -1);
-
-  /* Check shapes */
-  int i;
-  xtbool sign_flag;
-  for(i = 0; i < 4; i++)
-  {
-    if((p_inp1_shape[i] != p_inp2_shape[i] && p_inp1_shape[i] != 1 && p_inp2_shape[i] != 1) ||
-       (p_out_shape[i] != (p_inp1_shape[i] > p_inp2_shape[i] ? p_inp1_shape[i] : p_inp2_shape[i])))
-    {
-      return -1;
-    }
-  }
-
-  WORD32 inp1_strides[4], inp2_strides[4];
-  inp1_strides[3] = 1;
-  inp2_strides[3] = 1;
-  for(i = 2; i >= 0; i--)
-  {
-    ae_int32x2 d_str, d_shape;
-    d_str = AE_MOVDA32X2(inp1_strides[i + 1], inp2_strides[i + 1]);
-    d_shape = AE_MOVDA32X2(p_inp1_shape[i + 1], p_inp2_shape[i + 1]);
-    d_str = AE_MULP32X2(d_str, d_shape);
-    inp1_strides[i] = AE_MOVAD32_H(d_str);
-    inp2_strides[i] = AE_MOVAD32_L(d_str);
-  }
-
-  int need_broadcast = 0;
-  int inp1_const = 1, inp2_const = 1;
-  for(i = 0; i < 4; i++)
-  {
-    if(p_inp1_shape[i] != p_inp2_shape[i])
-    {
-      if(p_inp1_shape[i] == 1)
-        inp1_strides[i] = 0;
-      else
-        inp2_strides[i] = 0;
-
-      need_broadcast = 1;
-    }
-    if(p_inp1_shape[i] != 1)
-      inp1_const &= 0;
-    if(p_inp2_shape[i] != 1)
-      inp2_const &= 0;
-  }
-  int itr0, itr1, itr2;
-
-  FLOAT32 *p_out_tmp = p_out;
-  const FLOAT32 *__restrict__ p_inp1_tmp = p_inp1;
-  const FLOAT32 *__restrict__ p_inp2_tmp = p_inp2;
-  if(need_broadcast == 0)
-  {
-    sign_flag = 0;
-    internal_elm_mul_broadcast_2D_f32xf32_f32(
-                p_out,
-                p_inp1,
-                p_inp2,
-                1,
-                p_out_shape[0] * inp1_strides[0],
-                sign_flag);
-  }
-  else if(inp1_strides[3] == inp2_strides[3])
-  {
-    WORD32 in_lc, out_lc;
-    sign_flag = 0;
-    in_lc = p_out_shape[2] * p_out_shape[3];
-    out_lc = 1;
-    if(inp1_strides[2] == 0)
-    {
-      const FLOAT32 *tmp;
-      tmp = p_inp1_tmp;   p_inp1_tmp = p_inp2_tmp;    p_inp2_tmp = tmp;
-      sign_flag = 1;
-      int tmp_strides[2];
-      tmp_strides[0] = inp1_strides[0];
-      tmp_strides[1] = inp1_strides[1];
-
-      inp1_strides[0] = inp2_strides[0];
-      inp1_strides[1] = inp2_strides[1];
-
-      inp2_strides[0] = tmp_strides[0];
-      inp2_strides[1] = tmp_strides[1];
-      in_lc = p_out_shape[3];
-      out_lc = p_out_shape[2];
-    }
-    else if(inp2_strides[2] == 0)
-    {
-      in_lc = p_out_shape[3];
-      out_lc = p_out_shape[2];
-    }
-
-    for(itr0 = 0; itr0 < p_out_shape[0]; itr0++)
-    {
-      const FLOAT32 *__restrict__ p_inp1_tmp0 = p_inp1_tmp;
-      const FLOAT32 *__restrict__ p_inp2_tmp0 = p_inp2_tmp;
-      for(itr1 = 0; itr1 < p_out_shape[1]; itr1++)
-      {
-        internal_elm_mul_broadcast_2D_f32xf32_f32(
-            p_out_tmp,
-            p_inp1_tmp0,
-            p_inp2_tmp0,
-            out_lc,
-            in_lc,
-            sign_flag);
-        p_out_tmp += in_lc * out_lc;
-        p_inp1_tmp0 += inp1_strides[1];
-        p_inp2_tmp0 += inp2_strides[1];
-      }
-      p_inp1_tmp += inp1_strides[0];
-      p_inp2_tmp += inp2_strides[0];
-    }
-  }
-  else if(inp1_const == 1 || inp2_const == 1)
-  {
-    sign_flag = 0;
-    if(inp1_strides[3] == 0)
-    {
-      sign_flag = 1;
-      const FLOAT32 *tmp;
-      tmp = p_inp1_tmp;   p_inp1_tmp = p_inp2_tmp;    p_inp2_tmp = tmp;
-    }
-    internal_elm_mul_broadcast_f32xf32_f32(
-        p_out_tmp,
-        p_inp1_tmp,
-        p_inp2_tmp,
-        p_out_shape[0] * p_out_shape[1] * p_out_shape[2] * p_out_shape[3],
-        sign_flag);
-  }
-  else
-  {
-    sign_flag = 0;
-    if(inp1_strides[3] == 0)
-    {
-      const FLOAT32 *tmp;
-      tmp = p_inp1_tmp;   p_inp1_tmp = p_inp2_tmp;    p_inp2_tmp = tmp;
-      sign_flag = 1;
-      int tmp_strides[3];
-      tmp_strides[0] = inp1_strides[0];
-      tmp_strides[1] = inp1_strides[1];
-      tmp_strides[2] = inp1_strides[2];
-
-      inp1_strides[0] = inp2_strides[0];
-      inp1_strides[1] = inp2_strides[1];
-      inp1_strides[2] = inp2_strides[2];
-
-      inp2_strides[0] = tmp_strides[0];
-      inp2_strides[1] = tmp_strides[1];
-      inp2_strides[2] = tmp_strides[2];
-    }
-    for(itr0 = 0; itr0 < p_out_shape[0]; itr0++)
-    {
-      const FLOAT32 *__restrict__ p_inp1_tmp0 = p_inp1_tmp;
-      const FLOAT32 *__restrict__ p_inp2_tmp0 = p_inp2_tmp;
-      for(itr1 = 0; itr1 < p_out_shape[1]; itr1++)
-      {
-        const FLOAT32 *__restrict__ p_inp1_tmp1 = p_inp1_tmp0;
-        const FLOAT32 *__restrict__ p_inp2_tmp1 = p_inp2_tmp0;
-        for(itr2 = 0; itr2 < p_out_shape[2]; itr2++)
-        {
-          {
-            internal_elm_mul_broadcast_f32xf32_f32(
-                p_out_tmp,
-                p_inp1_tmp1,
-                p_inp2_tmp1,
-                p_out_shape[3], 
-                sign_flag);
-          }
-          p_out_tmp += p_out_shape[3];
-          p_inp1_tmp1 += inp1_strides[2];
-          p_inp2_tmp1 += inp2_strides[2];
-        }
-        p_inp1_tmp0 += inp1_strides[1];
-        p_inp2_tmp0 += inp2_strides[1];
-      }
-      p_inp1_tmp += inp1_strides[0];
-      p_inp2_tmp += inp2_strides[0];
-    }
-  }
-  return 0;
-}
-#endif