diff --git a/.github/workflows/conda-package.yml b/.github/workflows/conda-package.yml index 6e5ab73645..6bd208a326 100644 --- a/.github/workflows/conda-package.yml +++ b/.github/workflows/conda-package.yml @@ -44,7 +44,7 @@ jobs: run: conda install conda-build - name: Build conda package run: | - CHANNELS="-c intel -c main --override-channels" + CHANNELS="-c dppy/label/tools -c intel -c main --override-channels" VERSIONS="--python ${{ matrix.python }}" TEST="--no-test" conda build \ @@ -161,13 +161,20 @@ jobs: run: | . $CONDA/etc/profile.d/conda.sh conda activate test_dpctl - python -c "import dpctl; dpctl.lsplatform()" + python -c "import dpctl; dpctl.lsplatform(verbosity=2)" + - name: Install gdb + run: | + sudo apt-get install -y gdb + - name: Run test_elementwise under gdb + run: | + . $CONDA/etc/profile.d/conda.sh + conda activate test_dpctl + gdb --batch -ex r -ex 'info sharedlibrary' -ex 'set print elements 1000' -ex bt --args ${CONDA_PREFIX}/bin/python -m pytest -q -ra --disable-warnings --pyargs dpctl.tests.test_tensor_elementwise::test_cos_order -vv || true - name: Run tests run: | . $CONDA/etc/profile.d/conda.sh conda activate test_dpctl - # clinfo -l - python -m pytest --pyargs $MODULE_NAME + python -m pytest -v --pyargs $MODULE_NAME test_windows: needs: build_windows diff --git a/conda-recipe/meta.yaml b/conda-recipe/meta.yaml index a2bc599e38..0bcfe56c18 100644 --- a/conda-recipe/meta.yaml +++ b/conda-recipe/meta.yaml @@ -14,7 +14,7 @@ requirements: build: - {{ compiler('cxx') }} - {{ compiler('dpcpp') }} >=2023.1 # [not osx] - - sysroot_linux-64 >=2.17 # [linux] + - sysroot_linux-64 >=2.28 # [linux] host: - setuptools - cmake >=3.21 diff --git a/conda-recipe/run_test.sh b/conda-recipe/run_test.sh index 66193be58c..4b3566d7bd 100644 --- a/conda-recipe/run_test.sh +++ b/conda-recipe/run_test.sh @@ -3,5 +3,5 @@ set -e ${PYTHON} -c "import dpctl; print(dpctl.__version__)" -${PYTHON} -c "import dpctl; dpctl.lsplatform()" +${PYTHON} -c "import dpctl; dpctl.lsplatform(verbosity=2)" ${PYTHON} -m pytest -q -ra --disable-warnings -p no:faulthandler --cov dpctl --cov-report term-missing --pyargs dpctl -vv diff --git a/dpctl/tensor/CMakeLists.txt b/dpctl/tensor/CMakeLists.txt index 083f2558c5..9e22280f37 100644 --- a/dpctl/tensor/CMakeLists.txt +++ b/dpctl/tensor/CMakeLists.txt @@ -46,7 +46,15 @@ pybind11_add_module(${python_module_name} MODULE ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/where.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/boolean_reductions.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/device_support_queries.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions.cpp ) +set(_clang_prefix "") +if (WIN32) + set(_clang_prefix "/clang:") +endif() +set_source_files_properties( + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions.cpp + PROPERTIES COMPILE_OPTIONS "${_clang_prefix}-fno-fast-math") target_compile_options(${python_module_name} PRIVATE -fno-sycl-id-queries-fit-in-int) target_link_options(${python_module_name} PRIVATE -fsycl-device-code-split=per_kernel) if(UNIX) diff --git a/dpctl/tensor/__init__.py b/dpctl/tensor/__init__.py index 7364397648..be9426a834 100644 --- a/dpctl/tensor/__init__.py +++ b/dpctl/tensor/__init__.py @@ -91,6 +91,17 @@ from dpctl.tensor._utility_functions import all, any from ._constants import e, inf, nan, newaxis, pi +from ._elementwise_funcs import ( + abs, + add, + cos, + divide, + equal, + isfinite, + isinf, + isnan, + sqrt, +) __all__ = [ "Device", @@ -167,4 +178,13 @@ "pi", "nan", "inf", + "abs", + "add", + "cos", + "isinf", + "isnan", + "isfinite", + "sqrt", + "divide", + "equal", ] diff --git a/dpctl/tensor/_elementwise_common.py b/dpctl/tensor/_elementwise_common.py new file mode 100644 index 0000000000..6677670e73 --- /dev/null +++ b/dpctl/tensor/_elementwise_common.py @@ -0,0 +1,607 @@ +# Data Parallel Control (dpctl) +# +# Copyright 2020-2023 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numbers + +import numpy as np + +import dpctl +import dpctl.memory as dpm +import dpctl.tensor as dpt +import dpctl.tensor._tensor_impl as ti +from dpctl.tensor._manipulation_functions import _broadcast_shape_impl +from dpctl.tensor._usmarray import _is_object_with_buffer_protocol as _is_buffer +from dpctl.utils import ExecutionPlacementError + +from ._type_utils import ( + _empty_like_orderK, + _empty_like_pair_orderK, + _find_buf_dtype, + _find_buf_dtype2, + _to_device_supported_dtype, +) + + +class UnaryElementwiseFunc: + """ + Class that implements unary element-wise functions. + """ + + def __init__(self, name, result_type_resolver_fn, unary_dp_impl_fn, docs): + self.__name__ = "UnaryElementwiseFunc" + self.name_ = name + self.result_type_resolver_fn_ = result_type_resolver_fn + self.unary_fn_ = unary_dp_impl_fn + self.__doc__ = docs + + def __call__(self, x, out=None, order="K"): + if not isinstance(x, dpt.usm_ndarray): + raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}") + + if out is not None: + if not isinstance(out, dpt.usm_ndarray): + raise TypeError( + f"output array must be of usm_ndarray type, got {type(out)}" + ) + + if out.shape != x.shape: + raise TypeError( + "The shape of input and output arrays are inconsistent." + f"Expected output shape is {x.shape}, got {out.shape}" + ) + + if ti._array_overlap(x, out): + raise TypeError("Input and output arrays have memory overlap") + + if ( + dpctl.utils.get_execution_queue((x.sycl_queue, out.sycl_queue)) + is None + ): + raise TypeError( + "Input and output allocation queues are not compatible" + ) + + if order not in ["C", "F", "K", "A"]: + order = "K" + buf_dt, res_dt = _find_buf_dtype( + x.dtype, self.result_type_resolver_fn_, x.sycl_device + ) + if res_dt is None: + raise RuntimeError + exec_q = x.sycl_queue + if buf_dt is None: + if out is None: + if order == "K": + out = _empty_like_orderK(x, res_dt) + else: + if order == "A": + order = "F" if x.flags.f_contiguous else "C" + out = dpt.empty_like(x, dtype=res_dt, order=order) + else: + if res_dt != out.dtype: + raise TypeError( + f"Output array of type {res_dt} is needed," + f" got {out.dtype}" + ) + + ht, _ = self.unary_fn_(x, out, sycl_queue=exec_q) + ht.wait() + + return out + if order == "K": + buf = _empty_like_orderK(x, buf_dt) + else: + if order == "A": + order = "F" if x.flags.f_contiguous else "C" + buf = dpt.empty_like(x, dtype=buf_dt, order=order) + + ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=x, dst=buf, sycl_queue=exec_q + ) + if out is None: + if order == "K": + out = _empty_like_orderK(buf, res_dt) + else: + out = dpt.empty_like(buf, dtype=res_dt, order=order) + else: + if buf_dt != out.dtype: + raise TypeError( + f"Output array of type {buf_dt} is needed, got {out.dtype}" + ) + + ht, _ = self.unary_fn_(buf, out, sycl_queue=exec_q, depends=[copy_ev]) + ht_copy_ev.wait() + ht.wait() + + return out + + +def _get_queue_usm_type(o): + """Return SYCL device where object `o` allocated memory, or None.""" + if isinstance(o, dpt.usm_ndarray): + return o.sycl_queue, o.usm_type + elif hasattr(o, "__sycl_usm_array_interface__"): + try: + m = dpm.as_usm_memory(o) + return m.sycl_queue, m.get_usm_type() + except Exception: + return None, None + return None, None + + +class WeakBooleanType: + "Python type representing type of Python boolean objects" + + def __init__(self, o): + self.o_ = o + + def get(self): + return self.o_ + + +class WeakIntegralType: + "Python type representing type of Python integral objects" + + def __init__(self, o): + self.o_ = o + + def get(self): + return self.o_ + + +class WeakInexactType: + """Python type representing type of Python real- or + complex-valued floating point objects""" + + def __init__(self, o): + self.o_ = o + + def get(self): + return self.o_ + + +def _get_dtype(o, dev): + if isinstance(o, dpt.usm_ndarray): + return o.dtype + if hasattr(o, "__sycl_usm_array_interface__"): + return dpt.asarray(o).dtype + if _is_buffer(o): + host_dt = np.array(o).dtype + dev_dt = _to_device_supported_dtype(host_dt, dev) + return dev_dt + if hasattr(o, "dtype"): + dev_dt = _to_device_supported_dtype(o.dtype, dev) + return dev_dt + if isinstance(o, bool): + return WeakBooleanType(o) + if isinstance(o, int): + return WeakIntegralType(o) + if isinstance(o, (float, complex)): + return WeakInexactType(o) + return np.object_ + + +def _validate_dtype(dt) -> bool: + return isinstance( + dt, (WeakBooleanType, WeakInexactType, WeakIntegralType) + ) or ( + isinstance(dt, dpt.dtype) + and dt + in [ + dpt.bool, + dpt.int8, + dpt.uint8, + dpt.int16, + dpt.uint16, + dpt.int32, + dpt.uint32, + dpt.int64, + dpt.uint64, + dpt.float16, + dpt.float32, + dpt.float64, + dpt.complex64, + dpt.complex128, + ] + ) + + +def _weak_type_num_kind(o): + _map = {"?": 0, "i": 1, "f": 2} + if isinstance(o, WeakBooleanType): + return _map["?"] + if isinstance(o, WeakIntegralType): + return _map["i"] + if isinstance(o, WeakInexactType): + return _map["f"] + raise TypeError( + f"Unexpected type {o} while expecting " + "`WeakBooleanType`, `WeakIntegralType`, or " + "`WeakInexactType`." + ) + + +def _strong_dtype_num_kind(o): + _map = {"b": 0, "i": 1, "u": 1, "f": 2, "c": 2} + if not isinstance(o, dpt.dtype): + raise TypeError + k = o.kind + if k in _map: + return _map[k] + raise ValueError(f"Unrecognized kind {k} for dtype {o}") + + +def _resolve_weak_types(o1_dtype, o2_dtype, dev): + "Resolves weak data type per NEP-0050" + if isinstance( + o1_dtype, (WeakBooleanType, WeakInexactType, WeakIntegralType) + ): + if isinstance( + o2_dtype, (WeakBooleanType, WeakInexactType, WeakIntegralType) + ): + raise ValueError + o1_kind_num = _weak_type_num_kind(o1_dtype) + o2_kind_num = _strong_dtype_num_kind(o2_dtype) + if o1_kind_num > o2_kind_num: + if isinstance(o1_dtype, WeakBooleanType): + return dpt.bool, o2_dtype + if isinstance(o1_dtype, WeakIntegralType): + return dpt.int64, o2_dtype + if isinstance(o1_dtype.get(), complex): + return ( + _to_device_supported_dtype(dpt.complex128, dev), + o2_dtype, + ) + return _to_device_supported_dtype(dpt.float64, dev), o2_dtype + else: + return o2_dtype, o2_dtype + elif isinstance( + o2_dtype, (WeakBooleanType, WeakInexactType, WeakIntegralType) + ): + o1_kind_num = _strong_dtype_num_kind(o1_dtype) + o2_kind_num = _weak_type_num_kind(o2_dtype) + if o2_kind_num > o1_kind_num: + if isinstance(o2_dtype, WeakBooleanType): + return o1_dtype, dpt.bool + if isinstance(o2_dtype, WeakIntegralType): + return o1_dtype, dpt.int64 + if isinstance(o2_dtype.get(), complex): + return o1_dtype, _to_device_supported_dtype(dpt.complex128, dev) + return ( + o1_dtype, + _to_device_supported_dtype(dpt.float64, dev), + ) + else: + return o1_dtype, o1_dtype + else: + return o1_dtype, o2_dtype + + +def _get_shape(o): + if isinstance(o, dpt.usm_ndarray): + return o.shape + if _is_buffer(o): + return memoryview(o).shape + if isinstance(o, numbers.Number): + return tuple() + return getattr(o, "shape", tuple()) + + +class BinaryElementwiseFunc: + """ + Class that implements binary element-wise functions. + """ + + def __init__(self, name, result_type_resolver_fn, binary_dp_impl_fn, docs): + self.__name__ = "BinaryElementwiseFunc" + self.name_ = name + self.result_type_resolver_fn_ = result_type_resolver_fn + self.binary_fn_ = binary_dp_impl_fn + self.__doc__ = docs + + def __str__(self): + return f"" + + def __repr__(self): + return f"" + + def __call__(self, o1, o2, out=None, order="K"): + if order not in ["K", "C", "F", "A"]: + order = "K" + q1, o1_usm_type = _get_queue_usm_type(o1) + q2, o2_usm_type = _get_queue_usm_type(o2) + if q1 is None and q2 is None: + raise ExecutionPlacementError( + "Execution placement can not be unambiguously inferred " + "from input arguments. " + "One of the arguments must represent USM allocation and " + "expose `__sycl_usm_array_interface__` property" + ) + if q1 is None: + exec_q = q2 + res_usm_type = o2_usm_type + elif q2 is None: + exec_q = q1 + res_usm_type = o1_usm_type + else: + exec_q = dpctl.utils.get_execution_queue((q1, q2)) + if exec_q is None: + raise ExecutionPlacementError( + "Execution placement can not be unambiguously inferred " + "from input arguments." + ) + res_usm_type = dpctl.utils.get_coerced_usm_type( + ( + o1_usm_type, + o2_usm_type, + ) + ) + dpctl.utils.validate_usm_type(res_usm_type, allow_none=False) + o1_shape = _get_shape(o1) + o2_shape = _get_shape(o2) + if not all( + isinstance(s, (tuple, list)) + for s in ( + o1_shape, + o2_shape, + ) + ): + raise TypeError( + "Shape of arguments can not be inferred. " + "Arguments are expected to be " + ) + try: + res_shape = _broadcast_shape_impl( + [ + o1_shape, + o2_shape, + ] + ) + except ValueError: + raise ValueError( + "operands could not be broadcast together with shapes " + f"{o1_shape} and {o2_shape}" + ) + sycl_dev = exec_q.sycl_device + o1_dtype = _get_dtype(o1, sycl_dev) + o2_dtype = _get_dtype(o2, sycl_dev) + if not all(_validate_dtype(o) for o in (o1_dtype, o2_dtype)): + raise ValueError("Operands of unsupported types") + + o1_dtype, o2_dtype = _resolve_weak_types(o1_dtype, o2_dtype, sycl_dev) + + buf1_dt, buf2_dt, res_dt = _find_buf_dtype2( + o1_dtype, o2_dtype, self.result_type_resolver_fn_, sycl_dev + ) + + if res_dt is None: + raise TypeError( + "function 'add' does not support input types " + f"({o1_dtype}, {o2_dtype}), " + "and the inputs could not be safely coerced to any " + "supported types according to the casting rule ''safe''." + ) + + if out is not None: + if not isinstance(out, dpt.usm_ndarray): + raise TypeError( + f"output array must be of usm_ndarray type, got {type(out)}" + ) + + if out.shape != res_shape: + raise TypeError( + "The shape of input and output arrays are inconsistent." + f"Expected output shape is {o1_shape}, got {out.shape}" + ) + + if ti._array_overlap(o1, out) or ti._array_overlap(o2, out): + raise TypeError("Input and output arrays have memory overlap") + + if ( + dpctl.utils.get_execution_queue( + (o1.sycl_queue, o2.sycl_queue, out.sycl_queue) + ) + is None + ): + raise TypeError( + "Input and output allocation queues are not compatible" + ) + + if isinstance(o1, dpt.usm_ndarray): + src1 = o1 + else: + src1 = dpt.asarray(o1, dtype=o1_dtype, sycl_queue=exec_q) + if isinstance(o2, dpt.usm_ndarray): + src2 = o2 + else: + src2 = dpt.asarray(o2, dtype=o2_dtype, sycl_queue=exec_q) + + if buf1_dt is None and buf2_dt is None: + if out is None: + if order == "K": + out = _empty_like_pair_orderK( + src1, src2, res_dt, res_usm_type, exec_q + ) + else: + if order == "A": + order = ( + "F" + if all( + arr.flags.f_contiguous + for arr in ( + src1, + src2, + ) + ) + else "C" + ) + out = dpt.empty( + res_shape, + dtype=res_dt, + usm_type=res_usm_type, + sycl_queue=exec_q, + order=order, + ) + else: + if res_dt != out.dtype: + raise TypeError( + f"Output array of type {res_dt} is needed," + f"got {out.dtype}" + ) + + src1 = dpt.broadcast_to(src1, res_shape) + src2 = dpt.broadcast_to(src2, res_shape) + ht_, _ = self.binary_fn_( + src1=src1, src2=src2, dst=out, sycl_queue=exec_q + ) + ht_.wait() + return out + elif buf1_dt is None: + if order == "K": + buf2 = _empty_like_orderK(src2, buf2_dt) + else: + if order == "A": + order = "F" if src1.flags.f_contiguous else "C" + buf2 = dpt.empty_like(src2, dtype=buf2_dt, order=order) + ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=src2, dst=buf2, sycl_queue=exec_q + ) + if out is None: + if order == "K": + out = _empty_like_pair_orderK( + src1, buf2, res_dt, res_usm_type, exec_q + ) + else: + out = dpt.empty( + res_shape, + dtype=res_dt, + usm_type=res_usm_type, + sycl_queue=exec_q, + order=order, + ) + else: + if res_dt != out.dtype: + raise TypeError( + f"Output array of type {res_dt} is needed," + f"got {out.dtype}" + ) + + src1 = dpt.broadcast_to(src1, res_shape) + buf2 = dpt.broadcast_to(buf2, res_shape) + ht_, _ = self.binary_fn_( + src1=src1, + src2=buf2, + dst=out, + sycl_queue=exec_q, + depends=[copy_ev], + ) + ht_copy_ev.wait() + ht_.wait() + return out + elif buf2_dt is None: + if order == "K": + buf1 = _empty_like_orderK(src1, buf1_dt) + else: + if order == "A": + order = "F" if src1.flags.f_contiguous else "C" + buf1 = dpt.empty_like(src1, dtype=buf1_dt, order=order) + ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=src1, dst=buf1, sycl_queue=exec_q + ) + if out is None: + if order == "K": + out = _empty_like_pair_orderK( + buf1, src2, res_dt, res_usm_type, exec_q + ) + else: + out = dpt.empty( + res_shape, + dtype=res_dt, + usm_type=res_usm_type, + sycl_queue=exec_q, + order=order, + ) + else: + if res_dt != out.dtype: + raise TypeError( + f"Output array of type {res_dt} is needed," + f"got {out.dtype}" + ) + + buf1 = dpt.broadcast_to(buf1, res_shape) + src2 = dpt.broadcast_to(src2, res_shape) + ht_, _ = self.binary_fn_( + src1=buf1, + src2=src2, + dst=out, + sycl_queue=exec_q, + depends=[copy_ev], + ) + ht_copy_ev.wait() + ht_.wait() + return out + + if order in ["K", "A"]: + if src1.flags.f_contiguous and src2.flags.f_contiguous: + order = "F" + elif src1.flags.c_contiguous and src2.flags.c_contiguous: + order = "C" + else: + order = "C" if order == "A" else "K" + if order == "K": + buf1 = _empty_like_orderK(src1, buf1_dt) + else: + buf1 = dpt.empty_like(src1, dtype=buf1_dt, order=order) + ht_copy1_ev, copy1_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=src1, dst=buf1, sycl_queue=exec_q + ) + if order == "K": + buf2 = _empty_like_orderK(src2, buf2_dt) + else: + buf2 = dpt.empty_like(src2, dtype=buf2_dt, order=order) + ht_copy2_ev, copy2_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=src2, dst=buf2, sycl_queue=exec_q + ) + if out is None: + if order == "K": + out = _empty_like_pair_orderK( + buf1, buf2, res_dt, res_usm_type, exec_q + ) + else: + out = dpt.empty( + res_shape, + dtype=res_dt, + usm_type=res_usm_type, + sycl_queue=exec_q, + order=order, + ) + else: + if res_dt != out.dtype: + raise TypeError( + f"Output array of type {res_dt} is needed, got {out.dtype}" + ) + + buf1 = dpt.broadcast_to(buf1, res_shape) + buf2 = dpt.broadcast_to(buf2, res_shape) + ht_, _ = self.binary_fn_( + src1=buf1, + src2=buf2, + dst=out, + sycl_queue=exec_q, + depends=[copy1_ev, copy2_ev], + ) + dpctl.SyclEvent.wait_for([ht_copy1_ev, ht_copy2_ev, ht_]) + return out diff --git a/dpctl/tensor/_elementwise_funcs.py b/dpctl/tensor/_elementwise_funcs.py new file mode 100644 index 0000000000..27c549c034 --- /dev/null +++ b/dpctl/tensor/_elementwise_funcs.py @@ -0,0 +1,288 @@ +# Data Parallel Control (dpctl) +# +# Copyright 2020-2023 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import dpctl.tensor._tensor_impl as ti + +from ._elementwise_common import BinaryElementwiseFunc, UnaryElementwiseFunc + +# U01: ==== ABS (x) +_abs_docstring_ = """ +Calculate the absolute value element-wise. +""" + +abs = UnaryElementwiseFunc("abs", ti._abs_result_type, ti._abs, _abs_docstring_) + +# U02: ==== ACOS (x) +# FIXME: implement U02 + +# U03: ===== ACOSH (x) +# FIXME: implement U03 + +# B01: ===== ADD (x1, x2) + +_add_docstring_ = """ +add(x1, x2, order='K') + +Calculates the sum for each element `x1_i` of the input array `x1` with +the respective element `x2_i` of the input array `x2`. + +Args: + x1 (usm_ndarray): + First input array, expected to have numeric data type. + x2 (usm_ndarray): + Second input array, also expected to have numeric data type. +Returns: + usm_narray: + an array containing the element-wise sums. The data type of the + returned array is determined by the Type Promotion Rules. +""" +add = BinaryElementwiseFunc( + "add", ti._add_result_type, ti._add, _add_docstring_ +) + +# U04: ===== ASIN (x) +# FIXME: implement U04 + +# U05: ===== ASINH (x) +# FIXME: implement U05 + +# U06: ===== ATAN (x) +# FIXME: implement U06 + +# B02: ===== ATAN2 (x1, x2) +# FIXME: implemetn B02 + +# U07: ===== ATANH (x) +# FIXME: implemetn U07 + +# B03: ===== BITWISE_AND (x1, x2) +# FIXME: implemetn B03 + +# B04: ===== BITWISE_LEFT_SHIFT (x1, x2) +# FIXME: implement B04 + +# U08: ===== BITWISE_INVERT (x) +# FIXME: implement U08 + +# B05: ===== BITWISE_OR (x1, x2) +# FIXME: implement B05 + +# B06: ===== BITWISE_RIGHT_SHIFT (x1, x2) +# FIXME: implement B06 + +# B07: ===== BITWISE_XOR (x1, x2) +# FIXME: implement B07 + +# U09: ==== CEIL (x) +# FIXME: implement U09 + +# U10: ==== CONJ (x) +# FIXME: implement U10 + +# U11: ==== COS (x) +_cos_docstring = """ +cos(x, order='K') + +Computes cosine for each element `x_i` for input array `x`. +""" + +cos = UnaryElementwiseFunc("cos", ti._cos_result_type, ti._cos, _cos_docstring) + +# U12: ==== COSH (x) +# FIXME: implement U12 + +# B08: ==== DIVIDE (x1, x2) +_divide_docstring_ = """ +divide(x1, x2, order='K') + +Calculates the ratio for each element `x1_i` of the input array `x1` with +the respective element `x2_i` of the input array `x2`. + +Args: + x1 (usm_ndarray): + First input array, expected to have numeric data type. + x2 (usm_ndarray): + Second input array, also expected to have numeric data type. +Returns: + usm_narray: + an array containing the result of element-wise division. The data type + of the returned array is determined by the Type Promotion Rules. +""" + +divide = BinaryElementwiseFunc( + "divide", ti._divide_result_type, ti._divide, _divide_docstring_ +) + +# B09: ==== EQUAL (x1, x2) +_equal_docstring_ = """ +equal(x1, x2, order='K') + +Calculates equality test results for each element `x1_i` of the input array `x1` +with the respective element `x2_i` of the input array `x2`. + +Args: + x1 (usm_ndarray): + First input array, expected to have numeric data type. + x2 (usm_ndarray): + Second input array, also expected to have numeric data type. +Returns: + usm_narray: + an array containing the result of element-wise equality comparison. + The data type of the returned array is determined by the + Type Promotion Rules. +""" + +equal = BinaryElementwiseFunc( + "equal", ti._equal_result_type, ti._equal, _equal_docstring_ +) + +# U13: ==== EXP (x) +# FIXME: implement U13 + +# U14: ==== EXPM1 (x) +# FIXME: implement U14 + +# U15: ==== FLOOR (x) +# FIXME: implement U15 + +# B10: ==== FLOOR_DIVIDE (x1, x2) +# FIXME: implement B10 + +# B11: ==== GREATER (x1, x2) +# FIXME: implement B11 + +# B12: ==== GREATER_EQUAL (x1, x2) +# FIXME: implement B12 + +# U16: ==== IMAG (x) +# FIXME: implement U16 + +# U17: ==== ISFINITE (x) +_isfinite_docstring_ = """ +Computes if every element of input array is a finite number. +""" + +isfinite = UnaryElementwiseFunc( + "isfinite", ti._isfinite_result_type, ti._isfinite, _isfinite_docstring_ +) + +# U18: ==== ISINF (x) +_isinf_docstring_ = """ +Computes if every element of input array is an infinity. +""" + +isinf = UnaryElementwiseFunc( + "isinf", ti._isinf_result_type, ti._isinf, _isinf_docstring_ +) + +# U19: ==== ISNAN (x) +_isnan_docstring_ = """ +Computes if every element of input array is a NaN. +""" + +isnan = UnaryElementwiseFunc( + "isnan", ti._isnan_result_type, ti._isnan, _isnan_docstring_ +) + +# B13: ==== LESS (x1, x2) +# FIXME: implement B13 + +# B14: ==== LESS_EQUAL (x1, x2) +# FIXME: implement B14 + +# U20: ==== LOG (x) +# FIXME: implement U20 + +# U21: ==== LOG1P (x) +# FIXME: implement U21 + +# U22: ==== LOG2 (x) +# FIXME: implement U22 + +# U23: ==== LOG10 (x) +# FIXME: implement U23 + +# B15: ==== LOGADDEXP (x1, x2) +# FIXME: implement B15 + +# B16: ==== LOGICAL_AND (x1, x2) +# FIXME: implement B16 + +# U24: ==== LOGICAL_NOT (x) +# FIXME: implement U24 + +# B17: ==== LOGICAL_OR (x1, x2) +# FIXME: implement B17 + +# B18: ==== LOGICAL_XOR (x1, x2) +# FIXME: implement B18 + +# B19: ==== MULTIPLY (x1, x2) +# FIXME: implement B19 + +# U25: ==== NEGATIVE (x) +# FIXME: implement U25 + +# B20: ==== NOT_EQUAL (x1, x2) +# FIXME: implement B20 + +# U26: ==== POSITIVE (x) +# FIXME: implement U26 + +# B21: ==== POW (x1, x2) +# FIXME: implement B21 + +# U27: ==== REAL (x) +# FIXME: implement U27 + +# B22: ==== REMAINDER (x1, x2) +# FIXME: implement B22 + +# U28: ==== ROUND (x) +# FIXME: implement U28 + +# U29: ==== SIGN (x) +# FIXME: implement U29 + +# U30: ==== SIN (x) +# FIXME: implement U30 + +# U31: ==== SINH (x) +# FIXME: implement U31 + +# U32: ==== SQUARE (x) +# FIXME: implement U32 + +# U33: ==== SQRT (x) +_sqrt_docstring_ = """ +Computes sqrt for each element `x_i` for input array `x`. +""" + +sqrt = UnaryElementwiseFunc( + "sqrt", ti._sqrt_result_type, ti._sqrt, _sqrt_docstring_ +) + +# B23: ==== SUBTRACT (x1, x2) +# FIXME: implement B23 + +# U34: ==== TAN (x) +# FIXME: implement U34 + +# U35: ==== TANH (x) +# FIXME: implement U35 + +# U36: ==== TRUNC (x) +# FIXME: implement U36 diff --git a/dpctl/tensor/_type_utils.py b/dpctl/tensor/_type_utils.py index 3ea6875fce..d33f2eba06 100644 --- a/dpctl/tensor/_type_utils.py +++ b/dpctl/tensor/_type_utils.py @@ -14,7 +14,10 @@ # See the License for the specific language governing permissions and # limitations under the License. +import builtins + import dpctl.tensor as dpt +import dpctl.tensor._tensor_impl as ti def _all_data_types(_fp16, _fp64): @@ -111,3 +114,187 @@ def _can_cast(from_: dpt.dtype, to_: dpt.dtype, _fp16: bool, _fp64: bool): return True return can_cast_v + + +def _empty_like_orderK(X, dt, usm_type=None, dev=None): + """Returns empty array like `x`, using order='K' + + For an array `x` that was obtained by permutation of a contiguous + array the returned array will have the same shape and the same + strides as `x`. + """ + if not isinstance(X, dpt.usm_ndarray): + raise TypeError(f"Expected usm_ndarray, got {type(X)}") + if usm_type is None: + usm_type = X.usm_type + if dev is None: + dev = X.device + fl = X.flags + if fl["C"] or X.size <= 1: + return dpt.empty_like( + X, dtype=dt, usm_type=usm_type, device=dev, order="C" + ) + elif fl["F"]: + return dpt.empty_like( + X, dtype=dt, usm_type=usm_type, device=dev, order="F" + ) + st = list(X.strides) + perm = sorted( + range(X.ndim), key=lambda d: builtins.abs(st[d]), reverse=True + ) + inv_perm = sorted(range(X.ndim), key=lambda i: perm[i]) + st_sorted = [st[i] for i in perm] + sh = X.shape + sh_sorted = tuple(sh[i] for i in perm) + R = dpt.empty(sh_sorted, dtype=dt, usm_type=usm_type, device=dev, order="C") + if min(st_sorted) < 0: + sl = tuple( + slice(None, None, -1) + if st_sorted[i] < 0 + else slice(None, None, None) + for i in range(X.ndim) + ) + R = R[sl] + return dpt.permute_dims(R, inv_perm) + + +def _empty_like_pair_orderK(X1, X2, dt, usm_type, dev): + if not isinstance(X1, dpt.usm_ndarray): + raise TypeError(f"Expected usm_ndarray, got {type(X1)}") + if not isinstance(X2, dpt.usm_ndarray): + raise TypeError(f"Expected usm_ndarray, got {type(X2)}") + nd1 = X1.ndim + nd2 = X2.ndim + if nd1 > nd2: + return _empty_like_orderK(X1, dt, usm_type, dev) + elif nd1 < nd2: + return _empty_like_orderK(X2, dt, usm_type, dev) + fl1 = X1.flags + fl2 = X2.flags + if fl1["C"] or fl2["C"]: + return dpt.empty_like( + X1, dtype=dt, usm_type=usm_type, device=dev, order="C" + ) + if fl1["F"] and fl2["F"]: + return dpt.empty_like( + X1, dtype=dt, usm_type=usm_type, device=dev, order="F" + ) + st1 = list(X1.strides) + st2 = list(X2.strides) + perm = sorted( + range(nd1), + key=lambda d: (builtins.abs(st1[d]), builtins.abs(st2[d])), + reverse=True, + ) + inv_perm = sorted(range(nd1), key=lambda i: perm[i]) + st1_sorted = [st1[i] for i in perm] + st2_sorted = [st2[i] for i in perm] + sh = X1.shape + sh_sorted = tuple(sh[i] for i in perm) + R = dpt.empty(sh_sorted, dtype=dt, usm_type=usm_type, device=dev, order="C") + if max(min(st1_sorted), min(st2_sorted)) < 0: + sl = tuple( + slice(None, None, -1) + if (st1_sorted[i] < 0 and st2_sorted[i] < 0) + else slice(None, None, None) + for i in range(nd1) + ) + R = R[sl] + return dpt.permute_dims(R, inv_perm) + + +def _to_device_supported_dtype(dt, dev): + has_fp16 = dev.has_aspect_fp16 + has_fp64 = dev.has_aspect_fp64 + + if has_fp64: + if not has_fp16: + if dt is dpt.float16: + return dpt.float32 + else: + if dt is dpt.float64: + return dpt.float32 + elif dt is dpt.complex128: + return dpt.complex64 + if not has_fp16 and dt is dpt.float16: + return dpt.float32 + return dt + + +def _find_buf_dtype(arg_dtype, query_fn, sycl_dev): + res_dt = query_fn(arg_dtype) + if res_dt: + return None, res_dt + + _fp16 = sycl_dev.has_aspect_fp16 + _fp64 = sycl_dev.has_aspect_fp64 + all_dts = _all_data_types(_fp16, _fp64) + for buf_dt in all_dts: + if _can_cast(arg_dtype, buf_dt, _fp16, _fp64): + res_dt = query_fn(buf_dt) + if res_dt: + return buf_dt, res_dt + + return None, None + + +def _get_device_default_dtype(dt_kind, sycl_dev): + if dt_kind == "b": + return dpt.dtype(ti.default_device_bool_type(sycl_dev)) + elif dt_kind == "i": + return dpt.dtype(ti.default_device_int_type(sycl_dev)) + elif dt_kind == "u": + return dpt.dtype(ti.default_device_int_type(sycl_dev).upper()) + elif dt_kind == "f": + return dpt.dtype(ti.default_device_fp_type(sycl_dev)) + elif dt_kind == "c": + return dpt.dtype(ti.default_device_complex_type(sycl_dev)) + raise RuntimeError + + +def _find_buf_dtype2(arg1_dtype, arg2_dtype, query_fn, sycl_dev): + res_dt = query_fn(arg1_dtype, arg2_dtype) + if res_dt: + return None, None, res_dt + + _fp16 = sycl_dev.has_aspect_fp16 + _fp64 = sycl_dev.has_aspect_fp64 + all_dts = _all_data_types(_fp16, _fp64) + for buf1_dt in all_dts: + for buf2_dt in all_dts: + if _can_cast(arg1_dtype, buf1_dt, _fp16, _fp64) and _can_cast( + arg2_dtype, buf2_dt, _fp16, _fp64 + ): + res_dt = query_fn(buf1_dt, buf2_dt) + if res_dt: + ret_buf1_dt = None if buf1_dt == arg1_dtype else buf1_dt + ret_buf2_dt = None if buf2_dt == arg2_dtype else buf2_dt + if ret_buf1_dt is None or ret_buf2_dt is None: + return ret_buf1_dt, ret_buf2_dt, res_dt + else: + # both are being promoted, if the kind of result is + # different than the kind of original input dtypes, + # we must use default dtype for the resulting kind. + if (res_dt.kind != arg1_dtype.kind) and ( + res_dt.kind != arg2_dtype.kind + ): + default_dt = _get_device_default_dtype( + res_dt.kind, sycl_dev + ) + if res_dt == default_dt: + return ret_buf1_dt, ret_buf2_dt, res_dt + else: + continue + else: + return ret_buf1_dt, ret_buf2_dt, res_dt + + return None, None, None + + +__all__ = [ + "_find_buf_dtype", + "_find_buf_dtype2", + "_empty_like_orderK", + "_empty_like_pair_orderK", + "_to_device_supported_dtype", +] diff --git a/dpctl/tensor/libtensor/include/kernels/boolean_reductions.hpp b/dpctl/tensor/libtensor/include/kernels/boolean_reductions.hpp index dec96fab2a..8418fca83c 100644 --- a/dpctl/tensor/libtensor/include/kernels/boolean_reductions.hpp +++ b/dpctl/tensor/libtensor/include/kernels/boolean_reductions.hpp @@ -1,5 +1,5 @@ -//=== boolean_reductions.hpp - Implementation of boolean reduction kernels -//---*-C++-*--/===// +//=== boolean_reductions.hpp - Implementation of boolean reduction kernels // +// ---*-C++-*--/===// // // Data Parallel Control (dpctl) // diff --git a/dpctl/tensor/libtensor/include/kernels/constructors.hpp b/dpctl/tensor/libtensor/include/kernels/constructors.hpp index 6449d992cd..49111cbb61 100644 --- a/dpctl/tensor/libtensor/include/kernels/constructors.hpp +++ b/dpctl/tensor/libtensor/include/kernels/constructors.hpp @@ -3,7 +3,7 @@ // // Data Parallel Control (dpctl) // -// Copyright 2020-2022 Intel Corporation +// Copyright 2020-2023 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/dpctl/tensor/libtensor/include/kernels/copy_and_cast.hpp b/dpctl/tensor/libtensor/include/kernels/copy_and_cast.hpp index 02b7ac3c2d..f1e63ccc60 100644 --- a/dpctl/tensor/libtensor/include/kernels/copy_and_cast.hpp +++ b/dpctl/tensor/libtensor/include/kernels/copy_and_cast.hpp @@ -2,7 +2,7 @@ // // Data Parallel Control (dpctl) // -// Copyright 2020-2022 Intel Corporation +// Copyright 2020-2023 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/abs.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/abs.hpp new file mode 100644 index 0000000000..09f2995874 --- /dev/null +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/abs.hpp @@ -0,0 +1,224 @@ +//=== abs.hpp - Unary function ABS ------ *-C++-*--/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for elementwise evaluation of ABS(x) function. +//===---------------------------------------------------------------------===// + +#pragma once +#include +#include +#include +#include +#include + +#include "kernels/elementwise_functions/common.hpp" + +#include "utils/offset_utils.hpp" +#include "utils/type_dispatch.hpp" +#include "utils/type_utils.hpp" +#include + +#include + +namespace dpctl +{ +namespace tensor +{ +namespace kernels +{ +namespace abs +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +using dpctl::tensor::type_utils::is_complex; + +template struct AbsFunctor +{ + + using is_constant = typename std::false_type; + // constexpr resT constant_value = resT{}; + using supports_vec = typename std::false_type; + using supports_sg_loadstore = typename std::negation< + std::disjunction, is_complex>>; + + resT operator()(const argT &x) + { + + if constexpr (std::is_same_v || + (std::is_integral::value && + std::is_unsigned::value)) + { + static_assert(std::is_same_v); + return x; + } + else { + return std::abs(x); + } + } +}; + +template +using AbsContigFunctor = elementwise_common:: + UnaryContigFunctor, vec_sz, n_vecs>; + +template struct AbsOutputType +{ + using value_type = typename std::disjunction< // disjunction is C++17 + // feature, supported by DPC++ + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, float>, + td_ns::TypeMapResultEntry, double>, + td_ns::DefaultResultEntry>::result_type; +}; + +template +class abs_contig_kernel; + +template +sycl::event abs_contig_impl(sycl::queue exec_q, + size_t nelems, + const char *arg_p, + char *res_p, + const std::vector &depends = {}) +{ + sycl::event abs_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + size_t lws = 64; + constexpr unsigned int vec_sz = 4; + constexpr unsigned int n_vecs = 2; + const size_t n_groups = + ((nelems + lws * n_vecs * vec_sz - 1) / (lws * n_vecs * vec_sz)); + const auto gws_range = sycl::range<1>(n_groups * lws); + const auto lws_range = sycl::range<1>(lws); + + using resTy = typename AbsOutputType::value_type; + const argTy *arg_tp = reinterpret_cast(arg_p); + resTy *res_tp = reinterpret_cast(res_p); + + cgh.parallel_for>( + sycl::nd_range<1>(gws_range, lws_range), + AbsContigFunctor(arg_tp, res_tp, + nelems)); + }); + return abs_ev; +} + +template struct AbsContigFactory +{ + fnT get() + { + if constexpr (std::is_same_v::value_type, + void>) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = abs_contig_impl; + return fn; + } + } +}; + +template struct AbsTypeMapFactory +{ + /*! @brief get typeid for output type of std::abs(T x) */ + std::enable_if_t::value, int> get() + { + using rT = typename AbsOutputType::value_type; + return td_ns::GetTypeid{}.get(); + } +}; + +template +using AbsStridedFunctor = elementwise_common:: + UnaryStridedFunctor>; + +template class abs_strided_kernel; + +template +sycl::event abs_strided_impl(sycl::queue exec_q, + size_t nelems, + int nd, + const py::ssize_t *shape_and_strides, + const char *arg_p, + py::ssize_t arg_offset, + char *res_p, + py::ssize_t res_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + cgh.depends_on(additional_depends); + + using resTy = typename AbsOutputType::value_type; + using IndexerT = + typename dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer; + + IndexerT indexer{nd, arg_offset, res_offset, shape_and_strides}; + + const argTy *arg_tp = reinterpret_cast(arg_p); + resTy *res_tp = reinterpret_cast(res_p); + + cgh.parallel_for>( + {nelems}, + AbsStridedFunctor(arg_tp, res_tp, indexer)); + }); + return comp_ev; +} + +template struct AbsStridedFactory +{ + fnT get() + { + if constexpr (std::is_same_v::value_type, + void>) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = abs_strided_impl; + return fn; + } + } +}; + +} // namespace abs +} // namespace kernels +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/add.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/add.hpp new file mode 100644 index 0000000000..d0ab25d270 --- /dev/null +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/add.hpp @@ -0,0 +1,460 @@ +//=== add.hpp - Binary function ADD ------ *-C++-*--/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for elementwise evaluation of ADD(x1, x2) +/// function. +//===---------------------------------------------------------------------===// + +#pragma once +#include +#include +#include +#include + +#include "utils/offset_utils.hpp" +#include "utils/type_dispatch.hpp" +#include "utils/type_utils.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include + +namespace dpctl +{ +namespace tensor +{ +namespace kernels +{ +namespace add +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; +namespace tu_ns = dpctl::tensor::type_utils; + +template struct AddFunctor +{ + + using supports_sg_loadstore = std::negation< + std::disjunction, tu_ns::is_complex>>; + using supports_vec = std::negation< + std::disjunction, tu_ns::is_complex>>; + + resT operator()(const argT1 &in1, const argT2 &in2) + { + return in1 + in2; + } + + template + sycl::vec operator()(const sycl::vec &in1, + const sycl::vec &in2) + { + auto tmp = in1 + in2; + if constexpr (std::is_same_v) { + return tmp; + } + else { + using dpctl::tensor::type_utils::vec_cast; + + return vec_cast( + tmp); + } + } +}; + +template +using AddContigFunctor = + elementwise_common::BinaryContigFunctor, + vec_sz, + n_vecs>; + +template +using AddStridedFunctor = + elementwise_common::BinaryStridedFunctor>; + +template struct AddOutputType +{ + using value_type = typename std::disjunction< // disjunction is C++17 + // feature, supported by DPC++ + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + T2, + std::complex, + std::complex>, + td_ns::BinaryTypeMapResultEntry, + T2, + std::complex, + std::complex>, + td_ns::DefaultResultEntry>::result_type; +}; + +template +class add_contig_kernel; + +template +sycl::event add_contig_impl(sycl::queue exec_q, + size_t nelems, + const char *arg1_p, + py::ssize_t arg1_offset, + const char *arg2_p, + py::ssize_t arg2_offset, + char *res_p, + py::ssize_t res_offset, + const std::vector &depends = {}) +{ + sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + size_t lws = 64; + constexpr unsigned int vec_sz = 4; + constexpr unsigned int n_vecs = 2; + const size_t n_groups = + ((nelems + lws * n_vecs * vec_sz - 1) / (lws * n_vecs * vec_sz)); + const auto gws_range = sycl::range<1>(n_groups * lws); + const auto lws_range = sycl::range<1>(lws); + + using resTy = typename AddOutputType::value_type; + + const argTy1 *arg1_tp = + reinterpret_cast(arg1_p) + arg1_offset; + const argTy2 *arg2_tp = + reinterpret_cast(arg2_p) + arg2_offset; + resTy *res_tp = reinterpret_cast(res_p) + res_offset; + + cgh.parallel_for< + add_contig_kernel>( + sycl::nd_range<1>(gws_range, lws_range), + AddContigFunctor( + arg1_tp, arg2_tp, res_tp, nelems)); + }); + return comp_ev; +} + +template struct AddContigFactory +{ + fnT get() + { + if constexpr (std::is_same_v::value_type, + void>) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = add_contig_impl; + return fn; + } + } +}; + +template struct AddTypeMapFactory +{ + /*! @brief get typeid for output type of std::add(T1 x, T2 y) */ + std::enable_if_t::value, int> get() + { + using rT = typename AddOutputType::value_type; + ; + return td_ns::GetTypeid{}.get(); + } +}; + +template +class add_strided_strided_kernel; + +template +sycl::event add_strided_impl(sycl::queue exec_q, + size_t nelems, + int nd, + const py::ssize_t *shape_and_strides, + const char *arg1_p, + py::ssize_t arg1_offset, + const char *arg2_p, + py::ssize_t arg2_offset, + char *res_p, + py::ssize_t res_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + cgh.depends_on(additional_depends); + + using resTy = typename AddOutputType::value_type; + + using IndexerT = + typename dpctl::tensor::offset_utils::ThreeOffsets_StridedIndexer; + + IndexerT indexer{nd, arg1_offset, arg2_offset, res_offset, + shape_and_strides}; + + const argTy1 *arg1_tp = reinterpret_cast(arg1_p); + const argTy2 *arg2_tp = reinterpret_cast(arg2_p); + resTy *res_tp = reinterpret_cast(res_p); + + cgh.parallel_for< + add_strided_strided_kernel>( + {nelems}, AddStridedFunctor( + arg1_tp, arg2_tp, res_tp, indexer)); + }); + return comp_ev; +} + +template struct AddStridedFactory +{ + fnT get() + { + if constexpr (std::is_same_v::value_type, + void>) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = add_strided_impl; + return fn; + } + } +}; + +template +class add_matrix_row_broadcast_sg_krn; + +template +using AddContigMatrixContigRowBroadcastingFunctor = + elementwise_common::BinaryContigMatrixContigRowBroadcastingFunctor< + argT1, + argT2, + resT, + AddFunctor>; + +template +sycl::event add_contig_matrix_contig_row_broadcast_impl( + sycl::queue exec_q, + std::vector &host_tasks, + size_t n0, + size_t n1, + const char *mat_p, // typeless pointer to (n0, n1) C-contiguous matrix + py::ssize_t mat_offset, + const char *vec_p, // typeless pointer to (n1,) contiguous row + py::ssize_t vec_offset, + char *res_p, // typeless pointer to (n0, n1) result C-contig. matrix, + // res[i,j] = mat[i,j] + vec[j] + py::ssize_t res_offset, + const std::vector &depends = {}) +{ + const argT1 *mat = reinterpret_cast(mat_p) + mat_offset; + const argT2 *vec = reinterpret_cast(vec_p) + vec_offset; + resT *res = reinterpret_cast(res_p) + res_offset; + + const auto &dev = exec_q.get_device(); + const auto &sg_sizes = dev.get_info(); + // Get device-specific kernel info max_sub_group_size + size_t max_sgSize = + *(std::max_element(std::begin(sg_sizes), std::end(sg_sizes))); + + size_t n1_padded = n1 + max_sgSize; + argT2 *padded_vec = sycl::malloc_device(n1_padded, exec_q); + + if (padded_vec == nullptr) { + throw std::runtime_error("Could not allocate memory on the device"); + } + sycl::event make_padded_vec_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); // ensure vec contains actual data + cgh.parallel_for({n1_padded}, [=](sycl::id<1> id) { + auto i = id[0]; + padded_vec[i] = vec[i % n1]; + }); + }); + + // sub-group spans work-items [I, I + sgSize) + // base = ndit.get_global_linear_id() - sg.get_local_id()[0] + // Generically, sg.load( &mat[base]) may load arrays from + // different rows of mat. The start corresponds to row (base / n0) + // We read sg.load(&padded_vec[(base / n0)]). The vector is padded to + // ensure that reads are accessible + + size_t lws = 64; + + sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(make_padded_vec_ev); + + auto lwsRange = sycl::range<1>(lws); + size_t n_elems = n0 * n1; + size_t n_groups = (n_elems + lws - 1) / lws; + auto gwsRange = sycl::range<1>(n_groups * lws); + + cgh.parallel_for< + class add_matrix_row_broadcast_sg_krn>( + sycl::nd_range<1>(gwsRange, lwsRange), + AddContigMatrixContigRowBroadcastingFunctor( + mat, padded_vec, res, n_elems, n1)); + }); + + sycl::event tmp_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(comp_ev); + sycl::context ctx = exec_q.get_context(); + cgh.host_task([ctx, padded_vec]() { sycl::free(padded_vec, ctx); }); + }); + host_tasks.push_back(tmp_cleanup_ev); + + return comp_ev; +} + +template +struct AddContigMatrixContigRowBroadcastFactory +{ + fnT get() + { + using resT = typename AddOutputType::value_type; + if constexpr (std::is_same_v) { + fnT fn = nullptr; + return fn; + } + else { + if constexpr (dpctl::tensor::type_utils::is_complex::value || + dpctl::tensor::type_utils::is_complex::value || + dpctl::tensor::type_utils::is_complex::value) + { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = + add_contig_matrix_contig_row_broadcast_impl; + return fn; + } + } + } +}; + +template +sycl::event add_contig_row_contig_matrix_broadcast_impl( + sycl::queue exec_q, + std::vector &host_tasks, + size_t n0, + size_t n1, + const char *vec_p, // typeless pointer to (n1,) contiguous row + py::ssize_t vec_offset, + const char *mat_p, // typeless pointer to (n0, n1) C-contiguous matrix + py::ssize_t mat_offset, + char *res_p, // typeless pointer to (n0, n1) result C-contig. matrix, + // res[i,j] = mat[i,j] + vec[j] + py::ssize_t res_offset, + const std::vector &depends = {}) +{ + return add_contig_matrix_contig_row_broadcast_impl( + exec_q, host_tasks, n0, n1, mat_p, mat_offset, vec_p, vec_offset, res_p, + res_offset, depends); +}; + +template +struct AddContigRowContigMatrixBroadcastFactory +{ + fnT get() + { + using resT = typename AddOutputType::value_type; + if constexpr (std::is_same_v) { + fnT fn = nullptr; + return fn; + } + else { + if constexpr (dpctl::tensor::type_utils::is_complex::value || + dpctl::tensor::type_utils::is_complex::value || + dpctl::tensor::type_utils::is_complex::value) + { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = + add_contig_row_contig_matrix_broadcast_impl; + return fn; + } + } + } +}; + +} // namespace add +} // namespace kernels +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp new file mode 100644 index 0000000000..a1108c541c --- /dev/null +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp @@ -0,0 +1,621 @@ +//=== common.hpp - Common code for elementwise operations ----- *-C++-*--/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines common code for elementwise tensor operations. +//===---------------------------------------------------------------------===// + +#pragma once +#include +#include +#include +#include + +namespace dpctl +{ +namespace tensor +{ +namespace kernels +{ +namespace elementwise_common +{ + +/*! @brief Functor for unary function evaluation on contiguous array */ +template +struct UnaryContigFunctor +{ +private: + const argT *in = nullptr; + resT *out = nullptr; + const size_t nelems_; + +public: + UnaryContigFunctor(const argT *inp, resT *res, const size_t n_elems) + : in(inp), out(res), nelems_(n_elems) + { + } + + void operator()(sycl::nd_item<1> ndit) const + { + UnaryOperatorT op{}; + /* Each work-item processes vec_sz elements, contiguous in memory */ + /* NOTE: vec_sz must divide sg.max_local_range()[0] */ + + if constexpr (UnaryOperatorT::is_constant::value) { + // value of operator is known to be a known constant + constexpr resT const_val = UnaryOperatorT::constant_value; + using out_ptrT = + sycl::multi_ptr; + + auto sg = ndit.get_sub_group(); + std::uint8_t sgSize = sg.get_local_range()[0]; + std::uint8_t max_sgSize = sg.get_max_local_range()[0]; + size_t base = n_vecs * vec_sz * + (ndit.get_group(0) * ndit.get_local_range(0) + + sg.get_group_id()[0] * sgSize); + if (base + n_vecs * vec_sz * sgSize < nelems_ && + max_sgSize == sgSize) { + sycl::vec res_vec(const_val); +#pragma unroll + for (std::uint8_t it = 0; it < n_vecs * vec_sz; it += vec_sz) { + sg.store(out_ptrT(&out[base + it * sgSize]), + res_vec); + } + } + else { + for (size_t k = base + sg.get_local_id()[0]; k < nelems_; + k += sgSize) { + out[k] = const_val; + } + } + } + else if constexpr (UnaryOperatorT::supports_sg_loadstore::value && + UnaryOperatorT::supports_vec::value) + { + using in_ptrT = + sycl::multi_ptr; + using out_ptrT = + sycl::multi_ptr; + + auto sg = ndit.get_sub_group(); + std::uint16_t sgSize = sg.get_local_range()[0]; + std::uint16_t max_sgSize = sg.get_max_local_range()[0]; + size_t base = n_vecs * vec_sz * + (ndit.get_group(0) * ndit.get_local_range(0) + + sg.get_group_id()[0] * max_sgSize); + if (base + n_vecs * vec_sz * sgSize < nelems_ && + sgSize == max_sgSize) { + sycl::vec x; + +#pragma unroll + for (std::uint16_t it = 0; it < n_vecs * vec_sz; it += vec_sz) { + x = sg.load(in_ptrT(&in[base + it * sgSize])); + sycl::vec res_vec = op(x); + sg.store(out_ptrT(&out[base + it * sgSize]), + res_vec); + } + } + else { + for (size_t k = base + sg.get_local_id()[0]; k < nelems_; + k += sgSize) { + // scalar call + out[k] = op(in[k]); + } + } + } + else if constexpr (UnaryOperatorT::supports_sg_loadstore::value && + std::is_same_v) + { + // default: use scalar-value function + + auto sg = ndit.get_sub_group(); + std::uint8_t sgSize = sg.get_local_range()[0]; + std::uint8_t maxsgSize = sg.get_max_local_range()[0]; + size_t base = n_vecs * vec_sz * + (ndit.get_group(0) * ndit.get_local_range(0) + + sg.get_group_id()[0] * maxsgSize); + + if ((base + n_vecs * vec_sz * sgSize < nelems_) && + (maxsgSize == sgSize)) { + using in_ptrT = + sycl::multi_ptr; + using out_ptrT = + sycl::multi_ptr; + sycl::vec arg_vec; + +#pragma unroll + for (std::uint8_t it = 0; it < n_vecs * vec_sz; it += vec_sz) { + arg_vec = sg.load(in_ptrT(&in[base + it * sgSize])); +#pragma unroll + for (std::uint8_t k = 0; k < vec_sz; ++k) { + arg_vec[k] = op(arg_vec[k]); + } + sg.store(out_ptrT(&out[base + it * sgSize]), + arg_vec); + } + } + else { + for (size_t k = base + sg.get_local_id()[0]; k < nelems_; + k += sgSize) { + out[k] = op(in[k]); + } + } + } + else if constexpr (UnaryOperatorT::supports_sg_loadstore::value) { + // default: use scalar-value function + + auto sg = ndit.get_sub_group(); + std::uint8_t sgSize = sg.get_local_range()[0]; + std::uint8_t maxsgSize = sg.get_max_local_range()[0]; + size_t base = n_vecs * vec_sz * + (ndit.get_group(0) * ndit.get_local_range(0) + + sg.get_group_id()[0] * maxsgSize); + + if ((base + n_vecs * vec_sz * sgSize < nelems_) && + (maxsgSize == sgSize)) { + using in_ptrT = + sycl::multi_ptr; + using out_ptrT = + sycl::multi_ptr; + sycl::vec arg_vec; + sycl::vec res_vec; + +#pragma unroll + for (std::uint8_t it = 0; it < n_vecs * vec_sz; it += vec_sz) { + arg_vec = sg.load(in_ptrT(&in[base + it * sgSize])); +#pragma unroll + for (std::uint8_t k = 0; k < vec_sz; ++k) { + res_vec[k] = op(arg_vec[k]); + } + sg.store(out_ptrT(&out[base + it * sgSize]), + res_vec); + } + } + else { + for (size_t k = base + sg.get_local_id()[0]; k < nelems_; + k += sgSize) { + out[k] = op(in[k]); + } + } + } + else { + std::uint8_t sgSize = ndit.get_sub_group().get_local_range()[0]; + size_t base = ndit.get_global_linear_id(); + + base = (base / sgSize) * sgSize * n_vecs * vec_sz + (base % sgSize); + for (size_t offset = base; + offset < std::min(nelems_, base + sgSize * (n_vecs * vec_sz)); + offset += sgSize) + { + out[offset] = op(in[offset]); + } + } + } +}; + +template +struct UnaryStridedFunctor +{ +private: + const argT *inp_ = nullptr; + resT *res_ = nullptr; + IndexerT inp_out_indexer_; + +public: + UnaryStridedFunctor(const argT *inp_p, + resT *res_p, + IndexerT inp_out_indexer) + : inp_(inp_p), res_(res_p), inp_out_indexer_(inp_out_indexer) + { + } + + void operator()(sycl::id<1> wid) const + { + const auto &offsets_ = inp_out_indexer_(wid.get(0)); + const py::ssize_t &inp_offset = offsets_.get_first_offset(); + const py::ssize_t &res_offset = offsets_.get_second_offset(); + + UnaryOpT op{}; + + res_[res_offset] = op(inp_[inp_offset]); + } +}; + +template +struct BinaryContigFunctor +{ +private: + const argT1 *in1 = nullptr; + const argT2 *in2 = nullptr; + resT *out = nullptr; + const size_t nelems_; + +public: + BinaryContigFunctor(const argT1 *inp1, + const argT2 *inp2, + resT *res, + const size_t n_elems) + : in1(inp1), in2(inp2), out(res), nelems_(n_elems) + { + } + + void operator()(sycl::nd_item<1> ndit) const + { + BinaryOperatorT op{}; + /* Each work-item processes vec_sz elements, contiguous in memory */ + + if constexpr (BinaryOperatorT::supports_sg_loadstore::value && + BinaryOperatorT::supports_vec::value) + { + auto sg = ndit.get_sub_group(); + std::uint8_t sgSize = sg.get_local_range()[0]; + std::uint8_t maxsgSize = sg.get_max_local_range()[0]; + + size_t base = n_vecs * vec_sz * + (ndit.get_group(0) * ndit.get_local_range(0) + + sg.get_group_id()[0] * sgSize); + + if ((base + n_vecs * vec_sz * sgSize < nelems_) && + (sgSize == maxsgSize)) { + using in_ptrT1 = + sycl::multi_ptr; + using in_ptrT2 = + sycl::multi_ptr; + using out_ptrT = + sycl::multi_ptr; + sycl::vec arg1_vec; + sycl::vec arg2_vec; + sycl::vec res_vec; + +#pragma unroll + for (std::uint8_t it = 0; it < n_vecs * vec_sz; it += vec_sz) { + arg1_vec = + sg.load(in_ptrT1(&in1[base + it * sgSize])); + arg2_vec = + sg.load(in_ptrT2(&in2[base + it * sgSize])); + res_vec = op(arg1_vec, arg2_vec); + sg.store(out_ptrT(&out[base + it * sgSize]), + res_vec); + } + } + else { + for (size_t k = base + sg.get_local_id()[0]; k < nelems_; + k += sgSize) { + out[k] = op(in1[k], in2[k]); + } + } + } + else if constexpr (BinaryOperatorT::supports_sg_loadstore::value) { + auto sg = ndit.get_sub_group(); + std::uint8_t sgSize = sg.get_local_range()[0]; + std::uint8_t maxsgSize = sg.get_max_local_range()[0]; + + size_t base = n_vecs * vec_sz * + (ndit.get_group(0) * ndit.get_local_range(0) + + sg.get_group_id()[0] * sgSize); + + if ((base + n_vecs * vec_sz * sgSize < nelems_) && + (sgSize == maxsgSize)) { + using in_ptrT1 = + sycl::multi_ptr; + using in_ptrT2 = + sycl::multi_ptr; + using out_ptrT = + sycl::multi_ptr; + sycl::vec arg1_vec; + sycl::vec arg2_vec; + sycl::vec res_vec; + +#pragma unroll + for (std::uint8_t it = 0; it < n_vecs * vec_sz; it += vec_sz) { + arg1_vec = + sg.load(in_ptrT1(&in1[base + it * sgSize])); + arg2_vec = + sg.load(in_ptrT2(&in2[base + it * sgSize])); +#pragma unroll + for (std::uint8_t vec_id = 0; vec_id < vec_sz; ++vec_id) { + res_vec[vec_id] = + op(arg1_vec[vec_id], arg2_vec[vec_id]); + } + sg.store(out_ptrT(&out[base + it * sgSize]), + res_vec); + } + } + else { + for (size_t k = base + sg.get_local_id()[0]; k < nelems_; + k += sgSize) { + out[k] = op(in1[k], in2[k]); + } + } + } + else { + std::uint8_t sgSize = ndit.get_sub_group().get_local_range()[0]; + size_t base = ndit.get_global_linear_id(); + + base = (base / sgSize) * sgSize * n_vecs * vec_sz + (base % sgSize); + for (size_t offset = base; + offset < std::min(nelems_, base + sgSize * (n_vecs * vec_sz)); + offset += sgSize) + { + out[offset] = op(in1[offset], in2[offset]); + } + } + } +}; + +template +struct BinaryStridedFunctor +{ +private: + const argT1 *in1 = nullptr; + const argT2 *in2 = nullptr; + resT *out = nullptr; + ThreeOffsets_IndexerT three_offsets_indexer_; + +public: + BinaryStridedFunctor(const argT1 *inp1_tp, + const argT2 *inp2_tp, + resT *res_tp, + ThreeOffsets_IndexerT inps_res_indexer) + : in1(inp1_tp), in2(inp2_tp), out(res_tp), + three_offsets_indexer_(inps_res_indexer) + { + } + + void operator()(sycl::id<1> wid) const + { + const auto &three_offsets_ = + three_offsets_indexer_(static_cast(wid.get(0))); + + const auto &inp1_offset = three_offsets_.get_first_offset(); + const auto &inp2_offset = three_offsets_.get_second_offset(); + const auto &out_offset = three_offsets_.get_third_offset(); + + BinaryOperatorT op{}; + out[out_offset] = op(in1[inp1_offset], in2[inp2_offset]); + } +}; + +template +struct BinaryContigMatrixContigRowBroadcastingFunctor +{ +private: + const argT1 *mat; + const argT2 *padded_vec; + resT *res; + size_t n_elems; + size_t n1; + +public: + BinaryContigMatrixContigRowBroadcastingFunctor(const argT1 *mat_tp, + const argT2 *row_tp, + resT *res_tp, + size_t n_elems_in_mat, + size_t n_elems_in_row) + : mat(mat_tp), padded_vec(row_tp), res(res_tp), n_elems(n_elems_in_mat), + n1(n_elems_in_row) + { + } + + void operator()(sycl::nd_item<1> ndit) const + { + BinaryOperatorT op{}; + static_assert(BinaryOperatorT::supports_sg_loadstore::value); + + auto sg = ndit.get_sub_group(); + size_t gid = ndit.get_global_linear_id(); + + std::uint8_t sgSize = sg.get_local_range()[0]; + size_t base = gid - sg.get_local_id()[0]; + + if (base + sgSize < n_elems) { + using in_ptrT1 = + sycl::multi_ptr; + using in_ptrT2 = + sycl::multi_ptr; + using res_ptrT = + sycl::multi_ptr; + + const argT1 mat_el = sg.load(in_ptrT1(&mat[base])); + const argT2 vec_el = sg.load(in_ptrT2(&padded_vec[base % n1])); + + resT res_el = op(mat_el, vec_el); + + sg.store(res_ptrT(&res[base]), res_el); + } + else { + for (size_t k = base + sg.get_local_id()[0]; k < n_elems; + k += sgSize) { + res[k] = op(mat[k], padded_vec[k % n1]); + } + } + } +}; + +template +struct BinaryContigRowContigMatrixBroadcastingFunctor +{ +private: + const argT1 *padded_vec; + const argT2 *mat; + resT *res; + size_t n_elems; + size_t n1; + +public: + BinaryContigRowContigMatrixBroadcastingFunctor(const argT1 *row_tp, + const argT2 *mat_tp, + resT *res_tp, + size_t n_elems_in_mat, + size_t n_elems_in_row) + : padded_vec(row_tp), mat(mat_tp), res(res_tp), n_elems(n_elems_in_mat), + n1(n_elems_in_row) + { + } + + void operator()(sycl::nd_item<1> ndit) const + { + BinaryOperatorT op{}; + static_assert(BinaryOperatorT::supports_sg_loadstore::value); + + auto sg = ndit.get_sub_group(); + size_t gid = ndit.get_global_linear_id(); + + std::uint8_t sgSize = sg.get_local_range()[0]; + size_t base = gid - sg.get_local_id()[0]; + + if (base + sgSize < n_elems) { + using in_ptrT1 = + sycl::multi_ptr; + using in_ptrT2 = + sycl::multi_ptr; + using res_ptrT = + sycl::multi_ptr; + + const argT2 mat_el = sg.load(in_ptrT2(&mat[base])); + const argT1 vec_el = sg.load(in_ptrT1(&padded_vec[base % n1])); + + resT res_el = op(vec_el, mat_el); + + sg.store(res_ptrT(&res[base]), res_el); + } + else { + for (size_t k = base + sg.get_local_id()[0]; k < n_elems; + k += sgSize) { + res[k] = op(padded_vec[k % n1], mat[k]); + } + } + } +}; + +// Typdefs for function pointers + +typedef sycl::event (*unary_contig_impl_fn_ptr_t)( + sycl::queue, + size_t, + const char *, + char *, + const std::vector &); + +typedef sycl::event (*unary_strided_impl_fn_ptr_t)( + sycl::queue, + size_t, + int, + const py::ssize_t *, + const char *, + py::ssize_t, + char *, + py::ssize_t, + const std::vector &, + const std::vector &); + +typedef sycl::event (*binary_contig_impl_fn_ptr_t)( + sycl::queue, + size_t, + const char *, + py::ssize_t, + const char *, + py::ssize_t, + char *, + py::ssize_t, + const std::vector &); + +typedef sycl::event (*binary_strided_impl_fn_ptr_t)( + sycl::queue, + size_t, + int, + const py::ssize_t *, + const char *, + py::ssize_t, + const char *, + py::ssize_t, + char *, + py::ssize_t, + const std::vector &, + const std::vector &); + +typedef sycl::event (*binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t)( + sycl::queue, + std::vector &, + size_t, + size_t, + const char *, + py::ssize_t, + const char *, + py::ssize_t, + char *, + py::ssize_t, + const std::vector &); + +typedef sycl::event (*binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t)( + sycl::queue, + std::vector &, + size_t, + size_t, + const char *, + py::ssize_t, + const char *, + py::ssize_t, + char *, + py::ssize_t, + const std::vector &); + +} // namespace elementwise_common +} // namespace kernels +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cos.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cos.hpp new file mode 100644 index 0000000000..b6859910e3 --- /dev/null +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cos.hpp @@ -0,0 +1,210 @@ +//=== cos.hpp - Unary function COS ------ *-C++-*--/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for elementwise evaluation of COS(x) function. +//===---------------------------------------------------------------------===// + +#pragma once +#include +#include +#include +#include +#include + +#include "kernels/elementwise_functions/common.hpp" + +#include "utils/offset_utils.hpp" +#include "utils/type_dispatch.hpp" +#include "utils/type_utils.hpp" +#include + +namespace dpctl +{ +namespace tensor +{ +namespace kernels +{ +namespace cos +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +using dpctl::tensor::type_utils::is_complex; + +template struct CosFunctor +{ + + // is function constant for given argT + using is_constant = typename std::false_type; + // constant value, if constant + // constexpr resT constant_value = resT{}; + // is function defined for sycl::vec + using supports_vec = typename std::false_type; + // do both argTy and resTy support sugroup store/load operation + using supports_sg_loadstore = typename std::negation< + std::disjunction, is_complex>>; + + resT operator()(const argT &in) + { + return std::cos(in); + } +}; + +template +using CosContigFunctor = elementwise_common:: + UnaryContigFunctor, vec_sz, n_vecs>; + +template +using CosStridedFunctor = elementwise_common:: + UnaryStridedFunctor>; + +template struct CosOutputType +{ + using value_type = typename std::disjunction< // disjunction is C++17 + // feature, supported by DPC++ + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, std::complex>, + td_ns:: + TypeMapResultEntry, std::complex>, + td_ns::DefaultResultEntry>::result_type; +}; + +template +class cos_contig_kernel; + +template +sycl::event cos_contig_impl(sycl::queue exec_q, + size_t nelems, + const char *arg_p, + char *res_p, + const std::vector &depends = {}) +{ + sycl::event cos_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + constexpr size_t lws = 64; + constexpr unsigned int vec_sz = 4; + constexpr unsigned int n_vecs = 2; + static_assert(lws % vec_sz == 0); + auto gws_range = sycl::range<1>( + ((nelems + n_vecs * lws * vec_sz - 1) / (lws * n_vecs * vec_sz)) * + lws); + auto lws_range = sycl::range<1>(lws); + + using resTy = typename CosOutputType::value_type; + const argTy *arg_tp = reinterpret_cast(arg_p); + resTy *res_tp = reinterpret_cast(res_p); + + cgh.parallel_for>( + sycl::nd_range<1>(gws_range, lws_range), + CosContigFunctor(arg_tp, res_tp, + nelems)); + }); + return cos_ev; +} + +template struct CosContigFactory +{ + fnT get() + { + if constexpr (std::is_same_v::value_type, + void>) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = cos_contig_impl; + return fn; + } + } +}; + +template struct CosTypeMapFactory +{ + /*! @brief get typeid for output type of sycl::cos(T x) */ + std::enable_if_t::value, int> get() + { + using rT = typename CosOutputType::value_type; + return td_ns::GetTypeid{}.get(); + } +}; + +template class cos_strided_kernel; + +template +sycl::event cos_strided_impl(sycl::queue exec_q, + size_t nelems, + int nd, + const py::ssize_t *shape_and_strides, + const char *arg_p, + py::ssize_t arg_offset, + char *res_p, + py::ssize_t res_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + cgh.depends_on(additional_depends); + + using resTy = typename CosOutputType::value_type; + using IndexerT = + typename dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer; + + IndexerT arg_res_indexer(nd, arg_offset, res_offset, shape_and_strides); + + const argTy *arg_tp = reinterpret_cast(arg_p); + resTy *res_tp = reinterpret_cast(res_p); + + sycl::range<1> gRange{nelems}; + + cgh.parallel_for>( + gRange, CosStridedFunctor(arg_tp, res_tp, + arg_res_indexer)); + }); + return comp_ev; +} + +template struct CosStridedFactory +{ + fnT get() + { + if constexpr (std::is_same_v::value_type, + void>) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = cos_strided_impl; + return fn; + } + } +}; + +} // namespace cos +} // namespace kernels +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/equal.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/equal.hpp new file mode 100644 index 0000000000..edbbc393c5 --- /dev/null +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/equal.hpp @@ -0,0 +1,287 @@ +//=== equal.hpp - Binary function EQUAL ------ *-C++-*--/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for elementwise evaluation of equality of +/// tensor elements. +//===---------------------------------------------------------------------===// + +#pragma once +#include +#include +#include +#include + +#include "utils/offset_utils.hpp" +#include "utils/type_dispatch.hpp" +#include "utils/type_utils.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include + +namespace dpctl +{ +namespace tensor +{ +namespace kernels +{ +namespace equal +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; +namespace tu_ns = dpctl::tensor::type_utils; + +template struct EqualFunctor +{ + static_assert(std::is_same_v); + + using supports_sg_loadstore = std::negation< + std::disjunction, tu_ns::is_complex>>; + using supports_vec = std::conjunction< + std::is_same, + std::negation, + tu_ns::is_complex>>>; + + resT operator()(const argT1 &in1, const argT2 &in2) + { + return (in1 == in2); + } + + template + sycl::vec operator()(const sycl::vec &in1, + const sycl::vec &in2) + { + auto tmp = (in1 == in2); + if constexpr (std::is_same_v) { + return tmp; + } + else { + using dpctl::tensor::type_utils::vec_cast; + + return vec_cast( + tmp); + } + } +}; + +template +using EqualContigFunctor = + elementwise_common::BinaryContigFunctor, + vec_sz, + n_vecs>; + +template +using EqualStridedFunctor = + elementwise_common::BinaryStridedFunctor>; + +template struct EqualOutputType +{ + using value_type = typename std::disjunction< // disjunction is C++17 + // feature, supported by DPC++ + td_ns::BinaryTypeMapResultEntry, + td_ns:: + BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns:: + BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns:: + BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns:: + BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + T2, + std::complex, + bool>, + td_ns::BinaryTypeMapResultEntry, + T2, + std::complex, + bool>, + td_ns::DefaultResultEntry>::result_type; +}; + +template +class equal_contig_kernel; + +template +sycl::event equal_contig_impl(sycl::queue exec_q, + size_t nelems, + const char *arg1_p, + py::ssize_t arg1_offset, + const char *arg2_p, + py::ssize_t arg2_offset, + char *res_p, + py::ssize_t res_offset, + const std::vector &depends = {}) +{ + sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + size_t lws = 64; + constexpr unsigned int vec_sz = 4; + constexpr unsigned int n_vecs = 2; + const size_t n_groups = + ((nelems + lws * n_vecs * vec_sz - 1) / (lws * n_vecs * vec_sz)); + const auto gws_range = sycl::range<1>(n_groups * lws); + const auto lws_range = sycl::range<1>(lws); + + using resTy = typename EqualOutputType::value_type; + + const argTy1 *arg1_tp = + reinterpret_cast(arg1_p) + arg1_offset; + const argTy2 *arg2_tp = + reinterpret_cast(arg2_p) + arg2_offset; + resTy *res_tp = reinterpret_cast(res_p) + res_offset; + + cgh.parallel_for< + equal_contig_kernel>( + sycl::nd_range<1>(gws_range, lws_range), + EqualContigFunctor( + arg1_tp, arg2_tp, res_tp, nelems)); + }); + return comp_ev; +} + +template struct EqualContigFactory +{ + fnT get() + { + if constexpr (std::is_same_v< + typename EqualOutputType::value_type, void>) + { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = equal_contig_impl; + return fn; + } + } +}; + +template struct EqualTypeMapFactory +{ + /*! @brief get typeid for output type of operator()==(x, y), always bool */ + std::enable_if_t::value, int> get() + { + using rT = typename EqualOutputType::value_type; + return td_ns::GetTypeid{}.get(); + } +}; + +template +class equal_strided_strided_kernel; + +template +sycl::event +equal_strided_impl(sycl::queue exec_q, + size_t nelems, + int nd, + const py::ssize_t *shape_and_strides, + const char *arg1_p, + py::ssize_t arg1_offset, + const char *arg2_p, + py::ssize_t arg2_offset, + char *res_p, + py::ssize_t res_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + cgh.depends_on(additional_depends); + + using resTy = typename EqualOutputType::value_type; + + using IndexerT = + typename dpctl::tensor::offset_utils::ThreeOffsets_StridedIndexer; + + IndexerT indexer{nd, arg1_offset, arg2_offset, res_offset, + shape_and_strides}; + + const argTy1 *arg1_tp = reinterpret_cast(arg1_p); + const argTy2 *arg2_tp = reinterpret_cast(arg2_p); + resTy *res_tp = reinterpret_cast(res_p); + + cgh.parallel_for< + equal_strided_strided_kernel>( + {nelems}, EqualStridedFunctor( + arg1_tp, arg2_tp, res_tp, indexer)); + }); + return comp_ev; +} + +template struct EqualStridedFactory +{ + fnT get() + { + if constexpr (std::is_same_v< + typename EqualOutputType::value_type, void>) + { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = equal_strided_impl; + return fn; + } + } +}; + +} // namespace equal +} // namespace kernels +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isfinite.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isfinite.hpp new file mode 100644 index 0000000000..86340329da --- /dev/null +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isfinite.hpp @@ -0,0 +1,217 @@ +//=== isfinite.hpp - Unary function ISFINITE ------ *-C++-*--/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for elementwise evaluation of ISFINITE(x) +/// function that tests whether a tensor element is finite. +//===---------------------------------------------------------------------===// + +#pragma once +#include +#include +#include +#include +#include + +#include "utils/offset_utils.hpp" +#include "utils/type_dispatch.hpp" +#include "utils/type_utils.hpp" +#include + +namespace dpctl +{ +namespace tensor +{ +namespace kernels +{ +namespace isfinite +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +using dpctl::tensor::type_utils::is_complex; +using dpctl::tensor::type_utils::vec_cast; + +template struct IsFiniteFunctor +{ + static_assert(std::is_same_v); + + /* + std::is_same::value || + std::is_integral::value + */ + using is_constant = typename std::disjunction, + std::is_integral>; + static constexpr resT constant_value = true; + using supports_vec = typename std::false_type; + using supports_sg_loadstore = typename std::negation< + std::disjunction, is_complex>>; + + resT operator()(const argT &in) const + { + if constexpr (is_complex::value) { + const bool real_isfinite = std::isfinite(std::real(in)); + const bool imag_isfinite = std::isfinite(std::imag(in)); + return (real_isfinite && imag_isfinite); + } + else if constexpr (std::is_same::value || + std::is_integral::value) + { + return constant_value; + } + else if constexpr (std::is_same_v) { + return sycl::isfinite(in); + } + else { + return std::isfinite(in); + } + } + + template + sycl::vec operator()(const sycl::vec &in) + { + auto const &res_vec = sycl::isfinite(in); + + using deducedT = typename std::remove_cv_t< + std::remove_reference_t>::element_type; + + return vec_cast(res_vec); + } +}; + +template +using IsFiniteContigFunctor = elementwise_common:: + UnaryContigFunctor, vec_sz, n_vecs>; + +template +using IsFiniteStridedFunctor = elementwise_common:: + UnaryStridedFunctor>; + +template struct IsFiniteOutputType +{ + using value_type = bool; +}; + +template +class isfinite_contig_kernel; + +template +sycl::event isfinite_contig_impl(sycl::queue exec_q, + size_t nelems, + const char *arg_p, + char *res_p, + const std::vector &depends = {}) +{ + sycl::event isfinite_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + constexpr size_t lws = 64; + constexpr std::uint8_t vec_sz = 4; + constexpr std::uint8_t n_vecs = 2; + static_assert(lws % vec_sz == 0); + size_t n_groups = + ((nelems + lws * n_vecs * vec_sz - 1) / (lws * n_vecs * vec_sz)); + auto gws_range = sycl::range<1>(n_groups * lws); + auto lws_range = sycl::range<1>(lws); + + using resTy = typename IsFiniteOutputType::value_type; + const argTy *arg_tp = reinterpret_cast(arg_p); + resTy *res_tp = reinterpret_cast(res_p); + + cgh.parallel_for< + class isfinite_contig_kernel>( + sycl::nd_range<1>(gws_range, lws_range), + IsFiniteContigFunctor(arg_tp, res_tp, + nelems)); + }); + return isfinite_ev; +} + +template struct IsFiniteContigFactory +{ + fnT get() + { + fnT fn = isfinite_contig_impl; + return fn; + } +}; + +template struct IsFiniteTypeMapFactory +{ + /*! @brief get typeid for output type of sycl::isfinite(T x) */ + std::enable_if_t::value, int> get() + { + using rT = typename IsFiniteOutputType::value_type; + return td_ns::GetTypeid{}.get(); + } +}; + +template class isfinite_strided_kernel; + +template +sycl::event +isfinite_strided_impl(sycl::queue exec_q, + size_t nelems, + int nd, + const py::ssize_t *shape_and_strides, + const char *arg_p, + py::ssize_t arg_offset, + char *res_p, + py::ssize_t res_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + sycl::event isfinite_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + cgh.depends_on(additional_depends); + + using resTy = typename IsFiniteOutputType::value_type; + using IndexerT = + typename dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer; + + IndexerT arg_res_indexer{nd, arg_offset, res_offset, shape_and_strides}; + + const argTy *arg_tptr = reinterpret_cast(arg_p); + resTy *res_tptr = reinterpret_cast(res_p); + + cgh.parallel_for>( + {nelems}, IsFiniteStridedFunctor( + arg_tptr, res_tptr, arg_res_indexer)); + }); + return isfinite_ev; +} + +template struct IsFiniteStridedFactory +{ + fnT get() + { + fnT fn = isfinite_strided_impl; + return fn; + } +}; + +} // namespace isfinite +} // namespace kernels +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isinf.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isinf.hpp new file mode 100644 index 0000000000..8f7dcca6c7 --- /dev/null +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isinf.hpp @@ -0,0 +1,216 @@ +//=== isinf.hpp - Unary function ISINF ------ *-C++-*--/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for elementwise evaluation of ISINF(x) +/// function that tests whether a tensor element is an infinity. +//===---------------------------------------------------------------------===// + +#pragma once +#include +#include +#include +#include +#include + +#include "utils/offset_utils.hpp" +#include "utils/type_dispatch.hpp" +#include "utils/type_utils.hpp" +#include + +namespace dpctl +{ +namespace tensor +{ +namespace kernels +{ +namespace isinf +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +using dpctl::tensor::type_utils::is_complex; +using dpctl::tensor::type_utils::vec_cast; + +template struct IsInfFunctor +{ + static_assert(std::is_same_v); + + using is_constant = typename std::disjunction, + std::is_integral>; + static constexpr resT constant_value = false; + using supports_vec = + typename std::disjunction, + std::is_floating_point>; + using supports_sg_loadstore = typename std::negation< + std::disjunction, is_complex>>; + + resT operator()(const argT &in) const + { + if constexpr (is_complex::value) { + const bool real_isinf = std::isinf(std::real(in)); + const bool imag_isinf = std::isinf(std::imag(in)); + return (real_isinf || imag_isinf); + } + else if constexpr (std::is_same::value || + std::is_integral::value) + { + return constant_value; + } + else if constexpr (std::is_same_v) { + return sycl::isinf(in); + } + else { + return std::isinf(in); + } + } + + template + sycl::vec operator()(const sycl::vec &in) + { + auto const &res_vec = sycl::isinf(in); + + using deducedT = typename std::remove_cv_t< + std::remove_reference_t>::element_type; + + return vec_cast(res_vec); + } +}; + +template +using IsInfContigFunctor = elementwise_common:: + UnaryContigFunctor, vec_sz, n_vecs>; + +template +using IsInfStridedFunctor = elementwise_common:: + UnaryStridedFunctor>; + +template struct IsInfOutputType +{ + using value_type = bool; +}; + +template +class isinf_contig_kernel; + +template +sycl::event isinf_contig_impl(sycl::queue exec_q, + size_t nelems, + const char *arg_p, + char *res_p, + const std::vector &depends = {}) +{ + sycl::event isinf_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + constexpr size_t lws = 64; + constexpr std::uint8_t vec_sz = 4; + constexpr std::uint8_t n_vecs = 2; + static_assert(lws % vec_sz == 0); + auto gws_range = sycl::range<1>( + ((nelems + lws * n_vecs * vec_sz - 1) / (lws * n_vecs * vec_sz)) * + lws); + auto lws_range = sycl::range<1>(lws); + + using resTy = typename IsInfOutputType::value_type; + const argTy *arg_tp = reinterpret_cast(arg_p); + resTy *res_tp = reinterpret_cast(res_p); + + cgh.parallel_for< + class isinf_contig_kernel>( + sycl::nd_range<1>(gws_range, lws_range), + IsInfContigFunctor(arg_tp, res_tp, + nelems)); + }); + return isinf_ev; +} + +template struct IsInfContigFactory +{ + fnT get() + { + fnT fn = isinf_contig_impl; + return fn; + } +}; + +template struct IsInfTypeMapFactory +{ + /*! @brief get typeid for output type of sycl::isinf(T x) */ + std::enable_if_t::value, int> get() + { + using rT = typename IsInfOutputType::value_type; + return td_ns::GetTypeid{}.get(); + } +}; + +template class isinf_strided_kernel; + +template +sycl::event +isinf_strided_impl(sycl::queue exec_q, + size_t nelems, + int nd, + const py::ssize_t *shape_and_strides, + const char *arg_p, + py::ssize_t arg_offset, + char *res_p, + py::ssize_t res_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + cgh.depends_on(additional_depends); + + using resTy = typename IsInfOutputType::value_type; + using IndexerT = + typename dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer; + + IndexerT arg_res_indexer{nd, arg_offset, res_offset, shape_and_strides}; + + const argTy *arg_tptr = reinterpret_cast(arg_p); + resTy *res_tptr = reinterpret_cast(res_p); + + sycl::range<1> gRange{nelems}; + + cgh.parallel_for>( + gRange, IsInfStridedFunctor( + arg_tptr, res_tptr, arg_res_indexer)); + }); + return comp_ev; +} + +template struct IsInfStridedFactory +{ + fnT get() + { + fnT fn = isinf_strided_impl; + return fn; + } +}; + +} // namespace isinf +} // namespace kernels +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isnan.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isnan.hpp new file mode 100644 index 0000000000..3e4f68ed57 --- /dev/null +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isnan.hpp @@ -0,0 +1,361 @@ +//=== isnan.hpp - Unary function ISNAN ------ *-C++-*--/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for elementwise evaluation of ISNAN(x) +/// function that tests whether a tensor element is a NaN. +//===---------------------------------------------------------------------===// + +#pragma once +#include +#include +#include +#include + +#include "utils/offset_utils.hpp" +#include "utils/type_dispatch.hpp" +#include "utils/type_utils.hpp" +#include + +namespace dpctl +{ +namespace tensor +{ +namespace kernels +{ +namespace isnan +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +using dpctl::tensor::type_utils::is_complex; +using dpctl::tensor::type_utils::vec_cast; + +template struct IsNanFunctor +{ + static_assert(std::is_same_v); + + /* + std::is_same::value || + std::is_integral::value + */ + using is_constant = typename std::disjunction, + std::is_integral>; + static constexpr resT constant_value = false; + using supports_vec = typename std::true_type; + using supports_sg_loadstore = typename std::negation< + std::disjunction, is_complex>>; + + resT operator()(const argT &in) const + { + if constexpr (is_complex::value) { + const bool real_isnan = sycl::isnan(std::real(in)); + const bool imag_isnan = sycl::isnan(std::imag(in)); + return (real_isnan || imag_isnan); + } + else if constexpr (std::is_same::value || + std::is_integral::value) + { + return constant_value; + } + else { + return sycl::isnan(in); + } + } + + template + sycl::vec operator()(const sycl::vec &in) + { + auto const &res_vec = sycl::isnan(in); + + using deducedT = typename std::remove_cv_t< + std::remove_reference_t>::element_type; + + return vec_cast(res_vec); + } +}; + +template +using IsNanContigFunctor = elementwise_common:: + UnaryContigFunctor, vec_sz, n_vecs>; + +template +using IsNanStridedFunctor = elementwise_common:: + UnaryStridedFunctor>; + +template +struct IsNanContigFunctorOld +{ +private: + const argT *in = nullptr; + resT *out = nullptr; + const size_t nelems_; + +public: + IsNanContigFunctorOld(const argT *inp, resT *res, const size_t nelems) + : in(inp), out(res), nelems_(nelems) + { + } + + void operator()(sycl::nd_item<1> ndit) const + { + using dpctl::tensor::type_utils::is_complex; + using dpctl::tensor::type_utils::vec_cast; + + if constexpr (is_complex::value) { + std::uint8_t sgSize = ndit.get_sub_group().get_local_range()[0]; + size_t base = ndit.get_global_linear_id(); + + base = (base / sgSize) * sgSize * n_vecs * vec_sz + (base % sgSize); + for (size_t offset = base; + offset < std::min(nelems_, base + sgSize * (n_vecs * vec_sz)); + offset += sgSize) + { + const bool real_isnan = sycl::isnan(std::real(in[offset])); + const bool imag_isnan = sycl::isnan(std::imag(in[offset])); + out[offset] = real_isnan || imag_isnan; + } + } + else if constexpr (std::is_same::value || + std::is_integral::value) + { + using out_ptrT = + sycl::multi_ptr; + + auto sg = ndit.get_sub_group(); + std::uint8_t sgSize = sg.get_local_range()[0]; + std::uint8_t max_sgSize = sg.get_max_local_range()[0]; + size_t base = n_vecs * vec_sz * + (ndit.get_group(0) * ndit.get_local_range(0) + + sg.get_group_id()[0] * sgSize); + if (base + n_vecs * vec_sz * max_sgSize < nelems_ && + max_sgSize == sgSize) { + sycl::vec res_vec(false); +#pragma unroll + for (std::uint8_t it = 0; it < n_vecs * vec_sz; it += vec_sz) { + sg.store(out_ptrT(&out[base + it * sgSize]), + res_vec); + } + } + else { + for (size_t k = base + sg.get_local_id()[0]; k < nelems_; + k += sgSize) { + out[k] = false; + } + } + } + else { + using in_ptrT = + sycl::multi_ptr; + using out_ptrT = + sycl::multi_ptr; + static_assert(std::is_same::value); + + auto sg = ndit.get_sub_group(); + std::uint16_t sgSize = sg.get_local_range()[0]; + std::uint16_t max_sgSize = sg.get_max_local_range()[0]; + size_t base = n_vecs * vec_sz * + (ndit.get_group(0) * ndit.get_local_range(0) + + sg.get_group_id()[0] * max_sgSize); + if (base + n_vecs * vec_sz * max_sgSize < nelems_ && + sgSize == max_sgSize) { + sycl::vec x; + +#pragma unroll + for (std::uint16_t it = 0; it < n_vecs * vec_sz; it += vec_sz) { + x = sg.load(in_ptrT(&in[base + it * sgSize])); + // returns vec + auto res_vec = sycl::isnan(x); + // cast it to bool + sycl::vec res_bool = + vec_cast(res_vec); + sg.store(out_ptrT(&out[base + it * sgSize]), + res_bool); + } + } + else { + for (size_t k = base + sg.get_local_id()[0]; k < nelems_; + k += sgSize) { + out[k] = static_cast(sycl::isnan(in[k])); + } + } + } + } +}; + +template struct IsNanOutputType +{ + using value_type = bool; +}; + +template +class isnan_contig_kernel; + +template +sycl::event isnan_contig_impl(sycl::queue exec_q, + size_t nelems, + const char *arg_p, + char *res_p, + const std::vector &depends = {}) +{ + sycl::event isnan_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + constexpr size_t lws = 64; + constexpr std::uint8_t vec_sz = 4; + constexpr std::uint8_t n_vecs = 2; + static_assert(lws % vec_sz == 0); + auto gws_range = sycl::range<1>( + ((nelems + lws * n_vecs * vec_sz - 1) / (lws * n_vecs * vec_sz)) * + lws); + auto lws_range = sycl::range<1>(lws); + + using resTy = typename IsNanOutputType::value_type; + const argTy *arg_tp = reinterpret_cast(arg_p); + resTy *res_tp = reinterpret_cast(res_p); + + cgh.parallel_for< + class isnan_contig_kernel>( + sycl::nd_range<1>(gws_range, lws_range), + IsNanContigFunctor(arg_tp, res_tp, + nelems)); + }); + return isnan_ev; +} + +template struct IsNanContigFactory +{ + fnT get() + { + fnT fn = isnan_contig_impl; + return fn; + } +}; + +template struct IsNanTypeMapFactory +{ + /*! @brief get typeid for output type of sycl::isnan(T x) */ + std::enable_if_t::value, int> get() + { + using rT = typename IsNanOutputType::value_type; + return td_ns::GetTypeid{}.get(); + } +}; + +template +struct IsNanStridedFunctorOld +{ +private: + const argT *inp_ = nullptr; + resT *res_ = nullptr; + IndexerT inp_out_indexer_; + +public: + IsNanStridedFunctorOld(const argT *inp_p, + resT *res_p, + IndexerT inp_out_indexer) + : inp_(inp_p), res_(res_p), inp_out_indexer_(inp_out_indexer) + { + } + + void operator()(sycl::id<1> wid) const + { + const argT *const &in = inp_; + resT *const &out = res_; + + auto offsets_ = inp_out_indexer_(wid.get(0)); + const py::ssize_t &inp_offset = offsets_.get_first_offset(); + const py::ssize_t &out_offset = offsets_.get_second_offset(); + + using dpctl::tensor::type_utils::is_complex; + if constexpr (std::is_same_v || + (std::is_integral::value)) { + out[out_offset] = false; + } + else if constexpr (is_complex::value) { + const bool real_isnan = sycl::isnan(std::real(in[inp_offset])); + const bool imag_isnan = sycl::isnan(std::imag(in[inp_offset])); + + out[out_offset] = real_isnan || imag_isnan; + } + else { + out[out_offset] = sycl::isnan(in[inp_offset]); + } + } +}; + +template class isnan_strided_kernel; + +template +sycl::event +isnan_strided_impl(sycl::queue exec_q, + size_t nelems, + int nd, + const py::ssize_t *shape_and_strides, + const char *arg_p, + py::ssize_t arg_offset, + char *res_p, + py::ssize_t res_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + cgh.depends_on(additional_depends); + + using resTy = typename IsNanOutputType::value_type; + using IndexerT = + typename dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer; + + IndexerT arg_res_indexer{nd, arg_offset, res_offset, shape_and_strides}; + + const argTy *arg_tptr = reinterpret_cast(arg_p); + resTy *res_tptr = reinterpret_cast(res_p); + + cgh.parallel_for>( + {nelems}, IsNanStridedFunctor( + arg_tptr, res_tptr, arg_res_indexer)); + }); + return comp_ev; +} + +template struct IsNanStridedFactory +{ + fnT get() + { + fnT fn = isnan_strided_impl; + return fn; + } +}; + +} // namespace isnan +} // namespace kernels +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sqrt.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sqrt.hpp new file mode 100644 index 0000000000..7e576c8746 --- /dev/null +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sqrt.hpp @@ -0,0 +1,213 @@ +//=== sqrt.hpp - Unary function SQRT ------ *-C++-*--/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for elementwise evaluation of SQRT(x) +/// function that compute a square root. +//===---------------------------------------------------------------------===// + +#pragma once +#include +#include +#include +#include +#include + +#include "kernels/elementwise_functions/common.hpp" + +#include "utils/offset_utils.hpp" +#include "utils/type_dispatch.hpp" +#include "utils/type_utils.hpp" +#include + +namespace dpctl +{ +namespace tensor +{ +namespace kernels +{ +namespace sqrt +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +using dpctl::tensor::type_utils::is_complex; + +template struct SqrtFunctor +{ + + // is function constant for given argT + using is_constant = typename std::false_type; + // constant value, if constant + // constexpr resT constant_value = resT{}; + // is function defined for sycl::vec + using supports_vec = typename std::false_type; + // do both argTy and resTy support sugroup store/load operation + using supports_sg_loadstore = typename std::negation< + std::disjunction, is_complex>>; + + resT operator()(const argT &in) + { + return std::sqrt(in); + } +}; + +template +using SqrtContigFunctor = elementwise_common:: + UnaryContigFunctor, vec_sz, n_vecs>; + +template +using SqrtStridedFunctor = elementwise_common:: + UnaryStridedFunctor>; + +template struct SqrtOutputType +{ + using value_type = typename std::disjunction< // disjunction is C++17 + // feature, supported by DPC++ + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, std::complex>, + td_ns:: + TypeMapResultEntry, std::complex>, + td_ns::DefaultResultEntry>::result_type; +}; + +template +class sqrt_contig_kernel; + +template +sycl::event sqrt_contig_impl(sycl::queue exec_q, + size_t nelems, + const char *arg_p, + char *res_p, + const std::vector &depends = {}) +{ + sycl::event sqrt_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + constexpr size_t lws = 64; + constexpr unsigned int vec_sz = 4; + constexpr unsigned int n_vecs = 2; + static_assert(lws % vec_sz == 0); + auto gws_range = sycl::range<1>( + ((nelems + n_vecs * lws * vec_sz - 1) / (lws * n_vecs * vec_sz)) * + lws); + auto lws_range = sycl::range<1>(lws); + + using resTy = typename SqrtOutputType::value_type; + const argTy *arg_tp = reinterpret_cast(arg_p); + resTy *res_tp = reinterpret_cast(res_p); + + cgh.parallel_for< + class sqrt_contig_kernel>( + sycl::nd_range<1>(gws_range, lws_range), + SqrtContigFunctor(arg_tp, res_tp, + nelems)); + }); + return sqrt_ev; +} + +template struct SqrtContigFactory +{ + fnT get() + { + if constexpr (std::is_same_v::value_type, + void>) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = sqrt_contig_impl; + return fn; + } + } +}; + +template struct SqrtTypeMapFactory +{ + /*! @brief get typeid for output type of std::sqrt(T x) */ + std::enable_if_t::value, int> get() + { + using rT = typename SqrtOutputType::value_type; + return td_ns::GetTypeid{}.get(); + } +}; + +template class sqrt_strided_kernel; + +template +sycl::event +sqrt_strided_impl(sycl::queue exec_q, + size_t nelems, + int nd, + const py::ssize_t *shape_and_strides, + const char *arg_p, + py::ssize_t arg_offset, + char *res_p, + py::ssize_t res_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + cgh.depends_on(additional_depends); + + using resTy = typename SqrtOutputType::value_type; + using IndexerT = + typename dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer; + + IndexerT arg_res_indexer(nd, arg_offset, res_offset, shape_and_strides); + + const argTy *arg_tp = reinterpret_cast(arg_p); + resTy *res_tp = reinterpret_cast(res_p); + + sycl::range<1> gRange{nelems}; + + cgh.parallel_for>( + gRange, SqrtStridedFunctor( + arg_tp, res_tp, arg_res_indexer)); + }); + return comp_ev; +} + +template struct SqrtStridedFactory +{ + fnT get() + { + if constexpr (std::is_same_v::value_type, + void>) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = sqrt_strided_impl; + return fn; + } + } +}; + +} // namespace sqrt +} // namespace kernels +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/true_divide.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/true_divide.hpp new file mode 100644 index 0000000000..f34da1b415 --- /dev/null +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/true_divide.hpp @@ -0,0 +1,520 @@ +//=== true_divide.hpp - Binary function DIVIDE ------ *-C++-*--/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for elementwise evaluation of DIVIDE(x1, x2) +/// function. +//===---------------------------------------------------------------------===// + +#pragma once +#include +#include +#include +#include + +#include "utils/offset_utils.hpp" +#include "utils/type_dispatch.hpp" +#include "utils/type_utils.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include + +namespace dpctl +{ +namespace tensor +{ +namespace kernels +{ +namespace true_divide +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; +namespace tu_ns = dpctl::tensor::type_utils; + +template +struct TrueDivideFunctor +{ + + using supports_sg_loadstore = std::negation< + std::disjunction, tu_ns::is_complex>>; + using supports_vec = std::negation< + std::disjunction, tu_ns::is_complex>>; + + resT operator()(const argT1 &in1, const argT2 &in2) + { + return in1 / in2; + } + + template + sycl::vec operator()(const sycl::vec &in1, + const sycl::vec &in2) + { + auto tmp = in1 / in2; + if constexpr (std::is_same_v) { + return tmp; + } + else { + using dpctl::tensor::type_utils::vec_cast; + + return vec_cast( + tmp); + } + } +}; + +template +using TrueDivideContigFunctor = elementwise_common::BinaryContigFunctor< + argT1, + argT2, + resT, + TrueDivideFunctor, + vec_sz, + n_vecs>; + +template +using TrueDivideStridedFunctor = elementwise_common::BinaryStridedFunctor< + argT1, + argT2, + resT, + IndexerT, + TrueDivideFunctor>; + +template struct TrueDivideOutputType +{ + using value_type = typename std::disjunction< // disjunction is C++17 + // feature, supported by DPC++ + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + T2, + std::complex, + std::complex>, + td_ns::BinaryTypeMapResultEntry, + T2, + float, + std::complex>, + td_ns::BinaryTypeMapResultEntry, + std::complex>, + td_ns::BinaryTypeMapResultEntry, + T2, + std::complex, + std::complex>, + td_ns::BinaryTypeMapResultEntry, + std::complex>, + td_ns::BinaryTypeMapResultEntry, + T2, + double, + std::complex>, + td_ns::DefaultResultEntry>::result_type; +}; + +template +class true_divide_contig_kernel; + +template +sycl::event +true_divide_contig_impl(sycl::queue exec_q, + size_t nelems, + const char *arg1_p, + py::ssize_t arg1_offset, + const char *arg2_p, + py::ssize_t arg2_offset, + char *res_p, + py::ssize_t res_offset, + const std::vector &depends = {}) +{ + sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + size_t lws = 64; + constexpr unsigned int vec_sz = 4; + constexpr unsigned int n_vecs = 2; + const size_t n_groups = + ((nelems + lws * n_vecs * vec_sz - 1) / (lws * n_vecs * vec_sz)); + const auto gws_range = sycl::range<1>(n_groups * lws); + const auto lws_range = sycl::range<1>(lws); + + using resTy = typename TrueDivideOutputType::value_type; + + const argTy1 *arg1_tp = + reinterpret_cast(arg1_p) + arg1_offset; + const argTy2 *arg2_tp = + reinterpret_cast(arg2_p) + arg2_offset; + resTy *res_tp = reinterpret_cast(res_p) + res_offset; + + cgh.parallel_for< + true_divide_contig_kernel>( + sycl::nd_range<1>(gws_range, lws_range), + TrueDivideContigFunctor( + arg1_tp, arg2_tp, res_tp, nelems)); + }); + return comp_ev; +} + +template struct TrueDivideContigFactory +{ + fnT get() + { + if constexpr (std::is_same_v< + typename TrueDivideOutputType::value_type, + void>) + { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = true_divide_contig_impl; + return fn; + } + } +}; + +template +struct TrueDivideTypeMapFactory +{ + /*! @brief get typeid for output type of divide(T1 x, T2 y) */ + std::enable_if_t::value, int> get() + { + using rT = typename TrueDivideOutputType::value_type; + return td_ns::GetTypeid{}.get(); + } +}; + +template +class true_divide_strided_strided_kernel; + +template +sycl::event +true_divide_strided_impl(sycl::queue exec_q, + size_t nelems, + int nd, + const py::ssize_t *shape_and_strides, + const char *arg1_p, + py::ssize_t arg1_offset, + const char *arg2_p, + py::ssize_t arg2_offset, + char *res_p, + py::ssize_t res_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + cgh.depends_on(additional_depends); + + using resTy = typename TrueDivideOutputType::value_type; + + using IndexerT = + typename dpctl::tensor::offset_utils::ThreeOffsets_StridedIndexer; + + IndexerT indexer{nd, arg1_offset, arg2_offset, res_offset, + shape_and_strides}; + + const argTy1 *arg1_tp = reinterpret_cast(arg1_p); + const argTy2 *arg2_tp = reinterpret_cast(arg2_p); + resTy *res_tp = reinterpret_cast(res_p); + + sycl::range<1> gRange(nelems); + + cgh.parallel_for>( + gRange, TrueDivideStridedFunctor( + arg1_tp, arg2_tp, res_tp, indexer)); + }); + return comp_ev; +} + +template +struct TrueDivideStridedFactory +{ + fnT get() + { + if constexpr (std::is_same_v< + typename TrueDivideOutputType::value_type, + void>) + { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = true_divide_strided_impl; + return fn; + } + } +}; + +template +using TrueDivideContigMatrixContigRowBroadcastingFunctor = + elementwise_common::BinaryContigMatrixContigRowBroadcastingFunctor< + argT1, + argT2, + resT, + TrueDivideFunctor>; + +template +using TrueDivideContigRowContigMatrixBroadcastingFunctor = + elementwise_common::BinaryContigRowContigMatrixBroadcastingFunctor< + argT1, + argT2, + resT, + TrueDivideFunctor>; + +template +class true_divide_matrix_row_broadcast_sg_krn; + +template +class true_divide_row_matrix_broadcast_sg_krn; + +template +sycl::event true_divide_contig_matrix_contig_row_broadcast_impl( + sycl::queue exec_q, + std::vector &host_tasks, + size_t n0, + size_t n1, + const char *mat_p, // typeless pointer to (n0, n1) C-contiguous matrix + py::ssize_t mat_offset, + const char *vec_p, // typeless pointer to (n1,) contiguous row + py::ssize_t vec_offset, + char *res_p, // typeless pointer to (n0, n1) result C-contig. matrix, + // res[i,j] = mat[i,j] / vec[j] + py::ssize_t res_offset, + const std::vector &depends = {}) +{ + const argT1 *mat = reinterpret_cast(mat_p) + mat_offset; + const argT2 *vec = reinterpret_cast(vec_p) + vec_offset; + resT *res = reinterpret_cast(res_p) + res_offset; + + const auto &dev = exec_q.get_device(); + const auto &sg_sizes = dev.get_info(); + // Get device-specific kernel info max_sub_group_size + size_t max_sgSize = + *(std::max_element(std::begin(sg_sizes), std::end(sg_sizes))); + + size_t n1_padded = n1 + max_sgSize; + argT2 *padded_vec = sycl::malloc_device(n1_padded, exec_q); + + if (padded_vec == nullptr) { + throw std::runtime_error("Could not allocate memory on the device"); + } + sycl::event make_padded_vec_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); // ensure vec contains actual data + cgh.parallel_for({n1_padded}, [=](sycl::id<1> id) { + auto i = id[0]; + padded_vec[i] = vec[i % n1]; + }); + }); + + // sub-group spans work-items [I, I + sgSize) + // base = ndit.get_global_linear_id() - sg.get_local_id()[0] + // Generically, sg.load( &mat[base]) may load arrays from + // different rows of mat. The start corresponds to row (base / n0) + // We read sg.load(&padded_vec[(base / n0)]). The vector is padded to + // ensure that reads are accessible + + size_t lws = 64; + + sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(make_padded_vec_ev); + + auto lwsRange = sycl::range<1>(lws); + size_t n_elems = n0 * n1; + size_t n_groups = (n_elems + lws - 1) / lws; + auto gwsRange = sycl::range<1>(n_groups * lws); + + cgh.parallel_for< + class true_divide_matrix_row_broadcast_sg_krn>( + sycl::nd_range<1>(gwsRange, lwsRange), + TrueDivideContigMatrixContigRowBroadcastingFunctor( + mat, padded_vec, res, n_elems, n1)); + }); + + sycl::event tmp_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(comp_ev); + sycl::context ctx = exec_q.get_context(); + cgh.host_task([ctx, padded_vec]() { sycl::free(padded_vec, ctx); }); + }); + host_tasks.push_back(tmp_cleanup_ev); + + return comp_ev; +} + +template +struct TrueDivideContigMatrixContigRowBroadcastFactory +{ + fnT get() + { + if constexpr (std::is_same_v< + typename TrueDivideOutputType::value_type, + void>) + { + fnT fn = nullptr; + return fn; + } + else { + using resT = typename TrueDivideOutputType::value_type; + if constexpr (dpctl::tensor::type_utils::is_complex::value || + dpctl::tensor::type_utils::is_complex::value || + dpctl::tensor::type_utils::is_complex::value) + { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = + true_divide_contig_matrix_contig_row_broadcast_impl; + return fn; + } + } + } +}; + +template +sycl::event true_divide_contig_row_contig_matrix_broadcast_impl( + sycl::queue exec_q, + std::vector &host_tasks, + size_t n0, + size_t n1, + const char *vec_p, // typeless pointer to (n1,) contiguous row + py::ssize_t vec_offset, + const char *mat_p, // typeless pointer to (n0, n1) C-contiguous matrix + py::ssize_t mat_offset, + char *res_p, // typeless pointer to (n0, n1) result C-contig. matrix, + // res[i,j] = mat[i,j] + vec[j] + py::ssize_t res_offset, + const std::vector &depends = {}) +{ + const argT1 *vec = reinterpret_cast(vec_p) + vec_offset; + const argT2 *mat = reinterpret_cast(mat_p) + mat_offset; + resT *res = reinterpret_cast(res_p) + res_offset; + + const auto &dev = exec_q.get_device(); + const auto &sg_sizes = dev.get_info(); + // Get device-specific kernel info max_sub_group_size + size_t max_sgSize = + *(std::max_element(std::begin(sg_sizes), std::end(sg_sizes))); + + size_t n1_padded = n1 + max_sgSize; + argT2 *padded_vec = sycl::malloc_device(n1_padded, exec_q); + + if (padded_vec == nullptr) { + throw std::runtime_error("Could not allocate memory on the device"); + } + sycl::event make_padded_vec_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); // ensure vec contains actual data + cgh.parallel_for({n1_padded}, [=](sycl::id<1> id) { + auto i = id[0]; + padded_vec[i] = vec[i % n1]; + }); + }); + + // sub-group spans work-items [I, I + sgSize) + // base = ndit.get_global_linear_id() - sg.get_local_id()[0] + // Generically, sg.load( &mat[base]) may load arrays from + // different rows of mat. The start corresponds to row (base / n0) + // We read sg.load(&padded_vec[(base / n0)]). The vector is padded to + // ensure that reads are accessible + + size_t lws = 64; + + sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(make_padded_vec_ev); + + auto lwsRange = sycl::range<1>(lws); + size_t n_elems = n0 * n1; + size_t n_groups = (n_elems + lws - 1) / lws; + auto gwsRange = sycl::range<1>(n_groups * lws); + + cgh.parallel_for< + class true_divide_row_matrix_broadcast_sg_krn>( + sycl::nd_range<1>(gwsRange, lwsRange), + TrueDivideContigRowContigMatrixBroadcastingFunctor( + padded_vec, mat, res, n_elems, n1)); + }); + + sycl::event tmp_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(comp_ev); + sycl::context ctx = exec_q.get_context(); + cgh.host_task([ctx, padded_vec]() { sycl::free(padded_vec, ctx); }); + }); + host_tasks.push_back(tmp_cleanup_ev); + + return comp_ev; +}; + +template +struct TrueDivideContigRowContigMatrixBroadcastFactory +{ + fnT get() + { + using resT = typename TrueDivideOutputType::value_type; + if constexpr (std::is_same_v) { + fnT fn = nullptr; + return fn; + } + else { + if constexpr (dpctl::tensor::type_utils::is_complex::value || + dpctl::tensor::type_utils::is_complex::value || + dpctl::tensor::type_utils::is_complex::value) + { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = + true_divide_contig_row_contig_matrix_broadcast_impl; + return fn; + } + } + } +}; + +} // namespace true_divide +} // namespace kernels +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp b/dpctl/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp index ce24dc799a..0f60c7a4b2 100644 --- a/dpctl/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp +++ b/dpctl/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp @@ -2,7 +2,7 @@ // // Data Parallel Control (dpctl) // -// Copyright 2020-2022 Intel Corporation +// Copyright 2020-2023 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/dpctl/tensor/libtensor/include/utils/type_dispatch.hpp b/dpctl/tensor/libtensor/include/utils/type_dispatch.hpp index 5d7d6b8a8c..afc458169e 100644 --- a/dpctl/tensor/libtensor/include/utils/type_dispatch.hpp +++ b/dpctl/tensor/libtensor/include/utils/type_dispatch.hpp @@ -250,6 +250,136 @@ struct usm_ndarray_types } }; +/*! @brief struct to define result_type typename for Ty == ArgTy */ +template +struct TypeMapResultEntry : std::bool_constant> +{ + using result_type = ResTy; +}; + +/*! @brief struct to define result_type typename for Ty1 == ArgTy1 && Ty2 == + * ArgTy2 */ +template +struct BinaryTypeMapResultEntry + : std::bool_constant, + std::is_same>> +{ + using result_type = ResTy; +}; + +/*! @brief fall-through struct with specified result_type, usually void */ +template struct DefaultResultEntry : std::true_type +{ + using result_type = Ty; +}; + +/*! @brief Utility struct to convert C++ type into typeid integer */ +template struct GetTypeid +{ + int get() + { + if constexpr (std::is_same_v) { + return static_cast(typenum_t::BOOL); + } + else if constexpr (std::is_same_v) { + return static_cast(typenum_t::INT8); + } + else if constexpr (std::is_same_v) { + return static_cast(typenum_t::UINT8); + } + else if constexpr (std::is_same_v) { + return static_cast(typenum_t::INT16); + } + else if constexpr (std::is_same_v) { + return static_cast(typenum_t::UINT16); + } + else if constexpr (std::is_same_v) { + return static_cast(typenum_t::INT32); + } + else if constexpr (std::is_same_v) { + return static_cast(typenum_t::UINT32); + } + else if constexpr (std::is_same_v) { + return static_cast(typenum_t::INT64); + } + else if constexpr (std::is_same_v) { + return static_cast(typenum_t::UINT64); + } + else if constexpr (std::is_same_v) { + return static_cast(typenum_t::HALF); + } + else if constexpr (std::is_same_v) { + return static_cast(typenum_t::FLOAT); + } + else if constexpr (std::is_same_v) { + return static_cast(typenum_t::DOUBLE); + } + else if constexpr (std::is_same_v>) { + return static_cast(typenum_t::CFLOAT); + } + else if constexpr (std::is_same_v>) { + return static_cast(typenum_t::CDOUBLE); + } + else if constexpr (std::is_same_v) { // special token + return -1; + } + + assert(("Unsupported type T", false)); + return -2; + } +}; + +/*! @brief Class to generate vector of null function pointers */ +template struct NullPtrVector +{ + + using value_type = FunPtrT; + using const_reference = value_type const &; + + NullPtrVector() : val(nullptr) {} + + const_reference operator[](int) const + { + return val; + } + +private: + value_type val; +}; + +/*! @brief Class to generate table of null function pointers */ +template struct NullPtrTable +{ + using value_type = NullPtrVector; + using const_reference = value_type const &; + + NullPtrTable() : val() {} + + const_reference operator[](int) const + { + return val; + } + +private: + value_type val; +}; + +template +struct TypePairDefinedEntry : std::bool_constant && + std::is_same_v> +{ + static constexpr bool is_defined = true; +}; + +struct NotDefinedEntry : std::true_type +{ + static constexpr bool is_defined = false; +}; + } // namespace type_dispatch } // namespace tensor diff --git a/dpctl/tensor/libtensor/include/utils/type_utils.hpp b/dpctl/tensor/libtensor/include/utils/type_utils.hpp index 8464418d8b..36c00404c9 100644 --- a/dpctl/tensor/libtensor/include/utils/type_utils.hpp +++ b/dpctl/tensor/libtensor/include/utils/type_utils.hpp @@ -26,6 +26,7 @@ #include #include #include +#include namespace dpctl { @@ -100,6 +101,21 @@ template void validate_type_for_device(const sycl::queue &q) validate_type_for_device(q.get_device()); } +template +auto vec_cast_impl(const Vec &v, std::index_sequence) +{ + return Op{v[I]...}; +} + +template > +auto vec_cast(const sycl::vec &s) +{ + return vec_cast_impl, sycl::vec>(s, Indices{}); +} + } // namespace type_utils } // namespace tensor } // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/boolean_advanced_indexing.cpp b/dpctl/tensor/libtensor/source/boolean_advanced_indexing.cpp index 0a706146e2..59f62af5f1 100644 --- a/dpctl/tensor/libtensor/source/boolean_advanced_indexing.cpp +++ b/dpctl/tensor/libtensor/source/boolean_advanced_indexing.cpp @@ -2,7 +2,7 @@ // // Data Parallel Control (dpctl) // -// Copyright 2020-2022 Intel Corporation +// Copyright 2020-2023 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/dpctl/tensor/libtensor/source/boolean_advanced_indexing.hpp b/dpctl/tensor/libtensor/source/boolean_advanced_indexing.hpp index 96c264f563..e6e8a54ed6 100644 --- a/dpctl/tensor/libtensor/source/boolean_advanced_indexing.hpp +++ b/dpctl/tensor/libtensor/source/boolean_advanced_indexing.hpp @@ -2,7 +2,7 @@ // // Data Parallel Control (dpctl) // -// Copyright 2020-2022 Intel Corporation +// Copyright 2020-2023 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/dpctl/tensor/libtensor/source/copy_and_cast_usm_to_usm.cpp b/dpctl/tensor/libtensor/source/copy_and_cast_usm_to_usm.cpp index d8692c1098..57386e736f 100644 --- a/dpctl/tensor/libtensor/source/copy_and_cast_usm_to_usm.cpp +++ b/dpctl/tensor/libtensor/source/copy_and_cast_usm_to_usm.cpp @@ -2,7 +2,7 @@ // // Data Parallel Control (dpctl) // -// Copyright 2020-2022 Intel Corporation +// Copyright 2020-2023 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/dpctl/tensor/libtensor/source/copy_and_cast_usm_to_usm.hpp b/dpctl/tensor/libtensor/source/copy_and_cast_usm_to_usm.hpp index 192d70c0f2..109062516a 100644 --- a/dpctl/tensor/libtensor/source/copy_and_cast_usm_to_usm.hpp +++ b/dpctl/tensor/libtensor/source/copy_and_cast_usm_to_usm.hpp @@ -2,7 +2,7 @@ // // Data Parallel Control (dpctl) // -// Copyright 2020-2022 Intel Corporation +// Copyright 2020-2023 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/dpctl/tensor/libtensor/source/copy_for_reshape.cpp b/dpctl/tensor/libtensor/source/copy_for_reshape.cpp index d4f7f50437..7f4b8d718c 100644 --- a/dpctl/tensor/libtensor/source/copy_for_reshape.cpp +++ b/dpctl/tensor/libtensor/source/copy_for_reshape.cpp @@ -2,7 +2,7 @@ // // Data Parallel Control (dpctl) // -// Copyright 2020-2022 Intel Corporation +// Copyright 2020-2023 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/dpctl/tensor/libtensor/source/copy_for_reshape.hpp b/dpctl/tensor/libtensor/source/copy_for_reshape.hpp index 51c3719b97..09caddf824 100644 --- a/dpctl/tensor/libtensor/source/copy_for_reshape.hpp +++ b/dpctl/tensor/libtensor/source/copy_for_reshape.hpp @@ -2,7 +2,7 @@ // // Data Parallel Control (dpctl) // -// Copyright 2020-2022 Intel Corporation +// Copyright 2020-2023 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/dpctl/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp b/dpctl/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp index f4e29411b0..0464a14cd3 100644 --- a/dpctl/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp +++ b/dpctl/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp @@ -2,7 +2,7 @@ // // Data Parallel Control (dpctl) // -// Copyright 2020-2022 Intel Corporation +// Copyright 2020-2023 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/dpctl/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.hpp b/dpctl/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.hpp index 16adb921ee..e5bf513921 100644 --- a/dpctl/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.hpp +++ b/dpctl/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.hpp @@ -2,7 +2,7 @@ // // Data Parallel Control (dpctl) // -// Copyright 2020-2022 Intel Corporation +// Copyright 2020-2023 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/dpctl/tensor/libtensor/source/device_support_queries.cpp b/dpctl/tensor/libtensor/source/device_support_queries.cpp index 74ae3464fc..16ae43ba97 100644 --- a/dpctl/tensor/libtensor/source/device_support_queries.cpp +++ b/dpctl/tensor/libtensor/source/device_support_queries.cpp @@ -2,7 +2,7 @@ // // Data Parallel Control (dpctl) // -// Copyright 2020-2022 Intel Corporation +// Copyright 2020-2023 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/dpctl/tensor/libtensor/source/device_support_queries.hpp b/dpctl/tensor/libtensor/source/device_support_queries.hpp index 905ba4b535..a54835fc75 100644 --- a/dpctl/tensor/libtensor/source/device_support_queries.hpp +++ b/dpctl/tensor/libtensor/source/device_support_queries.hpp @@ -2,7 +2,7 @@ // // Data Parallel Control (dpctl) // -// Copyright 2020-2022 Intel Corporation +// Copyright 2020-2023 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/dpctl/tensor/libtensor/source/elementwise_functions.cpp b/dpctl/tensor/libtensor/source/elementwise_functions.cpp new file mode 100644 index 0000000000..de209fdb1c --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions.cpp @@ -0,0 +1,1215 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/abs.hpp" +#include "kernels/elementwise_functions/add.hpp" +#include "kernels/elementwise_functions/cos.hpp" +#include "kernels/elementwise_functions/equal.hpp" +#include "kernels/elementwise_functions/isfinite.hpp" +#include "kernels/elementwise_functions/isinf.hpp" +#include "kernels/elementwise_functions/isnan.hpp" +#include "kernels/elementwise_functions/sqrt.hpp" +#include "kernels/elementwise_functions/true_divide.hpp" + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +py::dtype _dtype_from_typenum(td_ns::typenum_t dst_typenum_t) +{ + switch (dst_typenum_t) { + case td_ns::typenum_t::BOOL: + return py::dtype("?"); + case td_ns::typenum_t::INT8: + return py::dtype("i1"); + case td_ns::typenum_t::UINT8: + return py::dtype("u1"); + case td_ns::typenum_t::INT16: + return py::dtype("i2"); + case td_ns::typenum_t::UINT16: + return py::dtype("u2"); + case td_ns::typenum_t::INT32: + return py::dtype("i4"); + case td_ns::typenum_t::UINT32: + return py::dtype("u4"); + case td_ns::typenum_t::INT64: + return py::dtype("i8"); + case td_ns::typenum_t::UINT64: + return py::dtype("u8"); + case td_ns::typenum_t::HALF: + return py::dtype("f2"); + case td_ns::typenum_t::FLOAT: + return py::dtype("f4"); + case td_ns::typenum_t::DOUBLE: + return py::dtype("f8"); + case td_ns::typenum_t::CFLOAT: + return py::dtype("c8"); + case td_ns::typenum_t::CDOUBLE: + return py::dtype("c16"); + default: + throw py::value_error("Unrecognized dst_typeid"); + } +} + +int _result_typeid(int arg_typeid, const int *fn_output_id) +{ + if (arg_typeid < 0 || arg_typeid >= td_ns::num_types) { + throw py::value_error("Input typeid " + std::to_string(arg_typeid) + + " is outside of expected bounds."); + } + + return fn_output_id[arg_typeid]; +} + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::binary_contig_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_strided_impl_fn_ptr_t; +using ew_cmn_ns::unary_contig_impl_fn_ptr_t; +using ew_cmn_ns::unary_strided_impl_fn_ptr_t; + +// U01: ==== ABS (x) +namespace impl +{ + +namespace abs_fn_ns = dpctl::tensor::kernels::abs; + +static unary_contig_impl_fn_ptr_t abs_contig_dispatch_vector[td_ns::num_types]; +static int abs_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + abs_strided_dispatch_vector[td_ns::num_types]; + +void populate_abs_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = abs_fn_ns; + + using fn_ns::AbsContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(abs_contig_dispatch_vector); + + using fn_ns::AbsStridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(abs_strided_dispatch_vector); + + using fn_ns::AbsTypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(abs_output_typeid_vector); +}; + +} // namespace impl + +// U02: ==== ACOS (x) +namespace impl +{ +// FIXME: add code for U02 +} // namespace impl + +// U03: ===== ACOSH (x) +namespace impl +{ +// FIXME: add code for U03 +} // namespace impl + +// B01: ===== ADD (x1, x2) +namespace impl +{ +namespace add_fn_ns = dpctl::tensor::kernels::add; + +static binary_contig_impl_fn_ptr_t add_contig_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +static int add_output_id_table[td_ns::num_types][td_ns::num_types]; + +static binary_strided_impl_fn_ptr_t + add_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; + +// add(matrix, row) +static binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t + add_contig_matrix_contig_row_broadcast_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +// add(row, matrix) +static binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t + add_contig_row_contig_matrix_broadcast_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +void populate_add_dispatch_tables(void) +{ + using namespace td_ns; + namespace fn_ns = add_fn_ns; + + // which input types are supported, and what is the type of the result + using fn_ns::AddTypeMapFactory; + DispatchTableBuilder dtb1; + dtb1.populate_dispatch_table(add_output_id_table); + + // function pointers for operation on general strided arrays + using fn_ns::AddStridedFactory; + DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(add_strided_dispatch_table); + + // function pointers for operation on contiguous inputs and output + using fn_ns::AddContigFactory; + DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table(add_contig_dispatch_table); + + // function pointers for operation on contiguous matrix, contiguous row + // with contiguous matrix output + using fn_ns::AddContigMatrixContigRowBroadcastFactory; + DispatchTableBuilder< + binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t, + AddContigMatrixContigRowBroadcastFactory, num_types> + dtb4; + dtb4.populate_dispatch_table( + add_contig_matrix_contig_row_broadcast_dispatch_table); + + // function pointers for operation on contiguous row, contiguous matrix + // with contiguous matrix output + using fn_ns::AddContigRowContigMatrixBroadcastFactory; + DispatchTableBuilder< + binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t, + AddContigRowContigMatrixBroadcastFactory, num_types> + dtb5; + dtb5.populate_dispatch_table( + add_contig_row_contig_matrix_broadcast_dispatch_table); +}; + +} // namespace impl + +// U04: ===== ASIN (x) +namespace impl +{ +// FIXME: add code for U04 +} // namespace impl + +// U05: ===== ASINH (x) +namespace impl +{ +// FIXME: add code for U05 +} // namespace impl + +// U06: ===== ATAN (x) +namespace impl +{ +// FIXME: add code for U06 +} // namespace impl + +// B02: ===== ATAN2 (x1, x2) +namespace impl +{ +// FIXME: add code for B02 +} // namespace impl + +// U07: ===== ATANH (x) +namespace impl +{ +// FIXME: add code for U07 +} // namespace impl + +// B03: ===== BITWISE_AND (x1, x2) +namespace impl +{ +// FIXME: add code for B03 +} // namespace impl + +// B04: ===== BITWISE_LEFT_SHIFT (x1, x2) +namespace impl +{ +// FIXME: add code for B04 +} // namespace impl + +// U08: ===== BITWISE_INVERT (x) +namespace impl +{ +// FIXME: add code for U08 +} // namespace impl + +// B05: ===== BITWISE_OR (x1, x2) +namespace impl +{ +// FIXME: add code for B05 +} // namespace impl + +// B06: ===== BITWISE_RIGHT_SHIFT (x1, x2) +namespace impl +{ +// FIXME: add code for B06 +} // namespace impl + +// B07: ===== BITWISE_XOR (x1, x2) +namespace impl +{ +// FIXME: add code for B07 +} // namespace impl + +// U09: ==== CEIL (x) +namespace impl +{ +// FIXME: add code for U09 +} // namespace impl + +// U10: ==== CONJ (x) +namespace impl +{ +// FIXME: add code for U10 +} // namespace impl + +// U11: ==== COS (x) +namespace impl +{ + +namespace cos_fn_ns = dpctl::tensor::kernels::cos; + +static unary_contig_impl_fn_ptr_t cos_contig_dispatch_vector[td_ns::num_types]; +static int cos_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + cos_strided_dispatch_vector[td_ns::num_types]; + +void populate_cos_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = cos_fn_ns; + + using fn_ns::CosContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(cos_contig_dispatch_vector); + + using fn_ns::CosStridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(cos_strided_dispatch_vector); + + using fn_ns::CosTypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(cos_output_typeid_vector); +} + +} // namespace impl + +// U12: ==== COSH (x) +namespace impl +{ +// FIXME: add code for U12 +} // namespace impl + +// B08: ==== DIVIDE (x1, x2) +namespace impl +{ +namespace true_divide_fn_ns = dpctl::tensor::kernels::true_divide; + +static binary_contig_impl_fn_ptr_t + true_divide_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; +static int true_divide_output_id_table[td_ns::num_types][td_ns::num_types]; + +static binary_strided_impl_fn_ptr_t + true_divide_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; + +// divide(matrix, row) +static binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t + true_divide_contig_matrix_contig_row_broadcast_dispatch_table + [td_ns::num_types][td_ns::num_types]; + +// divide(row, matrix) +static binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t + true_divide_contig_row_contig_matrix_broadcast_dispatch_table + [td_ns::num_types][td_ns::num_types]; + +void populate_true_divide_dispatch_tables(void) +{ + using namespace td_ns; + namespace fn_ns = true_divide_fn_ns; + + // which input types are supported, and what is the type of the result + using fn_ns::TrueDivideTypeMapFactory; + DispatchTableBuilder dtb1; + dtb1.populate_dispatch_table(true_divide_output_id_table); + + // function pointers for operation on general strided arrays + using fn_ns::TrueDivideStridedFactory; + DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(true_divide_strided_dispatch_table); + + // function pointers for operation on contiguous inputs and output + using fn_ns::TrueDivideContigFactory; + DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table(true_divide_contig_dispatch_table); + + // function pointers for operation on contiguous matrix, contiguous row + // with contiguous matrix output + using fn_ns::TrueDivideContigMatrixContigRowBroadcastFactory; + DispatchTableBuilder< + binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t, + TrueDivideContigMatrixContigRowBroadcastFactory, num_types> + dtb4; + dtb4.populate_dispatch_table( + true_divide_contig_matrix_contig_row_broadcast_dispatch_table); + + // function pointers for operation on contiguous row, contiguous matrix + // with contiguous matrix output + using fn_ns::TrueDivideContigRowContigMatrixBroadcastFactory; + DispatchTableBuilder< + binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t, + TrueDivideContigRowContigMatrixBroadcastFactory, num_types> + dtb5; + dtb5.populate_dispatch_table( + true_divide_contig_row_contig_matrix_broadcast_dispatch_table); +}; + +} // namespace impl + +// B09: ==== EQUAL (x1, x2) +namespace impl +{ +namespace equal_fn_ns = dpctl::tensor::kernels::equal; + +static binary_contig_impl_fn_ptr_t + equal_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; +static int equal_output_id_table[td_ns::num_types][td_ns::num_types]; + +static binary_strided_impl_fn_ptr_t + equal_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; + +void populate_equal_dispatch_tables(void) +{ + using namespace td_ns; + namespace fn_ns = equal_fn_ns; + + // which input types are supported, and what is the type of the result + using fn_ns::EqualTypeMapFactory; + DispatchTableBuilder dtb1; + dtb1.populate_dispatch_table(equal_output_id_table); + + // function pointers for operation on general strided arrays + using fn_ns::EqualStridedFactory; + DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(equal_strided_dispatch_table); + + // function pointers for operation on contiguous inputs and output + using fn_ns::EqualContigFactory; + DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table(equal_contig_dispatch_table); +}; +} // namespace impl + +// U13: ==== EXP (x) +namespace impl +{ +// FIXME: add code for U13 +} // namespace impl + +// U14: ==== EXPM1 (x) +namespace impl +{ +// FIXME: add code for U14 +} // namespace impl + +// U15: ==== FLOOR (x) +namespace impl +{ +// FIXME: add code for U15 +} // namespace impl + +// B10: ==== FLOOR_DIVIDE (x1, x2) +namespace impl +{ +// FIXME: add code for B10 +} // namespace impl + +// B11: ==== GREATER (x1, x2) +namespace impl +{ +// FIXME: add code for B11 +} // namespace impl + +// B12: ==== GREATER_EQUAL (x1, x2) +namespace impl +{ +// FIXME: add code for B12 +} // namespace impl + +// U16: ==== IMAG (x) +namespace impl +{ +// FIXME: add code for U16 +} // namespace impl + +// U17: ==== ISFINITE (x) +namespace impl +{ +namespace isfinite_fn_ns = dpctl::tensor::kernels::isfinite; + +static unary_contig_impl_fn_ptr_t + isfinite_contig_dispatch_vector[td_ns::num_types]; +static int isfinite_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + isfinite_strided_dispatch_vector[td_ns::num_types]; + +void populate_isfinite_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = isfinite_fn_ns; + + using fn_ns::IsFiniteContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(isfinite_contig_dispatch_vector); + + using fn_ns::IsFiniteStridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(isfinite_strided_dispatch_vector); + + using fn_ns::IsFiniteTypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(isfinite_output_typeid_vector); +} + +} // namespace impl + +// U18: ==== ISINF (x) +namespace impl +{ +namespace isinf_fn_ns = dpctl::tensor::kernels::isinf; + +static unary_contig_impl_fn_ptr_t + isinf_contig_dispatch_vector[td_ns::num_types]; +static int isinf_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + isinf_strided_dispatch_vector[td_ns::num_types]; + +void populate_isinf_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = isinf_fn_ns; + + using fn_ns::IsInfContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(isinf_contig_dispatch_vector); + + using fn_ns::IsInfStridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(isinf_strided_dispatch_vector); + + using fn_ns::IsInfTypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(isinf_output_typeid_vector); +} + +} // namespace impl + +// U19: ==== ISNAN (x) +namespace impl +{ +namespace isnan_fn_ns = dpctl::tensor::kernels::isnan; + +static unary_contig_impl_fn_ptr_t + isnan_contig_dispatch_vector[td_ns::num_types]; +static int isnan_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + isnan_strided_dispatch_vector[td_ns::num_types]; + +void populate_isnan_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = isnan_fn_ns; + + using fn_ns::IsNanContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(isnan_contig_dispatch_vector); + + using fn_ns::IsNanStridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(isnan_strided_dispatch_vector); + + using fn_ns::IsNanTypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(isnan_output_typeid_vector); +} + +} // namespace impl + +// B13: ==== LESS (x1, x2) +namespace impl +{ +// FIXME: add code for B13 +} // namespace impl + +// B14: ==== LESS_EQUAL (x1, x2) +namespace impl +{ +// FIXME: add code for B14 +} // namespace impl + +// U20: ==== LOG (x) +namespace impl +{ +// FIXME: add code for U20 +} // namespace impl + +// U21: ==== LOG1P (x) +namespace impl +{ +// FIXME: add code for U21 +} // namespace impl + +// U22: ==== LOG2 (x) +namespace impl +{ +// FIXME: add code for U22 +} // namespace impl + +// U23: ==== LOG10 (x) +namespace impl +{ +// FIXME: add code for U23 +} // namespace impl + +// B15: ==== LOGADDEXP (x1, x2) +namespace impl +{ +// FIXME: add code for B15 +} // namespace impl + +// B16: ==== LOGICAL_AND (x1, x2) +namespace impl +{ +// FIXME: add code for B16 +} // namespace impl + +// U24: ==== LOGICAL_NOT (x) +namespace impl +{ +// FIXME: add code for U24 +} // namespace impl + +// B17: ==== LOGICAL_OR (x1, x2) +namespace impl +{ +// FIXME: add code for B17 +} // namespace impl + +// B18: ==== LOGICAL_XOR (x1, x2) +namespace impl +{ +// FIXME: add code for B18 +} // namespace impl + +// B19: ==== MULTIPLY (x1, x2) +namespace impl +{ +// FIXME: add code for B19 +} // namespace impl + +// U25: ==== NEGATIVE (x) +namespace impl +{ +// FIXME: add code for U25 +} // namespace impl + +// B20: ==== NOT_EQUAL (x1, x2) +namespace impl +{ +// FIXME: add code for B20 +} // namespace impl + +// U26: ==== POSITIVE (x) +namespace impl +{ +// FIXME: add code for U26 +} // namespace impl + +// B21: ==== POW (x1, x2) +namespace impl +{ +// FIXME: add code for B21 +} // namespace impl + +// U27: ==== REAL (x) +namespace impl +{ +// FIXME: add code for U27 +} // namespace impl + +// B22: ==== REMAINDER (x1, x2) +namespace impl +{ +// FIXME: add code for B22 +} // namespace impl + +// U28: ==== ROUND (x) +namespace impl +{ +// FIXME: add code for U28 +} // namespace impl + +// U29: ==== SIGN (x) +namespace impl +{ +// FIXME: add code for U29 +} // namespace impl + +// U30: ==== SIN (x) +namespace impl +{ +// FIXME: add code for U30 +} // namespace impl + +// U31: ==== SINH (x) +namespace impl +{ +// FIXME: add code for U31 +} // namespace impl + +// U32: ==== SQUARE (x) +namespace impl +{ +// FIXME: add code for U32 +} // namespace impl + +// U33: ==== SQRT (x) +namespace impl +{ + +namespace sqrt_fn_ns = dpctl::tensor::kernels::sqrt; + +static unary_contig_impl_fn_ptr_t sqrt_contig_dispatch_vector[td_ns::num_types]; +static int sqrt_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + sqrt_strided_dispatch_vector[td_ns::num_types]; + +void populate_sqrt_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = sqrt_fn_ns; + + using fn_ns::SqrtContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(sqrt_contig_dispatch_vector); + + using fn_ns::SqrtStridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(sqrt_strided_dispatch_vector); + + using fn_ns::SqrtTypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(sqrt_output_typeid_vector); +} + +} // namespace impl + +// B23: ==== SUBTRACT (x1, x2) +namespace impl +{ +// FIXME: add code for B23 +} // namespace impl + +// U34: ==== TAN (x) +namespace impl +{ +// FIXME: add code for U34 +} // namespace impl + +// U35: ==== TANH (x) +namespace impl +{ +// FIXME: add code for U35 +} // namespace impl + +// U36: ==== TRUNC (x) +namespace impl +{ +// FIXME: add code for U36 +} // namespace impl + +// ========================================================================================== +// // + +namespace py = pybind11; + +void init_elementwise_functions(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + + // U01: ==== ABS (x) + { + impl::populate_abs_dispatch_vectors(); + using impl::abs_contig_dispatch_vector; + using impl::abs_output_typeid_vector; + using impl::abs_strided_dispatch_vector; + + auto abs_pyapi = [&](arrayT src, arrayT dst, sycl::queue exec_q, + const event_vecT &depends = {}) { + return py_unary_ufunc( + src, dst, exec_q, depends, abs_output_typeid_vector, + abs_contig_dispatch_vector, abs_strided_dispatch_vector); + }; + m.def("_abs", abs_pyapi, "", py::arg("src"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto abs_result_type_pyapi = [&](py::dtype dtype) { + return py_unary_ufunc_result_type(dtype, abs_output_typeid_vector); + }; + m.def("_abs_result_type", abs_result_type_pyapi); + } + + // U02: ==== ACOS (x) + // FIXME: + + // U03: ===== ACOSH (x) + // FIXME: + + // B01: ===== ADD (x1, x2) + { + impl::populate_add_dispatch_tables(); + using impl::add_contig_dispatch_table; + using impl::add_contig_matrix_contig_row_broadcast_dispatch_table; + using impl::add_contig_row_contig_matrix_broadcast_dispatch_table; + using impl::add_output_id_table; + using impl::add_strided_dispatch_table; + + auto add_pyapi = [&](dpctl::tensor::usm_ndarray src1, + dpctl::tensor::usm_ndarray src2, + dpctl::tensor::usm_ndarray dst, sycl::queue exec_q, + const std::vector &depends = {}) { + return py_binary_ufunc( + src1, src2, dst, exec_q, depends, add_output_id_table, + // function pointers to handle operation on contiguous arrays + // (pointers may be nullptr) + add_contig_dispatch_table, + // function pointers to handle operation on strided arrays (most + // general case) + add_strided_dispatch_table, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + add_contig_matrix_contig_row_broadcast_dispatch_table, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + add_contig_row_contig_matrix_broadcast_dispatch_table); + }; + auto add_result_type_pyapi = [&](py::dtype dtype1, py::dtype dtype2) { + return py_binary_ufunc_result_type(dtype1, dtype2, + add_output_id_table); + }; + m.def("_add", add_pyapi, "", py::arg("src1"), py::arg("src2"), + py::arg("dst"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + m.def("_add_result_type", add_result_type_pyapi, ""); + } + + // U04: ===== ASIN (x) + // FIXME: + + // U05: ===== ASINH (x) + // FIXME: + + // U06: ===== ATAN (x) + // FIXME: + + // B02: ===== ATAN2 (x1, x2) + // FIXME: + + // U07: ===== ATANH (x) + // FIXME: + + // B03: ===== BITWISE_AND (x1, x2) + // FIXME: + + // B04: ===== BITWISE_LEFT_SHIFT (x1, x2) + // FIXME: + + // U08: ===== BITWISE_INVERT (x) + // FIXME: + + // B05: ===== BITWISE_OR (x1, x2) + // FIXME: + + // B06: ===== BITWISE_RIGHT_SHIFT (x1, x2) + // FIXME: + + // B07: ===== BITWISE_XOR (x1, x2) + // FIXME: + + // U09: ==== CEIL (x) + // FIXME: + + // U10: ==== CONJ (x) + // FIXME: + + // U11: ==== COS (x) + { + impl::populate_cos_dispatch_vectors(); + using impl::cos_contig_dispatch_vector; + using impl::cos_output_typeid_vector; + using impl::cos_strided_dispatch_vector; + + auto cos_pyapi = [&](arrayT src, arrayT dst, sycl::queue exec_q, + const event_vecT &depends = {}) { + return py_unary_ufunc( + src, dst, exec_q, depends, cos_output_typeid_vector, + cos_contig_dispatch_vector, cos_strided_dispatch_vector); + }; + m.def("_cos", cos_pyapi, "", py::arg("src"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto cos_result_type_pyapi = [&](py::dtype dtype) { + return py_unary_ufunc_result_type(dtype, cos_output_typeid_vector); + }; + m.def("_cos_result_type", cos_result_type_pyapi); + } + + // U12: ==== COSH (x) + // FIXME: + + // B08: ==== DIVIDE (x1, x2) + { + impl::populate_true_divide_dispatch_tables(); + using impl::true_divide_contig_dispatch_table; + using impl:: + true_divide_contig_matrix_contig_row_broadcast_dispatch_table; + using impl:: + true_divide_contig_row_contig_matrix_broadcast_dispatch_table; + using impl::true_divide_output_id_table; + using impl::true_divide_strided_dispatch_table; + + auto divide_pyapi = [&](dpctl::tensor::usm_ndarray src1, + dpctl::tensor::usm_ndarray src2, + dpctl::tensor::usm_ndarray dst, + sycl::queue exec_q, + const std::vector &depends = {}) { + return py_binary_ufunc( + src1, src2, dst, exec_q, depends, true_divide_output_id_table, + // function pointers to handle operation on contiguous arrays + // (pointers may be nullptr) + true_divide_contig_dispatch_table, + // function pointers to handle operation on strided arrays (most + // general case) + true_divide_strided_dispatch_table, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + true_divide_contig_matrix_contig_row_broadcast_dispatch_table, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + true_divide_contig_row_contig_matrix_broadcast_dispatch_table); + }; + auto divide_result_type_pyapi = [&](py::dtype dtype1, + py::dtype dtype2) { + return py_binary_ufunc_result_type(dtype1, dtype2, + true_divide_output_id_table); + }; + m.def("_divide", divide_pyapi, "", py::arg("src1"), py::arg("src2"), + py::arg("dst"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + m.def("_divide_result_type", divide_result_type_pyapi, ""); + } + + // B09: ==== EQUAL (x1, x2) + { + impl::populate_equal_dispatch_tables(); + using impl::equal_contig_dispatch_table; + using impl::equal_output_id_table; + using impl::equal_strided_dispatch_table; + + auto equal_pyapi = [&](dpctl::tensor::usm_ndarray src1, + dpctl::tensor::usm_ndarray src2, + dpctl::tensor::usm_ndarray dst, + sycl::queue exec_q, + const std::vector &depends = {}) { + return py_binary_ufunc( + src1, src2, dst, exec_q, depends, equal_output_id_table, + // function pointers to handle operation on contiguous arrays + // (pointers may be nullptr) + equal_contig_dispatch_table, + // function pointers to handle operation on strided arrays (most + // general case) + equal_strided_dispatch_table, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{}, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{}); + }; + auto equal_result_type_pyapi = [&](py::dtype dtype1, py::dtype dtype2) { + return py_binary_ufunc_result_type(dtype1, dtype2, + equal_output_id_table); + }; + m.def("_equal", equal_pyapi, "", py::arg("src1"), py::arg("src2"), + py::arg("dst"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + m.def("_equal_result_type", equal_result_type_pyapi, ""); + } + + // U13: ==== EXP (x) + // FIXME: + + // U14: ==== EXPM1 (x) + // FIXME: + + // U15: ==== FLOOR (x) + // FIXME: + + // B10: ==== FLOOR_DIVIDE (x1, x2) + // FIXME: + + // B11: ==== GREATER (x1, x2) + // FIXME: + + // B12: ==== GREATER_EQUAL (x1, x2) + // FIXME: + + // U16: ==== IMAG (x) + // FIXME: + + // U17: ==== ISFINITE (x) + { + impl::populate_isfinite_dispatch_vectors(); + + using impl::isfinite_contig_dispatch_vector; + using impl::isfinite_output_typeid_vector; + using impl::isfinite_strided_dispatch_vector; + auto isfinite_pyapi = + [&](dpctl::tensor::usm_ndarray src, dpctl::tensor::usm_ndarray dst, + sycl::queue exec_q, + const std::vector &depends = {}) { + return py_unary_ufunc(src, dst, exec_q, depends, + isfinite_output_typeid_vector, + isfinite_contig_dispatch_vector, + isfinite_strided_dispatch_vector); + }; + auto isfinite_result_type_pyapi = [&](py::dtype dtype) { + return py_unary_ufunc_result_type(dtype, + isfinite_output_typeid_vector); + }; + m.def("_isfinite", isfinite_pyapi, "", py::arg("src"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + m.def("_isfinite_result_type", isfinite_result_type_pyapi, ""); + } + + // U18: ==== ISINF (x) + { + impl::populate_isinf_dispatch_vectors(); + + using impl::isinf_contig_dispatch_vector; + using impl::isinf_output_typeid_vector; + using impl::isinf_strided_dispatch_vector; + auto isinf_pyapi = [&](dpctl::tensor::usm_ndarray src, + dpctl::tensor::usm_ndarray dst, + sycl::queue exec_q, + const std::vector &depends = {}) { + return py_unary_ufunc( + src, dst, exec_q, depends, isinf_output_typeid_vector, + isinf_contig_dispatch_vector, isinf_strided_dispatch_vector); + }; + auto isinf_result_type_pyapi = [&](py::dtype dtype) { + return py_unary_ufunc_result_type(dtype, + isinf_output_typeid_vector); + }; + m.def("_isinf", isinf_pyapi, "", py::arg("src"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + m.def("_isinf_result_type", isinf_result_type_pyapi, ""); + } + + // U19: ==== ISNAN (x) + { + impl::populate_isnan_dispatch_vectors(); + + using impl::isnan_contig_dispatch_vector; + using impl::isnan_output_typeid_vector; + using impl::isnan_strided_dispatch_vector; + auto isnan_pyapi = [&](dpctl::tensor::usm_ndarray src, + dpctl::tensor::usm_ndarray dst, + sycl::queue exec_q, + const std::vector &depends = {}) { + return py_unary_ufunc( + src, dst, exec_q, depends, isnan_output_typeid_vector, + isnan_contig_dispatch_vector, isnan_strided_dispatch_vector); + }; + auto isnan_result_type_pyapi = [&](py::dtype dtype) { + return py_unary_ufunc_result_type(dtype, + isnan_output_typeid_vector); + }; + m.def("_isnan", isnan_pyapi, "", py::arg("src"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + m.def("_isnan_result_type", isnan_result_type_pyapi, ""); + } + + // B13: ==== LESS (x1, x2) + // FIXME: + + // B14: ==== LESS_EQUAL (x1, x2) + // FIXME: + + // U20: ==== LOG (x) + // FIXME: + + // U21: ==== LOG1P (x) + // FIXME: + + // U22: ==== LOG2 (x) + // FIXME: + + // U23: ==== LOG10 (x) + // FIXME: + + // B15: ==== LOGADDEXP (x1, x2) + // FIXME: + + // B16: ==== LOGICAL_AND (x1, x2) + // FIXME: + + // U24: ==== LOGICAL_NOT (x) + // FIXME: + + // B17: ==== LOGICAL_OR (x1, x2) + // FIXME: + + // B18: ==== LOGICAL_XOR (x1, x2) + // FIXME: + + // B19: ==== MULTIPLY (x1, x2) + // FIXME: + + // U25: ==== NEGATIVE (x) + // FIXME: + + // B20: ==== NOT_EQUAL (x1, x2) + // FIXME: + + // U26: ==== POSITIVE (x) + // FIXME: + + // B21: ==== POW (x1, x2) + // FIXME: + + // U27: ==== REAL (x) + // FIXME: + + // B22: ==== REMAINDER (x1, x2) + // FIXME: + + // U28: ==== ROUND (x) + // FIXME: + + // U29: ==== SIGN (x) + // FIXME: + + // U30: ==== SIN (x) + // FIXME: + + // U31: ==== SINH (x) + // FIXME: + + // U32: ==== SQUARE (x) + // FIXME: + + // U33: ==== SQRT (x) + { + impl::populate_sqrt_dispatch_vectors(); + using impl::sqrt_contig_dispatch_vector; + using impl::sqrt_output_typeid_vector; + using impl::sqrt_strided_dispatch_vector; + + auto sqrt_pyapi = [&](arrayT src, arrayT dst, sycl::queue exec_q, + const event_vecT &depends = {}) { + return py_unary_ufunc( + src, dst, exec_q, depends, sqrt_output_typeid_vector, + sqrt_contig_dispatch_vector, sqrt_strided_dispatch_vector); + }; + m.def("_sqrt", sqrt_pyapi, "", py::arg("src"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto sqrt_result_type_pyapi = [&](py::dtype dtype) { + return py_unary_ufunc_result_type(dtype, sqrt_output_typeid_vector); + }; + m.def("_sqrt_result_type", sqrt_result_type_pyapi); + } + + // B23: ==== SUBTRACT (x1, x2) + // FIXME: + + // U34: ==== TAN (x) + // FIXME: + + // U35: ==== TANH (x) + // FIXME: + + // U36: ==== TRUNC (x) + // FIXME: +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions.hpp b/dpctl/tensor/libtensor/source/elementwise_functions.hpp new file mode 100644 index 0000000000..58127756ca --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions.hpp @@ -0,0 +1,594 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include +#include + +#include "simplify_iteration_space.hpp" +#include "utils/memory_overlap.hpp" +#include "utils/offset_utils.hpp" +#include "utils/type_dispatch.hpp" + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +extern py::dtype _dtype_from_typenum(td_ns::typenum_t dst_typenum_t); +extern int _result_typeid(int arg_typeid, const int *fn_output_id); + +template +std::pair +py_unary_ufunc(dpctl::tensor::usm_ndarray src, + dpctl::tensor::usm_ndarray dst, + sycl::queue q, + const std::vector &depends, + // + const output_typesT &output_type_vec, + const contig_dispatchT &contig_dispatch_vector, + const strided_dispatchT &strided_dispatch_vector) +{ + int src_typenum = src.get_typenum(); + int dst_typenum = dst.get_typenum(); + + const auto &array_types = td_ns::usm_ndarray_types(); + int src_typeid = array_types.typenum_to_lookup_id(src_typenum); + int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum); + + int func_output_typeid = output_type_vec[src_typeid]; + + // check that types are supported + if (dst_typeid != func_output_typeid) { + throw py::value_error( + "Destination array has unexpected elemental data type."); + } + + // check that queues are compatible + if (!dpctl::utils::queues_are_compatible(q, {src, dst})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + // check that dimensions are the same + int src_nd = src.get_ndim(); + if (src_nd != dst.get_ndim()) { + throw py::value_error("Array dimensions are not the same."); + } + + // check that shapes are the same + const py::ssize_t *src_shape = src.get_shape_raw(); + const py::ssize_t *dst_shape = dst.get_shape_raw(); + bool shapes_equal(true); + size_t src_nelems(1); + + for (int i = 0; i < src_nd; ++i) { + src_nelems *= static_cast(src_shape[i]); + shapes_equal = shapes_equal && (src_shape[i] == dst_shape[i]); + } + if (!shapes_equal) { + throw py::value_error("Array shapes are not the same."); + } + + // if nelems is zero, return + if (src_nelems == 0) { + return std::make_pair(sycl::event(), sycl::event()); + } + + // ensure that output is ample enough to accomodate all elements + auto dst_offsets = dst.get_minmax_offsets(); + // destination must be ample enough to accomodate all elements + { + size_t range = + static_cast(dst_offsets.second - dst_offsets.first); + if (range + 1 < src_nelems) { + throw py::value_error( + "Destination array can not accomodate all the " + "elements of source array."); + } + } + + // check memory overlap + auto const &overlap = dpctl::tensor::overlap::MemoryOverlap(); + if (overlap(src, dst)) { + throw py::value_error("Arrays index overlapping segments of memory"); + } + + const char *src_data = src.get_data(); + char *dst_data = dst.get_data(); + + // handle contiguous inputs + bool is_src_c_contig = src.is_c_contiguous(); + bool is_src_f_contig = src.is_f_contiguous(); + + bool is_dst_c_contig = dst.is_c_contiguous(); + bool is_dst_f_contig = dst.is_f_contiguous(); + + bool both_c_contig = (is_src_c_contig && is_dst_c_contig); + bool both_f_contig = (is_src_f_contig && is_dst_f_contig); + + if (both_c_contig || both_f_contig) { + auto contig_fn = contig_dispatch_vector[src_typeid]; + + if (contig_fn == nullptr) { + throw std::runtime_error( + "Contiguous implementation is missing for src_typeid=" + + std::to_string(src_typeid)); + } + + auto comp_ev = contig_fn(q, src_nelems, src_data, dst_data, depends); + sycl::event ht_ev = + dpctl::utils::keep_args_alive(q, {src, dst}, {comp_ev}); + + return std::make_pair(ht_ev, comp_ev); + } + + // simplify iteration space + // if 1d with strides 1 - input is contig + // dispatch to strided + + auto const &src_strides = src.get_strides_vector(); + auto const &dst_strides = dst.get_strides_vector(); + + using shT = std::vector; + shT simplified_shape; + shT simplified_src_strides; + shT simplified_dst_strides; + py::ssize_t src_offset(0); + py::ssize_t dst_offset(0); + + int nd = src_nd; + const py::ssize_t *shape = src_shape; + + dpctl::tensor::py_internal::simplify_iteration_space( + nd, shape, src_strides, dst_strides, + // output + simplified_shape, simplified_src_strides, simplified_dst_strides, + src_offset, dst_offset); + + if (nd == 1 && simplified_src_strides[0] == 1 && + simplified_dst_strides[0] == 1) { + // Special case of contiguous data + auto contig_fn = contig_dispatch_vector[src_typeid]; + + if (contig_fn == nullptr) { + throw std::runtime_error( + "Contiguous implementation is missing for src_typeid=" + + std::to_string(src_typeid)); + } + + int src_elem_size = src.get_elemsize(); + int dst_elem_size = dst.get_elemsize(); + auto comp_ev = + contig_fn(q, src_nelems, src_data + src_elem_size * src_offset, + dst_data + dst_elem_size * dst_offset, depends); + + sycl::event ht_ev = + dpctl::utils::keep_args_alive(q, {src, dst}, {comp_ev}); + + return std::make_pair(ht_ev, comp_ev); + } + + // Strided implementation + auto strided_fn = strided_dispatch_vector[src_typeid]; + + if (strided_fn == nullptr) { + throw std::runtime_error( + "Strided implementation is missing for src_typeid=" + + std::to_string(src_typeid)); + } + + using dpctl::tensor::offset_utils::device_allocate_and_pack; + + std::vector host_tasks{}; + host_tasks.reserve(2); + + const auto &ptr_size_event_triple_ = device_allocate_and_pack( + q, host_tasks, simplified_shape, simplified_src_strides, + simplified_dst_strides); + py::ssize_t *shape_strides = std::get<0>(ptr_size_event_triple_); + sycl::event copy_shape_ev = std::get<2>(ptr_size_event_triple_); + + if (shape_strides == nullptr) { + throw std::runtime_error("Device memory allocation failed"); + } + + sycl::event strided_fn_ev = + strided_fn(q, src_nelems, nd, shape_strides, src_data, src_offset, + dst_data, dst_offset, depends, {copy_shape_ev}); + + // async free of shape_strides temporary + auto ctx = q.get_context(); + sycl::event tmp_cleanup_ev = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(strided_fn_ev); + cgh.host_task( + [ctx, shape_strides]() { sycl::free(shape_strides, ctx); }); + }); + host_tasks.push_back(tmp_cleanup_ev); + + return std::make_pair( + dpctl::utils::keep_args_alive(q, {src, dst}, host_tasks), + strided_fn_ev); +} + +template +py::object py_unary_ufunc_result_type(py::dtype input_dtype, + const output_typesT &output_types) +{ + int tn = input_dtype.num(); // NumPy type numbers are the same as in dpctl + int src_typeid = -1; + + auto array_types = td_ns::usm_ndarray_types(); + + try { + src_typeid = array_types.typenum_to_lookup_id(tn); + } catch (const std::exception &e) { + throw py::value_error(e.what()); + } + + int dst_typeid = _result_typeid(src_typeid, output_types); + + if (dst_typeid < 0) { + auto res = py::none(); + return py::cast(res); + } + else { + auto dst_typenum_t = static_cast(dst_typeid); + + auto dt = _dtype_from_typenum(dst_typenum_t); + + return py::cast(dt); + } +} + +// ======================== Binary functions =========================== + +namespace +{ +template +bool isEqual(Container const &c, std::initializer_list const &l) +{ + return std::equal(std::begin(c), std::end(c), std::begin(l), std::end(l)); +} +} // namespace + +template +std::pair py_binary_ufunc( + dpctl::tensor::usm_ndarray src1, + dpctl::tensor::usm_ndarray src2, + dpctl::tensor::usm_ndarray dst, // dst = op(src1, src2), elementwise + sycl::queue exec_q, + const std::vector depends, + // + const output_typesT &output_type_table, + const contig_dispatchT &contig_dispatch_table, + const strided_dispatchT &strided_dispatch_table, + const contig_matrix_row_dispatchT + &contig_matrix_row_broadcast_dispatch_table, + const contig_row_matrix_dispatchT + &contig_row_matrix_broadcast_dispatch_table) +{ + // check type_nums + int src1_typenum = src1.get_typenum(); + int src2_typenum = src2.get_typenum(); + int dst_typenum = dst.get_typenum(); + + auto array_types = td_ns::usm_ndarray_types(); + int src1_typeid = array_types.typenum_to_lookup_id(src1_typenum); + int src2_typeid = array_types.typenum_to_lookup_id(src2_typenum); + int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum); + + int output_typeid = output_type_table[src1_typeid][src2_typeid]; + + if (output_typeid != dst_typeid) { + throw py::value_error( + "Destination array has unexpected elemental data type."); + } + + // check that queues are compatible + if (!dpctl::utils::queues_are_compatible(exec_q, {src1, src2, dst})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + // check shapes, broadcasting is assumed done by caller + // check that dimensions are the same + int dst_nd = dst.get_ndim(); + if (dst_nd != src1.get_ndim() || dst_nd != src2.get_ndim()) { + throw py::value_error("Array dimensions are not the same."); + } + + // check that shapes are the same + const py::ssize_t *src1_shape = src1.get_shape_raw(); + const py::ssize_t *src2_shape = src2.get_shape_raw(); + const py::ssize_t *dst_shape = dst.get_shape_raw(); + bool shapes_equal(true); + size_t src_nelems(1); + + for (int i = 0; i < dst_nd; ++i) { + src_nelems *= static_cast(src1_shape[i]); + shapes_equal = shapes_equal && (src1_shape[i] == dst_shape[i] && + src2_shape[i] == dst_shape[i]); + } + if (!shapes_equal) { + throw py::value_error("Array shapes are not the same."); + } + + // if nelems is zero, return + if (src_nelems == 0) { + return std::make_pair(sycl::event(), sycl::event()); + } + + // ensure that output is ample enough to accomodate all elements + auto dst_offsets = dst.get_minmax_offsets(); + // destination must be ample enough to accomodate all elements + { + size_t range = + static_cast(dst_offsets.second - dst_offsets.first); + if (range + 1 < src_nelems) { + throw py::value_error( + "Destination array can not accomodate all the " + "elements of source array."); + } + } + + // check memory overlap + auto const &overlap = dpctl::tensor::overlap::MemoryOverlap(); + if (overlap(src1, dst) || overlap(src2, dst)) { + throw py::value_error("Arrays index overlapping segments of memory"); + } + // check memory overlap + const char *src1_data = src1.get_data(); + const char *src2_data = src2.get_data(); + char *dst_data = dst.get_data(); + + // handle contiguous inputs + bool is_src1_c_contig = src1.is_c_contiguous(); + bool is_src1_f_contig = src1.is_f_contiguous(); + + bool is_src2_c_contig = src2.is_c_contiguous(); + bool is_src2_f_contig = src2.is_f_contiguous(); + + bool is_dst_c_contig = dst.is_c_contiguous(); + bool is_dst_f_contig = dst.is_f_contiguous(); + + bool all_c_contig = + (is_src1_c_contig && is_src2_c_contig && is_dst_c_contig); + bool all_f_contig = + (is_src1_f_contig && is_src2_f_contig && is_dst_f_contig); + + // dispatch for contiguous inputs + if (all_c_contig || all_f_contig) { + auto contig_fn = contig_dispatch_table[src1_typeid][src2_typeid]; + + if (contig_fn != nullptr) { + auto comp_ev = contig_fn(exec_q, src_nelems, src1_data, 0, + src2_data, 0, dst_data, 0, depends); + sycl::event ht_ev = dpctl::utils::keep_args_alive( + exec_q, {src1, src2, dst}, {comp_ev}); + + return std::make_pair(ht_ev, comp_ev); + } + } + + // simplify strides + auto const &src1_strides = src1.get_strides_vector(); + auto const &src2_strides = src2.get_strides_vector(); + auto const &dst_strides = dst.get_strides_vector(); + + using shT = std::vector; + shT simplified_shape; + shT simplified_src1_strides; + shT simplified_src2_strides; + shT simplified_dst_strides; + py::ssize_t src1_offset(0); + py::ssize_t src2_offset(0); + py::ssize_t dst_offset(0); + + int nd = dst_nd; + const py::ssize_t *shape = src1_shape; + + // all args except itemsizes and is_?_contig bools can be modified by + // reference + dpctl::tensor::py_internal::simplify_iteration_space_3( + nd, shape, src1_strides, src2_strides, dst_strides, + // outputs + simplified_shape, simplified_src1_strides, simplified_src2_strides, + simplified_dst_strides, src1_offset, src2_offset, dst_offset); + + std::vector host_tasks{}; + + if (nd < 3) { + static constexpr auto unit_stride = + std::initializer_list{1}; + + if ((nd == 1) && isEqual(simplified_src1_strides, unit_stride) && + isEqual(simplified_src2_strides, unit_stride) && + isEqual(simplified_dst_strides, unit_stride)) + { + auto contig_fn = contig_dispatch_table[src1_typeid][src2_typeid]; + + if (contig_fn != nullptr) { + auto comp_ev = contig_fn(exec_q, src_nelems, src1_data, + src1_offset, src2_data, src2_offset, + dst_data, dst_offset, depends); + sycl::event ht_ev = dpctl::utils::keep_args_alive( + exec_q, {src1, src2, dst}, {comp_ev}); + + return std::make_pair(ht_ev, comp_ev); + } + } + if (nd == 2) { + static constexpr auto zero_one_strides = + std::initializer_list{0, 1}; + static constexpr auto one_zero_strides = + std::initializer_list{1, 0}; + constexpr py::ssize_t one{1}; + // special case of C-contiguous matrix and a row + if (isEqual(simplified_src2_strides, zero_one_strides) && + isEqual(simplified_src1_strides, {simplified_shape[1], one}) && + isEqual(simplified_dst_strides, {simplified_shape[1], one})) + { + auto matrix_row_broadcast_fn = + contig_matrix_row_broadcast_dispatch_table[src1_typeid] + [src2_typeid]; + if (matrix_row_broadcast_fn != nullptr) { + size_t n0 = simplified_shape[0]; + size_t n1 = simplified_shape[1]; + sycl::event comp_ev = matrix_row_broadcast_fn( + exec_q, host_tasks, n0, n1, src1_data, src1_offset, + src2_data, src2_offset, dst_data, dst_offset, depends); + + return std::make_pair( + dpctl::utils::keep_args_alive(exec_q, {src1, src2, dst}, + host_tasks), + comp_ev); + } + } + if (isEqual(simplified_src1_strides, one_zero_strides) && + isEqual(simplified_src2_strides, {one, simplified_shape[0]}) && + isEqual(simplified_dst_strides, {one, simplified_shape[0]})) + { + auto row_matrix_broadcast_fn = + contig_row_matrix_broadcast_dispatch_table[src1_typeid] + [src2_typeid]; + if (row_matrix_broadcast_fn != nullptr) { + size_t n0 = simplified_shape[1]; + size_t n1 = simplified_shape[0]; + sycl::event comp_ev = row_matrix_broadcast_fn( + exec_q, host_tasks, n0, n1, src1_data, src1_offset, + src2_data, src2_offset, dst_data, dst_offset, depends); + + return std::make_pair( + dpctl::utils::keep_args_alive(exec_q, {src1, src2, dst}, + host_tasks), + comp_ev); + } + } + } + } + + // dispatch to strided code + auto strided_fn = strided_dispatch_table[src1_typeid][src2_typeid]; + + if (strided_fn == nullptr) { + throw std::runtime_error( + "Contiguous implementation is missing for src1_typeid=" + + std::to_string(src1_typeid) + + " and src2_typeid=" + std::to_string(src2_typeid)); + } + + using dpctl::tensor::offset_utils::device_allocate_and_pack; + const auto &ptr_sz_event_triple_ = device_allocate_and_pack( + exec_q, host_tasks, simplified_shape, simplified_src1_strides, + simplified_src2_strides, simplified_dst_strides); + + py::ssize_t *shape_strides = std::get<0>(ptr_sz_event_triple_); + sycl::event copy_shape_ev = std::get<2>(ptr_sz_event_triple_); + + if (shape_strides == nullptr) { + throw std::runtime_error("Unabled to allocate device memory"); + } + + sycl::event strided_fn_ev = strided_fn( + exec_q, src_nelems, nd, shape_strides, src1_data, src1_offset, + src2_data, src2_offset, dst_data, dst_offset, depends, {copy_shape_ev}); + + // async free of shape_strides temporary + auto ctx = exec_q.get_context(); + + sycl::event tmp_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(strided_fn_ev); + cgh.host_task( + [ctx, shape_strides]() { sycl::free(shape_strides, ctx); }); + }); + + host_tasks.push_back(tmp_cleanup_ev); + + return std::make_pair( + dpctl::utils::keep_args_alive(exec_q, {src1, src2, dst}, host_tasks), + strided_fn_ev); +} + +template +py::object py_binary_ufunc_result_type(py::dtype input1_dtype, + py::dtype input2_dtype, + const output_typesT &output_types_table) +{ + int tn1 = input1_dtype.num(); // NumPy type numbers are the same as in dpctl + int tn2 = input2_dtype.num(); // NumPy type numbers are the same as in dpctl + int src1_typeid = -1; + int src2_typeid = -1; + + auto array_types = td_ns::usm_ndarray_types(); + + try { + src1_typeid = array_types.typenum_to_lookup_id(tn1); + src2_typeid = array_types.typenum_to_lookup_id(tn2); + } catch (const std::exception &e) { + throw py::value_error(e.what()); + } + + if (src1_typeid < 0 || src1_typeid >= td_ns::num_types || src2_typeid < 0 || + src2_typeid >= td_ns::num_types) + { + throw std::runtime_error("binary output type lookup failed"); + } + int dst_typeid = output_types_table[src1_typeid][src2_typeid]; + + if (dst_typeid < 0) { + auto res = py::none(); + return py::cast(res); + } + else { + auto dst_typenum_t = static_cast(dst_typeid); + + auto dt = _dtype_from_typenum(dst_typenum_t); + + return py::cast(dt); + } +} + +extern void init_elementwise_functions(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/eye_ctor.cpp b/dpctl/tensor/libtensor/source/eye_ctor.cpp index f04518bc48..c4a8f0cd08 100644 --- a/dpctl/tensor/libtensor/source/eye_ctor.cpp +++ b/dpctl/tensor/libtensor/source/eye_ctor.cpp @@ -2,7 +2,7 @@ // // Data Parallel Control (dpctl) // -// Copyright 2020-2022 Intel Corporation +// Copyright 2020-2023 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/dpctl/tensor/libtensor/source/eye_ctor.hpp b/dpctl/tensor/libtensor/source/eye_ctor.hpp index 1067ed8d8b..3436c23bd8 100644 --- a/dpctl/tensor/libtensor/source/eye_ctor.hpp +++ b/dpctl/tensor/libtensor/source/eye_ctor.hpp @@ -2,7 +2,7 @@ // // Data Parallel Control (dpctl) // -// Copyright 2020-2022 Intel Corporation +// Copyright 2020-2023 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/dpctl/tensor/libtensor/source/full_ctor.cpp b/dpctl/tensor/libtensor/source/full_ctor.cpp index f4b8ae5f42..2f7182807c 100644 --- a/dpctl/tensor/libtensor/source/full_ctor.cpp +++ b/dpctl/tensor/libtensor/source/full_ctor.cpp @@ -2,7 +2,7 @@ // // Data Parallel Control (dpctl) // -// Copyright 2020-2022 Intel Corporation +// Copyright 2020-2023 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/dpctl/tensor/libtensor/source/full_ctor.hpp b/dpctl/tensor/libtensor/source/full_ctor.hpp index 4a620a03db..3870573fa4 100644 --- a/dpctl/tensor/libtensor/source/full_ctor.hpp +++ b/dpctl/tensor/libtensor/source/full_ctor.hpp @@ -2,7 +2,7 @@ // // Data Parallel Control (dpctl) // -// Copyright 2020-2022 Intel Corporation +// Copyright 2020-2023 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp b/dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp index 1039820014..80b12314e3 100644 --- a/dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp +++ b/dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp @@ -2,7 +2,7 @@ // // Data Parallel Control (dpctl) // -// Copyright 2020-2022 Intel Corporation +// Copyright 2020-2023 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/dpctl/tensor/libtensor/source/integer_advanced_indexing.hpp b/dpctl/tensor/libtensor/source/integer_advanced_indexing.hpp index c6d5ed74b8..438bf613e2 100644 --- a/dpctl/tensor/libtensor/source/integer_advanced_indexing.hpp +++ b/dpctl/tensor/libtensor/source/integer_advanced_indexing.hpp @@ -2,7 +2,7 @@ // // Data Parallel Control (dpctl) // -// Copyright 2020-2022 Intel Corporation +// Copyright 2020-2023 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/dpctl/tensor/libtensor/source/linear_sequences.cpp b/dpctl/tensor/libtensor/source/linear_sequences.cpp index 9b17581b8e..306add5f54 100644 --- a/dpctl/tensor/libtensor/source/linear_sequences.cpp +++ b/dpctl/tensor/libtensor/source/linear_sequences.cpp @@ -2,7 +2,7 @@ // // Data Parallel Control (dpctl) // -// Copyright 2020-2022 Intel Corporation +// Copyright 2020-2023 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/dpctl/tensor/libtensor/source/linear_sequences.hpp b/dpctl/tensor/libtensor/source/linear_sequences.hpp index b463fdf533..8da56ecd10 100644 --- a/dpctl/tensor/libtensor/source/linear_sequences.hpp +++ b/dpctl/tensor/libtensor/source/linear_sequences.hpp @@ -2,7 +2,7 @@ // // Data Parallel Control (dpctl) // -// Copyright 2020-2022 Intel Corporation +// Copyright 2020-2023 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/dpctl/tensor/libtensor/source/simplify_iteration_space.cpp b/dpctl/tensor/libtensor/source/simplify_iteration_space.cpp index 4eecef2d3f..2fb2d6078e 100644 --- a/dpctl/tensor/libtensor/source/simplify_iteration_space.cpp +++ b/dpctl/tensor/libtensor/source/simplify_iteration_space.cpp @@ -2,7 +2,7 @@ // // Data Parallel Control (dpctl) // -// Copyright 2020-2022 Intel Corporation +// Copyright 2020-2023 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/dpctl/tensor/libtensor/source/simplify_iteration_space.hpp b/dpctl/tensor/libtensor/source/simplify_iteration_space.hpp index 356afca08d..1bd8ff5aa0 100644 --- a/dpctl/tensor/libtensor/source/simplify_iteration_space.hpp +++ b/dpctl/tensor/libtensor/source/simplify_iteration_space.hpp @@ -2,7 +2,7 @@ // // Data Parallel Control (dpctl) // -// Copyright 2020-2022 Intel Corporation +// Copyright 2020-2023 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/dpctl/tensor/libtensor/source/tensor_py.cpp b/dpctl/tensor/libtensor/source/tensor_py.cpp index e4c8a5e9b1..3a72c205fb 100644 --- a/dpctl/tensor/libtensor/source/tensor_py.cpp +++ b/dpctl/tensor/libtensor/source/tensor_py.cpp @@ -2,7 +2,7 @@ // // Data Parallel Control (dpctl) // -// Copyright 2020-2022 Intel Corporation +// Copyright 2020-2023 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -39,6 +39,7 @@ #include "copy_for_reshape.hpp" #include "copy_numpy_ndarray_into_usm_ndarray.hpp" #include "device_support_queries.hpp" +#include "elementwise_functions.hpp" #include "eye_ctor.hpp" #include "full_ctor.hpp" #include "integer_advanced_indexing.hpp" @@ -348,5 +349,6 @@ PYBIND11_MODULE(_tensor_impl, m) py::arg("x2"), py::arg("dst"), py::arg("sycl_queue"), py::arg("depends") = py::list()); + dpctl::tensor::py_internal::init_elementwise_functions(m); dpctl::tensor::py_internal::init_boolean_reduction_functions(m); } diff --git a/dpctl/tensor/libtensor/source/triul_ctor.cpp b/dpctl/tensor/libtensor/source/triul_ctor.cpp index b9cf4543f9..b40b50d030 100644 --- a/dpctl/tensor/libtensor/source/triul_ctor.cpp +++ b/dpctl/tensor/libtensor/source/triul_ctor.cpp @@ -2,7 +2,7 @@ // // Data Parallel Control (dpctl) // -// Copyright 2020-2022 Intel Corporation +// Copyright 2020-2023 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/dpctl/tensor/libtensor/source/triul_ctor.hpp b/dpctl/tensor/libtensor/source/triul_ctor.hpp index 3789df80c5..2f277bb416 100644 --- a/dpctl/tensor/libtensor/source/triul_ctor.hpp +++ b/dpctl/tensor/libtensor/source/triul_ctor.hpp @@ -2,7 +2,7 @@ // // Data Parallel Control (dpctl) // -// Copyright 2020-2022 Intel Corporation +// Copyright 2020-2023 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/dpctl/tests/elementwise/__init__.py b/dpctl/tests/elementwise/__init__.py new file mode 100644 index 0000000000..ac810ba127 --- /dev/null +++ b/dpctl/tests/elementwise/__init__.py @@ -0,0 +1,20 @@ +# Data Parallel Control (dpctl) +# +# Copyright 2020-2023 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Collection of test and utility files for testing elementwise operations +over :class:`dpctl.tensor.usm_ndarray`. +""" diff --git a/dpctl/tests/elementwise/test_abs.py b/dpctl/tests/elementwise/test_abs.py new file mode 100644 index 0000000000..ee7fa0cb6c --- /dev/null +++ b/dpctl/tests/elementwise/test_abs.py @@ -0,0 +1,115 @@ +# Data Parallel Control (dpctl) +# +# Copyright 2020-2023 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import itertools + +import numpy as np +import pytest + +import dpctl.tensor as dpt +from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported + +from .utils import _all_dtypes, _usm_types + + +@pytest.mark.parametrize("dtype", _all_dtypes) +def test_abs_out_type(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + arg_dt = np.dtype(dtype) + X = dpt.asarray(0, dtype=arg_dt, sycl_queue=q) + if np.issubdtype(arg_dt, np.complexfloating): + type_map = { + np.dtype("c8"): np.dtype("f4"), + np.dtype("c16"): np.dtype("f8"), + } + assert dpt.abs(X).dtype == type_map[arg_dt] + + r = dpt.empty_like(X, dtype=type_map[arg_dt]) + dpt.abs(X, out=r) + assert np.allclose(dpt.asnumpy(r), dpt.asnumpy(dpt.abs(X))) + else: + assert dpt.abs(X).dtype == arg_dt + + r = dpt.empty_like(X, dtype=arg_dt) + dpt.abs(X, out=r) + assert np.allclose(dpt.asnumpy(r), dpt.asnumpy(dpt.abs(X))) + + +@pytest.mark.parametrize("usm_type", _usm_types) +def test_abs_usm_type(usm_type): + q = get_queue_or_skip() + + arg_dt = np.dtype("i4") + input_shape = (10, 10, 10, 10) + X = dpt.empty(input_shape, dtype=arg_dt, usm_type=usm_type, sycl_queue=q) + X[..., 0::2] = 1 + X[..., 1::2] = 0 + + Y = dpt.abs(X) + assert Y.usm_type == X.usm_type + assert Y.sycl_queue == X.sycl_queue + assert Y.flags.c_contiguous + + expected_Y = dpt.asnumpy(X) + assert np.allclose(dpt.asnumpy(Y), expected_Y) + + +@pytest.mark.parametrize("dtype", _all_dtypes[1:]) +def test_abs_order(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + arg_dt = np.dtype(dtype) + input_shape = (10, 10, 10, 10) + X = dpt.empty(input_shape, dtype=arg_dt, sycl_queue=q) + X[..., 0::2] = 1 + X[..., 1::2] = 0 + + for ord in ["C", "F", "A", "K"]: + for perms in itertools.permutations(range(4)): + U = dpt.permute_dims(X[:, ::-1, ::-1, :], perms) + Y = dpt.abs(U, order=ord) + expected_Y = np.ones(Y.shape, dtype=Y.dtype) + expected_Y[..., 1::2] = 0 + expected_Y = np.transpose(expected_Y, perms) + assert np.allclose(dpt.asnumpy(Y), expected_Y) + + +@pytest.mark.parametrize("dtype", ["c8", "c16"]) +def test_abs_complex(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + arg_dt = np.dtype(dtype) + input_shape = (10, 10, 10, 10) + X = dpt.empty(input_shape, dtype=arg_dt, sycl_queue=q) + Xnp = np.random.standard_normal( + size=input_shape + ) + 1j * np.random.standard_normal(size=input_shape) + Xnp = Xnp.astype(arg_dt) + X[...] = Xnp + + for ord in ["C", "F", "A", "K"]: + for perms in itertools.permutations(range(4)): + U = dpt.permute_dims(X[:, ::-1, ::-1, :], perms) + Y = dpt.abs(U, order=ord) + expected_Y = np.abs(np.transpose(Xnp[:, ::-1, ::-1, :], perms)) + tol = dpt.finfo(Y.dtype).resolution + np.testing.assert_allclose( + dpt.asnumpy(Y), expected_Y, atol=tol, rtol=tol + ) diff --git a/dpctl/tests/elementwise/test_add.py b/dpctl/tests/elementwise/test_add.py new file mode 100644 index 0000000000..fa97b1c1c7 --- /dev/null +++ b/dpctl/tests/elementwise/test_add.py @@ -0,0 +1,306 @@ +# Data Parallel Control (dpctl) +# +# Copyright 2020-2023 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import ctypes + +import numpy as np +import pytest +from numpy.testing import assert_raises_regex + +import dpctl +import dpctl.tensor as dpt +from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported +from dpctl.utils import ExecutionPlacementError + +from .utils import _all_dtypes, _compare_dtypes, _usm_types + + +@pytest.mark.parametrize("op1_dtype", _all_dtypes) +@pytest.mark.parametrize("op2_dtype", _all_dtypes) +def test_add_dtype_matrix(op1_dtype, op2_dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(op1_dtype, q) + skip_if_dtype_not_supported(op2_dtype, q) + + sz = 127 + ar1 = dpt.ones(sz, dtype=op1_dtype) + ar2 = dpt.ones_like(ar1, dtype=op2_dtype) + + r = dpt.add(ar1, ar2) + assert isinstance(r, dpt.usm_ndarray) + expected_dtype = np.add( + np.zeros(1, dtype=op1_dtype), np.zeros(1, dtype=op2_dtype) + ).dtype + assert _compare_dtypes(r.dtype, expected_dtype, sycl_queue=q) + assert r.shape == ar1.shape + assert (dpt.asnumpy(r) == np.full(r.shape, 2, dtype=r.dtype)).all() + assert r.sycl_queue == ar1.sycl_queue + + r2 = dpt.empty_like(ar1, dtype=r.dtype) + dpt.add(ar1, ar2, out=r2) + assert (dpt.asnumpy(r2) == np.full(r2.shape, 2, dtype=r2.dtype)).all() + + ar3 = dpt.ones(sz, dtype=op1_dtype) + ar4 = dpt.ones(2 * sz, dtype=op2_dtype) + + r = dpt.add(ar3[::-1], ar4[::2]) + assert isinstance(r, dpt.usm_ndarray) + expected_dtype = np.add( + np.zeros(1, dtype=op1_dtype), np.zeros(1, dtype=op2_dtype) + ).dtype + assert _compare_dtypes(r.dtype, expected_dtype, sycl_queue=q) + assert r.shape == ar3.shape + assert (dpt.asnumpy(r) == np.full(r.shape, 2, dtype=r.dtype)).all() + + r2 = dpt.empty_like(ar1, dtype=r.dtype) + dpt.add(ar3[::-1], ar4[::2], out=r2) + assert (dpt.asnumpy(r2) == np.full(r2.shape, 2, dtype=r2.dtype)).all() + + +@pytest.mark.parametrize("op1_usm_type", _usm_types) +@pytest.mark.parametrize("op2_usm_type", _usm_types) +def test_add_usm_type_matrix(op1_usm_type, op2_usm_type): + get_queue_or_skip() + + sz = 128 + ar1 = dpt.ones(sz, dtype="i4", usm_type=op1_usm_type) + ar2 = dpt.ones_like(ar1, dtype="i4", usm_type=op2_usm_type) + + r = dpt.add(ar1, ar2) + assert isinstance(r, dpt.usm_ndarray) + expected_usm_type = dpctl.utils.get_coerced_usm_type( + (op1_usm_type, op2_usm_type) + ) + assert r.usm_type == expected_usm_type + + +def test_add_order(): + get_queue_or_skip() + + test_shape = ( + 20, + 20, + ) + test_shape2 = tuple(2 * dim for dim in test_shape) + n = test_shape[-1] + + for dt1, dt2 in zip(["i4", "i4", "f4"], ["i4", "f4", "i4"]): + ar1 = dpt.ones(test_shape, dtype=dt1, order="C") + ar2 = dpt.ones(test_shape, dtype=dt2, order="C") + r1 = dpt.add(ar1, ar2, order="C") + assert r1.flags.c_contiguous + r2 = dpt.add(ar1, ar2, order="F") + assert r2.flags.f_contiguous + r3 = dpt.add(ar1, ar2, order="A") + assert r3.flags.c_contiguous + r4 = dpt.add(ar1, ar2, order="K") + assert r4.flags.c_contiguous + + ar1 = dpt.ones(test_shape, dtype=dt1, order="F") + ar2 = dpt.ones(test_shape, dtype=dt2, order="F") + r1 = dpt.add(ar1, ar2, order="C") + assert r1.flags.c_contiguous + r2 = dpt.add(ar1, ar2, order="F") + assert r2.flags.f_contiguous + r3 = dpt.add(ar1, ar2, order="A") + assert r3.flags.f_contiguous + r4 = dpt.add(ar1, ar2, order="K") + assert r4.flags.f_contiguous + + ar1 = dpt.ones(test_shape2, dtype=dt1, order="C")[:20, ::-2] + ar2 = dpt.ones(test_shape2, dtype=dt2, order="C")[:20, ::-2] + r4 = dpt.add(ar1, ar2, order="K") + assert r4.strides == (n, -1) + r5 = dpt.add(ar1, ar2, order="C") + assert r5.strides == (n, 1) + + ar1 = dpt.ones(test_shape2, dtype=dt1, order="C")[:20, ::-2].mT + ar2 = dpt.ones(test_shape2, dtype=dt2, order="C")[:20, ::-2].mT + r4 = dpt.add(ar1, ar2, order="K") + assert r4.strides == (-1, n) + r5 = dpt.add(ar1, ar2, order="C") + assert r5.strides == (n, 1) + + +def test_add_broadcasting(): + get_queue_or_skip() + + m = dpt.ones((100, 5), dtype="i4") + v = dpt.arange(5, dtype="i4") + + r = dpt.add(m, v) + assert (dpt.asnumpy(r) == np.arange(1, 6, dtype="i4")[np.newaxis, :]).all() + + r2 = dpt.add(v, m) + assert (dpt.asnumpy(r2) == np.arange(1, 6, dtype="i4")[np.newaxis, :]).all() + + r3 = dpt.empty_like(m) + dpt.add(m, v, out=r3) + assert (dpt.asnumpy(r3) == np.arange(1, 6, dtype="i4")[np.newaxis, :]).all() + + r4 = dpt.empty_like(m) + dpt.add(v, m, out=r4) + assert (dpt.asnumpy(r4) == np.arange(1, 6, dtype="i4")[np.newaxis, :]).all() + + +def test_add_broadcasting_error(): + get_queue_or_skip() + m = dpt.ones((10, 10), dtype="i4") + v = dpt.ones((3,), dtype="i4") + with pytest.raises(ValueError): + dpt.add(m, v) + + +@pytest.mark.parametrize("arr_dt", _all_dtypes) +def test_add_python_scalar(arr_dt): + q = get_queue_or_skip() + skip_if_dtype_not_supported(arr_dt, q) + + X = dpt.zeros((10, 10), dtype=arr_dt, sycl_queue=q) + py_zeros = ( + bool(0), + int(0), + float(0), + complex(0), + np.float32(0), + ctypes.c_int(0), + ) + for sc in py_zeros: + R = dpt.add(X, sc) + assert isinstance(R, dpt.usm_ndarray) + R = dpt.add(sc, X) + assert isinstance(R, dpt.usm_ndarray) + + +class MockArray: + def __init__(self, arr): + self.data_ = arr + + @property + def __sycl_usm_array_interface__(self): + return self.data_.__sycl_usm_array_interface__ + + +def test_add_mock_array(): + get_queue_or_skip() + a = dpt.arange(10) + b = dpt.ones(10) + c = MockArray(b) + r = dpt.add(a, c) + assert isinstance(r, dpt.usm_ndarray) + + +def test_add_canary_mock_array(): + get_queue_or_skip() + a = dpt.arange(10) + + class Canary: + def __init__(self): + pass + + @property + def __sycl_usm_array_interface__(self): + return None + + c = Canary() + with pytest.raises(ValueError): + dpt.add(a, c) + + +def test_add_errors(): + get_queue_or_skip() + try: + gpu_queue = dpctl.SyclQueue("gpu") + except dpctl.SyclQueueCreationError: + pytest.skip("SyclQueue('gpu') failed, skipping") + try: + cpu_queue = dpctl.SyclQueue("cpu") + except dpctl.SyclQueueCreationError: + pytest.skip("SyclQueue('cpu') failed, skipping") + + ar1 = dpt.ones(2, dtype="float32", sycl_queue=gpu_queue) + ar2 = dpt.ones_like(ar1, sycl_queue=gpu_queue) + y = dpt.empty_like(ar1, sycl_queue=cpu_queue) + assert_raises_regex( + TypeError, + "Input and output allocation queues are not compatible", + dpt.add, + ar1, + ar2, + y, + ) + + ar1 = dpt.ones(2, dtype="float32") + ar2 = dpt.ones_like(ar1, dtype="int32") + y = dpt.empty(3) + assert_raises_regex( + TypeError, + "The shape of input and output arrays are inconsistent", + dpt.add, + ar1, + ar2, + y, + ) + + ar1 = dpt.ones(2, dtype="float32") + ar2 = dpt.ones_like(ar1, dtype="int32") + y = ar1 + assert_raises_regex( + TypeError, + "Input and output arrays have memory overlap", + dpt.add, + ar1, + ar2, + y, + ) + + ar1 = np.ones(2, dtype="float32") + ar2 = np.ones_like(ar1, dtype="int32") + assert_raises_regex( + ExecutionPlacementError, + "Execution placement can not be unambiguously inferred.*", + dpt.add, + ar1, + ar2, + ) + + ar1 = dpt.ones(2, dtype="float32") + ar2 = dpt.ones_like(ar1, dtype="int32") + y = np.empty_like(ar1) + assert_raises_regex( + TypeError, + "output array must be of usm_ndarray type", + dpt.add, + ar1, + ar2, + y, + ) + + +@pytest.mark.parametrize("dtype", _all_dtypes) +def test_add_dtype_error( + dtype, +): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + ar1 = dpt.ones(5, dtype=dtype) + ar2 = dpt.ones_like(ar1, dtype="f4") + + y = dpt.zeros_like(ar1, dtype="int8") + assert_raises_regex( + TypeError, "Output array of type.*is needed", dpt.add, ar1, ar2, y + ) diff --git a/dpctl/tests/elementwise/test_cos.py b/dpctl/tests/elementwise/test_cos.py new file mode 100644 index 0000000000..3bf441a8dc --- /dev/null +++ b/dpctl/tests/elementwise/test_cos.py @@ -0,0 +1,175 @@ +# Data Parallel Control (dpctl) +# +# Copyright 2020-2023 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import itertools + +import numpy as np +import pytest +from numpy.testing import assert_raises_regex + +import dpctl +import dpctl.tensor as dpt +from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported + +from .utils import _all_dtypes, _map_to_device_dtype + + +@pytest.mark.parametrize("dtype", _all_dtypes) +def test_cos_out_type(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + X = dpt.asarray(0, dtype=dtype, sycl_queue=q) + expected_dtype = np.cos(np.array(0, dtype=dtype)).dtype + expected_dtype = _map_to_device_dtype(expected_dtype, q.sycl_device) + assert dpt.cos(X).dtype == expected_dtype + + X = dpt.asarray(0, dtype=dtype, sycl_queue=q) + expected_dtype = np.cos(np.array(0, dtype=dtype)).dtype + expected_dtype = _map_to_device_dtype(expected_dtype, q.sycl_device) + Y = dpt.empty_like(X, dtype=expected_dtype) + dpt.cos(X, out=Y) + np.testing.assert_allclose(dpt.asnumpy(dpt.cos(X)), dpt.asnumpy(Y)) + + +@pytest.mark.parametrize("dtype", ["f2", "f4", "f8", "c8", "c16"]) +def test_cos_output(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + n_seq = 100 + n_rep = 137 + + Xnp = np.linspace(-np.pi / 4, np.pi / 4, num=n_seq, dtype=dtype) + X = dpt.asarray(np.repeat(Xnp, n_rep), dtype=dtype, sycl_queue=q) + + Y = dpt.cos(X) + tol = 8 * dpt.finfo(Y.dtype).resolution + + np.testing.assert_allclose( + dpt.asnumpy(Y), np.repeat(np.cos(Xnp), n_rep), atol=tol, rtol=tol + ) + + Z = dpt.empty_like(X, dtype=dtype) + dpt.cos(X, out=Z) + + np.testing.assert_allclose( + dpt.asnumpy(Z), np.repeat(np.cos(Xnp), n_rep), atol=tol, rtol=tol + ) + + +@pytest.mark.parametrize("usm_type", ["device", "shared", "host"]) +def test_cos_usm_type(usm_type): + q = get_queue_or_skip() + + arg_dt = np.dtype("f4") + input_shape = (10, 10, 10, 10) + X = dpt.empty(input_shape, dtype=arg_dt, usm_type=usm_type, sycl_queue=q) + X[..., 0::2] = np.pi / 6 + X[..., 1::2] = np.pi / 3 + + Y = dpt.cos(X) + assert Y.usm_type == X.usm_type + assert Y.sycl_queue == X.sycl_queue + assert Y.flags.c_contiguous + + expected_Y = np.empty(input_shape, dtype=arg_dt) + expected_Y[..., 0::2] = np.cos(np.float32(np.pi / 6)) + expected_Y[..., 1::2] = np.cos(np.float32(np.pi / 3)) + tol = 8 * dpt.finfo(Y.dtype).resolution + + np.testing.assert_allclose(dpt.asnumpy(Y), expected_Y, atol=tol, rtol=tol) + + +@pytest.mark.parametrize("dtype", _all_dtypes) +def test_cos_order(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + arg_dt = np.dtype(dtype) + input_shape = (10, 10, 10, 10) + X = dpt.empty(input_shape, dtype=arg_dt, sycl_queue=q) + X[..., 0::2] = np.pi / 6 + X[..., 1::2] = np.pi / 3 + + for ord in ["C", "F", "A", "K"]: + for perms in itertools.permutations(range(4)): + U = dpt.permute_dims(X[:, ::-1, ::-1, :], perms) + Y = dpt.cos(U, order=ord) + expected_Y = np.cos(dpt.asnumpy(U)) + tol = 8 * max( + dpt.finfo(Y.dtype).resolution, + np.finfo(expected_Y.dtype).resolution, + ) + np.testing.assert_allclose( + dpt.asnumpy(Y), expected_Y, atol=tol, rtol=tol + ) + + +def test_cos_errors(): + get_queue_or_skip() + try: + gpu_queue = dpctl.SyclQueue("gpu") + except dpctl.SyclQueueCreationError: + pytest.skip("SyclQueue('gpu') failed, skipping") + try: + cpu_queue = dpctl.SyclQueue("cpu") + except dpctl.SyclQueueCreationError: + pytest.skip("SyclQueue('cpu') failed, skipping") + + x = dpt.zeros(2, sycl_queue=gpu_queue) + y = dpt.empty_like(x, sycl_queue=cpu_queue) + assert_raises_regex( + TypeError, + "Input and output allocation queues are not compatible", + dpt.cos, + x, + y, + ) + + x = dpt.zeros(2) + y = dpt.empty(3) + assert_raises_regex( + TypeError, + "The shape of input and output arrays are inconsistent", + dpt.cos, + x, + y, + ) + + x = dpt.zeros(2) + y = x + assert_raises_regex( + TypeError, "Input and output arrays have memory overlap", dpt.cos, x, y + ) + + x = dpt.zeros(2, dtype="float32") + y = np.empty_like(x) + assert_raises_regex( + TypeError, "output array must be of usm_ndarray type", dpt.cos, x, y + ) + + +@pytest.mark.parametrize("dtype", _all_dtypes) +def test_cos_error_dtype(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + x = dpt.zeros(5, dtype=dtype) + y = dpt.empty_like(x, dtype="int16") + assert_raises_regex( + TypeError, "Output array of type.*is needed", dpt.cos, x, y + ) diff --git a/dpctl/tests/elementwise/test_divide.py b/dpctl/tests/elementwise/test_divide.py new file mode 100644 index 0000000000..41aac736d7 --- /dev/null +++ b/dpctl/tests/elementwise/test_divide.py @@ -0,0 +1,189 @@ +# Data Parallel Control (dpctl) +# +# Copyright 2020-2023 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import ctypes + +import numpy as np +import pytest + +import dpctl +import dpctl.tensor as dpt +from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported + +from .utils import _all_dtypes, _compare_dtypes, _usm_types + + +@pytest.mark.parametrize("op1_dtype", _all_dtypes) +@pytest.mark.parametrize("op2_dtype", _all_dtypes) +def test_divide_dtype_matrix(op1_dtype, op2_dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(op1_dtype, q) + skip_if_dtype_not_supported(op2_dtype, q) + + sz = 127 + ar1 = dpt.ones(sz, dtype=op1_dtype) + ar2 = dpt.ones_like(ar1, dtype=op2_dtype) + + r = dpt.divide(ar1, ar2) + assert isinstance(r, dpt.usm_ndarray) + expected = np.divide( + np.ones(1, dtype=op1_dtype), np.ones(1, dtype=op2_dtype) + ) + assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q) + assert r.shape == ar1.shape + assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all() + assert r.sycl_queue == ar1.sycl_queue + + ar3 = dpt.ones(sz, dtype=op1_dtype) + ar4 = dpt.ones(2 * sz, dtype=op2_dtype) + + r = dpt.divide(ar3[::-1], ar4[::2]) + assert isinstance(r, dpt.usm_ndarray) + expected = np.divide( + np.ones(1, dtype=op1_dtype), np.ones(1, dtype=op2_dtype) + ) + assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q) + assert r.shape == ar3.shape + assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all() + + +@pytest.mark.parametrize("op1_usm_type", _usm_types) +@pytest.mark.parametrize("op2_usm_type", _usm_types) +def test_divide_usm_type_matrix(op1_usm_type, op2_usm_type): + get_queue_or_skip() + + sz = 128 + ar1 = dpt.ones(sz, dtype="i4", usm_type=op1_usm_type) + ar2 = dpt.ones_like(ar1, dtype="i4", usm_type=op2_usm_type) + + r = dpt.divide(ar1, ar2) + assert isinstance(r, dpt.usm_ndarray) + expected_usm_type = dpctl.utils.get_coerced_usm_type( + (op1_usm_type, op2_usm_type) + ) + assert r.usm_type == expected_usm_type + + +def test_divide_order(): + get_queue_or_skip() + + ar1 = dpt.ones((20, 20), dtype="i4", order="C") + ar2 = dpt.ones((20, 20), dtype="i4", order="C") + r1 = dpt.divide(ar1, ar2, order="C") + assert r1.flags.c_contiguous + r2 = dpt.divide(ar1, ar2, order="F") + assert r2.flags.f_contiguous + r3 = dpt.divide(ar1, ar2, order="A") + assert r3.flags.c_contiguous + r4 = dpt.divide(ar1, ar2, order="K") + assert r4.flags.c_contiguous + + ar1 = dpt.ones((20, 20), dtype="i4", order="F") + ar2 = dpt.ones((20, 20), dtype="i4", order="F") + r1 = dpt.divide(ar1, ar2, order="C") + assert r1.flags.c_contiguous + r2 = dpt.divide(ar1, ar2, order="F") + assert r2.flags.f_contiguous + r3 = dpt.divide(ar1, ar2, order="A") + assert r3.flags.f_contiguous + r4 = dpt.divide(ar1, ar2, order="K") + assert r4.flags.f_contiguous + + ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2] + ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2] + r4 = dpt.divide(ar1, ar2, order="K") + assert r4.strides == (20, -1) + + ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT + ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT + r4 = dpt.divide(ar1, ar2, order="K") + assert r4.strides == (-1, 20) + + +def test_divide_broadcasting(): + get_queue_or_skip() + + m = dpt.ones((100, 5), dtype="i4") + v = dpt.arange(1, 6, dtype="i4") + + r = dpt.divide(m, v) + + expected = np.divide( + np.ones((100, 5), dtype="i4"), np.arange(1, 6, dtype="i4") + ) + assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all() + + r2 = dpt.divide(v, m) + expected2 = np.divide( + np.arange(1, 6, dtype="i4"), np.ones((100, 5), dtype="i4") + ) + assert (dpt.asnumpy(r2) == expected2.astype(r2.dtype)).all() + + +@pytest.mark.parametrize("arr_dt", _all_dtypes) +def test_divide_python_scalar(arr_dt): + q = get_queue_or_skip() + skip_if_dtype_not_supported(arr_dt, q) + + X = dpt.ones((10, 10), dtype=arr_dt, sycl_queue=q) + py_ones = ( + bool(1), + int(1), + float(1), + complex(1), + np.float32(1), + ctypes.c_int(1), + ) + for sc in py_ones: + R = dpt.divide(X, sc) + assert isinstance(R, dpt.usm_ndarray) + R = dpt.divide(sc, X) + assert isinstance(R, dpt.usm_ndarray) + + +class MockArray: + def __init__(self, arr): + self.data_ = arr + + @property + def __sycl_usm_array_interface__(self): + return self.data_.__sycl_usm_array_interface__ + + +def test_divide_mock_array(): + get_queue_or_skip() + a = dpt.arange(10) + b = dpt.ones(10) + c = MockArray(b) + r = dpt.divide(a, c) + assert isinstance(r, dpt.usm_ndarray) + + +def test_divide_canary_mock_array(): + get_queue_or_skip() + a = dpt.arange(10) + + class Canary: + def __init__(self): + pass + + @property + def __sycl_usm_array_interface__(self): + return None + + c = Canary() + with pytest.raises(ValueError): + dpt.divide(a, c) diff --git a/dpctl/tests/elementwise/test_equal.py b/dpctl/tests/elementwise/test_equal.py new file mode 100644 index 0000000000..cdd26a32d0 --- /dev/null +++ b/dpctl/tests/elementwise/test_equal.py @@ -0,0 +1,186 @@ +# Data Parallel Control (dpctl) +# +# Copyright 2020-2023 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import ctypes + +import numpy as np +import pytest + +import dpctl +import dpctl.tensor as dpt +from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported + +from .utils import _all_dtypes, _compare_dtypes, _usm_types + + +@pytest.mark.parametrize("op1_dtype", _all_dtypes) +@pytest.mark.parametrize("op2_dtype", _all_dtypes) +def test_equal_dtype_matrix(op1_dtype, op2_dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(op1_dtype, q) + skip_if_dtype_not_supported(op2_dtype, q) + + sz = 127 + ar1 = dpt.ones(sz, dtype=op1_dtype) + ar2 = dpt.ones_like(ar1, dtype=op2_dtype) + + r = dpt.equal(ar1, ar2) + assert isinstance(r, dpt.usm_ndarray) + expected_dtype = np.equal( + np.zeros(1, dtype=op1_dtype), np.zeros(1, dtype=op2_dtype) + ).dtype + assert _compare_dtypes(r.dtype, expected_dtype, sycl_queue=q) + assert r.shape == ar1.shape + assert (dpt.asnumpy(r) == np.full(r.shape, True, dtype=r.dtype)).all() + assert r.sycl_queue == ar1.sycl_queue + + ar3 = dpt.ones(sz, dtype=op1_dtype) + ar4 = dpt.ones(2 * sz, dtype=op2_dtype) + + r = dpt.equal(ar3[::-1], ar4[::2]) + assert isinstance(r, dpt.usm_ndarray) + expected_dtype = np.equal( + np.ones(1, dtype=op1_dtype), np.ones(1, dtype=op2_dtype) + ).dtype + assert _compare_dtypes(r.dtype, expected_dtype, sycl_queue=q) + assert r.shape == ar3.shape + assert (dpt.asnumpy(r) == np.full(r.shape, True, dtype=r.dtype)).all() + + +@pytest.mark.parametrize("op1_usm_type", _usm_types) +@pytest.mark.parametrize("op2_usm_type", _usm_types) +def test_equal_usm_type_matrix(op1_usm_type, op2_usm_type): + get_queue_or_skip() + + sz = 128 + ar1 = dpt.ones(sz, dtype="i4", usm_type=op1_usm_type) + ar2 = dpt.ones_like(ar1, dtype="i4", usm_type=op2_usm_type) + + r = dpt.equal(ar1, ar2) + assert isinstance(r, dpt.usm_ndarray) + expected_usm_type = dpctl.utils.get_coerced_usm_type( + (op1_usm_type, op2_usm_type) + ) + assert r.usm_type == expected_usm_type + + +def test_equal_order(): + get_queue_or_skip() + + ar1 = dpt.ones((20, 20), dtype="i4", order="C") + ar2 = dpt.ones((20, 20), dtype="i4", order="C") + r1 = dpt.equal(ar1, ar2, order="C") + assert r1.flags.c_contiguous + r2 = dpt.equal(ar1, ar2, order="F") + assert r2.flags.f_contiguous + r3 = dpt.equal(ar1, ar2, order="A") + assert r3.flags.c_contiguous + r4 = dpt.equal(ar1, ar2, order="K") + assert r4.flags.c_contiguous + + ar1 = dpt.ones((20, 20), dtype="i4", order="F") + ar2 = dpt.ones((20, 20), dtype="i4", order="F") + r1 = dpt.equal(ar1, ar2, order="C") + assert r1.flags.c_contiguous + r2 = dpt.equal(ar1, ar2, order="F") + assert r2.flags.f_contiguous + r3 = dpt.equal(ar1, ar2, order="A") + assert r3.flags.f_contiguous + r4 = dpt.equal(ar1, ar2, order="K") + assert r4.flags.f_contiguous + + ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2] + ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2] + r4 = dpt.equal(ar1, ar2, order="K") + assert r4.strides == (20, -1) + + ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT + ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT + r4 = dpt.equal(ar1, ar2, order="K") + assert r4.strides == (-1, 20) + + +def test_equal_broadcasting(): + get_queue_or_skip() + + m = dpt.ones((100, 5), dtype="i4") + v = dpt.arange(5, dtype="i4") + + r = dpt.equal(m, v) + expected = np.full((100, 5), [False, True, False, False, False], dtype="?") + + assert (dpt.asnumpy(r) == expected).all() + + r2 = dpt.equal(v, m) + assert (dpt.asnumpy(r2) == expected).all() + + +@pytest.mark.parametrize("arr_dt", _all_dtypes) +def test_equal_python_scalar(arr_dt): + q = get_queue_or_skip() + skip_if_dtype_not_supported(arr_dt, q) + + X = dpt.zeros((10, 10), dtype=arr_dt, sycl_queue=q) + py_zeros = ( + bool(0), + int(0), + float(0), + complex(0), + np.float32(0), + ctypes.c_int(0), + ) + for sc in py_zeros: + R = dpt.equal(X, sc) + assert isinstance(R, dpt.usm_ndarray) + assert dpt.all(R) + R = dpt.equal(sc, X) + assert isinstance(R, dpt.usm_ndarray) + assert dpt.all(R) + + +class MockArray: + def __init__(self, arr): + self.data_ = arr + + @property + def __sycl_usm_array_interface__(self): + return self.data_.__sycl_usm_array_interface__ + + +def test_equal_mock_array(): + get_queue_or_skip() + a = dpt.arange(10) + b = dpt.ones(10) + c = MockArray(b) + r = dpt.equal(a, c) + assert isinstance(r, dpt.usm_ndarray) + + +def test_equal_canary_mock_array(): + get_queue_or_skip() + a = dpt.arange(10) + + class Canary: + def __init__(self): + pass + + @property + def __sycl_usm_array_interface__(self): + return None + + c = Canary() + with pytest.raises(ValueError): + dpt.equal(a, c) diff --git a/dpctl/tests/elementwise/test_isfinite.py b/dpctl/tests/elementwise/test_isfinite.py new file mode 100644 index 0000000000..f25005542d --- /dev/null +++ b/dpctl/tests/elementwise/test_isfinite.py @@ -0,0 +1,98 @@ +# Data Parallel Control (dpctl) +# +# Copyright 2020-2023 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import itertools + +import numpy as np +import pytest + +import dpctl.tensor as dpt +from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported + +from .utils import _all_dtypes + + +@pytest.mark.parametrize("dtype", _all_dtypes) +def test_isfinite_out_type(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + X = dpt.asarray(0, dtype=dtype, sycl_queue=q) + assert dpt.isfinite(X).dtype == dpt.bool + + +def test_isfinite_output(): + q = get_queue_or_skip() + + Xnp = np.asarray(np.nan) + X = dpt.asarray(np.nan, sycl_queue=q) + assert dpt.asnumpy(dpt.isfinite(X)) == np.isfinite(Xnp) + + +@pytest.mark.parametrize("dtype", ["c8", "c16"]) +def test_isfinite_complex(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + y1 = complex(np.nan, np.nan) + y2 = complex(1, np.nan) + y3 = complex(np.nan, 1) + y4 = complex(2, 1) + y5 = complex(np.inf, 1) + + Ynp = np.repeat(np.array([y1, y2, y3, y4, y5], dtype=dtype), 12) + Y = dpt.asarray(Ynp, sycl_queue=q) + assert np.array_equal(dpt.asnumpy(dpt.isfinite(Y)), np.isfinite(Ynp)) + + r = dpt.empty_like(Y, dtype="bool") + dpt.isfinite(Y, out=r) + assert np.array_equal(dpt.asnumpy(r)[()], np.isfinite(Ynp)) + + +@pytest.mark.parametrize("dtype", ["f2", "f4", "f8"]) +def test_isfinite_floats(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + y1 = np.nan + y2 = 1 + y3 = np.inf + + for mult in [123, 137, 255, 271, 272]: + Ynp = np.repeat(np.array([y1, y2, y3], dtype=dtype), mult) + Y = dpt.asarray(Ynp, sycl_queue=q) + assert np.array_equal(dpt.asnumpy(dpt.isfinite(Y)), np.isfinite(Ynp)) + + r = dpt.empty_like(Y, dtype="bool") + dpt.isfinite(Y, out=r) + assert np.array_equal(dpt.asnumpy(r)[()], np.isfinite(Ynp)) + + +@pytest.mark.parametrize("dtype", _all_dtypes) +def test_isfinite_order(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + arg_dt = np.dtype(dtype) + input_shape = (10, 10, 10, 10) + X = dpt.ones(input_shape, dtype=arg_dt, sycl_queue=q) + + for ord in ["C", "F", "A", "K"]: + for perms in itertools.permutations(range(4)): + U = dpt.permute_dims(X[::2, ::-1, ::-1, ::5], perms) + Y = dpt.isfinite(U, order=ord) + expected_Y = np.full(Y.shape, True, dtype=Y.dtype) + assert np.allclose(dpt.asnumpy(Y), expected_Y) diff --git a/dpctl/tests/elementwise/test_isinf.py b/dpctl/tests/elementwise/test_isinf.py new file mode 100644 index 0000000000..37f9a68a2b --- /dev/null +++ b/dpctl/tests/elementwise/test_isinf.py @@ -0,0 +1,92 @@ +# Data Parallel Control (dpctl) +# +# Copyright 2020-2023 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import itertools + +import numpy as np +import pytest + +import dpctl.tensor as dpt +from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported + +from .utils import _all_dtypes + + +@pytest.mark.parametrize("dtype", _all_dtypes) +def test_isinf_out_type(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + X = dpt.asarray(0, dtype=dtype, sycl_queue=q) + assert dpt.isinf(X).dtype == dpt.bool + + +def test_isinf_output(): + q = get_queue_or_skip() + + Xnp = np.asarray(np.inf) + X = dpt.asarray(np.inf, sycl_queue=q) + assert dpt.asnumpy(dpt.isinf(X)) == np.isinf(Xnp) + + +@pytest.mark.parametrize("dtype", ["c8", "c16"]) +def test_isinf_complex(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + y1 = complex(np.inf, np.inf) + y2 = complex(1, np.inf) + y3 = complex(np.inf, 1) + y4 = complex(2, 1) + y5 = complex(np.inf, 1) + y6 = complex(np.inf, np.nan) + + Ynp = np.repeat(np.array([y1, y2, y3, y4, y5, y6], dtype=dtype), 123) + Y = dpt.asarray(Ynp, sycl_queue=q) + assert np.array_equal(dpt.asnumpy(dpt.isinf(Y)), np.isinf(Ynp)) + + +@pytest.mark.parametrize("dtype", ["f2", "f4", "f8"]) +def test_isinf_floats(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + y1 = np.nan + y2 = 1 + y3 = np.inf + y4 = -np.inf + + for mult in [123, 137, 255, 271, 272]: + Ynp = np.repeat(np.array([y1, y2, y3, y4], dtype=dtype), mult) + Y = dpt.asarray(Ynp, sycl_queue=q) + assert np.array_equal(dpt.asnumpy(dpt.isinf(Y)), np.isinf(Ynp)) + + +@pytest.mark.parametrize("dtype", _all_dtypes) +def test_isinf_order(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + arg_dt = np.dtype(dtype) + input_shape = (10, 10, 10, 10) + X = dpt.ones(input_shape, dtype=arg_dt, sycl_queue=q) + + for ord in ["C", "F", "A", "K"]: + for perms in itertools.permutations(range(4)): + U = dpt.permute_dims(X[::2, ::-1, ::-1, ::5], perms) + Y = dpt.isinf(U, order=ord) + expected_Y = np.full(Y.shape, False, dtype=Y.dtype) + assert np.allclose(dpt.asnumpy(Y), expected_Y) diff --git a/dpctl/tests/elementwise/test_isnan.py b/dpctl/tests/elementwise/test_isnan.py new file mode 100644 index 0000000000..5a2ac4e582 --- /dev/null +++ b/dpctl/tests/elementwise/test_isnan.py @@ -0,0 +1,98 @@ +# Data Parallel Control (dpctl) +# +# Copyright 2020-2023 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import itertools + +import numpy as np +import pytest + +import dpctl.tensor as dpt +from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported + +from .utils import _all_dtypes + + +@pytest.mark.parametrize("dtype", _all_dtypes) +def test_isnan_out_type(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + X = dpt.asarray(0, dtype=dtype, sycl_queue=q) + assert dpt.isnan(X).dtype == dpt.bool + + +def test_isnan_output(): + q = get_queue_or_skip() + + Xnp = np.asarray(np.nan) + X = dpt.asarray(np.nan, sycl_queue=q) + assert dpt.asnumpy(dpt.isnan(X)) == np.isnan(Xnp) + + +@pytest.mark.parametrize("dtype", ["c8", "c16"]) +def test_isnan_complex(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + y1 = complex(np.nan, np.nan) + y2 = complex(1, np.nan) + y3 = complex(np.nan, 1) + y4 = complex(2, 1) + y5 = complex(np.inf, 1) + + Ynp = np.repeat(np.array([y1, y2, y3, y4, y5], dtype=dtype), 123) + Y = dpt.asarray(Ynp, sycl_queue=q) + assert np.array_equal(dpt.asnumpy(dpt.isnan(Y)), np.isnan(Ynp)) + + r = dpt.empty_like(Y, dtype="bool") + dpt.isnan(Y, out=r) + assert np.array_equal(dpt.asnumpy(r)[()], np.isnan(Ynp)) + + +@pytest.mark.parametrize("dtype", ["f2", "f4", "f8"]) +def test_isnan_floats(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + y1 = np.nan + y2 = 1 + y3 = np.inf + + for mult in [123, 137, 255, 271, 272]: + Ynp = np.repeat(np.array([y1, y2, y3], dtype=dtype), mult) + Y = dpt.asarray(Ynp, sycl_queue=q) + assert np.array_equal(dpt.asnumpy(dpt.isnan(Y)), np.isnan(Ynp)) + + r = dpt.empty_like(Y, dtype="bool") + dpt.isnan(Y, out=r) + assert np.array_equal(dpt.asnumpy(r)[()], np.isnan(Ynp)) + + +@pytest.mark.parametrize("dtype", _all_dtypes) +def test_isnan_order(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + arg_dt = np.dtype(dtype) + input_shape = (10, 10, 10, 10) + X = dpt.ones(input_shape, dtype=arg_dt, sycl_queue=q) + + for ord in ["C", "F", "A", "K"]: + for perms in itertools.permutations(range(4)): + U = dpt.permute_dims(X[::2, ::-1, ::-1, ::5], perms) + Y = dpt.isnan(U, order=ord) + expected_Y = np.full(Y.shape, False, dtype=Y.dtype) + assert np.allclose(dpt.asnumpy(Y), expected_Y) diff --git a/dpctl/tests/elementwise/test_sqrt.py b/dpctl/tests/elementwise/test_sqrt.py new file mode 100644 index 0000000000..ce168a5ccb --- /dev/null +++ b/dpctl/tests/elementwise/test_sqrt.py @@ -0,0 +1,128 @@ +# Data Parallel Control (dpctl) +# +# Copyright 2020-2023 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import itertools + +import numpy as np +import pytest +from numpy.testing import assert_equal + +import dpctl.tensor as dpt +from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported + +from .utils import _all_dtypes, _map_to_device_dtype, _usm_types + + +@pytest.mark.parametrize("dtype", _all_dtypes) +def test_sqrt_out_type(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + X = dpt.asarray(0, dtype=dtype, sycl_queue=q) + expected_dtype = np.sqrt(np.array(0, dtype=dtype)).dtype + expected_dtype = _map_to_device_dtype(expected_dtype, q.sycl_device) + assert dpt.sqrt(X).dtype == expected_dtype + + +@pytest.mark.parametrize("dtype", ["f2", "f4", "f8", "c8", "c16"]) +def test_sqrt_output_contig(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + n_seq = 1027 + + X = dpt.linspace(0, 13, num=n_seq, dtype=dtype, sycl_queue=q) + Xnp = dpt.asnumpy(X) + + Y = dpt.sqrt(X) + tol = 8 * dpt.finfo(Y.dtype).resolution + + np.testing.assert_allclose(dpt.asnumpy(Y), np.sqrt(Xnp), atol=tol, rtol=tol) + + +@pytest.mark.parametrize("dtype", ["f2", "f4", "f8", "c8", "c16"]) +def test_sqrt_output_strided(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + n_seq = 2054 + + X = dpt.linspace(0, 13, num=n_seq, dtype=dtype, sycl_queue=q)[::-2] + Xnp = dpt.asnumpy(X) + + Y = dpt.sqrt(X) + tol = 8 * dpt.finfo(Y.dtype).resolution + + np.testing.assert_allclose(dpt.asnumpy(Y), np.sqrt(Xnp), atol=tol, rtol=tol) + + +@pytest.mark.parametrize("usm_type", _usm_types) +def test_sqrt_usm_type(usm_type): + q = get_queue_or_skip() + + arg_dt = np.dtype("f4") + input_shape = (10, 10, 10, 10) + X = dpt.empty(input_shape, dtype=arg_dt, usm_type=usm_type, sycl_queue=q) + X[..., 0::2] = 16.0 + X[..., 1::2] = 23.0 + + Y = dpt.sqrt(X) + assert Y.usm_type == X.usm_type + assert Y.sycl_queue == X.sycl_queue + assert Y.flags.c_contiguous + + expected_Y = np.empty(input_shape, dtype=arg_dt) + expected_Y[..., 0::2] = np.sqrt(np.float32(16.0)) + expected_Y[..., 1::2] = np.sqrt(np.float32(23.0)) + tol = 8 * dpt.finfo(Y.dtype).resolution + + np.testing.assert_allclose(dpt.asnumpy(Y), expected_Y, atol=tol, rtol=tol) + + +@pytest.mark.parametrize("dtype", _all_dtypes) +def test_sqrt_order(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + arg_dt = np.dtype(dtype) + input_shape = (10, 10, 10, 10) + X = dpt.empty(input_shape, dtype=arg_dt, sycl_queue=q) + X[..., 0::2] = 16.0 + X[..., 1::2] = 23.0 + + for ord in ["C", "F", "A", "K"]: + for perms in itertools.permutations(range(4)): + U = dpt.permute_dims(X[:, ::-1, ::-1, :], perms) + Y = dpt.sqrt(U, order=ord) + expected_Y = np.sqrt(dpt.asnumpy(U)) + tol = 8 * max( + dpt.finfo(Y.dtype).resolution, + np.finfo(expected_Y.dtype).resolution, + ) + np.testing.assert_allclose( + dpt.asnumpy(Y), expected_Y, atol=tol, rtol=tol + ) + + +def test_sqrt_special_cases(): + q = get_queue_or_skip() + + X = dpt.asarray( + [dpt.nan, -1.0, 0.0, -0.0, dpt.inf, -dpt.inf], dtype="f4", sycl_queue=q + ) + Xnp = dpt.asnumpy(X) + + assert_equal(dpt.asnumpy(dpt.sqrt(X)), np.sqrt(Xnp)) diff --git a/dpctl/tests/elementwise/test_type_utils.py b/dpctl/tests/elementwise/test_type_utils.py new file mode 100644 index 0000000000..c040713925 --- /dev/null +++ b/dpctl/tests/elementwise/test_type_utils.py @@ -0,0 +1,183 @@ +# Data Parallel Control (dpctl) +# +# Copyright 2020-2023 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest + +import dpctl +import dpctl.tensor as dpt +import dpctl.tensor._type_utils as tu + +from .utils import _all_dtypes, _map_to_device_dtype + + +class MockDevice: + def __init__(self, fp16: bool, fp64: bool): + self.has_aspect_fp16 = fp16 + self.has_aspect_fp64 = fp64 + + +@pytest.mark.parametrize("dtype", _all_dtypes) +def test_type_utils_map_to_device_type(dtype): + for fp64 in [ + True, + False, + ]: + for fp16 in [True, False]: + dev = MockDevice(fp16, fp64) + dt_in = dpt.dtype(dtype) + dt_out = _map_to_device_dtype(dt_in, dev) + assert isinstance(dt_out, dpt.dtype) + + +def test_type_util_all_data_types(): + for fp64 in [ + True, + False, + ]: + for fp16 in [True, False]: + r = tu._all_data_types(fp16, fp64) + assert isinstance(r, list) + # 11: bool + 4 signed + 4 unsigned inegral + float32 + complex64 + assert len(r) == 11 + int(fp16) + 2 * int(fp64) + + +def test_type_util_can_cast(): + for fp64 in [ + True, + False, + ]: + for fp16 in [True, False]: + for from_ in _all_dtypes: + for to_ in _all_dtypes: + r = tu._can_cast( + dpt.dtype(from_), dpt.dtype(to_), fp16, fp64 + ) + assert isinstance(r, bool) + + +def test_type_utils_empty_like_orderK(): + try: + a = dpt.empty((10, 10), dtype=dpt.int32, order="F") + except dpctl.SyclDeviceCreationError: + pytest.skip("No SYCL devices available") + X = tu._empty_like_orderK(a, dpt.int32, a.usm_type, a.device) + assert X.flags["F"] + + +def test_type_utils_empty_like_orderK_invalid_args(): + with pytest.raises(TypeError): + tu._empty_like_orderK([1, 2, 3], dpt.int32, "device", None) + with pytest.raises(TypeError): + tu._empty_like_pair_orderK( + [1, 2, 3], + ( + 1, + 2, + 3, + ), + dpt.int32, + "device", + None, + ) + try: + a = dpt.empty(10, dtype=dpt.int32) + except dpctl.SyclDeviceCreationError: + pytest.skip("No SYCL devices available") + with pytest.raises(TypeError): + tu._empty_like_pair_orderK( + a, + ( + 1, + 2, + 3, + ), + dpt.int32, + "device", + None, + ) + + +def test_type_utils_find_buf_dtype(): + def _denier_fn(dt): + return False + + for fp64 in [ + True, + False, + ]: + for fp16 in [True, False]: + dev = MockDevice(fp16, fp64) + arg_dt = dpt.float64 + r = tu._find_buf_dtype(arg_dt, _denier_fn, dev) + assert r == ( + None, + None, + ) + + +def test_type_utils_get_device_default_type(): + with pytest.raises(RuntimeError): + tu._get_device_default_dtype("-", MockDevice(True, True)) + try: + dev = dpctl.SyclDevice() + except dpctl.SyclDeviceCreationError: + pytest.skip("No SYCL devices available") + for k in ["b", "i", "u", "f", "c"]: + dt = tu._get_device_default_dtype(k, dev) + assert isinstance(dt, dpt.dtype) + assert dt.kind == k + + +def test_type_utils_find_buf_dtype2(): + def _denier_fn(dt1, dt2): + return False + + for fp64 in [ + True, + False, + ]: + for fp16 in [True, False]: + dev = MockDevice(fp16, fp64) + arg1_dt = dpt.float64 + arg2_dt = dpt.complex64 + r = tu._find_buf_dtype2(arg1_dt, arg2_dt, _denier_fn, dev) + assert r == ( + None, + None, + None, + ) + + +def test_unary_func_arg_validation(): + with pytest.raises(TypeError): + dpt.abs([1, 2, 3]) + try: + a = dpt.arange(8) + except dpctl.SyclDeviceCreationError: + pytest.skip("No SYCL devices available") + dpt.abs(a, order="invalid") + + +def test_binary_func_arg_validation(): + with pytest.raises(dpctl.utils.ExecutionPlacementError): + dpt.add([1, 2, 3], 1) + try: + a = dpt.arange(8) + except dpctl.SyclDeviceCreationError: + pytest.skip("No SYCL devices available") + with pytest.raises(ValueError): + dpt.add(a, Ellipsis) + dpt.add(a, a, order="invalid") diff --git a/dpctl/tests/elementwise/utils.py b/dpctl/tests/elementwise/utils.py new file mode 100644 index 0000000000..0d9396dcb4 --- /dev/null +++ b/dpctl/tests/elementwise/utils.py @@ -0,0 +1,55 @@ +# Data Parallel Control (dpctl) +# +# Copyright 2020-2023 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import dpctl +import dpctl.tensor._type_utils as tu + +_all_dtypes = [ + "b1", + "i1", + "u1", + "i2", + "u2", + "i4", + "u4", + "i8", + "u8", + "f2", + "f4", + "f8", + "c8", + "c16", +] +_usm_types = ["device", "shared", "host"] + + +def _map_to_device_dtype(dt, dev): + return tu._to_device_supported_dtype(dt, dev) + + +def _compare_dtypes(dt, ref_dt, sycl_queue=None): + assert isinstance(sycl_queue, dpctl.SyclQueue) + dev = sycl_queue.sycl_device + expected_dt = _map_to_device_dtype(ref_dt, dev) + return dt == expected_dt + + +__all__ = [ + "_all_dtypes", + "_usm_types", + "_map_to_device_dtype", + "_compare_dtypes", +]