From 4e144b161e62f3723a4a5e3ca9c0a07d3b235dd8 Mon Sep 17 00:00:00 2001 From: Dounia Khaldi Date: Mon, 20 Dec 2021 10:56:18 -0500 Subject: [PATCH 1/7] [SYCL][Matrix] test the two features: fill a matrix and element wise operations Signed-off-by: Dounia Khaldi --- SYCL/Matrix/element_wise_ops.cpp | 173 +++++++++++++++++++++++++++ SYCL/Matrix/joint_matrix_ss_int8.cpp | 9 +- 2 files changed, 176 insertions(+), 6 deletions(-) create mode 100644 SYCL/Matrix/element_wise_ops.cpp diff --git a/SYCL/Matrix/element_wise_ops.cpp b/SYCL/Matrix/element_wise_ops.cpp new file mode 100644 index 0000000000..40a0d4377b --- /dev/null +++ b/SYCL/Matrix/element_wise_ops.cpp @@ -0,0 +1,173 @@ +//==----------- element_wise_ops.cpp - DPC++ joint_matrix------------- ----==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: matrix + +// RUN: %clangxx -fsycl %s -o %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out + +#include +#include + +using namespace sycl; +using namespace sycl::ext::oneapi::experimental::matrix; + +#define SG_SZ 8 + +#define TM 8 +#define TN SG_SZ +#define TK 32 + +template struct big_matrix { +public: + T *mat; + +public: + T *get_data() { return mat; } + void set_data(T *data) { mat = data; } + big_matrix(T *data) : mat(data) {} +}; + +template +void matrix_multiply(big_matrix &C, + big_matrix &A, + big_matrix &B) { + size_t M = NUM_ROWS_C; + size_t N = NUM_COLS_C; + size_t K = NUM_COLS_A; + // B => K/4 x N*4, A => M x K, C => M, N + // stride should be X's cols, e.g., B's stirde = N*4 + assert(NUM_ROWS_C == NUM_ROWS_A && NUM_COLS_A == NUM_ROWS_B * 4); + size_t NDRangeM = M / TM; + size_t NDRangeN = N / TN; + buffer bufA(A.get_data(), range<2>(M, K)); + buffer bufB(B.get_data(), range<2>(K, N)); + buffer bufC(C.get_data(), range<2>(M, N)); + + queue q; + q.submit([&](handler &cgh) { + auto accC = bufC.get_access(cgh); + auto accA = bufA.get_access(cgh); + auto accB = bufB.get_access(cgh); + + cgh.parallel_for( + nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}), + [accA, accB, accC, M, N, K](nd_item<2> spmd_item) + + { + // The submatrix API has to be accessed by all the workitems in a + // subgroup these functions will be called once by the subgroup no + // code divergence between the workitems + const auto global_idx = spmd_item.get_global_id(0); + const auto global_idy = spmd_item.get_global_id(1); + const auto sg_startx = global_idx - spmd_item.get_local_id(0); + const auto sg_starty = global_idy - spmd_item.get_local_id(1); + + ext::oneapi::sub_group sg = spmd_item.get_sub_group(); + joint_matrix sub_a(sg); + // For B, since current implementation does not support non-packed + // layout, users need to specify the updated VNNI sizes along with + // the packed_b layout. By default, the layout is row_major and size + // is (TK, TN). + joint_matrix sub_b(sg); + joint_matrix sub_c(sg); + + // AMX: 8 register tiles : 1k byte size, SMmaxxSKmax =16x64 + // strideX = X's cols, so strideC = N, strideA = K, strideB = N*4 + joint_matrix_load(sg, sub_c, + accC.get_pointer() + (sg_startx * TM) * N + + sg_starty / SG_SZ * TN, + N, matrix_layout::row_major); + for (int k = 0; k < K / TK; k += 1) { + joint_matrix_load( + sg, sub_a, accA.get_pointer() + (sg_startx * TM) * K + k * TK, + K, matrix_layout::row_major); + // Assuming B data is already in VNNI format. + joint_matrix_load(sg, sub_b, + accB.get_pointer() + (k * TK / 4) * (N * 4) + + sg_starty / SG_SZ * TN * 4, + N * 4, matrix_layout::packed_b); + sub_c = joint_matrix_mad(sg, sub_a, sub_b, sub_c); + auto wi_slice_c = sub_c.get_wi_data(); + for (int i = 0; i < wi_slice_c.length(); i++) { + wi_slice_c[i] *= 2; + } + } + joint_matrix_store(sg, sub_c, + accC.get_pointer() + (sg_startx * TM) * N + + sg_starty / SG_SZ * TN, + N, matrix_layout::row_major); + }); // parallel for + }).wait(); +} + +static constexpr size_t MATRIX_M = TM * 2; +static constexpr size_t MATRIX_N = TN * 2; +static constexpr size_t MATRIX_K = TK * 2; +int8_t A[MATRIX_M][MATRIX_K]; +int8_t B[MATRIX_K / 4][MATRIX_N * 4]; +int32_t C[MATRIX_M][MATRIX_N]; +int32_t D[MATRIX_M][MATRIX_N]; + +void matrix_multiply_ref(int32_t *A_mem, int32_t *B_mem, int32_t *C_mem, int M, + int N, int K) { + // tiling + for (int m = 0; m < M; m++) + for (int n = 0; n < N; n++) { + for (int k = 0; k < K; k++) { + char *va = (char *)(A_mem + m * K + k); + char *vb = (char *)(B_mem + k * N + n); + int acc = *(C_mem + m * N + n); + for (int i = 0; i < 4; i++) { + acc += (va[i] * vb[i]); + } + *(C_mem + m * N + n) = acc; + } + } +} + +int main() { + for (int i = 0; i < MATRIX_M; i++) { + for (int j = 0; j < MATRIX_K; j++) { + A[i][j] = i + 2 * j; + } + } + for (int i = 0; i < MATRIX_K / 4; i++) { + for (int j = 0; j < MATRIX_N * 4; j++) { + B[i][j] = i + j; + } + } + for (int i = 0; i < MATRIX_M; i++) { + for (int j = 0; j < MATRIX_N; j++) { + C[i][j] = 1; + D[i][j] = 1; + } + } + + big_matrix MC((int32_t *)&C); + big_matrix MD((int32_t *)&D); + big_matrix MA((int8_t *)&A); + big_matrix MB((int8_t *)&B); + matrix_multiply(MC, MA, MB); + matrix_multiply_ref((int32_t *)A, (int32_t *)B, (int32_t *)D, MATRIX_M, + MATRIX_N, MATRIX_K / 4); + + bool res = true; + for (int i = 0; i < MATRIX_M; i++) { + for (int j = 0; j < MATRIX_N; j++) { + if (C[i][j] != D[i][j]) + res = false; + } + } + if (res) + std::cout << "passed\n"; + else + std::cout << "failed\n"; +} diff --git a/SYCL/Matrix/joint_matrix_ss_int8.cpp b/SYCL/Matrix/joint_matrix_ss_int8.cpp index e80338e0e1..6551fb1326 100644 --- a/SYCL/Matrix/joint_matrix_ss_int8.cpp +++ b/SYCL/Matrix/joint_matrix_ss_int8.cpp @@ -78,10 +78,7 @@ void matrix_multiply(big_matrix &C, joint_matrix sub_b(sg); joint_matrix sub_c(sg); - joint_matrix_load(sg, sub_c, - accC.get_pointer() + (sg_startx * TM) * N + - sg_starty / SG_SZ * TN, - N, matrix_layout::row_major); + joint_matrix_fill(sg, sub_c, 0); for (int k = 0; k < K / TK; k += 1) { joint_matrix_load( sg, sub_a, accA.get_pointer() + (sg_startx * TM) * K + k * TK, @@ -139,8 +136,8 @@ int main() { } for (int i = 0; i < MATRIX_M; i++) { for (int j = 0; j < MATRIX_N; j++) { - C[i][j] = 1; - D[i][j] = 1; + C[i][j] = 0; + D[i][j] = 0; } } From 2c6626745029f8073293aa3c7adf379e46716f31 Mon Sep 17 00:00:00 2001 From: Dounia Khaldi Date: Tue, 21 Dec 2021 10:37:33 -0500 Subject: [PATCH 2/7] [SYCL][Matrix][Element-wise-ops] correct the reference multiplication Signed-off-by: Dounia Khaldi --- SYCL/Matrix/element_wise_ops.cpp | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/SYCL/Matrix/element_wise_ops.cpp b/SYCL/Matrix/element_wise_ops.cpp index 40a0d4377b..f3e1cfe8f0 100644 --- a/SYCL/Matrix/element_wise_ops.cpp +++ b/SYCL/Matrix/element_wise_ops.cpp @@ -95,10 +95,10 @@ void matrix_multiply(big_matrix &C, sg_starty / SG_SZ * TN * 4, N * 4, matrix_layout::packed_b); sub_c = joint_matrix_mad(sg, sub_a, sub_b, sub_c); - auto wi_slice_c = sub_c.get_wi_data(); - for (int i = 0; i < wi_slice_c.length(); i++) { - wi_slice_c[i] *= 2; - } + } + auto wi_slice_c = sub_c.get_wi_data(); + for (int i = 0; i < wi_slice_c.length(); i++) { + wi_slice_c[i] *= 2; } joint_matrix_store(sg, sub_c, accC.get_pointer() + (sg_startx * TM) * N + @@ -130,6 +130,7 @@ void matrix_multiply_ref(int32_t *A_mem, int32_t *B_mem, int32_t *C_mem, int M, } *(C_mem + m * N + n) = acc; } + *(C_mem + m * N + n) *= 2; } } From 51203d15f6e97c67f7ab015ddcc477a2f47628da Mon Sep 17 00:00:00 2001 From: Dounia Khaldi Date: Wed, 5 Jan 2022 21:15:45 -0500 Subject: [PATCH 3/7] [SYCL][Matrix] Incorporate Alexey comments by adding half type and all other operations Signed-off-by: Dounia Khaldi --- SYCL/Matrix/element_wise_all_ops_half.cpp | 255 ++++++++++++++++++++++ 1 file changed, 255 insertions(+) create mode 100644 SYCL/Matrix/element_wise_all_ops_half.cpp diff --git a/SYCL/Matrix/element_wise_all_ops_half.cpp b/SYCL/Matrix/element_wise_all_ops_half.cpp new file mode 100644 index 0000000000..ab4a6d932e --- /dev/null +++ b/SYCL/Matrix/element_wise_all_ops_half.cpp @@ -0,0 +1,255 @@ +//==----------- element_wise_all_ops_half.cpp - DPC++ joint_matrix---------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: matrix + +// RUN: %clangxx -fsycl %s -o %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out + +#include +#include +#include + +using namespace sycl; +using namespace sycl::ext::intel; +using namespace sycl::ext::oneapi::experimental::matrix; + +#define SG_SZ 8 + +#define TM 8 +#define TN SG_SZ +#define TK 16 + +template struct big_matrix { +public: + T *mat; + +public: + T *get_data() { return mat; } + void set_data(T *data) { mat = data; } + big_matrix(T *data) : mat(data) {} +}; + +template +void assert_ops_ref(/*const T &C*/ accessor + C, + const float ref) { + for (size_t i = 0; i < M; i++) + for (size_t j = 0; j < N; j++) { + auto diff = C[i][j] - ref; + assert(std::fabs(static_cast(diff)) < + std::numeric_limits::epsilon()); + } +} +template +void matrix_verify_add(queue q, big_matrix &A, nd_range<2> &r, + const float ref) { + buffer bufA(A.get_data(), range<2>(M, N)); + + q.submit([&](handler &cgh) { + auto accA = bufA.get_access(cgh); + + cgh.parallel_for(r, [accA](nd_item<2> spmd_item) { + const auto global_idx = spmd_item.get_global_id(0); + const auto global_idy = spmd_item.get_global_id(1); + const auto sg_startx = global_idx - spmd_item.get_local_id(0); + const auto sg_starty = global_idy - spmd_item.get_local_id(1); + + ext::oneapi::sub_group sg = spmd_item.get_sub_group(); + joint_matrix sub_a(sg); + + joint_matrix_fill(sg, sub_a, 5.0); + + auto wi_slice_a = sub_a.get_wi_data(); + for (int i = 0; i < wi_slice_a.length(); i++) { + wi_slice_a[i] = wi_slice_a[i] + 2; + } + joint_matrix_store(sg, sub_a, + accA.get_pointer() + (sg_startx * TM) * N + + sg_starty / SG_SZ * TN, + N, matrix_layout::row_major); + }); // parallel for + }).wait(); + assert_ops_ref(bufA.get_access(), ref); +} + +template +void matrix_verify_sub(queue q, big_matrix &A, nd_range<2> &r, + const float ref) { + buffer bufA(A.get_data(), range<2>(M, N)); + + q.submit([&](handler &cgh) { + auto accA = bufA.get_access(cgh); + + cgh.parallel_for(r, [accA](nd_item<2> spmd_item) { + const auto global_idx = spmd_item.get_global_id(0); + const auto global_idy = spmd_item.get_global_id(1); + const auto sg_startx = global_idx - spmd_item.get_local_id(0); + const auto sg_starty = global_idy - spmd_item.get_local_id(1); + + ext::oneapi::sub_group sg = spmd_item.get_sub_group(); + joint_matrix sub_a(sg); + + joint_matrix_fill(sg, sub_a, 5.0); + + auto wi_slice_a = sub_a.get_wi_data(); + for (int i = 0; i < wi_slice_a.length(); i++) { + wi_slice_a[i] = wi_slice_a[i] - 2; + } + joint_matrix_store(sg, sub_a, + accA.get_pointer() + (sg_startx * TM) * N + + sg_starty / SG_SZ * TN, + N, matrix_layout::row_major); + }); // parallel for + }).wait(); + assert_ops_ref(bufA.get_access(), ref); +} + +template +void matrix_verify_mul(queue q, big_matrix &A, nd_range<2> &r, + const float ref) { + buffer bufA(A.get_data(), range<2>(M, N)); + + q.submit([&](handler &cgh) { + auto accA = bufA.get_access(cgh); + + cgh.parallel_for(r, [accA](nd_item<2> spmd_item) { + const auto global_idx = spmd_item.get_global_id(0); + const auto global_idy = spmd_item.get_global_id(1); + const auto sg_startx = global_idx - spmd_item.get_local_id(0); + const auto sg_starty = global_idy - spmd_item.get_local_id(1); + + ext::oneapi::sub_group sg = spmd_item.get_sub_group(); + joint_matrix sub_a(sg); + + joint_matrix_fill(sg, sub_a, 5.0); + + auto wi_slice_a = sub_a.get_wi_data(); + for (int i = 0; i < wi_slice_a.length(); i++) { + wi_slice_a[i] = wi_slice_a[i] * 3.0; + } + joint_matrix_store(sg, sub_a, + accA.get_pointer() + (sg_startx * TM) * N + + sg_starty / SG_SZ * TN, + N, matrix_layout::row_major); + }); // parallel for + }).wait(); + assert_ops_ref(bufA.get_access(), ref); +} + +template +void matrix_verify_div(queue q, big_matrix &A, nd_range<2> &r, + const float ref) { + buffer bufA(A.get_data(), range<2>(M, N)); + + q.submit([&](handler &cgh) { + auto accA = bufA.get_access(cgh); + + cgh.parallel_for(r, [accA](nd_item<2> spmd_item) { + const auto global_idx = spmd_item.get_global_id(0); + const auto global_idy = spmd_item.get_global_id(1); + const auto sg_startx = global_idx - spmd_item.get_local_id(0); + const auto sg_starty = global_idy - spmd_item.get_local_id(1); + + ext::oneapi::sub_group sg = spmd_item.get_sub_group(); + joint_matrix sub_a(sg); + + joint_matrix_fill(sg, sub_a, 4.0); + + auto wi_slice_a = sub_a.get_wi_data(); + for (int i = 0; i < wi_slice_a.length(); i++) { + wi_slice_a[i] = wi_slice_a[i] / 2.0; + } + joint_matrix_store(sg, sub_a, + accA.get_pointer() + (sg_startx * TM) * N + + sg_starty / SG_SZ * TN, + N, matrix_layout::row_major); + }); // parallel for + }).wait(); + assert_ops_ref(bufA.get_access(), ref); +} + +template +void matrix_verify_logic(queue q, big_matrix &A, nd_range<2> &r, + const float ref) { + buffer bufA(A.get_data(), range<2>(M, N)); + + q.submit([&](handler &cgh) { + auto accA = bufA.get_access(cgh); + + cgh.parallel_for(r, [accA](nd_item<2> spmd_item) { + const auto global_idx = spmd_item.get_global_id(0); + const auto global_idy = spmd_item.get_global_id(1); + const auto sg_startx = global_idx - spmd_item.get_local_id(0); + const auto sg_starty = global_idy - spmd_item.get_local_id(1); + + ext::oneapi::sub_group sg = spmd_item.get_sub_group(); + joint_matrix sub_a(sg); + + joint_matrix_fill(sg, sub_a, 5.0); + + auto wi_slice_a = sub_a.get_wi_data(); + for (int i = 0; i < wi_slice_a.length(); i++) { + if (wi_slice_a[i]) { + if (wi_slice_a[i] > 2.0 || wi_slice_a[i] >= 2.0 || + wi_slice_a[i] < 2.0 || wi_slice_a[i] <= 2.0) { + T val = (wi_slice_a[i] != 2.0) ? wi_slice_a[i] : 2.0; + val--; + val++; + if (wi_slice_a[i] == 2.0) { + val -= 2; + val *= 3.0; + val /= 2.0; + } else { + val += 2; + } + wi_slice_a[i] = val; + } + } + } + joint_matrix_store(sg, sub_a, + accA.get_pointer() + (sg_startx * TM) * N + + sg_starty / SG_SZ * TN, + N, matrix_layout::row_major); + }); // parallel for + }).wait(); + assert_ops_ref(bufA.get_access(), ref); +} + +static constexpr size_t MATRIX_M = TM * 2; +static constexpr size_t MATRIX_N = TN * 2; +half A[MATRIX_M][MATRIX_N]; +float D[MATRIX_M][MATRIX_N]; + +void matrix_ops_ref(float *D, int M, int N) { + for (int m = 0; m < M; m++) + for (int n = 0; n < N; n++) { + *(D + m * N + n) = 0; + *(D + m * N + n) *= 2; + } +} + +int main() { + + big_matrix MD((float *)&D); + big_matrix MA((half *)&A); + + size_t NDRangeM = MATRIX_M / TM; + size_t NDRangeN = MATRIX_N / TN; + queue q; + nd_range<2> r({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}); + + matrix_verify_add(q, MA, r, 7.0); + matrix_verify_sub(q, MA, r, 3.0); + matrix_verify_mul(q, MA, r, 15.0); + matrix_verify_div(q, MA, r, 2.0); + matrix_verify_logic(q, MA, r, 7.0); + + return 0; +} From 7ca403f33092a5fec0d926316c831cc00d884bc2 Mon Sep 17 00:00:00 2001 From: Dounia Khaldi Date: Thu, 6 Jan 2022 10:16:42 -0500 Subject: [PATCH 4/7] [SYCL][Matrix] Add xfail to the modified test because the backend patches are not merged yet Signed-off-by: Dounia Khaldi --- SYCL/Matrix/joint_matrix_ss_int8.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/SYCL/Matrix/joint_matrix_ss_int8.cpp b/SYCL/Matrix/joint_matrix_ss_int8.cpp index 6551fb1326..d542d5e13c 100644 --- a/SYCL/Matrix/joint_matrix_ss_int8.cpp +++ b/SYCL/Matrix/joint_matrix_ss_int8.cpp @@ -11,6 +11,8 @@ // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out +// XFAIL: * + #include #include From 6d126d13da4ac9b5f6b0155f35b60ce97d58d2ef Mon Sep 17 00:00:00 2001 From: Dounia Khaldi Date: Thu, 6 Jan 2022 22:08:38 -0500 Subject: [PATCH 5/7] [SYCL][Matrix] Add a comment to explain why the XFAIL is temporary --- SYCL/Matrix/joint_matrix_ss_int8.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/SYCL/Matrix/joint_matrix_ss_int8.cpp b/SYCL/Matrix/joint_matrix_ss_int8.cpp index d542d5e13c..ba456c5a32 100644 --- a/SYCL/Matrix/joint_matrix_ss_int8.cpp +++ b/SYCL/Matrix/joint_matrix_ss_int8.cpp @@ -11,6 +11,8 @@ // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out +//Two patches are not merged yet causing this test to fail +//Will remove the XFAIL once these pacthes are merged // XFAIL: * #include From e5026c60d782024ec2071cc931cdbec8199d6626 Mon Sep 17 00:00:00 2001 From: Dounia Khaldi Date: Thu, 6 Jan 2022 22:20:17 -0500 Subject: [PATCH 6/7] [SYCL][Matrix] formatting --- SYCL/Matrix/joint_matrix_ss_int8.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/SYCL/Matrix/joint_matrix_ss_int8.cpp b/SYCL/Matrix/joint_matrix_ss_int8.cpp index ba456c5a32..416ff0092d 100644 --- a/SYCL/Matrix/joint_matrix_ss_int8.cpp +++ b/SYCL/Matrix/joint_matrix_ss_int8.cpp @@ -11,8 +11,8 @@ // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out -//Two patches are not merged yet causing this test to fail -//Will remove the XFAIL once these pacthes are merged +// Two patches are not merged yet causing this test to fail +// Will remove the XFAIL once these pacthes are merged // XFAIL: * #include From 6f812d17b01f3337be68635e71045c43388ba832 Mon Sep 17 00:00:00 2001 From: Dounia Khaldi Date: Thu, 6 Jan 2022 22:36:25 -0500 Subject: [PATCH 7/7] [SYCL][Matrix] Add XFAIL because of a known bug --- SYCL/Matrix/element_wise_all_ops_half.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/SYCL/Matrix/element_wise_all_ops_half.cpp b/SYCL/Matrix/element_wise_all_ops_half.cpp index ab4a6d932e..7a7f77f259 100644 --- a/SYCL/Matrix/element_wise_all_ops_half.cpp +++ b/SYCL/Matrix/element_wise_all_ops_half.cpp @@ -11,6 +11,11 @@ // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out +// There is a known bug in joint_matrix_fill when type is half +// A PR is being developed to fix the bug +// Will remove the XFAIL once this is fixed +// XFAIL: * + #include #include #include