From b6c86ebb330c5c1788b9a1bcdaa63a0c05ff302c Mon Sep 17 00:00:00 2001 From: pwhMass Date: Thu, 20 Feb 2025 16:20:22 +0000 Subject: [PATCH] =?UTF-8?q?ehancement(cuda):=20=E4=BC=98=E5=8C=96rearrange?= =?UTF-8?q?=E7=AE=97=E5=AD=90=EF=BC=8C=E5=B9=B6=E4=BD=BF=E5=85=B6=E6=94=AF?= =?UTF-8?q?=E6=8C=81=E5=A4=9A=E7=BB=B4=E5=BC=A0=E9=87=8F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 需要注意目前 ARRAY_SIZE 的大小是5,该常亮与可接受的Tensor的维度有关,但太大会导致kernel计算量增大 Operator 需要用到max_warps_block,warp_size来辅助计算,目前并未用到 block_size 目前固定位256,可进一步优化 --- operators/src/rearrange/cuda/mod.rs | 467 ++++++++++++++++----- operators/src/rearrange/cuda/rearrange.cuh | 131 +++++- 2 files changed, 468 insertions(+), 130 deletions(-) diff --git a/operators/src/rearrange/cuda/mod.rs b/operators/src/rearrange/cuda/mod.rs index fa8ec5c..ea45c58 100644 --- a/operators/src/rearrange/cuda/mod.rs +++ b/operators/src/rearrange/cuda/mod.rs @@ -4,17 +4,45 @@ use super::{Args, Rearrange, args::Scheme}; use crate::{ ByteOf, LaunchError, QueueAlloc, cuda::{Gpu, Handle, ModuleBox}, - rank_not_support, shape_not_support, -}; -use std::{ - ffi::CString, - slice::{from_raw_parts, from_raw_parts_mut}, - sync::Arc, }; +use itertools::Itertools; +use std::iter::repeat; +use std::{ffi::CString, sync::Arc}; + +#[derive(Debug)] +struct SplitDim { + choose_idx: usize, + num_per_block: usize, + num_per_grid: usize, + array_struct_idx_block: ArrayType, + array_struct_idx_grid: ArrayType, +} + +const ARRAY_SIZE: usize = 5; +type ArrayType = i32; +#[derive(Debug, Clone, Copy)] +struct ArrayStruct([ArrayType; N]); + +impl ArrayStruct { + fn new(element: impl Iterator, default: ArrayType) -> Option { + let mut array = [default; N]; + for (i, v) in element.into_iter().enumerate() { + if i >= N { + return None; + } + array[i] = v; + } + Some(Self(array)) + } +} + +//TODO 需要使用max_warps_block和warp_size来进行计算 pub struct Operator { _handle: Arc, + #[allow(unused)] max_warps_block: usize, + #[allow(unused)] warp_size: usize, module: Arc, } @@ -55,105 +83,250 @@ impl crate::Operator for Operator { QA: QueueAlloc, { let scheme = Scheme::new(args)?; - if scheme.ndim() == 0 { - let unit = scheme.unit(); - let dst = unsafe { from_raw_parts_mut(args.dst_base, unit) }; - let src = unsafe { from_raw_parts(args.src_base, unit) }; - queue_alloc.queue().memcpy_d2d(dst, src); - return Ok(()); + + // 发现最大的1 thread 处理的数据量 + let scheme_update = scheme.distribute_unit((0..=5).rev().map(|n| (1 << n))); + + let src_strides = scheme_update.src_strides(); + let dst_strides = scheme_update.dst_strides(); + let shape = scheme_update.shape().collect::>(); + let unit = scheme_update.unit(); + let ndim = scheme_update.ndim(); + + //src strides 降序 index + let src_strides_desc_idx = (0..scheme_update.ndim()) + .zip(src_strides) + .sorted_by(|a, b| b.1.cmp(&a.1)) + .map(|(i, _)| i) + .collect::>(); + + //分离维度,分成grid处理的维度和block处理的维度,与dst的维度相对应 + let mut block_dim_choose = repeat(false).take(ndim).collect::>(); + let mut src_choose_idx = ndim; + let mut dst_choose_idx = ndim; + + let mut block_elements = 1; + let mut block_src_elements = 1; + let mut block_dst_elements = 1; + + //TODO 需要优化 + let block_size = 256; + let mut split_dims = Vec::new(); // 长度最多为2 + + while src_choose_idx > 0 && dst_choose_idx > 0 { + let src_idx = src_strides_desc_idx[src_choose_idx - 1]; + let dst_idx = dst_choose_idx - 1; + + if src_idx == dst_idx { + let idx = src_idx; + let len = shape[idx]; + if block_elements * shape[src_idx] <= block_size { + //选择维度 + block_dim_choose[idx] = true; + block_elements *= len; + block_src_elements *= len; + block_dst_elements *= len; + src_choose_idx -= 1; + dst_choose_idx -= 1; + } else { + //切分维度,并退出 + let num_per_block = block_size.div_euclid(block_elements); + assert!(num_per_block > 0); + assert!(len >= num_per_block); + if num_per_block > 1 { + split_dims.push(SplitDim { + choose_idx: idx, + num_per_block, + num_per_grid: len.div_ceil(num_per_block), + array_struct_idx_block: 0, + array_struct_idx_grid: 0, + }); + } + break; + } + } else { + let src_div_dst = block_src_elements as f64 / block_dst_elements as f64; + let src_num_per_block = + (block_size as f64 / block_elements as f64 / src_div_dst).sqrt(); + let dst_num_per_block = src_num_per_block * src_div_dst; + + let src_current_dim_len = shape[src_idx]; + let dst_current_dim_len = shape[dst_idx]; + + if (src_current_dim_len as f64) < src_num_per_block { + //选择维度 + block_dim_choose[src_idx] = true; + block_elements *= src_current_dim_len; + block_src_elements *= src_current_dim_len; + src_choose_idx -= 1; + } else if (dst_current_dim_len as f64) < dst_num_per_block { + //选择维度 + block_dim_choose[dst_idx] = true; + block_elements *= dst_current_dim_len; + block_dst_elements *= dst_current_dim_len; + dst_choose_idx -= 1; + } else { + //切分维度,并退出 + let src_num_per_block = src_num_per_block.floor() as usize; + let dst_num_per_block = dst_num_per_block.floor() as usize; + let src_num_per_grid = src_current_dim_len.div_ceil(src_num_per_block); + let dst_num_per_grid = dst_current_dim_len.div_ceil(dst_num_per_block); + + if src_num_per_block > 1 { + split_dims.push(SplitDim { + choose_idx: src_idx, + num_per_block: src_num_per_block, + num_per_grid: src_num_per_grid, + array_struct_idx_block: 0, + array_struct_idx_grid: 0, + }); + } + if dst_num_per_block > 1 { + split_dims.push(SplitDim { + choose_idx: dst_idx, + num_per_block: dst_num_per_block, + num_per_grid: dst_num_per_grid, + array_struct_idx_block: 0, + array_struct_idx_grid: 0, + }); + } + break; + } + } } - let scheme = scheme.distribute_unit((0..=5).rev().map(|n| 32 * (1 << n))); - let unit = scheme.unit(); + let mut block_dim: ArrayType = 0; + + let mut block_len = Vec::::with_capacity(ARRAY_SIZE); + let mut src_block_stride = Vec::::with_capacity(ARRAY_SIZE); + let mut dst_block_stride = Vec::::with_capacity(ARRAY_SIZE); + + let mut grid_len = Vec::::with_capacity(ARRAY_SIZE); + let mut src_grid_stride = Vec::::with_capacity(ARRAY_SIZE); + let mut dst_grid_stride = Vec::::with_capacity(ARRAY_SIZE); - struct Layout { - r: u32, - c: u32, - dst_rs: i32, - dst_cs: i32, - src_rs: i32, - src_cs: i32, + // 处理block,填充block_len,block_stride + for i in 0..ndim { + if block_dim_choose[i] { + block_len.push(shape[i] as ArrayType); + src_block_stride.push(src_strides[i] as ArrayType); + dst_block_stride.push(dst_strides[i] as ArrayType); + block_dim += 1; + } + + for split_dim in split_dims.iter_mut() { + if i == split_dim.choose_idx { + block_len.push(split_dim.num_per_block as ArrayType); + src_block_stride.push(src_strides[i] as ArrayType); + dst_block_stride.push(dst_strides[i] as ArrayType); + split_dim.array_struct_idx_block = block_dim; + block_dim += 1; + } + } } - let Layout { - r, - c, - dst_rs, - dst_cs, - src_rs, - src_cs, - } = match scheme.ndim() { - 0 => unreachable!(), - 1 => { - let &[dst_cs] = scheme.dst_strides() else { - unreachable!() - }; - let &[src_cs] = scheme.src_strides() else { - unreachable!() - }; - Layout { - r: 1, - c: scheme.shape().next().unwrap() as _, - dst_rs: 0, - dst_cs: dst_cs as _, - src_rs: 0, - src_cs: src_cs as _, + // 处理grid,填充grid_len,grid_stride + let mut grid_dim = 0; + for i in 0..ndim { + let mut is_split = false; + if !block_dim_choose[i] { + for split_dim in split_dims.iter_mut() { + if i == split_dim.choose_idx { + is_split = true; + grid_len.push(split_dim.num_per_grid as ArrayType); + src_grid_stride + .push((src_strides[i] * split_dim.num_per_block as isize) as ArrayType); + dst_grid_stride + .push((dst_strides[i] * split_dim.num_per_block as isize) as ArrayType); + split_dim.array_struct_idx_grid = grid_dim; + } + } + if !is_split { + grid_len.push(shape[i] as ArrayType); + src_grid_stride.push(src_strides[i] as ArrayType); + dst_grid_stride.push(dst_strides[i] as ArrayType); } + grid_dim += 1; + } + } + + // cuda 参数准备 + let block_len_total = block_len.iter().product::(); + let src_block_stride = + ArrayStruct::::new(src_block_stride.into_iter(), 0).unwrap(); + let dst_block_stride = + ArrayStruct::::new(dst_block_stride.into_iter(), 0).unwrap(); + let src_grid_stride = + ArrayStruct::::new(src_grid_stride.into_iter(), 0).unwrap(); + let dst_grid_stride = + ArrayStruct::::new(dst_grid_stride.into_iter(), 0).unwrap(); + let block_len = ArrayStruct::::new(block_len.into_iter(), 1).unwrap(); + let grid_len = ArrayStruct::::new(grid_len.into_iter(), 1).unwrap(); + + let (constrain1, constrain2) = match split_dims.len() { + 0 => (ArrayStruct([0; 4]), ArrayStruct([0; 4])), + 1 => { + let constrains1 = ArrayStruct([ + split_dims[0].array_struct_idx_grid, + split_dims[0].array_struct_idx_block, + split_dims[0].num_per_block as ArrayType, + shape[split_dims[0].choose_idx] as ArrayType, + ]); + let constrains2 = ArrayStruct([0; 4]); + (constrains1, constrains2) } 2 => { - let mut shape = scheme.shape(); - let r = shape.next().unwrap(); - let c = shape.next().unwrap(); - let &[dst_rs, dst_cs] = scheme.dst_strides() else { - unreachable!() - }; - let &[src_rs, src_cs] = scheme.src_strides() else { - unreachable!() - }; - Layout { - r: r as _, - c: c as _, - dst_rs: dst_rs as _, - dst_cs: dst_cs as _, - src_rs: src_rs as _, - src_cs: src_cs as _, - } + let constrains1 = ArrayStruct([ + split_dims[0].array_struct_idx_grid, + split_dims[0].array_struct_idx_block, + split_dims[0].num_per_block as ArrayType, + shape[split_dims[0].choose_idx] as ArrayType, + ]); + let constrains2 = ArrayStruct([ + split_dims[1].array_struct_idx_grid, + split_dims[1].array_struct_idx_block, + split_dims[1].num_per_block as ArrayType, + shape[split_dims[1].choose_idx] as ArrayType, + ]); + (constrains1, constrains2) + } + _ => { + unreachable!() } - _ => Err(rank_not_support("rearrange not support ndim > 2 on NV GPU"))?, }; + //---------------------------------------------------------------------- let name = CString::new(NAME).unwrap(); - if unit % self.warp_size != 0 { - Err(shape_not_support(format!( - "memory region {unit} is not align to warp size, which is not supported yet on NV GPU", - )))?; - } - let bytes_thread = (unit / self.warp_size) as u32; - if bytes_thread > 32 || !bytes_thread.is_power_of_two() { - Err(shape_not_support(format!( - "bytes per thread {bytes_thread} is not supported yet on NV GPU" - )))?; - } - let warps = self.max_warps_block as u32; - let grid = (r, c.div_ceil(warps)); - let block = (c.div_ceil(grid.1), self.warp_size as u32); + let grid = shape + .iter() + .zip(block_dim_choose.iter()) + .filter_map(|(len, is_choose)| { + if !*is_choose { + Some(*len as ArrayType) + } else { + None + } + }) + .product::() as u32; + let block = block_size as u32; - let unit = unit as i32; - let dst_rs = dst_rs / unit; - let dst_cs = dst_cs / unit; - let src_rs = src_rs / unit; - let src_cs = src_cs / unit; + let unit = unit as usize; let params = params![ args.dst_base, - dst_rs, - dst_cs, args.src_base, - src_rs, - src_cs, - c, - bytes_thread + block_dim, + block_len_total, + constrain1, + constrain2, + block_len, // 各维度的长度 + src_block_stride, // 源tensor在各维度上的步长(bytes) + dst_block_stride, // 目标tensor在各维度上的步长(bytes) + grid_len, // 各维度的长度 + src_grid_stride, // 源tensor在各维度上的步长(bytes) + dst_grid_stride, // 源tensor在各维度上的步长(bytes) + unit // bytes_per_thread ]; self.module.launch( &name, @@ -167,25 +340,50 @@ impl crate::Operator for Operator { fn format_code() -> String { format!( - r#"{CODE} + r#"#define ARRAY_SIZE {ARRAY_SIZE} +#define ARRAY_TYPE int +{CODE} extern "C" __global__ void {NAME}( void *__restrict__ dst, - int const rsa, - int const csa, void const *__restrict__ src, - int const rsb, - int const csb, - unsigned int const ncols, - unsigned int const bytes_per_thread + const int block_dim, // block维度数量 + const int block_len_total, // block_len 各元素的乘积 + const ArrayStruct<4, ARRAY_TYPE> constrains1, // 切分维度的约束条件1 + const ArrayStruct<4, ARRAY_TYPE> constrains2, // 切分维度的约束条件2 + const ArrayStruct block_len, // 各维度的长度 + const ArrayStruct src_block_stride, // 源tensor在各维度上的步长(bytes) + const ArrayStruct dst_block_stride, // 目标tensor在各维度上的步长(bytes) + const ArrayStruct grid_len, // 各维度的长度 + const ArrayStruct src_grid_stride, // 源tensor在各维度上的步长(bytes) + const ArrayStruct dst_grid_stride, // 目标tensor在各维度上的步长(bytes) + unsigned int const unit_size // 每个元素的字节数 ){{ - switch (bytes_per_thread) {{ - case 1: rearrange(dst, rsa, csa, src, rsb, csb, ncols); break; - case 2: rearrange(dst, rsa, csa, src, rsb, csb, ncols); break; - case 4: rearrange(dst, rsa, csa, src, rsb, csb, ncols); break; - case 8: rearrange(dst, rsa, csa, src, rsb, csb, ncols); break; - case 16: rearrange(dst, rsa, csa, src, rsb, csb, ncols); break; - case 32: rearrange(dst, rsa, csa, src, rsb, csb, ncols); break; + switch (unit_size) {{ + case 1: + rearrange_1(dst, src, block_dim, block_len_total, constrains1, constrains2, + block_len, src_block_stride, dst_block_stride, grid_len, src_grid_stride, dst_grid_stride, unit_size); + break; + case 2: + rearrange_1(dst, src, block_dim, block_len_total, constrains1, constrains2, + block_len, src_block_stride, dst_block_stride, grid_len, src_grid_stride, dst_grid_stride, unit_size); + break; + case 4: + rearrange_1(dst, src, block_dim, block_len_total, constrains1, constrains2, + block_len, src_block_stride, dst_block_stride, grid_len, src_grid_stride, dst_grid_stride, unit_size); + break; + case 8: + rearrange_1(dst, src, block_dim, block_len_total, constrains1, constrains2, + block_len, src_block_stride, dst_block_stride, grid_len, src_grid_stride, dst_grid_stride, unit_size); + break; + case 16: + rearrange_1(dst, src, block_dim, block_len_total, constrains1, constrains2, + block_len, src_block_stride, dst_block_stride, grid_len, src_grid_stride, dst_grid_stride, unit_size); + break; + case 32: + rearrange_1(dst, src, block_dim, block_len_total, constrains1, constrains2, + block_len, src_block_stride, dst_block_stride, grid_len, src_grid_stride, dst_grid_stride, unit_size); + break; }} }} "# @@ -218,29 +416,43 @@ mod test { fn test_compute() { use super::super::common_cpu::Operator as RefOp; use crate::common_cpu::{Cpu, ThisThread}; + use cuda::memcpy_d2h; use ndarray_layout::{ArrayLayout, Endian::BigEndian}; use rand::Rng; - + // use crate::rearrange::cuda::format_code; + // let code = format_code(); + // std::fs::write("rearrange.cu", code).unwrap(); let Some(gpu) = Gpu::init() else { return; }; - let dt = ty::U32; + let dt = ty::U64; let cpu_op = RefOp::new(&Cpu); let gpu_op = Operator::new(&gpu); - let nh = 32; - let seq = 7; - let dh = 128; - let mut src = vec![0u32; nh * seq * dh]; + const N: usize = 5; + const TRANS_N: usize = 3; + let shape: [usize; N] = [2232, 3, 7, 9, 4]; + let mut r_shape: [usize; N] = shape.clone(); + r_shape[0..TRANS_N].reverse(); + + let trans_param: [usize; TRANS_N] = + (0..TRANS_N).rev().collect::>().try_into().unwrap(); + + let mut src = vec![0u64; shape.iter().product::()]; rand::rng().fill(&mut src[..]); let ele = dt.nbytes(); - let s_src = ArrayLayout::<3>::new_contiguous(&[nh, seq, dh], BigEndian, ele); + let s_src = ArrayLayout::<3>::new_contiguous(&shape, BigEndian, ele); let s_dst = - ArrayLayout::<3>::new_contiguous(&[seq, nh, dh], BigEndian, ele).transpose(&[1, 0]); + ArrayLayout::<3>::new_contiguous(&r_shape, BigEndian, ele).transpose(&trans_param); + + println!("s_src shape: {:?}", s_src.shape()); + println!("s_dst shape: {:?}", s_dst.shape()); + println!("s_src strides: {:?}", s_src.strides()); + println!("s_dst strides: {:?}", s_dst.strides()); let dst_ans = gpu.apply(|ctx| { let stream = ctx.stream(); @@ -248,13 +460,37 @@ mod test { let rt = &stream; #[cfg(use_iluvatar)] let rt = ctx; + let src = rt.from_host(&src); let mut dst = rt.malloc::(src.len()); + + let start_event = stream.record(); + + stream.bench( + |_, stream| { + gpu_op + .launch( + &args( + dt, + &shape, + s_src.strides(), + s_dst.strides(), + src.as_ptr().cast(), + dst.as_mut_ptr().cast(), + ), + &mut [], + stream, + ) + .unwrap(); + }, + 5, + 1, + ); gpu_op .launch( &args( dt, - &[nh, seq, dh], + &shape, s_src.strides(), s_dst.strides(), src.as_ptr().cast(), @@ -264,17 +500,22 @@ mod test { &stream, ) .unwrap(); - let mut host = vec![0u32; nh * seq * dh]; + let end_event = stream.record(); + end_event.synchronize(); + let time = end_event.elapse_from(&start_event); + println!("time: {time:?}"); + + let mut host = vec![0u64; shape.iter().product::()]; memcpy_d2h(&mut host, &dst); host }); - let mut dst_ref = vec![0u32; seq * nh * dh]; + let mut dst_ref = vec![0u64; shape.iter().product::()]; cpu_op .launch( &args( dt, - &[nh, seq, dh], + &shape, s_src.strides(), s_dst.strides(), src.as_ptr().cast(), diff --git a/operators/src/rearrange/cuda/rearrange.cuh b/operators/src/rearrange/cuda/rearrange.cuh index f7b6561..e4e7583 100644 --- a/operators/src/rearrange/cuda/rearrange.cuh +++ b/operators/src/rearrange/cuda/rearrange.cuh @@ -1,25 +1,122 @@ -template -static __device__ void rearrange( +template +struct ArrayStruct { + ArrayType a[ArrSize]; +}; + +template +static __device__ void rearrange_1( void *__restrict__ dst, - int const rsa, - int const csa, void const *__restrict__ src, - int const rsb, - int const csb, - unsigned int const ncols) { + unsigned int const block_dim, + unsigned int const block_len_total, // block_len 各元素的乘积 + const ArrayStruct<4, ArrayType> constrains1, // 切分维度的约束条件1,, 各个元素分别代表:[grid_idx, block_idx, grid 的stride相对于block的倍数,总的len限制] + const ArrayStruct<4, ArrayType> constrains2, // 切分维度的约束条件2 + const ArrayStruct block_len, // 各维度的长度 + const ArrayStruct src_block_stride, // 源tensor在各维度上的步长(bytes) + const ArrayStruct dst_block_stride, // 目标tensor在各维度上的步长(bytes) + const ArrayStruct grid_len, // 各维度的长度 + const ArrayStruct src_grid_stride, // 源tensor在各维度上的步长(bytes) + const ArrayStruct dst_grid_stride, // 目标tensor在各维度上的步长(bytes) + unsigned int const unit_size // 每个元素的字节数 +) { - auto row = blockIdx.y, - col = blockIdx.x * blockDim.y + threadIdx.y; - if (col >= ncols) { + int remaining = threadIdx.x; + if (remaining >= block_len_total) { return; } - auto thread = threadIdx.x, - warp_size = blockDim.x; - auto i = (row * rsa + col * csa) * warp_size + thread; - auto j = (row * rsb + col * csb) * warp_size + thread; - // printf("%d %d %d %d: row = %d, col = %d, nrows = %d, ncols = %d, rsa = %d, rsb = %d, csa = %d, csb = %d, warp_size = %d, thread = %d, i = %d, j = %d\n", - // blockIdx.y, blockIdx.x, threadIdx.y, threadIdx.x, row, col, gridDim.y, ncols, rsa, rsb, csa, csb, warp_size, thread, i, j); + // 声明共享内存 + __shared__ int shared_src_offset; + __shared__ int shared_dst_offset; + + __shared__ int shared_constrains1_grid_idx_multiple; + __shared__ int shared_constrains2_grid_idx_multiple; + + if (threadIdx.x == 0) { // 只让0号线程计算 + // 计算当前block处理的数据在src和dst中的基础偏移(bytes) + int src_offset = 0; + int dst_offset = 0; + int remaining = blockIdx.x; +#pragma unroll + for (int i = ArrSize - 1; i >= 0; i--) { + int idx = remaining % grid_len.a[i]; + remaining /= grid_len.a[i]; + src_offset += idx * src_grid_stride.a[i]; + dst_offset += idx * dst_grid_stride.a[i]; + + if (i == constrains1.a[0]) { + shared_constrains1_grid_idx_multiple = idx * constrains1.a[2]; + } + if (i == constrains2.a[0]) { + shared_constrains2_grid_idx_multiple = idx * constrains2.a[2]; + } + + // 将结果存入共享内存 + shared_src_offset = src_offset; + shared_dst_offset = dst_offset; + } + } + + // 确保所有线程都能看到共享内存中的值 + __syncthreads(); + + // 所有线程直接使用计算好的偏移值 + int src_offset = shared_src_offset; + int dst_offset = shared_dst_offset; + + int constrains1_grid_idx_multiple = shared_constrains1_grid_idx_multiple; + int constrains2_grid_idx_multiple = shared_constrains2_grid_idx_multiple; + + for (int i = ArrSize - 1; i > 0; i--) { + if (block_len.a[i] > 1) { + int idx = remaining % block_len.a[i]; + remaining /= block_len.a[i]; + // 计算偏移量 + src_offset += idx * src_block_stride.a[i]; + dst_offset += idx * dst_block_stride.a[i]; + + if (constrains1.a[3] != 0 && i == constrains1.a[1]) { + if (constrains1_grid_idx_multiple + idx >= constrains1.a[3]) { + return; + } + } - reinterpret_cast(dst)[i] = reinterpret_cast(src)[j]; + if (constrains2.a[3] != 0 && i == constrains2.a[1]) { + if (constrains2_grid_idx_multiple + idx >= constrains2.a[3]) { + return; + } + } + } + } + + // 单独处理第一个维度 + if (remaining >= block_len.a[0]) { + return; + } + src_offset += remaining * src_block_stride.a[0]; + dst_offset += remaining * dst_block_stride.a[0]; + + if (constrains1.a[3] != 0 && 0 == constrains1.a[1]) { + if (constrains1_grid_idx_multiple + remaining >= constrains1.a[3]) { + return; + } + } + + if (constrains2.a[3] != 0 && 0 == constrains2.a[1]) { + if (constrains2_grid_idx_multiple + remaining >= constrains2.a[3]) { + return; + } + } + + // 执行数据拷贝,注意offset已经是字节偏移 + const int elements_per_thread = unit_size / sizeof(Tmem); + if (elements_per_thread == 1) { + *reinterpret_cast(reinterpret_cast(dst) + dst_offset) = + *reinterpret_cast(reinterpret_cast(src) + src_offset); + } else { + for (int i = 0; i < elements_per_thread; i++) { + reinterpret_cast(reinterpret_cast(dst) + dst_offset)[i] = + reinterpret_cast(reinterpret_cast(src) + src_offset)[i]; + } + } }