diff --git a/operators/src/rearrange/cuda/mod.rs b/operators/src/rearrange/cuda/mod.rs index ea45c58..4cd8232 100644 --- a/operators/src/rearrange/cuda/mod.rs +++ b/operators/src/rearrange/cuda/mod.rs @@ -1,13 +1,53 @@ -use cuda::params; - -use super::{Args, Rearrange, args::Scheme}; +use super::{Args, Rearrange, args::Scheme as ArgsScheme}; +use crate::rank_not_support; use crate::{ - ByteOf, LaunchError, QueueAlloc, + ByteOf, LaunchError, QueueAlloc, SchemeDiversity, cuda::{Gpu, Handle, ModuleBox}, }; use itertools::Itertools; +use lru::LruCache; +use std::cmp::max; use std::iter::repeat; -use std::{ffi::CString, sync::Arc}; +use std::slice::{from_raw_parts, from_raw_parts_mut}; +use std::{ + ffi::CString, + sync::{Arc, Mutex}, +}; +#[derive(Clone, Copy, PartialEq, Eq, Hash, Debug)] +struct SchemeKey { + unit_size: usize, + block_array_size: usize, + grid_array_size: usize, + constrain_num: usize, +} + +#[derive(Clone)] +struct Scheme { + module: Arc, + name: CString, +} + +impl Scheme { + pub fn new(key: SchemeKey, handle: &Arc) -> Self { + let name = kernel_name(key); + let cc = handle.device().compute_capability(); + // for DEBUG + // let code = format_code(key.unit_size, key.constrain_num); + // std::fs::write("rearrange.cu", code).unwrap(); + + Self { + module: handle.compile_kernel(&name, cc, || format_code(key)), + name: CString::new(name).unwrap(), + } + } +} + +/// Type used for array indices and strides +type ArrayType = i32; + +// 默认的数组大小,同时也是最大的数组大小,不能为0 +const DEFAULT_ARRAY_SIZE: usize = 5; +const CONSTRAIN_ARRAY_SIZE: usize = 8; #[derive(Debug)] struct SplitDim { @@ -16,38 +56,49 @@ struct SplitDim { num_per_grid: usize, array_struct_idx_block: ArrayType, array_struct_idx_grid: ArrayType, + dim_len: usize, } -const ARRAY_SIZE: usize = 5; +#[derive(Debug)] +struct ArrayStruct(Vec); -type ArrayType = i32; -#[derive(Debug, Clone, Copy)] -struct ArrayStruct([ArrayType; N]); - -impl ArrayStruct { - fn new(element: impl Iterator, default: ArrayType) -> Option { - let mut array = [default; N]; - for (i, v) in element.into_iter().enumerate() { - if i >= N { - return None; +impl ArrayStruct { + fn new(mut array: Vec, default: ArrayType) -> Self { + while array.len() < DEFAULT_ARRAY_SIZE { + array.push(default); + } + Self(array) + } + + fn try_into_array(self) -> Result<[ArrayType; N], LaunchError> { + if self.0.len() > N { + Err(rank_not_support("ArrayStruct::try_into_array")) + } else { + let ArrayStruct(vec) = self; + if vec.len() == N { + Ok(vec.try_into().unwrap()) + } else { + let vec_len = vec.len(); + Ok(vec + .into_iter() + .chain(repeat(0).take(N - vec_len)) + .collect::>() + .try_into() + .unwrap()) } - array[i] = v; } - Some(Self(array)) } } -//TODO 需要使用max_warps_block和warp_size来进行计算 pub struct Operator { - _handle: Arc, + handle: Arc, #[allow(unused)] max_warps_block: usize, #[allow(unused)] warp_size: usize, - module: Arc, + schemes: Mutex>, } -const NAME: &str = "rearrange"; const CODE: &str = include_str!("rearrange.cuh"); impl Rearrange for Operator {} @@ -62,14 +113,13 @@ impl crate::Operator for Operator { let device = node.0.device(); let max_threads_block = device.block_limit().max_threads; let warp_size = device.warp_size(); - let cc = device.compute_capability(); assert_eq!(max_threads_block % warp_size, 0); // 生成执行资源 Self { - _handle: node.0.clone(), + handle: node.0.clone(), max_warps_block: max_threads_block / warp_size, warp_size, - module: node.0.compile_kernel(NAME, cc, format_code), + schemes: node.0.scheme_cache(SchemeDiversity::Low), } } @@ -82,10 +132,17 @@ impl crate::Operator for Operator { where QA: QueueAlloc, { - let scheme = Scheme::new(args)?; + let scheme_update = ArgsScheme::new(args)?; // 发现最大的1 thread 处理的数据量 - let scheme_update = scheme.distribute_unit((0..=5).rev().map(|n| (1 << n))); + let scheme_update = scheme_update.distribute_unit((0..=5).rev().map(|n| (1 << n))); + if scheme_update.ndim() == 0 { + let unit = scheme_update.unit(); + let dst = unsafe { from_raw_parts_mut(args.dst_base, unit) }; + let src = unsafe { from_raw_parts(args.src_base, unit) }; + queue_alloc.queue().memcpy_d2d(dst, src); + return Ok(()); + } let src_strides = scheme_update.src_strides(); let dst_strides = scheme_update.dst_strides(); @@ -96,114 +153,128 @@ impl crate::Operator for Operator { //src strides 降序 index let src_strides_desc_idx = (0..scheme_update.ndim()) .zip(src_strides) - .sorted_by(|a, b| b.1.cmp(&a.1)) + .sorted_by(|a, b| b.1.cmp(a.1)) .map(|(i, _)| i) .collect::>(); //分离维度,分成grid处理的维度和block处理的维度,与dst的维度相对应 let mut block_dim_choose = repeat(false).take(ndim).collect::>(); - let mut src_choose_idx = ndim; - let mut dst_choose_idx = ndim; - - let mut block_elements = 1; - let mut block_src_elements = 1; - let mut block_dst_elements = 1; //TODO 需要优化 - let block_size = 256; + let max_block_size = 256; let mut split_dims = Vec::new(); // 长度最多为2 - while src_choose_idx > 0 && dst_choose_idx > 0 { - let src_idx = src_strides_desc_idx[src_choose_idx - 1]; - let dst_idx = dst_choose_idx - 1; - - if src_idx == dst_idx { - let idx = src_idx; - let len = shape[idx]; - if block_elements * shape[src_idx] <= block_size { - //选择维度 - block_dim_choose[idx] = true; - block_elements *= len; - block_src_elements *= len; - block_dst_elements *= len; - src_choose_idx -= 1; - dst_choose_idx -= 1; - } else { - //切分维度,并退出 - let num_per_block = block_size.div_euclid(block_elements); - assert!(num_per_block > 0); - assert!(len >= num_per_block); - if num_per_block > 1 { - split_dims.push(SplitDim { - choose_idx: idx, - num_per_block, - num_per_grid: len.div_ceil(num_per_block), - array_struct_idx_block: 0, - array_struct_idx_grid: 0, - }); + //进行维度选择 + { + let mut src_choose_idx = ndim; + let mut dst_choose_idx = ndim; + + let mut block_elements = 1; + let mut block_src_elements = 1; + let mut block_dst_elements = 1; + + while src_choose_idx > 0 && dst_choose_idx > 0 { + let src_idx = src_strides_desc_idx[src_choose_idx - 1]; + let dst_idx = dst_choose_idx - 1; + + if src_idx == dst_idx { + let idx = src_idx; + let len = shape[idx]; + if block_elements * shape[src_idx] <= max_block_size { + //选择维度 + block_dim_choose[idx] = true; + block_elements *= len; + block_src_elements *= len; + block_dst_elements *= len; + src_choose_idx -= 1; + dst_choose_idx -= 1; + } else { + //切分维度,并退出 + let num_per_block = max_block_size.div_euclid(block_elements); + assert!(num_per_block > 0); + assert!(len >= num_per_block); + if num_per_block > 1 { + split_dims.push(SplitDim { + choose_idx: idx, + num_per_block, + num_per_grid: len.div_ceil(num_per_block), + array_struct_idx_block: 0, + array_struct_idx_grid: 0, + dim_len: len, + }); + } + break; } - break; - } - } else { - let src_div_dst = block_src_elements as f64 / block_dst_elements as f64; - let src_num_per_block = - (block_size as f64 / block_elements as f64 / src_div_dst).sqrt(); - let dst_num_per_block = src_num_per_block * src_div_dst; - - let src_current_dim_len = shape[src_idx]; - let dst_current_dim_len = shape[dst_idx]; - - if (src_current_dim_len as f64) < src_num_per_block { - //选择维度 - block_dim_choose[src_idx] = true; - block_elements *= src_current_dim_len; - block_src_elements *= src_current_dim_len; - src_choose_idx -= 1; - } else if (dst_current_dim_len as f64) < dst_num_per_block { - //选择维度 - block_dim_choose[dst_idx] = true; - block_elements *= dst_current_dim_len; - block_dst_elements *= dst_current_dim_len; - dst_choose_idx -= 1; } else { - //切分维度,并退出 - let src_num_per_block = src_num_per_block.floor() as usize; - let dst_num_per_block = dst_num_per_block.floor() as usize; - let src_num_per_grid = src_current_dim_len.div_ceil(src_num_per_block); - let dst_num_per_grid = dst_current_dim_len.div_ceil(dst_num_per_block); - - if src_num_per_block > 1 { - split_dims.push(SplitDim { - choose_idx: src_idx, - num_per_block: src_num_per_block, - num_per_grid: src_num_per_grid, - array_struct_idx_block: 0, - array_struct_idx_grid: 0, - }); - } - if dst_num_per_block > 1 { - split_dims.push(SplitDim { - choose_idx: dst_idx, - num_per_block: dst_num_per_block, - num_per_grid: dst_num_per_grid, - array_struct_idx_block: 0, - array_struct_idx_grid: 0, - }); + let src_div_dst = block_src_elements as f64 / block_dst_elements as f64; + let src_num_per_block = + (max_block_size as f64 / block_elements as f64 / src_div_dst).sqrt(); + let dst_num_per_block = src_num_per_block * src_div_dst; + + let src_current_dim_len = shape[src_idx]; + let dst_current_dim_len = shape[dst_idx]; + + if (src_current_dim_len as f64) < src_num_per_block { + //选择维度 + block_dim_choose[src_idx] = true; + block_elements *= src_current_dim_len; + block_src_elements *= src_current_dim_len; + src_choose_idx -= 1; + } else if (dst_current_dim_len as f64) < dst_num_per_block { + //选择维度 + block_dim_choose[dst_idx] = true; + block_elements *= dst_current_dim_len; + block_dst_elements *= dst_current_dim_len; + dst_choose_idx -= 1; + } else { + //切分维度,并退出 + let src_num_per_block = src_num_per_block.floor() as usize; + let dst_num_per_block = dst_num_per_block.floor() as usize; + let src_num_per_grid = src_current_dim_len.div_ceil(src_num_per_block); + let dst_num_per_grid = dst_current_dim_len.div_ceil(dst_num_per_block); + + if src_num_per_block == 1 { + } else if src_num_per_grid == 1 { + block_dim_choose[src_idx] = true; + } else { + split_dims.push(SplitDim { + choose_idx: src_idx, + num_per_block: src_num_per_block, + num_per_grid: src_num_per_grid, + array_struct_idx_block: 0, + array_struct_idx_grid: 0, + dim_len: src_current_dim_len, + }); + } + + if dst_num_per_block == 1 { + } else if dst_num_per_grid == 1 { + block_dim_choose[dst_idx] = true; + } else { + split_dims.push(SplitDim { + choose_idx: dst_idx, + num_per_block: dst_num_per_block, + num_per_grid: dst_num_per_grid, + array_struct_idx_block: 0, + array_struct_idx_grid: 0, + dim_len: dst_current_dim_len, + }); + } + break; } - break; } } } let mut block_dim: ArrayType = 0; - let mut block_len = Vec::::with_capacity(ARRAY_SIZE); - let mut src_block_stride = Vec::::with_capacity(ARRAY_SIZE); - let mut dst_block_stride = Vec::::with_capacity(ARRAY_SIZE); + let mut block_len = Vec::::with_capacity(DEFAULT_ARRAY_SIZE); + let mut src_block_stride = Vec::::with_capacity(DEFAULT_ARRAY_SIZE); + let mut dst_block_stride = Vec::::with_capacity(DEFAULT_ARRAY_SIZE); - let mut grid_len = Vec::::with_capacity(ARRAY_SIZE); - let mut src_grid_stride = Vec::::with_capacity(ARRAY_SIZE); - let mut dst_grid_stride = Vec::::with_capacity(ARRAY_SIZE); + let mut grid_len = Vec::::with_capacity(DEFAULT_ARRAY_SIZE); + let mut src_grid_stride = Vec::::with_capacity(DEFAULT_ARRAY_SIZE); + let mut dst_grid_stride = Vec::::with_capacity(DEFAULT_ARRAY_SIZE); // 处理block,填充block_len,block_stride for i in 0..ndim { @@ -226,7 +297,7 @@ impl crate::Operator for Operator { } // 处理grid,填充grid_len,grid_stride - let mut grid_dim = 0; + let mut grid_dim = 0_u32; for i in 0..ndim { let mut is_split = false; if !block_dim_choose[i] { @@ -238,7 +309,7 @@ impl crate::Operator for Operator { .push((src_strides[i] * split_dim.num_per_block as isize) as ArrayType); dst_grid_stride .push((dst_strides[i] * split_dim.num_per_block as isize) as ArrayType); - split_dim.array_struct_idx_grid = grid_dim; + split_dim.array_struct_idx_grid = grid_dim as ArrayType; } } if !is_split { @@ -250,86 +321,79 @@ impl crate::Operator for Operator { } } - // cuda 参数准备 - let block_len_total = block_len.iter().product::(); - let src_block_stride = - ArrayStruct::::new(src_block_stride.into_iter(), 0).unwrap(); - let dst_block_stride = - ArrayStruct::::new(dst_block_stride.into_iter(), 0).unwrap(); - let src_grid_stride = - ArrayStruct::::new(src_grid_stride.into_iter(), 0).unwrap(); - let dst_grid_stride = - ArrayStruct::::new(dst_grid_stride.into_iter(), 0).unwrap(); - let block_len = ArrayStruct::::new(block_len.into_iter(), 1).unwrap(); - let grid_len = ArrayStruct::::new(grid_len.into_iter(), 1).unwrap(); - - let (constrain1, constrain2) = match split_dims.len() { - 0 => (ArrayStruct([0; 4]), ArrayStruct([0; 4])), - 1 => { - let constrains1 = ArrayStruct([ - split_dims[0].array_struct_idx_grid, - split_dims[0].array_struct_idx_block, - split_dims[0].num_per_block as ArrayType, - shape[split_dims[0].choose_idx] as ArrayType, - ]); - let constrains2 = ArrayStruct([0; 4]); - (constrains1, constrains2) - } - 2 => { - let constrains1 = ArrayStruct([ - split_dims[0].array_struct_idx_grid, - split_dims[0].array_struct_idx_block, - split_dims[0].num_per_block as ArrayType, - shape[split_dims[0].choose_idx] as ArrayType, - ]); - let constrains2 = ArrayStruct([ - split_dims[1].array_struct_idx_grid, - split_dims[1].array_struct_idx_block, - split_dims[1].num_per_block as ArrayType, - shape[split_dims[1].choose_idx] as ArrayType, - ]); - (constrains1, constrains2) - } - _ => { - unreachable!() - } + let filter_split_dims = split_dims + .iter() + .filter(|split_dim| split_dim.dim_len % split_dim.num_per_block != 0) + .collect::>(); + + let constrain_num = filter_split_dims.len(); + + // 准备kernel + let key = SchemeKey { + unit_size: unit, + constrain_num, + block_array_size: block_len.len(), + grid_array_size: grid_len.len(), }; - //---------------------------------------------------------------------- - let name = CString::new(NAME).unwrap(); + let mut schemes = self.schemes.lock().unwrap(); - let grid = shape - .iter() - .zip(block_dim_choose.iter()) - .filter_map(|(len, is_choose)| { - if !*is_choose { - Some(*len as ArrayType) - } else { - None - } - }) - .product::() as u32; - let block = block_size as u32; + let scheme = schemes.get_or_insert(key, || Scheme::new(key, &self.handle)); + + // 计算grid和block + let grid = grid_len.iter().product::() as u32; + let block = block_len.iter().product::() as u32; - let unit = unit as usize; + // cuda 参数准备 + let block_len_total = block_len.iter().map(|x| *x as u32).product::(); + let src_block_stride = ArrayStruct::new(src_block_stride, 0); + let dst_block_stride = ArrayStruct::new(dst_block_stride, 0); + let src_grid_stride = ArrayStruct::new(src_grid_stride, 0); + let dst_grid_stride = ArrayStruct::new(dst_grid_stride, 0); + let block_len = ArrayStruct::new(block_len, 1); + let grid_len = ArrayStruct::new(grid_len, 1); + + let constrains = match filter_split_dims.len() { + 0 => ArrayStruct(vec![0; 8]), + 1 => ArrayStruct(vec![ + filter_split_dims[0].array_struct_idx_grid, + filter_split_dims[0].array_struct_idx_block, + filter_split_dims[0].num_per_block as ArrayType, + filter_split_dims[0].dim_len as ArrayType, + 0, + 0, + 0, + 0, + ]), + 2 => ArrayStruct(vec![ + filter_split_dims[0].array_struct_idx_grid, + filter_split_dims[0].array_struct_idx_block, + filter_split_dims[0].num_per_block as ArrayType, + filter_split_dims[0].dim_len as ArrayType, + filter_split_dims[1].array_struct_idx_grid, + filter_split_dims[1].array_struct_idx_block, + filter_split_dims[1].num_per_block as ArrayType, + filter_split_dims[1].dim_len as ArrayType, + ]), + _ => unreachable!(), + }; - let params = params![ + let params = cuda::params![ args.dst_base, args.src_base, block_dim, block_len_total, - constrain1, - constrain2, - block_len, // 各维度的长度 - src_block_stride, // 源tensor在各维度上的步长(bytes) - dst_block_stride, // 目标tensor在各维度上的步长(bytes) - grid_len, // 各维度的长度 - src_grid_stride, // 源tensor在各维度上的步长(bytes) - dst_grid_stride, // 源tensor在各维度上的步长(bytes) - unit // bytes_per_thread + block_len.try_into_array::()?, // 各维度的长度 + src_block_stride.try_into_array::()?, // 源tensor在各维度上的步长(bytes) + dst_block_stride.try_into_array::()?, // 目标tensor在各维度上的步长(bytes) + grid_len.try_into_array::()?, // 各维度的长度 + src_grid_stride.try_into_array::()?, // 源tensor在各维度上的步长(bytes) + dst_grid_stride.try_into_array::()?, // 目标tensor在各维度上的步长(bytes) + constrains.try_into_array::()? ]; - self.module.launch( - &name, + + scheme.module.launch( + &scheme.name, (grid, block, 0), ¶ms.to_ptrs(), queue_alloc.queue(), @@ -338,63 +402,119 @@ impl crate::Operator for Operator { } } -fn format_code() -> String { +fn kernel_name( + SchemeKey { + unit_size, + block_array_size, + grid_array_size, + constrain_num, + }: SchemeKey, +) -> String { + let tmem_type = match unit_size { + 1 => "uchar1", + 2 => "uchar2", + 4 => "float1", + 8 => "float2", + 16 => "float4", + 32 => "double4", + _ => unreachable!(), + }; format!( - r#"#define ARRAY_SIZE {ARRAY_SIZE} -#define ARRAY_TYPE int -{CODE} + "rearrange_unit_{tmem_type}_block_{block_array_size}_grid_{grid_array_size}_constrain_{constrain_num}" + ) +} -extern "C" __global__ void {NAME}( - void *__restrict__ dst, +fn format_code( + SchemeKey { + unit_size, + block_array_size, + grid_array_size, + constrain_num, + }: SchemeKey, +) -> String { + assert!(block_array_size != 0); + + let kernel_name = kernel_name(SchemeKey { + unit_size, + block_array_size, + grid_array_size, + constrain_num, + }); + //处理 grid_array_size = 0的情况 + let grid_array_size = max(grid_array_size, 1); + + let mut code = String::new(); + + let tmem_type = match unit_size { + 1 => "uchar1", + 2 => "uchar2", + 4 => "float1", + 8 => "float2", + 16 => "float4", + 32 => "double4", + _ => unreachable!(), + }; + + // 添加头部定义 + code.push_str(&format!("#define BLOCK_ARRAY_SIZE {block_array_size}\n")); + code.push_str(&format!("#define GRID_ARRAY_SIZE {grid_array_size}\n")); + code.push_str("#define ARRAY_TYPE int\n"); + code.push_str(&format!("#define CONSTRAIN_NUM {constrain_num}\n")); + code.push_str(CODE); + code.push('\n'); + + // 添加实例化宏调用 + code.push_str(&format!( + r#" +extern "C" __global__ void {kernel_name}( + void *__restrict__ dst, void const *__restrict__ src, - const int block_dim, // block维度数量 - const int block_len_total, // block_len 各元素的乘积 - const ArrayStruct<4, ARRAY_TYPE> constrains1, // 切分维度的约束条件1 - const ArrayStruct<4, ARRAY_TYPE> constrains2, // 切分维度的约束条件2 - const ArrayStruct block_len, // 各维度的长度 - const ArrayStruct src_block_stride, // 源tensor在各维度上的步长(bytes) - const ArrayStruct dst_block_stride, // 目标tensor在各维度上的步长(bytes) - const ArrayStruct grid_len, // 各维度的长度 - const ArrayStruct src_grid_stride, // 源tensor在各维度上的步长(bytes) - const ArrayStruct dst_grid_stride, // 目标tensor在各维度上的步长(bytes) - unsigned int const unit_size // 每个元素的字节数 -){{ - switch (unit_size) {{ - case 1: - rearrange_1(dst, src, block_dim, block_len_total, constrains1, constrains2, - block_len, src_block_stride, dst_block_stride, grid_len, src_grid_stride, dst_grid_stride, unit_size); - break; - case 2: - rearrange_1(dst, src, block_dim, block_len_total, constrains1, constrains2, - block_len, src_block_stride, dst_block_stride, grid_len, src_grid_stride, dst_grid_stride, unit_size); - break; - case 4: - rearrange_1(dst, src, block_dim, block_len_total, constrains1, constrains2, - block_len, src_block_stride, dst_block_stride, grid_len, src_grid_stride, dst_grid_stride, unit_size); - break; - case 8: - rearrange_1(dst, src, block_dim, block_len_total, constrains1, constrains2, - block_len, src_block_stride, dst_block_stride, grid_len, src_grid_stride, dst_grid_stride, unit_size); - break; - case 16: - rearrange_1(dst, src, block_dim, block_len_total, constrains1, constrains2, - block_len, src_block_stride, dst_block_stride, grid_len, src_grid_stride, dst_grid_stride, unit_size); - break; - case 32: - rearrange_1(dst, src, block_dim, block_len_total, constrains1, constrains2, - block_len, src_block_stride, dst_block_stride, grid_len, src_grid_stride, dst_grid_stride, unit_size); - break; - }} + unsigned int const block_dim, + unsigned int const block_len_total, + const ArrayStruct block_len, + const ArrayStruct src_block_stride, + const ArrayStruct dst_block_stride, + const ArrayStruct grid_len, + const ArrayStruct src_grid_stride, + const ArrayStruct dst_grid_stride +#if CONSTRAIN_NUM > 0 + ,const ArrayStruct> constrains +#endif +) {{ + rearrange_kernel<{tmem_type}, {constrain_num}>( + dst, src, block_dim, block_len_total, + block_len, src_block_stride, dst_block_stride, + grid_len, src_grid_stride, dst_grid_stride +#if CONSTRAIN_NUM > 0 + ,constrains +#endif + ); }} "# - ) + )); + code.push('\n'); + + code } #[cfg(test)] mod test { + use std::time::Duration; + use super::{Args, Gpu, Operator}; use crate::{ConstPtr, Hardware, MutPtr, Operator as _, TensorLayout}; use digit_layout::{DigitLayout, types as ty}; + use log::debug; + + // fn dyn_args(dt: DigitLayout) -> Args { + // use std::ptr::{null, null_mut}; + // Args { + // dst_layout: TensorLayout::new(dt, &[0; 2], &[0; 2]), + // dst_base: null_mut(), + // src_layout: TensorLayout::new_dyn(dt, &[0; 2], &[0; 2]), + // src_base: null(), + // } + // } fn args( dt: DigitLayout, @@ -413,28 +533,71 @@ mod test { } #[test] - fn test_compute() { + fn test_compile() { + use super::Scheme; + use super::SchemeKey; + + let Some(gpu) = Gpu::init() else { + return; + }; + println!("{}", gpu.0.device().info()); + + let op = Operator::new(&gpu); + + // 遍历所有可能的unit_size和constrain_num组合,编译所有kernel + for unit_size in (0..=5).map(|n| (1 << n)) { + for constrain_num in 0..=2 { + println!( + "compile unit_size: {}, constrain_num: {}", + unit_size, constrain_num + ); + let key = SchemeKey { + unit_size, + constrain_num, + block_array_size: 5, + grid_array_size: 5, + }; + op.schemes + .lock() + .unwrap() + .get_or_insert(key, || Scheme::new(key, &op.handle)); + } + } + + // 打印所有编译好的kernel信息 + gpu.apply(|ctx| { + let schemes = op.schemes.lock().unwrap(); + for (key, scheme) in schemes.iter() { + println!("{:?}", scheme.name); + println!( + "unit_size: {}, constrain_num: {}\n{}", + key.unit_size, + key.constrain_num, + // scheme.name.to_str().unwrap(), + scheme.module.load(&scheme.name, ctx).info() + ); + println!("----------------------------------------"); + } + }); + } + + fn copute_with_check( + gpu: &Gpu, + shape: [usize; N], + ) -> Duration { + assert!(TRANS_N <= N, "TRANS_N must be less than or equal to N"); use super::super::common_cpu::Operator as RefOp; use crate::common_cpu::{Cpu, ThisThread}; use cuda::memcpy_d2h; use ndarray_layout::{ArrayLayout, Endian::BigEndian}; use rand::Rng; - // use crate::rearrange::cuda::format_code; - // let code = format_code(); - // std::fs::write("rearrange.cu", code).unwrap(); - let Some(gpu) = Gpu::init() else { - return; - }; let dt = ty::U64; let cpu_op = RefOp::new(&Cpu); let gpu_op = Operator::new(&gpu); - const N: usize = 5; - const TRANS_N: usize = 3; - let shape: [usize; N] = [2232, 3, 7, 9, 4]; let mut r_shape: [usize; N] = shape.clone(); r_shape[0..TRANS_N].reverse(); @@ -445,16 +608,16 @@ mod test { rand::rng().fill(&mut src[..]); let ele = dt.nbytes(); - let s_src = ArrayLayout::<3>::new_contiguous(&shape, BigEndian, ele); + let s_src = ArrayLayout::::new_contiguous(&shape, BigEndian, ele); let s_dst = - ArrayLayout::<3>::new_contiguous(&r_shape, BigEndian, ele).transpose(&trans_param); + ArrayLayout::::new_contiguous(&r_shape, BigEndian, ele).transpose(&trans_param); - println!("s_src shape: {:?}", s_src.shape()); - println!("s_dst shape: {:?}", s_dst.shape()); - println!("s_src strides: {:?}", s_src.strides()); - println!("s_dst strides: {:?}", s_dst.strides()); + debug!("s_src shape: {:?}", s_src.shape()); + debug!("s_dst shape: {:?}", s_dst.shape()); + debug!("s_src strides: {:?}", s_src.strides()); + debug!("s_dst strides: {:?}", s_dst.strides()); - let dst_ans = gpu.apply(|ctx| { + let (dst_ans, time) = gpu.apply(|ctx| { let stream = ctx.stream(); #[cfg(use_nvidia)] let rt = &stream; @@ -503,11 +666,10 @@ mod test { let end_event = stream.record(); end_event.synchronize(); let time = end_event.elapse_from(&start_event); - println!("time: {time:?}"); let mut host = vec![0u64; shape.iter().product::()]; memcpy_d2h(&mut host, &dst); - host + (host, time) }); let mut dst_ref = vec![0u64; shape.iter().product::()]; @@ -526,5 +688,32 @@ mod test { ) .unwrap(); assert_eq!(dst_ans, dst_ref); + time + } + + #[test] + fn test_compute() { + let Some(gpu) = Gpu::init() else { + return; + }; + let shape = [2]; + let time = copute_with_check::<1, 1>(&gpu, shape); + println!("time: {time:?}"); + + let shape = [13]; + let time = copute_with_check::<1, 1>(&gpu, shape); + println!("time: {time:?}"); + + let shape = [16, 2, 16]; + let time = copute_with_check::<3, 3>(&gpu, shape); + println!("time: {time:?}"); + + let shape = [32, 2, 17]; + let time = copute_with_check::<3, 3>(&gpu, shape); + println!("time: {time:?}"); + + let shape = [32, 2, 17, 2, 13]; + let time = copute_with_check::<5, 5>(&gpu, shape); + println!("time: {time:?}"); } } diff --git a/operators/src/rearrange/cuda/rearrange.cuh b/operators/src/rearrange/cuda/rearrange.cuh index e4e7583..e542d39 100644 --- a/operators/src/rearrange/cuda/rearrange.cuh +++ b/operators/src/rearrange/cuda/rearrange.cuh @@ -1,25 +1,48 @@ -template + +#ifndef BLOCK_ARRAY_SIZE +#define BLOCK_ARRAY_SIZE 5 +#endif + +#ifndef GRID_ARRAY_SIZE +#define GRID_ARRAY_SIZE 5 +#endif + +#ifndef ARRAY_TYPE +#define ARRAY_TYPE int +#endif + +template struct ArrayStruct { ArrayType a[ArrSize]; }; -template -static __device__ void rearrange_1( +// 各个元素分别代表:[grid_idx, block_idx, grid 的stride相对于block的倍数,总的len限制] +template +struct Constrains { + ElementType grid_idx; + ElementType block_idx; + ElementType grid_div_block; + ElementType total_len; +}; + +// 主要的重排序内核模板 +template +__forceinline__ __device__ void rearrange_kernel( void *__restrict__ dst, void const *__restrict__ src, unsigned int const block_dim, - unsigned int const block_len_total, // block_len 各元素的乘积 - const ArrayStruct<4, ArrayType> constrains1, // 切分维度的约束条件1,, 各个元素分别代表:[grid_idx, block_idx, grid 的stride相对于block的倍数,总的len限制] - const ArrayStruct<4, ArrayType> constrains2, // 切分维度的约束条件2 - const ArrayStruct block_len, // 各维度的长度 - const ArrayStruct src_block_stride, // 源tensor在各维度上的步长(bytes) - const ArrayStruct dst_block_stride, // 目标tensor在各维度上的步长(bytes) - const ArrayStruct grid_len, // 各维度的长度 - const ArrayStruct src_grid_stride, // 源tensor在各维度上的步长(bytes) - const ArrayStruct dst_grid_stride, // 目标tensor在各维度上的步长(bytes) - unsigned int const unit_size // 每个元素的字节数 + unsigned int const block_len_total, // block_len 各元素的乘积 + const ArrayStruct block_len, // 各维度的长度 + const ArrayStruct src_block_stride,// 源tensor在各维度上的步长(bytes) + const ArrayStruct dst_block_stride,// 目标tensor在各维度上的步长(bytes) + const ArrayStruct grid_len, // 各维度的长度 + const ArrayStruct src_grid_stride, // 源tensor在各维度上的步长(bytes) + const ArrayStruct dst_grid_stride // 目标tensor在各维度上的步长(bytes) +#if CONSTRAIN_NUM > 0 + , + const ArrayStruct> constrains// 切分维度的约束条件数组 +#endif ) { - int remaining = threadIdx.x; if (remaining >= block_len_total) { return; @@ -28,33 +51,41 @@ static __device__ void rearrange_1( // 声明共享内存 __shared__ int shared_src_offset; __shared__ int shared_dst_offset; +#if CONSTRAIN_NUM > 0 + __shared__ int shared_constrains_grid_idx_multiple[CONSTRAIN_NUM]; +#endif - __shared__ int shared_constrains1_grid_idx_multiple; - __shared__ int shared_constrains2_grid_idx_multiple; - - if (threadIdx.x == 0) { // 只让0号线程计算 + if (threadIdx.x == 0) {// 只让0号线程计算 // 计算当前block处理的数据在src和dst中的基础偏移(bytes) int src_offset = 0; int dst_offset = 0; +#if CONSTRAIN_NUM > 0 + int constrains_grid_idx_multiple[CONSTRAIN_NUM] = {0}; +#endif int remaining = blockIdx.x; -#pragma unroll - for (int i = ArrSize - 1; i >= 0; i--) { + + for (int i = GRID_ARRAY_SIZE - 1; i >= 0; i--) { int idx = remaining % grid_len.a[i]; remaining /= grid_len.a[i]; src_offset += idx * src_grid_stride.a[i]; dst_offset += idx * dst_grid_stride.a[i]; - - if (i == constrains1.a[0]) { - shared_constrains1_grid_idx_multiple = idx * constrains1.a[2]; - } - if (i == constrains2.a[0]) { - shared_constrains2_grid_idx_multiple = idx * constrains2.a[2]; +#if CONSTRAIN_NUM > 0 + for (int j = 0; j < CONSTRAIN_NUM; j++) { + if (i == constrains.a[j].grid_idx) { + constrains_grid_idx_multiple[j] = idx * constrains.a[j].grid_div_block; + } } +#endif + } - // 将结果存入共享内存 - shared_src_offset = src_offset; - shared_dst_offset = dst_offset; + // 将结果存入共享内存 + shared_src_offset = src_offset; + shared_dst_offset = dst_offset; +#if CONSTRAIN_NUM > 0 + for (int j = 0; j < CONSTRAIN_NUM; j++) { + shared_constrains_grid_idx_multiple[j] = constrains_grid_idx_multiple[j]; } +#endif } // 确保所有线程都能看到共享内存中的值 @@ -63,60 +94,44 @@ static __device__ void rearrange_1( // 所有线程直接使用计算好的偏移值 int src_offset = shared_src_offset; int dst_offset = shared_dst_offset; - - int constrains1_grid_idx_multiple = shared_constrains1_grid_idx_multiple; - int constrains2_grid_idx_multiple = shared_constrains2_grid_idx_multiple; - - for (int i = ArrSize - 1; i > 0; i--) { - if (block_len.a[i] > 1) { - int idx = remaining % block_len.a[i]; - remaining /= block_len.a[i]; - // 计算偏移量 - src_offset += idx * src_block_stride.a[i]; - dst_offset += idx * dst_block_stride.a[i]; - - if (constrains1.a[3] != 0 && i == constrains1.a[1]) { - if (constrains1_grid_idx_multiple + idx >= constrains1.a[3]) { - return; - } - } - - if (constrains2.a[3] != 0 && i == constrains2.a[1]) { - if (constrains2_grid_idx_multiple + idx >= constrains2.a[3]) { +#if CONSTRAIN_NUM > 0 + int constrains_grid_idx_multiple[CONSTRAIN_NUM]; + for (int j = 0; j < CONSTRAIN_NUM; j++) { + constrains_grid_idx_multiple[j] = shared_constrains_grid_idx_multiple[j]; + } +#endif + + for (int i = BLOCK_ARRAY_SIZE - 1; i > 0; i--) { + + int idx = remaining % block_len.a[i]; + remaining /= block_len.a[i]; + // 计算偏移量 + src_offset += idx * src_block_stride.a[i]; + dst_offset += idx * dst_block_stride.a[i]; +#if CONSTRAIN_NUM > 0 + for (int j = 0; j < CONSTRAIN_NUM; j++) { + if (i == constrains.a[j].block_idx) { + if (constrains_grid_idx_multiple[j] + idx >= constrains.a[j].total_len) { return; } } } +#endif } - // 单独处理第一个维度 - if (remaining >= block_len.a[0]) { - return; - } src_offset += remaining * src_block_stride.a[0]; dst_offset += remaining * dst_block_stride.a[0]; - - if (constrains1.a[3] != 0 && 0 == constrains1.a[1]) { - if (constrains1_grid_idx_multiple + remaining >= constrains1.a[3]) { - return; - } - } - - if (constrains2.a[3] != 0 && 0 == constrains2.a[1]) { - if (constrains2_grid_idx_multiple + remaining >= constrains2.a[3]) { - return; +#if CONSTRAIN_NUM > 0 + for (int j = 0; j < CONSTRAIN_NUM; j++) { + if (0 == constrains.a[j].block_idx) { + if (constrains_grid_idx_multiple[j] + remaining >= constrains.a[j].total_len) { + return; + } } } +#endif // 执行数据拷贝,注意offset已经是字节偏移 - const int elements_per_thread = unit_size / sizeof(Tmem); - if (elements_per_thread == 1) { - *reinterpret_cast(reinterpret_cast(dst) + dst_offset) = - *reinterpret_cast(reinterpret_cast(src) + src_offset); - } else { - for (int i = 0; i < elements_per_thread; i++) { - reinterpret_cast(reinterpret_cast(dst) + dst_offset)[i] = - reinterpret_cast(reinterpret_cast(src) + src_offset)[i]; - } - } + *reinterpret_cast(reinterpret_cast(dst) + dst_offset) = + *reinterpret_cast(reinterpret_cast(src) + src_offset); }