diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 9c1c554b..815126ec 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -52,6 +52,7 @@ jobs: run: cargo clippy --all-features + --all-targets --message-format=json | clippy-sarif | tee rust-clippy-results.sarif | sarif-fmt continue-on-error: true diff --git a/Cargo.toml b/Cargo.toml index 32e3f815..e8105970 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,7 @@ [workspace] members = ["operators"] -resolver = "2" +resolver = "3" +package.edition = "2024" [workspace.dependencies] clrt = { git = "https://github.com/InfiniTensor/clrt", rev = "984ac7a" } @@ -11,8 +12,8 @@ infini-op = { git = "https://github.com/InfiniTensor/infini-toolkit", rev = "e83 infini-ccl = { git = "https://github.com/InfiniTensor/infini-toolkit", rev = "e8362c3" } search-infini-tools = { git = "https://github.com/InfiniTensor/infini-toolkit", rev = "e8362c3" } -cuda = { git = "https://github.com/YdrMaster/cuda-driver", rev = "f3ffbcc" } -cublas = { git = "https://github.com/YdrMaster/cuda-driver", rev = "f3ffbcc" } -nccl = { git = "https://github.com/YdrMaster/cuda-driver", rev = "f3ffbcc" } -search-cuda-tools = { git = "https://github.com/YdrMaster/cuda-driver", rev = "f3ffbcc" } -search-corex-tools = { git = "https://github.com/YdrMaster/cuda-driver", rev = "f3ffbcc" } +cuda = { git = "https://github.com/YdrMaster/cuda-driver", rev = "c2b12d3" } +cublas = { git = "https://github.com/YdrMaster/cuda-driver", rev = "c2b12d3" } +nccl = { git = "https://github.com/YdrMaster/cuda-driver", rev = "c2b12d3" } +search-cuda-tools = { git = "https://github.com/YdrMaster/cuda-driver", rev = "c2b12d3" } +search-corex-tools = { git = "https://github.com/YdrMaster/cuda-driver", rev = "c2b12d3" } diff --git a/operators/Cargo.toml b/operators/Cargo.toml index 7b65d5a9..2c9c55e2 100644 --- a/operators/Cargo.toml +++ b/operators/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "operators" version = "0.0.0" -edition = "2021" +edition.workspace = true authors = ["YdrMaster "] [features] @@ -13,13 +13,13 @@ nvidia-gpu = ["cuda", "cublas", "nccl", "fslock", "libloading"] iluvatar-gpu = ["cuda", "cublas", "fslock", "libloading"] [dependencies] -digit-layout = "0.2" -ndarray-layout = "0.1" +digit-layout = "0.3" +ndarray-layout = "0.2" rayon = "1.10" -lru = "0.12" +lru = "0.14" num-traits = "0.2" itertools = "0.14" -half = "2.4" +half = "2.6" log = "0.4" gemm = { version = "0.18", optional = true } diff --git a/operators/build.rs b/operators/build.rs index c5c199f8..d9e32b49 100644 --- a/operators/build.rs +++ b/operators/build.rs @@ -26,18 +26,21 @@ fn main() { { infini.define() } - let use_nvidia = cfg!(feature = "nvidia-gpu") && find_cuda_root().is_some(); + + // iluvatar let use_iluvatar = cfg!(feature = "iluvatar-gpu") && find_corex().is_some(); + if use_iluvatar { + iluvatar.define(); + cuda.define(); + return; + } + + let use_nvidia = cfg!(feature = "nvidia-gpu") && find_cuda_root().is_some(); if use_nvidia { nvidia.define(); if find_nccl_root().is_some() { nccl.define() } - } - if use_iluvatar { - iluvatar.define() - } - if use_nvidia || use_iluvatar { - cuda.define() + cuda.define(); } } diff --git a/operators/src/.clang-format b/operators/src/.clang-format index 66c6e431..91ec77ad 100644 --- a/operators/src/.clang-format +++ b/operators/src/.clang-format @@ -1,21 +1,13 @@ -# Generated from CLion C/C++ Code Style settings +--- BasedOnStyle: LLVM -AccessModifierOffset: -4 -AlignAfterOpenBracket: Align -# AlignConsecutiveAssignments: None -AlignOperands: Align -AllowAllArgumentsOnNextLine: false -AllowAllConstructorInitializersOnNextLine: false -AllowAllParametersOfDeclarationOnNextLine: false -AllowShortBlocksOnASingleLine: Always -AllowShortCaseLabelsOnASingleLine: false -AllowShortFunctionsOnASingleLine: All -AllowShortIfStatementsOnASingleLine: Always -AllowShortLambdasOnASingleLine: All -AllowShortLoopsOnASingleLine: true -AlwaysBreakAfterReturnType: None -AlwaysBreakTemplateDeclarations: No -BreakBeforeBraces: Custom +IndentWidth: 4 # 缩进宽度,LLVM 默认值为 2,改为 4 +AccessModifierOffset: -4 # public/protected/private 访问控制符相对成员的偏移,与 IndentWidth 配合,LLVM 默认值为 -2 +AlignOperands: AlignAfterOperator # 双目运算符的行间对齐,LLVM 默认值为 Align,改为带符号一起换行 +ColumnLimit: 0 # 列宽限制,LLVM 默认值为 80,改为不限制 +AllowShortBlocksOnASingleLine: Always # 是否允许短块(单个语句的块)不换行,LLVM 默认值为 Never,改为允许 +AllowShortLoopsOnASingleLine: true # 是否允许短循环不换行,LLVM 默认值为 false,改为允许 +InsertBraces: true # 是否在 if/for/while/switch 等语句后插入大括号,LLVM 默认值为 false,改为允许 +BreakBeforeBraces: Custom # 大括号换行配置,LLVM 默认值为 LLVM,改为自定义以使 BraceWrapping 生效 BraceWrapping: AfterCaseLabel: false AfterClass: false @@ -23,44 +15,15 @@ BraceWrapping: AfterEnum: false AfterFunction: false AfterNamespace: false + AfterObjCDeclaration: false + AfterStruct: false AfterUnion: false + AfterExternBlock: false BeforeCatch: false BeforeElse: false + BeforeLambdaBody: false + BeforeWhile: false IndentBraces: false - SplitEmptyFunction: false + SplitEmptyFunction: true SplitEmptyRecord: true -BreakBeforeBinaryOperators: None -BreakBeforeTernaryOperators: true -BreakConstructorInitializers: BeforeColon -BreakInheritanceList: BeforeColon -ColumnLimit: 0 -CompactNamespaces: true -ContinuationIndentWidth: 4 -IndentCaseLabels: true -IndentPPDirectives: None -IndentWidth: 4 -KeepEmptyLinesAtTheStartOfBlocks: true -MaxEmptyLinesToKeep: 2 -NamespaceIndentation: All -ObjCSpaceAfterProperty: false -ObjCSpaceBeforeProtocolList: true -PointerAlignment: Right -ReflowComments: false -SpaceAfterCStyleCast: true -SpaceAfterLogicalNot: false -SpaceAfterTemplateKeyword: false -SpaceBeforeAssignmentOperators: true -SpaceBeforeCpp11BracedList: false -SpaceBeforeCtorInitializerColon: true -SpaceBeforeInheritanceColon: true -SpaceBeforeParens: ControlStatements -SpaceBeforeRangeBasedForLoopColon: true -SpaceInEmptyParentheses: false -SpacesBeforeTrailingComments: 0 -SpacesInAngles: false -SpacesInCStyleCastParentheses: false -SpacesInContainerLiterals: false -SpacesInParentheses: false -SpacesInSquareBrackets: false -TabWidth: 4 -UseTab: Never + SplitEmptyNamespace: true diff --git a/operators/src/add/args.rs b/operators/src/add/args.rs index ca2c1294..2adc29f0 100644 --- a/operators/src/add/args.rs +++ b/operators/src/add/args.rs @@ -1,6 +1,6 @@ use crate::{ - get_static, rank_mismatch, shape_mismatch, shape_not_support, utils::type_distinct, ConstPtr, - Hardware, MutPtr, SchemeError, TensorLayout, + ConstPtr, Hardware, LaunchError, MutPtr, TensorLayout, rank_mismatch, shape_mismatch, + shape_not_support, utils::type_distinct, }; use digit_layout::DigitLayout; use itertools::izip; @@ -40,7 +40,7 @@ impl Args { pub(super) struct Scheme(DigitLayout, Box<[isize]>); impl Scheme { - pub fn new(args: &Args) -> Result { + pub fn new(args: &Args) -> Result { let Args { c_layout: c, a_layout: a, @@ -48,7 +48,7 @@ impl Scheme { .. } = args; // # 检查基本属性 - let dt = type_distinct(&[c.dt(), a.dt(), b.dt()])?; + let dt = type_distinct(&[c.dt, a.dt, b.dt])?; let ndim = c.ndim(); if a.ndim() != ndim || b.ndim() != ndim { return Err(rank_mismatch(format!( @@ -68,17 +68,13 @@ impl Scheme { } let mut dims = Vec::with_capacity(ndim); for (&d, &da, &db, &sc, &sa, &sb) in izip!( - c.shape(), - a.shape(), - b.shape(), + c.shape_group(), + a.shape_group(), + b.shape_group(), c.strides(), a.strides(), - b.strides() + b.strides(), ) { - get_static! { - d da db - sc sa sb - } if da != d || db != d { return Err(shape_mismatch(format!( "c: {:?}, a: {:?}, b: {:?}", diff --git a/operators/src/add/common_cpu/mod.rs b/operators/src/add/common_cpu/mod.rs index 5f04865a..c3a8871d 100644 --- a/operators/src/add/common_cpu/mod.rs +++ b/operators/src/add/common_cpu/mod.rs @@ -1,5 +1,5 @@ -use super::{args::Scheme, Add, Args}; -use crate::{common_cpu::Cpu, ByteOf, LaunchError, QueueAlloc, SchemeError}; +use super::{Add, Args, args::Scheme}; +use crate::{ByteOf, LaunchError, QueueAlloc, common_cpu::Cpu}; use digit_layout::types as ty; use half::f16; use rayon::iter::{IntoParallelIterator, ParallelIterator}; @@ -17,14 +17,6 @@ impl crate::Operator for Operator { fn new(_node: &Self::TopoNode) -> Self { Self } - #[inline] - fn scheme( - &mut self, - _args: &Self::Args, - _max_workspace_size: usize, - ) -> Result { - Ok(0) - } fn launch( &self, diff --git a/operators/src/add/cuda/add.cuh b/operators/src/add/cuda/add.cuh index 663ad267..957fb678 100644 --- a/operators/src/add/cuda/add.cuh +++ b/operators/src/add/cuda/add.cuh @@ -1,4 +1,4 @@ -template +template static __device__ void _add( Tdata *__restrict__ c, Tdata const *__restrict__ a, diff --git a/operators/src/add/cuda/mod.rs b/operators/src/add/cuda/mod.rs index 376bf6fe..18fd3b64 100644 --- a/operators/src/add/cuda/mod.rs +++ b/operators/src/add/cuda/mod.rs @@ -1,14 +1,15 @@ -use super::{args::Scheme, Add, Args}; +use super::{Add, Args, args::Scheme}; use crate::{ - cuda::{dt_name, Gpu, Handle, ModuleBox}, + ByteOf, LaunchError, QueueAlloc, SchemeDiversity, + cuda::{Gpu, Handle, ModuleBox, dt_name}, shape_not_support, strides_not_support, - utils::{gcd, type_distinct}, - ByteOf, LaunchError, QueueAlloc, SchemeDiversity, SchemeError, + utils::gcd, }; +use cuda::params; use digit_layout::DigitLayout; use lru::LruCache; use std::{ - ffi::{c_uint, CString}, + ffi::c_uint, sync::{Arc, Mutex}, }; @@ -32,20 +33,6 @@ impl crate::Operator for Operator { } } - #[inline] - fn scheme( - &mut self, - args: &Self::Args, - _max_workspace_size: usize, - ) -> Result { - let dt = type_distinct(&[args.c_layout.dt(), args.a_layout.dt(), args.b_layout.dt()])?; - self.schemes - .lock() - .unwrap() - .get_or_insert(dt, || compile(&self.handle, dt)); - Ok(0) - } - fn launch( &self, args: &Self::Args, @@ -60,20 +47,20 @@ impl crate::Operator for Operator { let count = scheme.count(); let &[1] = scheme.idx_strides() else { - return Err(shape_not_support("").into()); + return Err(shape_not_support("")); }; let &[sc] = scheme.c_strides() else { - return Err(shape_not_support("").into()); + return Err(shape_not_support("")); }; let &[sa] = scheme.a_strides() else { - return Err(shape_not_support("").into()); + return Err(shape_not_support("")); }; let &[sb] = scheme.b_strides() else { - return Err(shape_not_support("").into()); + return Err(shape_not_support("")); }; let unit = dt.nbytes() as isize; if sc != unit || sa != unit || sb != unit { - return Err(strides_not_support("").into()); + return Err(strides_not_support("")); } let block_dims = gcd(count, self.max_threads_block); @@ -84,18 +71,15 @@ impl crate::Operator for Operator { b_base, .. } = args; - let params = cuda::params![c_base, a_base, b_base]; self.schemes .lock() .unwrap() .get_or_insert(dt, || compile(&self.handle, dt)) .launch( - CString::new("add").unwrap(), - grid_dims as c_uint, - block_dims as c_uint, - params.as_ptr(), - 0, + c"add", + (grid_dims as c_uint, block_dims as c_uint, 0), + ¶ms![*c_base, *a_base, *b_base].to_ptrs(), queue_alloc.queue(), ); Ok(()) @@ -124,25 +108,12 @@ extern "C" __global__ void add( #[cfg(test)] mod test { use super::{Args, Gpu, Operator}; - use crate::{dyn_, Hardware, Operator as _, TensorLayout}; + use crate::{Hardware, Operator as _, TensorLayout}; use digit_layout::{ - types::{F16, F64}, DigitLayout, + types::{F16, F64}, }; - use std::ptr::null; - fn dyn_args(dt: DigitLayout) -> Args { - use std::ptr::null_mut; - let layout = TensorLayout::new_dyn(dt, &[dyn_(); 2], &[dyn_(); 2]); - Args { - c_layout: layout.clone(), - c_base: null_mut(), - a_layout: layout.clone(), - a_base: null(), - b_layout: layout.clone(), - b_base: null(), - } - } fn args( dt: DigitLayout, n: usize, @@ -178,10 +149,8 @@ mod test { return; }; - let mut cpu_op = RefOp::new(&Cpu); - let mut gpu_op = Operator::new(&gpu); - cpu_op.scheme(&dyn_args(F64), 0).unwrap(); - gpu_op.scheme(&dyn_args(F16), 0).unwrap(); + let cpu_op = RefOp::new(&Cpu); + let gpu_op = Operator::new(&gpu); let n = 1; let d = 768; diff --git a/operators/src/add/infini/mod.rs b/operators/src/add/infini/mod.rs deleted file mode 100644 index a780e260..00000000 --- a/operators/src/add/infini/mod.rs +++ /dev/null @@ -1,36 +0,0 @@ -use super::{Add, Args}; -use crate::{infini::Device, ByteOf, LaunchError, QueueAlloc, SchemeError}; - -pub struct Operator; - -impl Add for Operator {} - -impl crate::Operator for Operator { - type Hardware = Device; - type TopoNode = Device; - type Args = Args; - - fn new(_node: &Self::TopoNode) -> Self { - todo!() - } - - fn scheme( - &mut self, - _args: &Self::Args, - _max_workspace_size: usize, - ) -> Result { - todo!() - } - - fn launch( - &self, - _args: &Self::Args, - _workspace: &mut [ByteOf], - _queue_alloc: &QA, - ) -> Result<(), LaunchError> - where - QA: QueueAlloc, - { - todo!() - } -} diff --git a/operators/src/add/mod.rs b/operators/src/add/mod.rs index 00e72b7b..e3d9f920 100644 --- a/operators/src/add/mod.rs +++ b/operators/src/add/mod.rs @@ -4,10 +4,6 @@ pub mod common_cpu; #[cfg(use_cuda)] pub mod cuda; -#[cfg(use_infini)] -pub mod infini; -#[cfg(use_cl)] -pub mod opencl; mod args; pub use args::Args; diff --git a/operators/src/add/opencl/mod.rs b/operators/src/add/opencl/mod.rs deleted file mode 100644 index 95abcd0e..00000000 --- a/operators/src/add/opencl/mod.rs +++ /dev/null @@ -1,36 +0,0 @@ -use super::{Add, Args}; -use crate::{opencl::ClDevice, ByteOf, LaunchError, QueueAlloc, SchemeError}; - -pub struct Operator; - -impl Add for Operator {} - -impl crate::Operator for Operator { - type Hardware = ClDevice; - type TopoNode = ClDevice; - type Args = Args; - - fn new(_node: &Self::TopoNode) -> Self { - todo!() - } - - fn scheme( - &mut self, - _args: &Self::Args, - _max_workspace_size: usize, - ) -> Result { - todo!() - } - - fn launch( - &self, - _args: &Self::Args, - _workspace: &mut [ByteOf], - _queue_alloc: &QA, - ) -> Result<(), LaunchError> - where - QA: QueueAlloc, - { - todo!() - } -} diff --git a/operators/src/add_rows/args.rs b/operators/src/add_rows/args.rs index b5ed0690..f2950c7e 100644 --- a/operators/src/add_rows/args.rs +++ b/operators/src/add_rows/args.rs @@ -1,7 +1,6 @@ -use crate::{ - type_not_support, +use crate::{ + ConstPtr, Hardware, LaunchError, MutPtr, TensorLayout, type_not_support, utils::{dim_distinct, rank_error, type_distinct}, - ConstPtr, Hardware, MaybeDyn, MutPtr, SchemeError, TensorLayout, }; use digit_layout::{DigitLayout, LayoutContent::Unsigned}; use std::ptr::{null, null_mut}; @@ -37,14 +36,14 @@ impl Args { pub(super) struct Meta { pub dt: DigitLayout, pub dt_idx: DigitLayout, - pub batch: MaybeDyn, - pub m: MaybeDyn, - pub n: MaybeDyn, - pub k: MaybeDyn, + pub batch: usize, + pub m: usize, + pub n: usize, + pub k: usize, } impl Args { - pub(super) fn meta(&self) -> Result { + pub(super) fn meta(&self) -> Result { let Self { dst_layout: dst, src_layout: src, @@ -52,30 +51,30 @@ impl Args { .. } = self; - let dt = type_distinct(&[dst.dt(), src.dt()])?; - let dt_idx = idx.dt(); + let dt = type_distinct(&[dst.dt, src.dt])?; + let dt_idx = idx.dt; if !matches!(dt_idx.decode(), Unsigned { .. }) { return Err(type_not_support(format!( "data type {dt_idx} is not supported, must be unsigned integers" ))); } - let &[batch, m, n] = dst.shape() else { + let &[batch, m, n] = &*dst.shape() else { return Err(rank_error("dst", 3, dst.ndim())); }; - let &[k, n_] = src.shape() else { + let &[k, n_] = &*src.shape() else { return Err(rank_error("src", 2, src.ndim())); }; - let &[batch_, m_] = idx.shape() else { + let &[batch_, m_] = &*idx.shape() else { return Err(rank_error("idx", 2, idx.ndim())); }; Ok(Meta { dt, dt_idx, - batch: dim_distinct(&[batch, batch_])?, - m: dim_distinct(&[m, m_])?, - n: dim_distinct(&[n, n_])?, + batch: dim_distinct(&[batch, batch_]).expect("batch mismatch"), + m: dim_distinct(&[m, m_]).expect("m mismatch"), + n: dim_distinct(&[n, n_]).expect("n mismatch"), k, }) } diff --git a/operators/src/add_rows/common_cpu/mod.rs b/operators/src/add_rows/common_cpu/mod.rs index 68fadcf4..7ee12200 100644 --- a/operators/src/add_rows/common_cpu/mod.rs +++ b/operators/src/add_rows/common_cpu/mod.rs @@ -1,5 +1,5 @@ -use super::{args::Meta, AddRows, Args}; -use crate::{common_cpu::Cpu, get_static, ByteOf, LaunchError, QueueAlloc, SchemeError, Unsigned}; +use super::{AddRows, Args, args::Meta}; +use crate::{ByteOf, LaunchError, QueueAlloc, Unsigned, common_cpu::Cpu}; use digit_layout::types as ty; use half::f16; use rayon::iter::{IntoParallelIterator, ParallelIterator}; @@ -18,14 +18,6 @@ impl crate::Operator for Operator { Self } - fn scheme( - &mut self, - _args: &Self::Args, - _max_workspace_size: usize, - ) -> Result { - Ok(0) - } - fn launch( &self, args: &Self::Args, @@ -62,12 +54,6 @@ impl crate::Operator for Operator { unreachable!() }; - get_static! { - b m n k - bsd msd nsd - bsi msi nss kss - } - let dst = *dst_base as usize; let src = *src_base as usize; let idx = *idx_base as usize; diff --git a/operators/src/add_rows/cuda/add_rows.cuh b/operators/src/add_rows/cuda/add_rows.cuh index b7c17ad9..1e5449ac 100644 --- a/operators/src/add_rows/cuda/add_rows.cuh +++ b/operators/src/add_rows/cuda/add_rows.cuh @@ -1,4 +1,4 @@ -template +template static __device__ void add_rows( Tdata *__restrict__ dst, Tdata const *__restrict__ src, diff --git a/operators/src/add_rows/cuda/mod.rs b/operators/src/add_rows/cuda/mod.rs index d3a944fc..ab1dd394 100644 --- a/operators/src/add_rows/cuda/mod.rs +++ b/operators/src/add_rows/cuda/mod.rs @@ -1,11 +1,12 @@ use super::{AddRows, Args}; use crate::{ + ByteOf, LaunchError, QueueAlloc, SchemeDiversity, add_rows::args::Meta, - cuda::{dt_name, Gpu, Handle, ModuleBox}, - get_static, strides_not_support, + cuda::{Gpu, Handle, ModuleBox, dt_name}, + strides_not_support, utils::gcd, - ByteOf, LaunchError, QueueAlloc, SchemeDiversity, SchemeError, }; +use cuda::params; use digit_layout::DigitLayout; use lru::LruCache; use std::{ @@ -34,22 +35,6 @@ impl crate::Operator for Operator { } } - #[inline] - fn scheme( - &mut self, - args: &Self::Args, - _max_workspace_size: usize, - ) -> Result { - let Meta { dt, .. } = args.meta()?; - - let key = SchemeKey { dt }; - self.schemes - .lock() - .unwrap() - .try_get_or_insert(key, || Scheme::new(&self.handle, key))?; - Ok(0) - } - fn launch( &self, args: &Self::Args, @@ -79,15 +64,10 @@ impl crate::Operator for Operator { unreachable!() }; - get_static! { - b n m - bsd msd nsd - bsi msi nss kss - } - let unit_dst = dst_layout.dt().nbytes() as isize; - let unit_idx = idx_layout.dt().nbytes() as isize; + let unit_dst = dst_layout.dt.nbytes() as isize; + let unit_idx = idx_layout.dt.nbytes() as isize; if nsd != unit_dst || nss != unit_dst || msi != unit_idx { - return Err(strides_not_support("").into()); + return Err(strides_not_support("")); }; fn cast(strides: &[isize], size: usize) -> Vec { strides.iter().map(|x| x / size as isize).collect() @@ -98,12 +78,10 @@ impl crate::Operator for Operator { let &[bsi] = cast(&[bsi], unit_idx as usize).as_slice() else { todo!() }; - let params = cuda::params![dst_base, src_base, idx_base, bsd, msd, kss, bsi]; + let params = params![*dst_base, *src_base, *idx_base, bsd, msd, kss, bsi]; let block = gcd(self.max_threads_block, n); let dimx = n.div_ceil(block); - let key = SchemeKey { - dt: dst_layout.dt(), - }; + let key = SchemeKey { dt: dst_layout.dt }; let scheme = self .schemes .lock() @@ -112,10 +90,8 @@ impl crate::Operator for Operator { .clone(); scheme.module.launch( &scheme.name, - (b as _, m as _, dimx as _), - block as u32, - params.as_ptr(), - 0, + ((b as _, m as _, dimx as _), block as u32, 0), + ¶ms.to_ptrs(), queue_alloc.queue(), ); Ok(()) @@ -134,7 +110,7 @@ struct SchemeKey { } impl Scheme { - pub fn new(handle: &Arc, SchemeKey { dt }: SchemeKey) -> Result { + pub fn new(handle: &Arc, SchemeKey { dt }: SchemeKey) -> Result { let device = handle.device(); let cc = device.compute_capability(); let type_name = dt_name(dt); @@ -168,26 +144,15 @@ extern "C" __global__ void {name}( #[cfg(test)] mod test { use super::{Args, Gpu, Operator}; - use crate::{cuda::cast_load, dyn_, Hardware, Operator as _, TensorLayout}; + use crate::{Hardware, Operator as _, TensorLayout, cuda::cast_load}; use cuda::memcpy_d2h; use digit_layout::{ - types::{F16, F64, U32}, DigitLayout, + types::{F16, F64, U32}, }; use half::f16; - use std::ptr::null; - fn dyn_args(dt: DigitLayout) -> Args { - use std::ptr::null_mut; - Args { - dst_layout: TensorLayout::new_dyn(dt, &[dyn_(); 3], &[dyn_(); 3]), - dst_base: null_mut(), - src_layout: TensorLayout::new_dyn(dt, &[dyn_(); 2], &[dyn_(); 2]), - src_base: null(), - idx_layout: TensorLayout::new_dyn(U32, &[dyn_(); 2], &[dyn_(); 2]), - idx_base: null(), - } - } + #[allow(clippy::too_many_arguments)] fn args( dt: DigitLayout, b: usize, @@ -220,10 +185,8 @@ mod test { return; }; - let mut cpu_op = RefOp::new(&Cpu); - let mut gpu_op = Operator::new(&gpu); - cpu_op.scheme(&dyn_args(F64), 0).unwrap(); - gpu_op.scheme(&dyn_args(F16), 0).unwrap(); + let cpu_op = RefOp::new(&Cpu); + let gpu_op = Operator::new(&gpu); let b = 1; let m = 10; diff --git a/operators/src/add_rows/infini/mod.rs b/operators/src/add_rows/infini/mod.rs deleted file mode 100644 index 380153a3..00000000 --- a/operators/src/add_rows/infini/mod.rs +++ /dev/null @@ -1,36 +0,0 @@ -use super::{AddRows, Args}; -use crate::{infini::Device, ByteOf, LaunchError, QueueAlloc, SchemeError}; - -pub struct Operator; - -impl AddRows for Operator {} - -impl crate::Operator for Operator { - type Hardware = Device; - type TopoNode = Device; - type Args = Args; - - fn new(_node: &Self::TopoNode) -> Self { - todo!() - } - - fn scheme( - &mut self, - _args: &Self::Args, - _max_workspace_size: usize, - ) -> Result { - todo!() - } - - fn launch( - &self, - _args: &Self::Args, - _workspace: &mut [ByteOf], - _queue_alloc: &QA, - ) -> Result<(), LaunchError> - where - QA: QueueAlloc, - { - todo!() - } -} diff --git a/operators/src/add_rows/mod.rs b/operators/src/add_rows/mod.rs index 2608f660..cdf4b8b6 100644 --- a/operators/src/add_rows/mod.rs +++ b/operators/src/add_rows/mod.rs @@ -4,10 +4,6 @@ pub mod common_cpu; #[cfg(use_cuda)] pub mod cuda; -#[cfg(use_infini)] -pub mod infini; -#[cfg(use_cl)] -pub mod opencl; mod args; pub use args::Args; diff --git a/operators/src/add_rows/opencl/mod.rs b/operators/src/add_rows/opencl/mod.rs deleted file mode 100644 index 10a3ac22..00000000 --- a/operators/src/add_rows/opencl/mod.rs +++ /dev/null @@ -1,36 +0,0 @@ -use super::{AddRows, Args}; -use crate::{opencl::ClDevice, ByteOf, LaunchError, QueueAlloc, SchemeError}; - -pub struct Operator; - -impl AddRows for Operator {} - -impl crate::Operator for Operator { - type Hardware = ClDevice; - type TopoNode = ClDevice; - type Args = Args; - - fn new(_node: &Self::TopoNode) -> Self { - todo!() - } - - fn scheme( - &mut self, - _args: &Self::Args, - _max_workspace_size: usize, - ) -> Result { - todo!() - } - - fn launch( - &self, - _args: &Self::Args, - _workspace: &mut [ByteOf], - _queue_alloc: &QA, - ) -> Result<(), LaunchError> - where - QA: QueueAlloc, - { - todo!() - } -} diff --git a/operators/src/all_reduce/args.rs b/operators/src/all_reduce/args.rs index 4b5bb7b0..f56d2294 100644 --- a/operators/src/all_reduce/args.rs +++ b/operators/src/all_reduce/args.rs @@ -1,10 +1,8 @@ -use super::ReduceOp; +use super::ReduceOp; use crate::{ - dyn_not_support, rearrange, shape_mismatch, strides_not_support, utils::type_distinct, - Hardware, MaybeDyn, SchemeError, + Hardware, LaunchError, rearrange, shape_mismatch, strides_not_support, utils::type_distinct, }; use digit_layout::DigitLayout; -use ndarray_layout::ArrayLayout; pub struct Args { pub pair: rearrange::Args, @@ -24,7 +22,7 @@ pub(super) struct Meta { } impl Args { - pub(super) fn meta(&self) -> Result { + pub(super) fn meta(&self) -> Result { let Self { pair: rearrange::Args { @@ -35,15 +33,9 @@ impl Args { .. } = self; - let dt = type_distinct(&[dst_layout.dt(), src_layout.dt()])?; + let dt = type_distinct(&[dst_layout.dt, src_layout.dt])?; - let Some(shape) = MaybeDyn::get_all(dst_layout.shape()) else { - return Err(dyn_not_support("")); - }; - let Some(strides) = MaybeDyn::get_all(dst_layout.strides()) else { - return Err(dyn_not_support("")); - }; - let dst = ArrayLayout::<2>::new(shape, strides, 0); + let dst = &dst_layout.layout; let &[dst] = dst .merge_be(0, dst.ndim()) .ok_or(strides_not_support(""))? @@ -52,13 +44,7 @@ impl Args { unreachable!() }; - let Some(shape) = MaybeDyn::get_all(src_layout.shape()) else { - return Err(dyn_not_support("")); - }; - let Some(strides) = MaybeDyn::get_all(src_layout.strides()) else { - return Err(dyn_not_support("")); - }; - let src = ArrayLayout::<2>::new(shape, strides, 0); + let src = &src_layout.layout; let &[src] = src .merge_be(0, src.ndim()) .ok_or(strides_not_support(""))? diff --git a/operators/src/all_reduce/common_cpu.rs b/operators/src/all_reduce/common_cpu.rs index 57606088..a909aa3c 100644 --- a/operators/src/all_reduce/common_cpu.rs +++ b/operators/src/all_reduce/common_cpu.rs @@ -1,8 +1,9 @@ -use super::{args::Meta, AllReduce, Args, ReduceOp}; +use super::{AllReduce, Args, ReduceOp, args::Meta}; use crate::{ + ByteOf, LaunchError, QueueAlloc, TopoNode, broadcast::{self, common_cpu::Operator as Broadcast}, common_cpu::{Cpu, InprocNode}, - rearrange, ByteOf, LaunchError, QueueAlloc, SchemeError, TopoNode, + rearrange, }; use digit_layout::DigitLayout; use half::{bf16, f16}; @@ -32,14 +33,6 @@ impl crate::Operator for Operator { } } - fn scheme( - &mut self, - _args: &Self::Args, - _max_workspace_size: usize, - ) -> Result { - Ok(0) - } - fn launch( &self, args: &Self::Args, @@ -146,7 +139,7 @@ fn sum(len: usize, buf: *mut u8, src: *const u8) { #[test] fn test_comm() { - use crate::{common_cpu::ThisThread, Operator as _, TensorLayout}; + use crate::{Operator as _, TensorLayout, common_cpu::ThisThread}; use digit_layout::types::U32; InprocNode::new(4) @@ -174,5 +167,5 @@ fn test_comm() { }) .collect::>() .into_iter() - .for_each(|h| assert_eq!(h.join().unwrap(), [0 + 1 + 2 + 3; 8])); + .for_each(|h| assert_eq!(h.join().unwrap(), [1 + 2 + 3; 8])); } diff --git a/operators/src/all_reduce/infini.rs b/operators/src/all_reduce/infini.rs index 163b45fa..d3d66a6d 100644 --- a/operators/src/all_reduce/infini.rs +++ b/operators/src/all_reduce/infini.rs @@ -1,8 +1,8 @@ -use super::{args::Meta, AllReduce, Args, ReduceOp}; +use super::{AllReduce, Args, ReduceOp, args::Meta}; use crate::{ + ByteOf, LaunchError, QueueAlloc, infini::{Device, InfiniNode}, rearrange::{self, infini::Operator as Rearrange}, - ByteOf, LaunchError, QueueAlloc, SchemeError, }; use digit_layout::types as ty; use infini_ccl::bindings::InfiniDataType_t; @@ -30,14 +30,6 @@ impl crate::Operator for Operator { } } - fn scheme( - &mut self, - _args: &Self::Args, - _max_workspace_size: usize, - ) -> Result { - Ok(0) - } - fn launch( &self, args: &Self::Args, diff --git a/operators/src/all_reduce/nccl.rs b/operators/src/all_reduce/nccl.rs index d66398df..98afa4cd 100644 --- a/operators/src/all_reduce/nccl.rs +++ b/operators/src/all_reduce/nccl.rs @@ -1,7 +1,8 @@ -use super::{args::Meta, AllReduce, Args, ReduceOp}; +use super::{AllReduce, Args, ReduceOp, args::Meta}; use crate::{ + ByteOf, LaunchError, QueueAlloc, cuda::{Gpu, NcclNode}, - rearrange, ByteOf, LaunchError, QueueAlloc, SchemeError, + rearrange, }; use std::{ slice::{from_raw_parts, from_raw_parts_mut}, @@ -25,14 +26,6 @@ impl crate::Operator for Operator { } } - fn scheme( - &mut self, - _args: &Self::Args, - _max_workspace_size: usize, - ) -> Result { - Ok(0) - } - fn launch( &self, args: &Self::Args, diff --git a/operators/src/attention/args.rs b/operators/src/attention/args.rs index 4c1ac43a..ed9732f3 100644 --- a/operators/src/attention/args.rs +++ b/operators/src/attention/args.rs @@ -1,11 +1,9 @@ -use crate::{ - dyn_, +use crate::{ + ConstPtr, Hardware, LaunchError, MutPtr, TensorLayout, fuesd_softmax::AttnMask, utils::{dim_distinct, rank_error, type_distinct}, - ConstPtr, Hardware, MaybeDyn, MutPtr, SchemeError, TensorLayout, }; use digit_layout::DigitLayout; -use std::ptr::{null, null_mut}; pub struct Args { pub q_layout: TensorLayout, @@ -25,39 +23,15 @@ pub struct Args { pub(super) struct Meta { pub dt: DigitLayout, - pub nh: MaybeDyn, - pub nkvh: MaybeDyn, - pub seq: MaybeDyn, - pub att: MaybeDyn, - pub dh: MaybeDyn, + pub nh: usize, + pub nkvh: usize, + pub seq: usize, + pub att: usize, + pub dh: usize, } impl Args { - pub(crate) fn new_null( - mask: AttnMask, - dt: DigitLayout, - nh: MaybeDyn, - nkvh: MaybeDyn, - seq: MaybeDyn, - att: MaybeDyn, - dh: MaybeDyn, - ) -> Self { - let qo_layout = TensorLayout::new_dyn(dt, &[nh, seq, dh], &[dyn_(); 3]); - let kv_layout = TensorLayout::new_dyn(dt, &[nkvh, att, dh], &[dyn_(); 3]); - Self { - q_layout: qo_layout.clone(), - q_base: null_mut(), - k_layout: kv_layout.clone(), - k_base: null(), - v_layout: kv_layout, - v_base: null(), - o_layout: qo_layout, - o_base: null_mut(), - mask, - } - } - - pub(super) fn meta(&self) -> Result { + pub(super) fn meta(&self) -> Result { let Self { q_layout, k_layout, @@ -66,26 +40,26 @@ impl Args { .. } = self; - let &[nh_q, seq_q, dh_q] = q_layout.shape() else { + let &[nh_q, seq_q, dh_q] = &*q_layout.shape() else { return Err(rank_error("q", 3, q_layout.ndim())); }; - let &[nkvh_k, att_k, dh_k] = k_layout.shape() else { + let &[nkvh_k, att_k, dh_k] = &*k_layout.shape() else { return Err(rank_error("k", 3, k_layout.ndim())); }; - let &[nkvh_v, att_v, dh_v] = v_layout.shape() else { + let &[nkvh_v, att_v, dh_v] = &*v_layout.shape() else { return Err(rank_error("v", 3, v_layout.ndim())); }; - let &[nh_o, seq_o, dh_o] = o_layout.shape() else { + let &[nh_o, seq_o, dh_o] = &*o_layout.shape() else { return Err(rank_error("o", 3, o_layout.ndim())); }; Ok(Meta { - dt: type_distinct(&[q_layout.dt(), k_layout.dt(), v_layout.dt(), o_layout.dt()])?, - nh: dim_distinct(&[nh_q, nh_o])?, - nkvh: dim_distinct(&[nkvh_k, nkvh_v])?, - seq: dim_distinct(&[seq_q, seq_o])?, - att: dim_distinct(&[att_k, att_v])?, - dh: dim_distinct(&[dh_q, dh_k, dh_v, dh_o])?, + dt: type_distinct(&[q_layout.dt, k_layout.dt, v_layout.dt, o_layout.dt])?, + nh: dim_distinct(&[nh_q, nh_o]).expect("nh mismatch"), + nkvh: dim_distinct(&[nkvh_k, nkvh_v]).expect("nkvh mismatch"), + seq: dim_distinct(&[seq_q, seq_o]).expect("seq mismatch"), + att: dim_distinct(&[att_k, att_v]).expect("att mismatch"), + dh: dim_distinct(&[dh_q, dh_k, dh_v, dh_o]).expect("dh mismatch"), }) } } diff --git a/operators/src/attention/cuda.rs b/operators/src/attention/cuda.rs index 208561c3..98e02907 100644 --- a/operators/src/attention/cuda.rs +++ b/operators/src/attention/cuda.rs @@ -3,22 +3,10 @@ impl_op!(cuda, Gpu); #[cfg(test)] mod test { use super::{super::Args, Operator}; - use crate::{cuda::Gpu, ByteOf, Hardware, Operator as _, TensorLayout}; - use digit_layout::{types as ty, DigitLayout}; - - fn dyn_args(dt: DigitLayout, nh: usize, seq: usize, att: usize) -> Args { - use crate::dyn_; - Args::new_null( - crate::fuesd_softmax::AttnMask::Causal, - dt, - nh.into(), - dyn_(), - seq.into(), - att.into(), - dyn_(), - ) - } + use crate::{ByteOf, Hardware, Operator as _, TensorLayout, cuda::Gpu}; + use digit_layout::{DigitLayout, types as ty}; + #[allow(clippy::too_many_arguments)] fn args( dt: DigitLayout, nh: usize, @@ -44,18 +32,6 @@ mod test { } } - #[test] - fn test_compile() { - let Some(gpu) = Gpu::init() else { - return; - }; - println!("{}", gpu.0.device().info()); - - let mut op = Operator::new(&gpu); - let workspace = op.scheme(&dyn_args(ty::F16, 32, 7, 127), usize::MAX); - println!("workspace: {workspace:?}"); - } - #[test] fn test_compute() { use super::super::common_cpu::Operator as RefOp; diff --git a/operators/src/attention/operator.rs b/operators/src/attention/operator.rs index f520fb24..d54ca76a 100644 --- a/operators/src/attention/operator.rs +++ b/operators/src/attention/operator.rs @@ -1,7 +1,7 @@ -use super::{args::Meta, Args, Attention}; +use super::{Args, Attention, args::Meta}; use crate::{ - dyn_, fuesd_softmax, get_static, mat_mul, rearrange, ByteOf, Hardware, LaunchError, QueueAlloc, - SchemeError, TensorLayout, Workspace, WorkspaceCollector, + ByteOf, Hardware, LaunchError, QueueAlloc, TensorLayout, Workspace, fuesd_softmax, mat_mul, + rearrange, }; use ndarray_layout::ArrayLayout; use std::marker::PhantomData; @@ -42,97 +42,6 @@ where } } - fn scheme( - &mut self, - args: &Self::Args, - max_workspace_size: usize, - ) -> Result { - let Meta { - dt, - nh, - seq, - att, - dh, - .. - } = args.meta()?; - let Args { - q_layout, - k_layout, - v_layout, - o_layout, - .. - } = args; - - // 如果不能保证 nh seq att dh 已知,用任意值初始化算子 - let (Some(&nh), Some(&seq), Some(&att), Some(&dh)) = ( - nh.get_static(), - seq.get_static(), - att.get_static(), - dh.get_static(), - ) else { - let mut wc = WorkspaceCollector::new(); - - let layout = TensorLayout::new_dyn(dt, &[dyn_(); 3], &[dyn_(); 3]); - wc.push_sub(self.mat_mul.scheme( - &mat_mul::Args::new_null(layout.clone(), 1., layout.clone(), layout, 1.), - max_workspace_size, - )?); - - let layout = TensorLayout::new_dyn(dt, &[nh, seq, att], &[dyn_(); 3]); - wc.push_sub(self.softmax.scheme( - &fuesd_softmax::Args::new_null(args.mask, layout), - max_workspace_size, - )?); - - let layout = TensorLayout::new_dyn(dt, &[dyn_(); 3], &[dyn_(); 3]); - wc.push_sub(self.rearrange.scheme( - &rearrange::Args::new_null(layout.clone(), layout), - max_workspace_size, - )?); - - return Ok(wc.cauculate(max_workspace_size)); - }; - - let ele = dt.nbytes(); - let att_layout = TensorLayout::new_contiguous(dt, &[nh, seq, att]); - let q_size = nh * seq * dh * ele; - let att_size = nh * seq * att * ele; - let workspace_size = max_workspace_size.saturating_sub(q_size + att_size); - - let mut wc = WorkspaceCollector::new(); - wc.push_base(q_size); - wc.push_base(att_size); - - // att = q . k^T - wc.push_sub(self.mat_mul.scheme( - &mat_mul::Args::new_null( - att_layout.clone(), - 0., - q_layout.clone(), - k_layout.clone(), - 1., - ), - workspace_size, - )?); - // att = softmax(att) - wc.push_sub(self.softmax.scheme( - &fuesd_softmax::Args::new_null(args.mask, att_layout.clone()), - workspace_size, - )?); - // q = att . v - wc.push_sub(self.mat_mul.scheme( - &mat_mul::Args::new_null(q_layout.clone(), 0., att_layout, v_layout.clone(), 1.), - workspace_size, - )?); - // o = rearrange(q) - wc.push_sub(self.rearrange.scheme( - &rearrange::Args::new_null(o_layout.clone(), q_layout.clone()), - workspace_size, - )?); - - Ok(wc.cauculate(max_workspace_size)) - } - fn launch( &self, args: &Self::Args, @@ -170,12 +79,6 @@ where }; let ele = dt.nbytes(); - get_static! { - nh seq dh - nh_sq seq_sq dh_sq - nkvh att - nkvh_sk att_sk dh_sk - }; #[inline(always)] fn layout(shape: [usize; 3], strides: [isize; 3]) -> ArrayLayout<3> { diff --git a/operators/src/attention_kv_cached/args.rs b/operators/src/attention_kv_cached/args.rs index 78518ef5..95a31a5b 100644 --- a/operators/src/attention_kv_cached/args.rs +++ b/operators/src/attention_kv_cached/args.rs @@ -1,7 +1,7 @@ -use crate::{ +use crate::{ + ConstPtr, Hardware, LaunchError, MutPtr, TensorLayout, fuesd_softmax::AttnMask, utils::{dim_distinct, rank_error, type_distinct}, - ConstPtr, Hardware, MaybeDyn, MutPtr, SchemeError, TensorLayout, }; use digit_layout::DigitLayout; @@ -25,16 +25,15 @@ pub struct Args { pub v_cache_base: MutPtr, pub mask: AttnMask, - pub pos: MaybeDyn, + pub pos: usize, } pub(super) struct Meta { pub dt: DigitLayout, - pub nh: MaybeDyn, - pub nkvh: MaybeDyn, - pub dh: MaybeDyn, + pub nkvh: usize, + pub dh: usize, - pub seq: MaybeDyn, + pub seq: usize, } impl Args { @@ -47,7 +46,7 @@ impl Args { k_cache_layout: TensorLayout, v_cache_layout: TensorLayout, mask: AttnMask, - pos: MaybeDyn, + pos: usize, ) -> Self { use std::ptr::{null, null_mut}; Self { @@ -68,7 +67,7 @@ impl Args { } } - pub(super) fn meta(&self) -> Result { + pub(super) fn meta(&self) -> Result { let Self { q_layout, k_layout, @@ -79,38 +78,38 @@ impl Args { .. } = self; - let &[nh_q, seq_q, dh_q] = q_layout.shape() else { + let &[nh_q, seq_q, dh_q] = &*q_layout.shape() else { return Err(rank_error("q", 3, q_layout.ndim())); }; - let &[nkvh_k, seq_k, dh_k] = k_layout.shape() else { + let &[nkvh_k, seq_k, dh_k] = &*k_layout.shape() else { return Err(rank_error("k", 3, k_layout.ndim())); }; - let &[nkvh_v, seq_v, dh_v] = v_layout.shape() else { + let &[nkvh_v, seq_v, dh_v] = &*v_layout.shape() else { return Err(rank_error("v", 3, v_layout.ndim())); }; - let &[nh_o, seq_o, dh_o] = o_layout.shape() else { + let &[nh_o, seq_o, dh_o] = &*o_layout.shape() else { return Err(rank_error("o", 3, o_layout.ndim())); }; - let &[nkvh_kc, _buf, dh_kc] = k_cache_layout.shape() else { + let &[nkvh_kc, _buf, dh_kc] = &*k_cache_layout.shape() else { return Err(rank_error("k_cache", 3, k_cache_layout.ndim())); }; - let &[nkvh_vc, _buf, dh_vc] = v_cache_layout.shape() else { + let &[nkvh_vc, _buf, dh_vc] = &*v_cache_layout.shape() else { return Err(rank_error("v_cache", 3, v_cache_layout.ndim())); }; + let _nh = dim_distinct(&[nh_q, nh_o]); Ok(Meta { dt: type_distinct(&[ - q_layout.dt(), - k_layout.dt(), - v_layout.dt(), - o_layout.dt(), - k_cache_layout.dt(), - v_cache_layout.dt(), + q_layout.dt, + k_layout.dt, + v_layout.dt, + o_layout.dt, + k_cache_layout.dt, + v_cache_layout.dt, ])?, - nh: dim_distinct(&[nh_q, nh_o])?, - nkvh: dim_distinct(&[nkvh_k, nkvh_v, nkvh_kc, nkvh_vc])?, - dh: dim_distinct(&[dh_q, dh_k, dh_v, dh_o, dh_kc, dh_vc])?, - seq: dim_distinct(&[seq_q, seq_k, seq_v, seq_o])?, + nkvh: dim_distinct(&[nkvh_k, nkvh_v, nkvh_kc, nkvh_vc]).expect("nkvh mismatch"), + dh: dim_distinct(&[dh_q, dh_k, dh_v, dh_o, dh_kc, dh_vc]).expect("dh mismatch"), + seq: dim_distinct(&[seq_q, seq_k, seq_v, seq_o]).expect("seq mismatch"), }) } } diff --git a/operators/src/attention_kv_cached/cuda.rs b/operators/src/attention_kv_cached/cuda.rs index c5c605ae..f683e7d7 100644 --- a/operators/src/attention_kv_cached/cuda.rs +++ b/operators/src/attention_kv_cached/cuda.rs @@ -3,29 +3,10 @@ impl_op!(cuda, Gpu); #[cfg(test)] mod test { use super::{super::Args, Operator}; - use crate::{cuda::Gpu, ByteOf, Hardware, Operator as _, TensorLayout}; - use digit_layout::{types as ty, DigitLayout}; - - fn dyn_args( - dt: DigitLayout, - nh: usize, - seq: usize, - dh: usize, - pos: usize, - ) -> Args { - use crate::dyn_; - Args::new_null( - TensorLayout::new_dyn(dt, &[nh.into(), seq.into(), dh.into()], &[dyn_(); 3]), - TensorLayout::new_dyn(dt, &[dyn_(), seq.into(), dh.into()], &[dyn_(); 3]), - TensorLayout::new_dyn(dt, &[dyn_(), seq.into(), dh.into()], &[dyn_(); 3]), - TensorLayout::new_dyn(dt, &[nh.into(), seq.into(), dh.into()], &[dyn_(); 3]), - TensorLayout::new_dyn(dt, &[nh.into(), seq.into(), dh.into()], &[dyn_(); 3]), - TensorLayout::new_dyn(dt, &[nh.into(), seq.into(), dh.into()], &[dyn_(); 3]), - crate::fuesd_softmax::AttnMask::Causal, - pos.into(), - ) - } + use crate::{ByteOf, Hardware, Operator as _, TensorLayout, cuda::Gpu}; + use digit_layout::{DigitLayout, types as ty}; + #[allow(clippy::too_many_arguments)] fn args( dt: DigitLayout, nh: usize, @@ -54,22 +35,10 @@ mod test { k_cache_base, v_cache_base, mask: crate::fuesd_softmax::AttnMask::Causal, - pos: pos.into(), + pos, } } - #[test] - fn test_compile() { - let Some(gpu) = Gpu::init() else { - return; - }; - println!("{}", gpu.0.device().info()); - - let mut op = Operator::new(&gpu); - let workspace = op.scheme(&dyn_args(ty::F16, 32, 7, 64, 13), usize::MAX); - println!("workspace: {workspace:?}"); - } - #[test] fn test_compute() { use super::super::common_cpu::Operator as RefOp; diff --git a/operators/src/attention_kv_cached/operator.rs b/operators/src/attention_kv_cached/operator.rs index 81345f4c..520dd400 100644 --- a/operators/src/attention_kv_cached/operator.rs +++ b/operators/src/attention_kv_cached/operator.rs @@ -1,7 +1,6 @@ -use super::{args::Meta, Args, AttnKVCached}; +use super::{Args, AttnKVCached, args::Meta}; use crate::{ - attention, dyn_, get_static, rearrange, shape_mismatch, ByteOf, Hardware, LaunchError, - MaybeDyn, QueueAlloc, TensorLayout, WorkspaceCollector, + ByteOf, Hardware, LaunchError, QueueAlloc, TensorLayout, attention, rearrange, shape_mismatch, }; use ndarray_layout::ArrayLayout; use std::marker::PhantomData; @@ -38,41 +37,6 @@ where } } - fn scheme( - &mut self, - args: &Self::Args, - max_workspace_size: usize, - ) -> Result { - let Meta { - dt, - nh, - nkvh, - dh, - seq, - } = args.meta()?; - - let mut wc = WorkspaceCollector::new(); - - let layout = TensorLayout::new_dyn(dt, &[dyn_(); 3], &[dyn_(); 3]); - wc.push_sub(self.rearrange.scheme( - &rearrange::Args::new_null(layout.clone(), layout), - max_workspace_size, - )?); - - let att = if let (Some(&seq), Some(&pos)) = (seq.get_static(), args.pos.get_static()) { - MaybeDyn::from(pos + seq) - } else { - dyn_() - }; - - wc.push_sub(self.attention.scheme( - &attention::Args::new_null(args.mask, dt, nh, nkvh, seq, att, dh), - max_workspace_size, - )?); - - Ok(wc.cauculate(max_workspace_size)) - } - fn launch( &self, args: &Self::Args, @@ -85,27 +49,27 @@ where let Meta { dt, nkvh, dh, seq, .. } = args.meta()?; - let Args { - q_layout, + let &Args { + ref q_layout, q_base, - k_layout, + ref k_layout, k_base, - v_layout, + ref v_layout, v_base, - o_layout, + ref o_layout, o_base, - k_cache_layout, + ref k_cache_layout, k_cache_base, - v_cache_layout, + ref v_cache_layout, v_cache_base, mask, pos, } = args; - let &[_, buf_k, _] = k_cache_layout.shape() else { + let &[_, buf_k, _] = &*k_cache_layout.shape() else { unreachable!() }; - let &[_, buf_v, _] = v_cache_layout.shape() else { + let &[_, buf_v, _] = &*v_cache_layout.shape() else { unreachable!() }; let &[nkvh_skc, buf_skc, dh_skc] = k_cache_layout.strides() else { @@ -115,18 +79,10 @@ where unreachable!() }; - get_static! { - pos seq - buf_k - nkvh buf_v dh - nkvh_skc buf_skc dh_skc - nkvh_svc buf_svc dh_svc - }; - // 检查 cache 容量 let att = pos + seq; if buf_k < att || buf_v < att { - return Err(shape_mismatch("Out of cache buffer").into()); + return Err(shape_mismatch("Out of cache buffer")); } // 连接 kv cache #[inline(always)] @@ -145,7 +101,7 @@ where dst_layout: TensorLayout::new(dt, k_cat.shape(), k_cat.strides()), dst_base: unsafe { k_cache_base.byte_add(k_cat.offset() as _) }, src_layout: k_layout.clone(), - src_base: *k_base, + src_base: k_base, }, workspace, queue_alloc, @@ -155,7 +111,7 @@ where dst_layout: TensorLayout::new(dt, v_cat.shape(), v_cat.strides()), dst_base: unsafe { v_cache_base.byte_add(k_cat.offset() as _) }, src_layout: v_layout.clone(), - src_base: *v_base, + src_base: v_base, }, workspace, queue_alloc, @@ -167,15 +123,15 @@ where assert_eq!(v_layout.offset(), 0); self.attention.launch( &attention::Args { - mask: *mask, + mask, q_layout: q_layout.clone(), - q_base: *q_base, + q_base, k_layout: TensorLayout::new(dt, k_layout.shape(), k_layout.strides()), - k_base: *k_cache_base, + k_base: k_cache_base, v_layout: TensorLayout::new(dt, v_layout.shape(), v_layout.strides()), - v_base: *v_cache_base, + v_base: v_cache_base, o_layout: o_layout.clone(), - o_base: *o_base, + o_base, }, workspace, queue_alloc, diff --git a/operators/src/broadcast/args.rs b/operators/src/broadcast/args.rs index 0fb24387..d0904a89 100644 --- a/operators/src/broadcast/args.rs +++ b/operators/src/broadcast/args.rs @@ -1,8 +1,6 @@ -use crate::{ - dyn_not_support, rearrange, shape_mismatch, strides_not_support, utils::type_distinct, - Hardware, MaybeDyn, SchemeError, +use crate::{ + Hardware, LaunchError, rearrange, shape_mismatch, strides_not_support, utils::type_distinct, }; -use ndarray_layout::ArrayLayout; pub struct Args { pub pair: rearrange::Args, @@ -21,7 +19,7 @@ pub(super) struct Meta { } impl Args { - pub(super) fn meta(&self) -> Result { + pub(super) fn meta(&self) -> Result { let Self { pair: rearrange::Args { @@ -32,15 +30,9 @@ impl Args { .. } = self; - let dt = type_distinct(&[dst_layout.dt(), src_layout.dt()])?; + let dt = type_distinct(&[dst_layout.dt, src_layout.dt])?; - let Some(shape) = MaybeDyn::get_all(dst_layout.shape()) else { - return Err(dyn_not_support("")); - }; - let Some(strides) = MaybeDyn::get_all(dst_layout.strides()) else { - return Err(dyn_not_support("")); - }; - let dst = ArrayLayout::<2>::new(shape, strides, 0); + let dst = &dst_layout.layout; let &[dst] = dst .merge_be(0, dst.ndim()) .ok_or(strides_not_support(""))? @@ -49,13 +41,7 @@ impl Args { unreachable!() }; - let Some(shape) = MaybeDyn::get_all(src_layout.shape()) else { - return Err(dyn_not_support("")); - }; - let Some(strides) = MaybeDyn::get_all(src_layout.strides()) else { - return Err(dyn_not_support("")); - }; - let src = ArrayLayout::<2>::new(shape, strides, 0); + let src = &src_layout.layout; let &[src] = src .merge_be(0, src.ndim()) .ok_or(strides_not_support(""))? diff --git a/operators/src/broadcast/common_cpu/mod.rs b/operators/src/broadcast/common_cpu/mod.rs index b4ac034b..f169e0dd 100644 --- a/operators/src/broadcast/common_cpu/mod.rs +++ b/operators/src/broadcast/common_cpu/mod.rs @@ -1,7 +1,8 @@ -use super::{args::Meta, Args, Broadcast}; +use super::{Args, Broadcast, args::Meta}; use crate::{ + ByteOf, LaunchError, QueueAlloc, TopoNode, common_cpu::{Cpu, InprocNode}, - rearrange, ByteOf, LaunchError, QueueAlloc, SchemeError, TopoNode, + rearrange, }; use std::ptr::{addr_eq, copy, copy_nonoverlapping}; @@ -19,14 +20,6 @@ impl crate::Operator for Operator { Self(node.clone()) } - fn scheme( - &mut self, - _args: &Self::Args, - _max_workspace_size: usize, - ) -> Result { - Ok(0) - } - fn launch( &self, args: &Self::Args, @@ -71,7 +64,7 @@ impl crate::Operator for Operator { #[test] fn test_comm() { - use crate::{common_cpu::ThisThread, Operator as _, TensorLayout}; + use crate::{Operator as _, TensorLayout, common_cpu::ThisThread}; use digit_layout::types::U32; InprocNode::new(4) diff --git a/operators/src/broadcast/nccl/mod.rs b/operators/src/broadcast/nccl/mod.rs index 498432d2..fdcb12ea 100644 --- a/operators/src/broadcast/nccl/mod.rs +++ b/operators/src/broadcast/nccl/mod.rs @@ -1,7 +1,8 @@ -use super::{args::Meta, Args, Broadcast}; +use super::{Args, Broadcast, args::Meta}; use crate::{ + ByteOf, LaunchError, QueueAlloc, cuda::{Gpu, NcclNode}, - rearrange, ByteOf, LaunchError, QueueAlloc, SchemeError, + rearrange, }; use std::{ slice::{from_raw_parts, from_raw_parts_mut}, @@ -25,14 +26,6 @@ impl crate::Operator for Operator { } } - fn scheme( - &mut self, - _args: &Self::Args, - _max_workspace_size: usize, - ) -> Result { - Ok(0) - } - fn launch( &self, args: &Self::Args, diff --git a/operators/src/common/blob.rs b/operators/src/common/blob.rs index 94ae0dd3..fb4bdad7 100644 --- a/operators/src/common/blob.rs +++ b/operators/src/common/blob.rs @@ -1,5 +1,5 @@ -use std::{ - alloc::{alloc, dealloc, Layout}, +use std::{ + alloc::{Layout, alloc, dealloc}, ops::{Deref, DerefMut}, ptr::NonNull, slice::{from_raw_parts, from_raw_parts_mut}, diff --git a/operators/src/common/error.rs b/operators/src/common/error.rs index 57f1fd5d..1584fca3 100644 --- a/operators/src/common/error.rs +++ b/operators/src/common/error.rs @@ -1,5 +1,5 @@ #[derive(Clone, Copy, PartialEq, Eq, Debug)] -pub enum SchemeErrorKind { +pub enum LaunchErrorKind { TypeNotSupport, TypeMismatch, RankNotSupport, @@ -9,17 +9,6 @@ pub enum SchemeErrorKind { StridesNotSupport, ArgsNotSupport, DynamicNotSupport, -} - -#[derive(Clone, Debug)] -pub struct SchemeError { - pub kind: SchemeErrorKind, - pub info: String, -} - -#[derive(Clone, Copy, PartialEq, Eq, Debug)] -pub enum LaunchErrorKind { - Scheme(SchemeErrorKind), ExecutionFailed, } @@ -29,17 +18,8 @@ pub struct LaunchError { pub info: String, } -impl From for LaunchError { - fn from(SchemeError { kind, info }: SchemeError) -> Self { - Self { - kind: LaunchErrorKind::Scheme(kind), - info, - } - } -} - pub(super) mod functions { - use super::{LaunchError, LaunchErrorKind::*, SchemeError, SchemeErrorKind::*}; + use super::{LaunchError, LaunchErrorKind::*}; macro_rules! builder { ($ty:ident: $name:ident $kind:expr) => { @@ -53,15 +33,15 @@ pub(super) mod functions { }; } - builder!(SchemeError: type_not_support TypeNotSupport ); - builder!(SchemeError: type_mismatch TypeMismatch ); - builder!(SchemeError: rank_mismatch RankMismatch ); - builder!(SchemeError: rank_not_support RankNotSupport ); - builder!(SchemeError: shape_not_support ShapeNotSupport ); - builder!(SchemeError: shape_mismatch ShapeMismatch ); - builder!(SchemeError: strides_not_support StridesNotSupport); - builder!(SchemeError: args_not_support ArgsNotSupport ); - builder!(SchemeError: dyn_not_support DynamicNotSupport); + builder!(LaunchError: type_not_support TypeNotSupport ); + builder!(LaunchError: type_mismatch TypeMismatch ); + builder!(LaunchError: rank_mismatch RankMismatch ); + builder!(LaunchError: rank_not_support RankNotSupport ); + builder!(LaunchError: shape_not_support ShapeNotSupport ); + builder!(LaunchError: shape_mismatch ShapeMismatch ); + builder!(LaunchError: strides_not_support StridesNotSupport); + builder!(LaunchError: args_not_support ArgsNotSupport ); + builder!(LaunchError: dyn_not_support DynamicNotSupport); builder!(LaunchError: execution_failed ExecutionFailed ); } diff --git a/operators/src/common/maybe_dyn.rs b/operators/src/common/maybe_dyn.rs deleted file mode 100644 index 86f3b990..00000000 --- a/operators/src/common/maybe_dyn.rs +++ /dev/null @@ -1,118 +0,0 @@ -pub trait DynVal { - fn default_dyn() -> Self; - fn is_dynamic(&self) -> bool; -} - -impl DynVal for isize { - #[inline] - fn default_dyn() -> Self { - Self::MAX - } - #[inline] - fn is_dynamic(&self) -> bool { - *self == Self::MAX - } -} - -impl DynVal for usize { - #[inline] - fn default_dyn() -> Self { - Self::MAX - } - #[inline] - fn is_dynamic(&self) -> bool { - *self == Self::MAX - } -} - -impl DynVal for f32 { - #[inline] - fn default_dyn() -> Self { - Self::INFINITY - } - #[inline] - fn is_dynamic(&self) -> bool { - self.is_infinite() && self.is_sign_positive() - } -} - -#[derive(Clone, Copy, PartialEq, Eq, Debug)] -#[repr(transparent)] -pub struct MaybeDyn(pub T); - -impl From for MaybeDyn { - #[inline] - fn from(value: T) -> Self { - Self(value) - } -} - -impl MaybeDyn { - #[inline] - pub fn dynamic() -> Self { - Self(T::default_dyn()) - } - #[inline] - pub fn is_dynamic(&self) -> bool { - self.0.is_dynamic() - } - #[inline] - pub fn get_static(&self) -> Option<&T> { - if !self.is_dynamic() { - Some(&self.0) - } else { - None - } - } -} - -#[inline(always)] -pub fn dyn_() -> MaybeDyn { - MaybeDyn::dynamic() -} - -#[derive(Clone, Copy, PartialEq, Eq, Debug)] -pub enum MergeError { - EmptyIter, - NotMatch, -} - -impl MaybeDyn { - pub fn merge<'a>(iter: impl IntoIterator) -> Result<&'a Self, MergeError> { - let mut iter = iter.into_iter(); - let mut acc = iter.next().ok_or(MergeError::EmptyIter)?; - for it in iter { - if it.is_dynamic() { - // Nothing to do - } else if acc.is_dynamic() { - acc = it; - } else if acc.0 != it.0 { - return Err(MergeError::NotMatch); - } - } - Ok(acc) - } - - pub fn get_all(slice: &[Self]) -> Option<&[T]> { - if slice.iter().any(|arg| arg.is_dynamic()) { - None - } else { - Some(unsafe { std::slice::from_raw_parts(slice.as_ptr().cast(), slice.len()) }) - } - } -} - -#[inline] -pub(crate) fn static_from(arg: &MaybeDyn) -> Result<&T, SchemeError> { - arg.get_static().ok_or_else(|| dyn_not_support("")) -} - -macro_rules! get_static { - ($($name:ident)*) => { - $( let $name = *$crate::static_from(&$name)?; )* - }; -} - -pub(crate) use get_static; - -use super::{dyn_not_support, SchemeError}; diff --git a/operators/src/common/mod.rs b/operators/src/common/mod.rs index ade7efdb..ee34ec19 100644 --- a/operators/src/common/mod.rs +++ b/operators/src/common/mod.rs @@ -2,7 +2,6 @@ mod blob; mod calculator; mod diversity; mod error; -mod maybe_dyn; mod pool; mod tensor; mod unsigned; @@ -10,19 +9,16 @@ mod workspace; pub use blob::Blob; pub use calculator::OffsetCalculator; -pub use error::{functions::*, LaunchError, LaunchErrorKind, SchemeError, SchemeErrorKind}; -pub use maybe_dyn::{dyn_, DynVal, MaybeDyn}; +pub use error::{LaunchError, LaunchErrorKind, functions::*}; pub use pool::Pool; pub use tensor::TensorLayout; pub use unsigned::Unsigned; pub use workspace::Workspace; -pub(crate) use diversity::{SchemeCacheSize, SchemeDiversity}; -pub(crate) use maybe_dyn::{get_static, static_from}; -pub(crate) use workspace::WorkspaceCollector; +pub(crate) use diversity::SchemeDiversity; pub mod utils { - use super::{rank_not_support, shape_mismatch, type_mismatch, MaybeDyn, SchemeError}; + use super::{LaunchError, rank_not_support, type_mismatch}; use digit_layout::DigitLayout; #[cfg(any(use_cuda, use_cl))] @@ -37,7 +33,7 @@ pub mod utils { } #[inline] - pub(crate) fn type_distinct(pairs: &[DigitLayout]) -> Result { + pub(crate) fn type_distinct(pairs: &[DigitLayout]) -> Result { let [dt, tail @ ..] = pairs else { unreachable!("pairs empty"); }; @@ -49,15 +45,19 @@ pub mod utils { } #[inline] - pub(crate) fn rank_error(arg: &str, expected: usize, actual: usize) -> SchemeError { + pub(crate) fn rank_error(arg: &str, expected: usize, actual: usize) -> LaunchError { rank_not_support(format!("{arg}.ndim = {actual}, {expected} expected")) } #[inline] - pub(crate) fn dim_distinct(args: &[MaybeDyn]) -> Result, SchemeError> { - MaybeDyn::merge(args) - .copied() - .map_err(|_| shape_mismatch(format!("{args:?} are not distinct"))) + pub(crate) fn dim_distinct(args: &[usize]) -> Option { + let (&ans, others) = args.split_first().unwrap(); + for &x in others { + if x != ans { + return None; + } + } + Some(ans) } } diff --git a/operators/src/common/pool.rs b/operators/src/common/pool.rs index ba82de9f..4b2018e1 100644 --- a/operators/src/common/pool.rs +++ b/operators/src/common/pool.rs @@ -1,5 +1,5 @@ use std::{ - alloc::{alloc, dealloc, Layout}, + alloc::{Layout, alloc, dealloc}, ptr::null_mut, sync::atomic::{ AtomicPtr, diff --git a/operators/src/common/tensor.rs b/operators/src/common/tensor.rs index ca5838cb..2eaa6a03 100644 --- a/operators/src/common/tensor.rs +++ b/operators/src/common/tensor.rs @@ -1,48 +1,18 @@ -use crate::MaybeDyn; -use digit_layout::DigitLayout; +use digit_layout::DigitLayout; use ndarray_layout::ArrayLayout; -use std::{ - alloc::{alloc, dealloc, Layout}, - ptr::{copy_nonoverlapping, NonNull}, - slice::from_raw_parts, -}; +use std::borrow::Cow; -/// | field | type | -/// |:--------:|:-------------:| -/// | dt | DigitLayout | -/// | ndim | u64 | -/// | shape | [usize; ndim] | -/// | strides | [isize; ndim] | -#[repr(transparent)] -pub struct TensorLayout(NonNull); +#[derive(Clone)] +pub struct TensorLayout { + pub dt: DigitLayout, + pub layout: ArrayLayout<4>, +} impl TensorLayout { - pub fn new_dyn( - dt: DigitLayout, - shape: &[MaybeDyn], - strides: &[MaybeDyn], - ) -> Self { - let shape: &[usize] = unsafe { std::mem::transmute(shape) }; - let strides: &[isize] = unsafe { std::mem::transmute(strides) }; - Self::new(dt, shape, strides) - } - pub fn new(dt: DigitLayout, shape: &[usize], strides: &[isize]) -> Self { - assert_eq!(shape.len(), strides.len()); - - unsafe { - let ptr = alloc(Self::layout(shape.len())); - - let cursor: *mut DigitLayout = ptr.cast(); - cursor.write(dt); - let cursor: *mut u64 = cursor.add(1).cast(); - cursor.write(shape.len() as _); - let cursor: *mut usize = cursor.add(1).cast(); - copy_nonoverlapping(shape.as_ptr(), cursor, shape.len()); - let cursor: *mut isize = cursor.add(shape.len()).cast(); - copy_nonoverlapping(strides.as_ptr(), cursor, strides.len()); - - Self(NonNull::new_unchecked(ptr as _)) + Self { + dt, + layout: ArrayLayout::new(shape, strides, 0), } } @@ -60,61 +30,27 @@ impl TensorLayout { Self::new(dt, shape, &strides) } - #[inline] - pub fn from_arr(dt: DigitLayout, arr: &ArrayLayout) -> Self { - Self::new(dt, arr.shape(), arr.strides()) - } - - #[inline] - pub fn dt(&self) -> DigitLayout { - let ptr = self.0.cast(); - unsafe { *ptr.as_ref() } - } - #[inline] pub fn ndim(&self) -> usize { - let ptr = self.0.cast::().as_ptr(); - unsafe { *ptr.add(1) as _ } + self.layout.ndim() } #[inline] - pub fn shape(&self) -> &[MaybeDyn] { - let ptr = self.0.cast::>().as_ptr(); - let len = self.ndim(); - unsafe { from_raw_parts(ptr.add(2), len) } + pub fn shape_group(&self) -> &[usize] { + self.layout.shape() } #[inline] - pub fn strides(&self) -> &[MaybeDyn] { - let ptr = self.0.cast::>().as_ptr(); - let len = self.ndim(); - unsafe { from_raw_parts(ptr.add(2 + len), len) } - } - - #[inline(always)] - fn layout(ndim: usize) -> Layout { - Layout::array::(2 + ndim * 2).unwrap() - } -} - -impl Clone for TensorLayout { - #[inline] - fn clone(&self) -> Self { - let layout = Self::layout(self.ndim()); - let src = self.0.cast::().as_ptr(); - unsafe { - let dst = alloc(layout); - copy_nonoverlapping(src, dst, layout.size()); - Self(NonNull::new_unchecked(dst as _)) + pub fn shape(&self) -> Cow<[usize]> { + if self.dt.group_size() == 1 { + Cow::Borrowed(self.layout.shape()) + } else { + Cow::Owned(vec![]) } } -} -impl Drop for TensorLayout { #[inline] - fn drop(&mut self) { - let ptr = self.0.cast().as_ptr(); - let layout = Self::layout(self.ndim()); - unsafe { dealloc(ptr, layout) } + pub fn strides(&self) -> &[isize] { + self.layout.strides() } } diff --git a/operators/src/common/workspace.rs b/operators/src/common/workspace.rs index 55e284d7..ba3b4f5c 100644 --- a/operators/src/common/workspace.rs +++ b/operators/src/common/workspace.rs @@ -50,41 +50,3 @@ impl Drop for Workspace<'_, QA> { } } } - -pub(crate) struct WorkspaceCollector { - base: Vec, - sub: usize, -} - -impl WorkspaceCollector { - #[inline] - pub fn new() -> Self { - Self { - base: Vec::with_capacity(2), - sub: 0, - } - } - - #[inline] - pub fn push_base(&mut self, base: usize) { - self.base.push(base) - } - - #[inline] - pub fn push_sub(&mut self, sub: usize) { - self.sub = self.sub.max(sub) - } - - pub fn cauculate(mut self, max_workspace_size: usize) -> usize { - self.base.push(self.sub); - let mut ans = 0; - for s in self.base { - if ans + s <= max_workspace_size { - ans += s; - } else { - return ans; - } - } - ans - } -} diff --git a/operators/src/conv/args.rs b/operators/src/conv/args.rs index 0faaba73..f2bb2cc5 100644 --- a/operators/src/conv/args.rs +++ b/operators/src/conv/args.rs @@ -1,6 +1,6 @@ -use crate::{ +use crate::{ + ConstPtr, Hardware, LaunchError, MutPtr, TensorLayout, utils::{dim_distinct, rank_error, type_distinct}, - ConstPtr, Hardware, MaybeDyn, MutPtr, SchemeError, TensorLayout, }; use digit_layout::DigitLayout; @@ -20,19 +20,19 @@ pub struct Args { pub(crate) struct Meta { pub dt: DigitLayout, - pub n: MaybeDyn, - pub m: MaybeDyn, - pub c: MaybeDyn, - pub h: MaybeDyn, - pub w: MaybeDyn, - pub hy: MaybeDyn, - pub wy: MaybeDyn, - pub hk: MaybeDyn, - pub wk: MaybeDyn, + pub n: usize, + pub m: usize, + pub c: usize, + pub h: usize, + pub w: usize, + pub hy: usize, + pub wy: usize, + pub hk: usize, + pub wk: usize, } impl Args { - pub(super) fn meta(&self) -> Result { + pub(super) fn meta(&self) -> Result { let Self { y_layout, x_layout, @@ -41,24 +41,24 @@ impl Args { .. } = self; - let &[ny, my, hy, wy] = y_layout.shape() else { + let &[ny, my, hy, wy] = &*y_layout.shape() else { return Err(rank_error("y", 4, y_layout.ndim())); }; - let &[n, c, h, w] = x_layout.shape() else { + let &[n, c, h, w] = &*x_layout.shape() else { return Err(rank_error("x", 4, x_layout.ndim())); }; - let &[m, ck, hk, wk] = w_layout.shape() else { + let &[m, ck, hk, wk] = &*w_layout.shape() else { return Err(rank_error("w", 4, w_layout.ndim())); }; - let &[mb] = b_layout.shape() else { + let &[mb] = &*b_layout.shape() else { return Err(rank_error("b", 1, b_layout.ndim())); }; Ok(Meta { - dt: type_distinct(&[y_layout.dt(), x_layout.dt(), w_layout.dt(), b_layout.dt()])?, - n: dim_distinct(&[n, ny])?, - m: dim_distinct(&[m, my, mb])?, - c: dim_distinct(&[c, ck])?, + dt: type_distinct(&[y_layout.dt, x_layout.dt, w_layout.dt, b_layout.dt])?, + n: dim_distinct(&[n, ny]).expect("n mismatch"), + m: dim_distinct(&[m, my, mb]).expect("m mismatch"), + c: dim_distinct(&[c, ck]).expect("c mismatch"), h, w, hy, diff --git a/operators/src/conv/im2col.rs b/operators/src/conv/im2col.rs index 0abef071..73f02603 100644 --- a/operators/src/conv/im2col.rs +++ b/operators/src/conv/im2col.rs @@ -1,7 +1,7 @@ -use super::{args::Meta, Args, Conv}; +use super::{Args, Conv, args::Meta}; use crate::{ - args_not_support, get_static, mat_mul, rearrange, strides_not_support, ByteOf, Hardware, - LaunchError, QueueAlloc, SchemeError, TensorLayout, Workspace, + ByteOf, Hardware, LaunchError, QueueAlloc, TensorLayout, Workspace, args_not_support, mat_mul, + rearrange, strides_not_support, }; use ndarray_layout::{ArrayLayout, Endian::BigEndian, MergeArg}; use std::marker::PhantomData; @@ -38,48 +38,6 @@ where } } - fn scheme( - &mut self, - args: &Self::Args, - max_workspace_size: usize, - ) -> Result { - let Args { pads, .. } = args; - let &[0, 0, 0, 0] = pads else { - return Err(args_not_support( - "non-zero padding for im2col is not supported", - )); - }; - let Meta { - dt, - n, - c, - hy, - wy, - hk, - wk, - .. - } = args.meta()?; - - macro_rules! get { - ($( $var:ident )+) => { - $( - let Some(&$var) = $var.get_static() else { - return Ok(0); - }; - )+ - }; - } - - get!(n c hk wk hy wy); - let a_size = [n, c, hk, wk, hy, wy, dt.nbytes()].iter().product(); - - Ok(if a_size <= max_workspace_size { - a_size - } else { - 0 - }) - } - fn launch( &self, args: &Self::Args, @@ -106,7 +64,9 @@ where let &[hs, ws] = strides; let &[hd, wd] = dilations; let &[0, 0, 0, 0] = pads else { - return Err(args_not_support("non-zero padding for im2col is not supported").into()); + return Err(args_not_support( + "non-zero padding for im2col is not supported", + )); }; let Meta { @@ -135,45 +95,37 @@ where unreachable!() }; - get_static! { - n m c h hy hk w wy wk - nys mys hys wys - nxs cxs hxs wxs - mks cks hks wks - mbs - } - // 计算考虑空洞的 kernel size let hkd = (hk - 1) * hd + 1; let wkd = (wk - 1) * wd + 1; if (h - hkd) % hs != 0 || (w - wkd) % ws != 0 { - return Err(strides_not_support("output size not divisible by strides").into()); + return Err(strides_not_support("output size not divisible by strides")); } - type Arr6 = ArrayLayout<6>; + type Arr4 = ArrayLayout<4>; // c <- y: [n, m, hy * wy] // a <- w: [n, m, c * hk * wk] // b <- x: [n, c * hk * wk, hy * wy] // y 作为矩阵乘输出的布局 - let Some(c_y) = Arr6::new(&[n, m, hy, wy], &[nys, mys, hys, wys], 0).merge_be(2, 2) else { - return Err(strides_not_support("").into()); + let Some(c_y) = Arr4::new(&[n, m, hy, wy], &[nys, mys, hys, wys], 0).merge_be(2, 2) else { + return Err(strides_not_support("")); }; // w 作为矩阵乘输入的布局 - let Some(a_w) = Arr6::new(&[n, m, c, hk, wk], &[0, mks, cks, hks, wks], 0).merge_be(2, 3) + let Some(a_w) = Arr4::new(&[n, m, c, hk, wk], &[0, mks, cks, hks, wks], 0).merge_be(2, 3) else { - return Err(strides_not_support("").into()); + return Err(strides_not_support("")); }; // x im2col rearrange let ele = dt.nbytes(); let b_shape = [n, c, hk, wk, hy, wy]; let [hd, wd, hs, ws] = [hd, wd, hs, ws].map(|x| x as isize); let b_strides = [nxs, cxs, hxs * hd, wxs * wd, hxs * hs, wxs * ws]; - let b_dst = Arr6::new_contiguous(&b_shape, BigEndian, ele); - let b_src = Arr6::new(&b_shape, &b_strides, 0); + let b_dst = Arr4::new_contiguous(&b_shape, BigEndian, ele); + let b_src = Arr4::new(&b_shape, &b_strides, 0); let b_x = b_dst .merge_many(&[ MergeArg { @@ -189,14 +141,14 @@ where ]) .unwrap(); - let c_y = TensorLayout::from_arr(dt, &c_y); - let a_w = TensorLayout::from_arr(dt, &a_w); - let b_x = TensorLayout::from_arr(dt, &b_x); - let b_dst = TensorLayout::from_arr(dt, &b_dst); - let b_src = TensorLayout::from_arr(dt, &b_src); + let c_y = TensorLayout { dt, layout: c_y }; + let a_w = TensorLayout { dt, layout: a_w }; + let b_x = TensorLayout { dt, layout: b_x }; + let b_dst = TensorLayout { dt, layout: b_dst }; + let b_src = TensorLayout { dt, layout: b_src }; // b 布局广播 - let b = Arr6::new(&[n, m, hy * wy], &[0, mbs, 0], 0); + let b = Arr4::new(&[n, m, hy * wy], &[0, mbs, 0], 0); // 广播 b self.rearrange.launch( &rearrange::Args { diff --git a/operators/src/fuesd_softmax/args.rs b/operators/src/fuesd_softmax/args.rs index 832f7f69..48eccaeb 100644 --- a/operators/src/fuesd_softmax/args.rs +++ b/operators/src/fuesd_softmax/args.rs @@ -1,4 +1,4 @@ -use crate::{rank_not_support, Hardware, MutPtr, SchemeError, TensorLayout}; +use crate::{Hardware, LaunchError, MutPtr, TensorLayout, rank_not_support}; use digit_layout::DigitLayout; use std::ptr::null_mut; @@ -28,8 +28,8 @@ impl Args { } } - pub(super) fn meta(&self) -> Result { - let dt = self.att_layout.dt(); + pub(super) fn meta(&self) -> Result { + let dt = self.att_layout.dt; if self.att_layout.ndim() != 3 { return Err(rank_not_support("")); } diff --git a/operators/src/fuesd_softmax/common_cpu/mod.rs b/operators/src/fuesd_softmax/common_cpu/mod.rs index 2e803eca..b205129e 100644 --- a/operators/src/fuesd_softmax/common_cpu/mod.rs +++ b/operators/src/fuesd_softmax/common_cpu/mod.rs @@ -1,8 +1,8 @@ -use super::{ - args::{AttnMask, Meta}, +use super::{ Args, FusedSoftmax, + args::{AttnMask, Meta}, }; -use crate::{common_cpu::Cpu, get_static, ByteOf, LaunchError, QueueAlloc, SchemeError}; +use crate::{ByteOf, LaunchError, QueueAlloc, common_cpu::Cpu}; use half::f16; use rayon::iter::{IntoParallelIterator, ParallelIterator}; @@ -20,15 +20,6 @@ impl crate::Operator for Operator { Self } - fn scheme( - &mut self, - args: &Self::Args, - _max_workspace_size: usize, - ) -> Result { - let _meta = args.meta()?; - Ok(0) - } - fn launch( &self, args: &Self::Args, @@ -44,18 +35,13 @@ impl crate::Operator for Operator { att_layout, att_base, } = args; - let &[nh, seq_len, att_len] = att_layout.shape() else { + let &[nh, seq_len, att_len] = &*att_layout.shape() else { unreachable!() }; let &[sh, ss, sa] = att_layout.strides() else { unreachable!() }; - get_static! { - nh seq_len att_len - sh ss sa - } - macro_rules! calculate { ($ty:ty) => { Scheme::<$ty> { diff --git a/operators/src/fuesd_softmax/cuda/fused_softmax.cuh b/operators/src/fuesd_softmax/cuda/fused_softmax.cuh index 745f7b1e..34495985 100644 --- a/operators/src/fuesd_softmax/cuda/fused_softmax.cuh +++ b/operators/src/fuesd_softmax/cuda/fused_softmax.cuh @@ -21,7 +21,7 @@ struct AttentionCausalMask { } }; -template +template static __device__ void block_padding( Tdata *__restrict__ att, Tmask mask, @@ -30,8 +30,8 @@ static __device__ void block_padding( auto att_idx = threadIdx.x, att_len = blockDim.x; auto thread_data = mask(tok_id, seq_len, att_idx, att_len) - ? float(att[att_idx]) - : -__FLT_MAX__; + ? float(att[att_idx]) + : -__FLT_MAX__; using BlockOp = cub::BlockReduce; __shared__ typename BlockOp::TempStorage temp_storage; @@ -40,21 +40,25 @@ static __device__ void block_padding( __shared__ float max; { auto acc = block_op.Reduce(thread_data, cub::Max(), att_len); - if (threadIdx.x == 0) { max = acc; } + if (threadIdx.x == 0) { + max = acc; + } } __syncthreads(); __shared__ float mean; { auto acc = block_op.Sum(thread_data = expf(thread_data - max), att_len); - if (threadIdx.x == 0) { mean = fdividef(1, acc); } + if (threadIdx.x == 0) { + mean = fdividef(1, acc); + } } __syncthreads(); att[att_idx] = Tdata(thread_data * mean); } -template +template static __device__ void block_folding( Tdata *__restrict__ att, Tmask mask, @@ -78,8 +82,8 @@ static __device__ void block_folding( for (unsigned int i = 0; i < local; ++i) { auto att_idx = thread_offset + i; auto val = att_idx < att_len && mask(tok_id, seq_len, att_idx, att_len) - ? float(att[i]) - : -__FLT_MAX__; + ? float(att[i]) + : -__FLT_MAX__; thread_data[i * blockDim.x] = val; thread_max = cub::Max()(thread_max, val); } @@ -91,7 +95,9 @@ static __device__ void block_folding( __shared__ float max; { auto acc = block_op.Reduce(thread_max, cub::Max()); - if (threadIdx.x == 0) { max = acc; } + if (threadIdx.x == 0) { + max = acc; + } } __syncthreads(); @@ -103,7 +109,9 @@ static __device__ void block_folding( thread_sum += (val = expf(val - max)); } auto acc = block_op.Sum(thread_sum); - if (threadIdx.x == 0) { mean = fdividef(1, acc); } + if (threadIdx.x == 0) { + mean = fdividef(1, acc); + } } __syncthreads(); @@ -115,7 +123,7 @@ static __device__ void block_folding( } // assert BLOCK_SIZE >= blockDim.x -template +template static __forceinline__ __device__ void padding( Tdata *__restrict__ att, Tmask mask, @@ -128,7 +136,7 @@ static __forceinline__ __device__ void padding( block_padding(att + offset, mask, tok_id, seq_len); } -template +template static __forceinline__ __device__ void folding( Tdata *__restrict__ att, Tmask mask, diff --git a/operators/src/fuesd_softmax/cuda/mod.rs b/operators/src/fuesd_softmax/cuda/mod.rs index ce3cc6e4..7e3e5993 100644 --- a/operators/src/fuesd_softmax/cuda/mod.rs +++ b/operators/src/fuesd_softmax/cuda/mod.rs @@ -1,16 +1,17 @@ -use super::{ - args::{AttnMask, Meta}, +use super::{ Args, FusedSoftmax, + args::{AttnMask, Meta}, }; use crate::{ + ByteOf, LaunchError, QueueAlloc, cuda::{Gpu, Handle, ModuleBox}, - get_static, strides_not_support, type_not_support, ByteOf, LaunchError, QueueAlloc, - SchemeError, + strides_not_support, type_not_support, }; +use cuda::params; use digit_layout::types::F16; use std::{ collections::HashMap, - ffi::{c_float, CString}, + ffi::{CString, c_float}, mem::size_of, sync::Arc, }; @@ -37,19 +38,6 @@ impl crate::Operator for Operator { } } - fn scheme( - &mut self, - args: &Self::Args, - _max_workspace_size: usize, - ) -> Result { - let Meta { dt } = args.meta()?; - if dt == F16 { - Ok(0) - } else { - Err(type_not_support("")) - } - } - fn launch( &self, args: &Self::Args, @@ -65,7 +53,7 @@ impl crate::Operator for Operator { att_layout, att_base, } = args; - let &[nh, seq_len, att_len] = att_layout.shape() else { + let &[nh, seq_len, att_len] = &*att_layout.shape() else { unreachable!() }; let &[sh, ss, sa] = att_layout.strides() else { @@ -73,17 +61,12 @@ impl crate::Operator for Operator { }; if dt != F16 { - return Err(type_not_support("").into()); - } - - get_static! { - nh seq_len att_len - sh ss sa + return Err(type_not_support("")); } let unit = dt.nbytes() as isize; if sa != unit { - return Err(strides_not_support("").into()); + return Err(strides_not_support("")); }; let scheme = &self.scheme[att_mask]; @@ -92,15 +75,13 @@ impl crate::Operator for Operator { let sh = (sh / unit) as i32; let ss = (ss / unit) as i32; let att_len = att_len as u32; - let params = cuda::params![att_base, 0i32, sh, ss, att_len]; + let params = params![*att_base, 0i32, sh, ss, att_len]; if att_len <= block_size { scheme.module.launch( &scheme.padding, - grid_dims, - att_len, - params.as_ptr(), - 0, + (grid_dims, att_len, 0), + ¶ms.to_ptrs(), queue.queue(), ); } else { @@ -108,10 +89,8 @@ impl crate::Operator for Operator { let smem = (num_items_thread * block_size) as usize; scheme.module.launch( &scheme.folding, - grid_dims, - block_size, - params.as_ptr(), - smem * size_of::(), + (grid_dims, block_size, smem * size_of::()), + ¶ms.to_ptrs(), queue.queue(), ); } @@ -183,17 +162,7 @@ extern "C" __global__ void {folding}( mod test { use super::{Args, AttnMask, Gpu, Operator}; use crate::{Hardware, Operator as _, TensorLayout}; - use digit_layout::{types as ty, DigitLayout}; - - fn dyn_args(dt: DigitLayout) -> Args { - use crate::dyn_; - use std::ptr::null_mut; - Args { - att_mask: AttnMask::Causal, - att_layout: TensorLayout::new_dyn(dt, &[dyn_(); 3], &[dyn_(); 3]), - att_base: null_mut(), - } - } + use digit_layout::{DigitLayout, types as ty}; fn args( dt: DigitLayout, @@ -209,27 +178,6 @@ mod test { } } - #[test] - fn test_compile() { - let Some(gpu) = Gpu::init() else { - return; - }; - println!("{}", gpu.0.device().info()); - - let mut op = Operator::new(&gpu); - op.scheme(&dyn_args(ty::F16), 0).unwrap(); - - gpu.apply(|ctx| { - for (mask, scheme) in op.scheme { - println!("{mask:?}============================"); - println!("{}", scheme.padding.to_str().unwrap()); - println!("{}", scheme.module.load(&scheme.padding, ctx).info()); - println!("{}", scheme.folding.to_str().unwrap()); - println!("{}", scheme.module.load(&scheme.folding, ctx).info()); - } - }) - } - #[test] fn test_compute() { use super::super::common_cpu::Operator as RefOp; @@ -246,10 +194,8 @@ mod test { return; }; - let mut cpu_op = RefOp::new(&Cpu); - let mut gpu_op = Operator::new(&gpu); - cpu_op.scheme(&dyn_args(ty::F64), 0).unwrap(); - gpu_op.scheme(&dyn_args(ty::F16), 0).unwrap(); + let cpu_op = RefOp::new(&Cpu); + let gpu_op = Operator::new(&gpu); let nh = 32; for (seq_len, att_len) in [(1, 511), (1, 2048), (7, 511), (7, 2048)] { diff --git a/operators/src/fuesd_softmax/infini/mod.rs b/operators/src/fuesd_softmax/infini/mod.rs index 590b329b..fd2f4b22 100644 --- a/operators/src/fuesd_softmax/infini/mod.rs +++ b/operators/src/fuesd_softmax/infini/mod.rs @@ -1,10 +1,8 @@ -use infini_op::{infiniop, AsRaw, Descriptor}; - -use super::{args::Meta, Args, FusedSoftmax}; +use super::{Args, FusedSoftmax, args::Meta}; use crate::{ - fuesd_softmax::args::AttnMask, get_static, infini::Device, ByteOf, LaunchError, QueueAlloc, - SchemeError, Workspace, + ByteOf, LaunchError, QueueAlloc, Workspace, fuesd_softmax::args::AttnMask, infini::Device, }; +use infini_op::{AsRaw, Descriptor, infiniop}; pub struct Operator(Device); @@ -20,15 +18,6 @@ impl crate::Operator for Operator { Self(node.clone()) } - #[inline] - fn scheme( - &mut self, - _args: &Self::Args, - _max_workspace_size: usize, - ) -> Result { - Ok(0) - } - fn launch( &self, args: &Self::Args, @@ -47,18 +36,13 @@ impl crate::Operator for Operator { if !matches!(att_mask, AttnMask::Causal) { todo!() } - let &[nh, seq_len, att_len] = att_layout.shape() else { + let &[nh, seq_len, att_len] = &*att_layout.shape() else { unreachable!() }; let &[sh, ss, sa] = att_layout.strides() else { unreachable!() }; - get_static! { - nh seq_len att_len - sh ss sa - } - let att = infini_op::Tensor::new(dt, [nh, seq_len, att_len], [sh, ss, sa]); let descriptor = Descriptor::new( |ptr| { @@ -91,17 +75,7 @@ impl crate::Operator for Operator { mod test { use super::{Args, AttnMask, Device, Operator}; use crate::{Hardware, Operator as _, TensorLayout}; - use digit_layout::{types as ty, DigitLayout}; - - fn dyn_args(dt: DigitLayout) -> Args { - use crate::dyn_; - use std::ptr::null_mut; - Args { - att_mask: AttnMask::Causal, - att_layout: TensorLayout::new_dyn(dt, &[dyn_(); 3], &[dyn_(); 3]), - att_base: null_mut(), - } - } + use digit_layout::{DigitLayout, types as ty}; fn args( dt: DigitLayout, @@ -131,10 +105,8 @@ mod test { infini_rt::init(infini_rt::DEVICE_CPU); let dev = Device::cpu(); - let mut cpu_op = RefOp::new(&Cpu); - let mut dev_op = Operator::new(&dev); - cpu_op.scheme(&dyn_args(ty::F64), 0).unwrap(); - dev_op.scheme(&dyn_args(ty::F16), 0).unwrap(); + let cpu_op = RefOp::new(&Cpu); + let dev_op = Operator::new(&dev); let nh = 32; for (seq_len, att_len) in [(1, 511), (1, 2048), (7, 511), (7, 2048)] { diff --git a/operators/src/fuesd_softmax/opencl/mod.rs b/operators/src/fuesd_softmax/opencl/mod.rs index 9965444e..413c39c9 100644 --- a/operators/src/fuesd_softmax/opencl/mod.rs +++ b/operators/src/fuesd_softmax/opencl/mod.rs @@ -1,17 +1,16 @@ -use super::{args::Meta, Args, FusedSoftmax}; +use super::{Args, FusedSoftmax, args::Meta}; use crate::{ - fuesd_softmax::args::AttnMask, - get_static, - opencl::{ClDevice, CodeGen, KernelCache, CL2_0}, - strides_not_support, ByteOf, LaunchError, QueueAlloc, + ByteOf, LaunchError, QueueAlloc, SchemeDiversity::Low as LowDiversity, - SchemeError, + fuesd_softmax::args::AttnMask, + opencl::{CL2_0, ClDevice, CodeGen, KernelCache}, + strides_not_support, }; use clrt::{ - bindings::{cl_int, cl_uint}, Context, + bindings::{cl_int, cl_uint}, }; -use digit_layout::{types as Ty, DigitLayout}; +use digit_layout::{DigitLayout, types as Ty}; use lru::LruCache; use std::sync::Mutex; @@ -47,16 +46,6 @@ impl crate::Operator for Operator { } } - fn scheme( - &mut self, - args: &Self::Args, - _max_workspace_size: usize, - ) -> Result { - let Meta { dt } = args.meta()?; - self.cache_kernel(dt); - Ok(0) - } - fn launch( &self, args: &Self::Args, @@ -67,7 +56,7 @@ impl crate::Operator for Operator { QA: QueueAlloc, { let Meta { dt } = args.meta()?; - self.cache_kernel(args.att_layout.dt()); + self.cache_kernel(args.att_layout.dt); let Args { att_mask, @@ -77,21 +66,16 @@ impl crate::Operator for Operator { if !matches!(*att_mask, AttnMask::Causal) { todo!() } - let &[nh, seq_len, att_len] = att_layout.shape() else { + let &[nh, seq_len, att_len] = &*att_layout.shape() else { unreachable!() }; let &[sh, ss, sa] = att_layout.strides() else { unreachable!() }; - get_static! { - nh seq_len att_len - sh ss sa - } - let unit = dt.nbytes() as isize; if sa != unit { - return Err(strides_not_support("").into()); + return Err(strides_not_support("")); }; let group_size = last_power_of_two(att_len.min(self.max_group_size)); @@ -163,16 +147,6 @@ mod test { use crate::{Hardware, TensorLayout}; use digit_layout::DigitLayout; - fn dyn_args(dt: DigitLayout) -> Args { - use crate::dyn_; - use std::ptr::null_mut; - Args { - att_mask: AttnMask::Causal, - att_layout: TensorLayout::new_dyn(dt, &[dyn_(); 3], &[dyn_(); 3]), - att_base: null_mut(), - } - } - fn args( dt: DigitLayout, nh: usize, @@ -191,10 +165,10 @@ mod test { fn test_compute() { use super::{super::common_cpu::Operator as RefOp, Operator}; use crate::{ + Operator as _, common_cpu::{Cpu, ThisThread}, opencl::ClDevice, test_utils::{Diff, ErrorCollector}, - Operator as _, }; use clrt::Platform; use digit_layout::types as ty; @@ -202,16 +176,14 @@ mod test { use rayon::iter::{IndexedParallelIterator, IntoParallelIterator, ParallelIterator}; use std::{iter::zip, time::Instant}; - let mut cpu_op = RefOp::new(&Cpu); + let cpu_op = RefOp::new(&Cpu); for platform in Platform::all() { for device in platform.devices() { println!("device: {}", device.name()); let context = device.context(); let queue = context.queue(); - let mut cl_op = Operator::new(&ClDevice::new(context.clone(), Default::default())); - cpu_op.scheme(&dyn_args(ty::F64), 0).unwrap(); - cl_op.scheme(&dyn_args(ty::F32), 0).unwrap(); + let cl_op = Operator::new(&ClDevice::new(context.clone(), Default::default())); let nh = 32; for (seq_len, att_len) in [ @@ -260,7 +232,7 @@ mod test { let cpu_time = time.elapsed(); println!("cl: {cl_time:?} / cpu: {cpu_time:?}"); - let map = queue.map(&mut att_svm); + let map = queue.map(&att_svm); let ([], mem, []) = (unsafe { map.align_to::() }) else { panic!() }; diff --git a/operators/src/gelu/args.rs b/operators/src/gelu/args.rs index 9cf27d33..ad2b0c32 100644 --- a/operators/src/gelu/args.rs +++ b/operators/src/gelu/args.rs @@ -1,4 +1,4 @@ -use crate::{utils::rank_error, Hardware, MaybeDyn, MutPtr, SchemeError, TensorLayout}; +use crate::{Hardware, LaunchError, MutPtr, TensorLayout, utils::rank_error}; use digit_layout::DigitLayout; pub struct Args { @@ -8,8 +8,8 @@ pub struct Args { pub(super) struct Meta { pub dt: DigitLayout, - pub n: MaybeDyn, - pub d: MaybeDyn, + pub n: usize, + pub d: usize, } impl Args { @@ -21,15 +21,15 @@ impl Args { } } - pub(super) fn meta(&self) -> Result { + pub(super) fn meta(&self) -> Result { let Self { layout, .. } = self; - let &[n, d] = layout.shape() else { + let &[n, d] = &*layout.shape() else { return Err(rank_error("layout", 2, layout.ndim())); }; Ok(Meta { - dt: layout.dt(), + dt: layout.dt, n, d, }) diff --git a/operators/src/gelu/common_cpu/mod.rs b/operators/src/gelu/common_cpu/mod.rs index d72041b8..7fac626b 100644 --- a/operators/src/gelu/common_cpu/mod.rs +++ b/operators/src/gelu/common_cpu/mod.rs @@ -1,5 +1,5 @@ -use super::{args::Meta, Args, Gelu}; -use crate::{common_cpu::Cpu, get_static, ByteOf, LaunchError, QueueAlloc, SchemeError}; +use super::{Args, Gelu, args::Meta}; +use crate::{ByteOf, LaunchError, QueueAlloc, common_cpu::Cpu}; use half::f16; pub struct Operator; @@ -15,15 +15,6 @@ impl crate::Operator for Operator { Self } - fn scheme( - &mut self, - args: &Self::Args, - _max_workspace_size: usize, - ) -> Result { - let _meta = args.meta()?; - Ok(0) - } - fn launch( &self, args: &Self::Args, @@ -39,11 +30,6 @@ impl crate::Operator for Operator { unreachable!() }; - get_static! { - n d - sn sd - } - macro_rules! calculate { ($ty:ty) => { Scheme::<$ty> { diff --git a/operators/src/gelu/cuda/gelu.cuh b/operators/src/gelu/cuda/gelu.cuh index e0a3a33c..1575c32a 100644 --- a/operators/src/gelu/cuda/gelu.cuh +++ b/operators/src/gelu/cuda/gelu.cuh @@ -1,10 +1,11 @@ #ifndef M_SQRT1_2 #define M_SQRT1_2 .707106781186547524401f #endif -template + +template static __device__ void gelu( Tdata *__restrict__ data) { auto i = blockIdx.x * blockDim.x + threadIdx.x; auto x = float(data[i]); data[i] = Tdata(0.5f * x * (1.0f + erf(x * M_SQRT1_2))); -} \ No newline at end of file +} diff --git a/operators/src/gelu/cuda/mod.rs b/operators/src/gelu/cuda/mod.rs index 9e2f5179..2be4925c 100644 --- a/operators/src/gelu/cuda/mod.rs +++ b/operators/src/gelu/cuda/mod.rs @@ -1,13 +1,14 @@ -use super::{args::Meta, Args, Gelu}; +use super::{Args, Gelu, args::Meta}; use crate::{ + ByteOf, LaunchError, QueueAlloc, cuda::{Gpu, Handle, ModuleBox}, - get_static, strides_not_support, type_not_support, + strides_not_support, type_not_support, utils::gcd, - ByteOf, LaunchError, QueueAlloc, SchemeError, }; +use cuda::params; use digit_layout::types::F16; use std::{ - ffi::{c_uint, CString}, + ffi::{CString, c_uint}, sync::Arc, }; @@ -37,15 +38,6 @@ impl crate::Operator for Operator { } } - #[inline] - fn scheme( - &mut self, - _args: &Self::Args, - _max_workspace_size: usize, - ) -> Result { - Ok(0) - } - fn launch( &self, args: &Self::Args, @@ -58,30 +50,23 @@ impl crate::Operator for Operator { let Meta { dt, n, d } = args.meta()?; let Args { layout, base } = args; if dt != F16 { - return Err(type_not_support("").into()); + return Err(type_not_support("")); } let &[_, ds] = layout.strides() else { unreachable!() }; - get_static! { - n d ds - } - let unit = dt.nbytes() as isize; if ds != unit { - return Err(strides_not_support("").into()); + return Err(strides_not_support("")); }; - let params = cuda::params![base]; let block = gcd(self.max_threads_block, d); self.module.launch( CString::new(NAME).unwrap(), - (n * d).div_ceil(block) as c_uint, - block as u32, - params.as_ptr(), - 0, + ((n * d).div_ceil(block) as c_uint, block as c_uint, 0), + ¶ms![*base].to_ptrs(), queue_alloc.queue(), ); Ok(()) @@ -103,20 +88,12 @@ extern "C" __global__ void {NAME}( #[cfg(test)] mod test { use super::{Args, Gpu, Operator}; - use crate::{dyn_, Hardware, Operator as _, TensorLayout}; + use crate::{Hardware, Operator as _, TensorLayout}; use digit_layout::{ - types::{F16, F64}, DigitLayout, + types::{F16, F64}, }; - fn dyn_args(dt: DigitLayout) -> Args { - use std::ptr::null_mut; - let layout = TensorLayout::new_dyn(dt, &[dyn_(); 2], &[dyn_(); 2]); - Args { - layout: layout.clone(), - base: null_mut(), - } - } fn args(dt: DigitLayout, n: usize, d: usize, base: *mut H::Byte) -> Args { let layout = TensorLayout::new_contiguous(dt, &[n, d]); Args { @@ -125,27 +102,6 @@ mod test { } } - #[test] - fn test_compile() { - use super::NAME; - use std::ffi::CString; - - let Some(gpu) = Gpu::init() else { - return; - }; - println!("{}", gpu.0.device().info()); - - let mut op = Operator::new(&gpu); - op.scheme(&dyn_args(F16), 0).unwrap(); - - gpu.apply(|ctx| { - println!( - "{NAME}\n{}", - op.module.load(CString::new(NAME).unwrap(), ctx).info() - ); - }) - } - #[test] fn test_compute() { use super::super::common_cpu::Operator as RefOp; @@ -162,10 +118,8 @@ mod test { return; }; - let mut cpu_op = RefOp::new(&Cpu); - let mut gpu_op = Operator::new(&gpu); - cpu_op.scheme(&dyn_args(F64), 0).unwrap(); - gpu_op.scheme(&dyn_args(F16), 0).unwrap(); + let cpu_op = RefOp::new(&Cpu); + let gpu_op = Operator::new(&gpu); let n = 1024; let d = 2048; diff --git a/operators/src/gelu/infini/mod.rs b/operators/src/gelu/infini/mod.rs deleted file mode 100644 index 07510f3a..00000000 --- a/operators/src/gelu/infini/mod.rs +++ /dev/null @@ -1,39 +0,0 @@ -use super::{Args, Gelu}; -use crate::{infini::Device, ByteOf, LaunchError, QueueAlloc, SchemeError}; -use infini_op::Handle; -use std::sync::Arc; - -#[repr(transparent)] -pub struct Operator(Arc); - -impl Gelu for Operator {} - -impl crate::Operator for Operator { - type Hardware = Device; - type TopoNode = Device; - type Args = Args; - - fn new(_node: &Self::TopoNode) -> Self { - todo!() - } - - fn scheme( - &mut self, - _args: &Self::Args, - _max_workspace_size: usize, - ) -> Result { - todo!() - } - - fn launch( - &self, - _args: &Self::Args, - _workspace: &mut [ByteOf], - _queue_alloc: &QA, - ) -> Result<(), LaunchError> - where - QA: QueueAlloc, - { - todo!() - } -} diff --git a/operators/src/gelu/mod.rs b/operators/src/gelu/mod.rs index f7053f30..3ca53177 100644 --- a/operators/src/gelu/mod.rs +++ b/operators/src/gelu/mod.rs @@ -2,10 +2,6 @@ pub mod common_cpu; #[cfg(use_cuda)] pub mod cuda; -#[cfg(use_infini)] -pub mod infini; -#[cfg(use_cl)] -pub mod opencl; mod args; pub use args::Args; diff --git a/operators/src/gelu/opencl/mod.rs b/operators/src/gelu/opencl/mod.rs deleted file mode 100644 index 3ac8e99a..00000000 --- a/operators/src/gelu/opencl/mod.rs +++ /dev/null @@ -1,36 +0,0 @@ -use super::{Args, Gelu}; -use crate::{opencl::ClDevice, ByteOf, LaunchError, QueueAlloc, SchemeError}; - -pub struct Operator; - -impl Gelu for Operator {} - -impl crate::Operator for Operator { - type Hardware = ClDevice; - type TopoNode = ClDevice; - type Args = Args; - - fn new(_node: &Self::TopoNode) -> Self { - todo!() - } - - fn scheme( - &mut self, - _args: &Self::Args, - _max_workspace_size: usize, - ) -> Result { - todo!() - } - - fn launch( - &self, - _args: &Self::Args, - _workspace: &mut [ByteOf], - _queue_alloc: &QA, - ) -> Result<(), LaunchError> - where - QA: QueueAlloc, - { - todo!() - } -} diff --git a/operators/src/handle/common_cpu/inproc_node.rs b/operators/src/handle/common_cpu/inproc_node.rs index 54cd9f42..a5a4fa92 100644 --- a/operators/src/handle/common_cpu/inproc_node.rs +++ b/operators/src/handle/common_cpu/inproc_node.rs @@ -1,9 +1,9 @@ -use super::Cpu; +use super::Cpu; use crate::TopoNode; use std::sync::{ - atomic::{AtomicUsize, Ordering::Relaxed}, - mpsc::{channel, Receiver, Sender}, Arc, Condvar, Mutex, + atomic::{AtomicUsize, Ordering::Relaxed}, + mpsc::{Receiver, Sender, channel}, }; pub struct InprocNode { diff --git a/operators/src/handle/cuda/alloc.rs b/operators/src/handle/cuda/alloc.rs index 5b4ab2fe..4610da55 100644 --- a/operators/src/handle/cuda/alloc.rs +++ b/operators/src/handle/cuda/alloc.rs @@ -99,7 +99,6 @@ impl<'ctx> Alloc> for &'ctx CurrentCtx { fn free(&self, _mem: DevMem<'ctx>) {} } -#[cfg(use_nvidia)] impl<'ctx> Alloc> for Stream<'ctx> { #[inline] fn alloc(&self, size: usize) -> DevMem<'ctx> { @@ -108,11 +107,10 @@ impl<'ctx> Alloc> for Stream<'ctx> { #[inline] fn free(&self, mem: DevMem<'ctx>) { - mem.drop_on(self) + Stream::free(self, mem); } } -#[cfg(use_nvidia)] impl<'ctx> QueueAlloc for Stream<'ctx> { type Hardware = Gpu; type DevMem = DevMem<'ctx>; diff --git a/operators/src/handle/cuda/library.rs b/operators/src/handle/cuda/library.rs index 7636a287..39fb565e 100644 --- a/operators/src/handle/cuda/library.rs +++ b/operators/src/handle/cuda/library.rs @@ -210,7 +210,7 @@ fn read_output(output: &Output) -> String { fn test_compile() { use cuda::Version; use libloading::Symbol; - use std::ffi::{c_char, CStr}; + use std::ffi::{CStr, c_char}; let lib = cache_lib( &("test_compile".into(), Version { major: 8, minor: 0 }), diff --git a/operators/src/handle/cuda/mod.rs b/operators/src/handle/cuda/mod.rs index 2e735e1b..2fdd9faa 100644 --- a/operators/src/handle/cuda/mod.rs +++ b/operators/src/handle/cuda/mod.rs @@ -219,7 +219,7 @@ where { let mut host = stream.ctx().malloc_host::(val.len()); let host = unsafe { std::slice::from_raw_parts_mut(host.as_mut_ptr().cast(), val.len()) }; - host.into_iter().zip(val).for_each(|(y, x)| *y = f(*x)); + host.iter_mut().zip(val).for_each(|(y, x)| *y = f(*x)); #[cfg(use_nvidia)] let mem = stream.from_host(host); diff --git a/operators/src/handle/cuda/module.rs b/operators/src/handle/cuda/module.rs index 30c7f7f1..58895415 100644 --- a/operators/src/handle/cuda/module.rs +++ b/operators/src/handle/cuda/module.rs @@ -1,12 +1,12 @@ use super::{Handle, Key}; use cuda::{ - bindings::nvrtcResult, ContextResource, ContextSpore, CurrentCtx, Dim3, KernelFn, ModuleSpore, - Ptx, Stream, + ContextResource, ContextSpore, CurrentCtx, Dim3, KernelFn, ModuleSpore, Ptx, Stream, + bindings::nvrtcResult, }; use log::warn; use std::{ - collections::{hash_map::Entry::Occupied, HashMap}, - ffi::{c_void, CStr}, + collections::{HashMap, hash_map::Entry::Occupied}, + ffi::{CStr, c_void}, ptr::addr_eq, sync::{Arc, OnceLock, RwLock}, }; @@ -39,19 +39,11 @@ impl ModuleBox { pub fn launch( &self, name: impl AsRef, - grid_dims: impl Into, - block_dims: impl Into, - params: *const *const c_void, - shared_mem: usize, + attrs: (impl Into, impl Into, usize), + params: &[*const c_void], stream: &Stream, ) { - self.load(name, stream.ctx()).launch( - grid_dims, - block_dims, - params, - shared_mem, - Some(stream), - ) + stream.launch(&self.load(name, stream.ctx()), attrs, params); } } diff --git a/operators/src/handle/infini/ccl.rs b/operators/src/handle/infini/ccl.rs index bdad0285..77eea46c 100644 --- a/operators/src/handle/infini/ccl.rs +++ b/operators/src/handle/infini/ccl.rs @@ -1,6 +1,6 @@ -use super::Device; +use super::Device; use crate::TopoNode; -use infini_ccl::{bindings::DeviceType, Comm}; +use infini_ccl::{Comm, bindings::DeviceType}; use std::{os::raw::c_uint, sync::Arc}; pub struct InfiniNode { diff --git a/operators/src/handle/infini/mod.rs b/operators/src/handle/infini/mod.rs index 622f0312..ea4db855 100644 --- a/operators/src/handle/infini/mod.rs +++ b/operators/src/handle/infini/mod.rs @@ -106,7 +106,7 @@ impl QueueAlloc for Stream { /// 并行转换类型并异步拷贝到显存。 #[cfg(test)] -pub(crate) fn cast_load<'ctx, T, U, F>(val: &[T], f: F, stream: &Stream) -> DevBlob +pub(crate) fn cast_load(val: &[T], f: F, stream: &Stream) -> DevBlob where T: Sync + Copy, U: Send + Copy, @@ -114,7 +114,7 @@ where { let mut host = stream.get_device().malloc_host::(val.len()); let host = unsafe { std::slice::from_raw_parts_mut(host.as_mut_ptr().cast(), val.len()) }; - host.into_iter().zip(val).for_each(|(y, x)| *y = f(*x)); + host.iter_mut().zip(val).for_each(|(y, x)| *y = f(*x)); let ans = stream.from_host(host); stream.synchronize(); ans diff --git a/operators/src/layer_norm/args.rs b/operators/src/layer_norm/args.rs index 6ee739aa..70ea9286 100644 --- a/operators/src/layer_norm/args.rs +++ b/operators/src/layer_norm/args.rs @@ -1,6 +1,6 @@ -use crate::{ +use crate::{ + ConstPtr, Hardware, LaunchError, MutPtr, TensorLayout, utils::{dim_distinct, rank_error, type_distinct}, - ConstPtr, Hardware, MaybeDyn, MutPtr, SchemeError, TensorLayout, }; use digit_layout::DigitLayout; @@ -19,12 +19,12 @@ pub struct Args { pub(super) struct Meta { pub dt_a: DigitLayout, pub dt_w: DigitLayout, - pub n: MaybeDyn, - pub d: MaybeDyn, + pub n: usize, + pub d: usize, } impl Args { - pub(super) fn meta(&self) -> Result { + pub(super) fn meta(&self) -> Result { let Self { y_layout: y, x_layout: x, @@ -33,24 +33,24 @@ impl Args { .. } = self; - let &[ny, dy] = y.shape() else { + let &[ny, dy] = &*y.shape() else { return Err(rank_error("y", 2, y.ndim())); }; - let &[nx, dx] = x.shape() else { + let &[nx, dx] = &*x.shape() else { return Err(rank_error("x", 2, x.ndim())); }; - let &[ds] = scale.shape() else { + let &[ds] = &*scale.shape() else { return Err(rank_error("scale", 1, scale.ndim())); }; - let &[db] = bias.shape() else { + let &[db] = &*bias.shape() else { return Err(rank_error("bias", 1, bias.ndim())); }; Ok(Meta { - dt_a: type_distinct(&[y.dt(), x.dt()])?, - dt_w: type_distinct(&[scale.dt(), bias.dt()])?, - n: dim_distinct(&[ny, nx])?, - d: dim_distinct(&[dy, dx, ds, db])?, + dt_a: type_distinct(&[y.dt, x.dt])?, + dt_w: type_distinct(&[scale.dt, bias.dt])?, + n: dim_distinct(&[ny, nx]).expect("m mismatch"), + d: dim_distinct(&[dy, dx, ds, db]).expect("d mismatch"), }) } } diff --git a/operators/src/layer_norm/common_cpu/mod.rs b/operators/src/layer_norm/common_cpu/mod.rs index d5b768d4..aff62ac9 100644 --- a/operators/src/layer_norm/common_cpu/mod.rs +++ b/operators/src/layer_norm/common_cpu/mod.rs @@ -1,7 +1,7 @@ -use super::{args::Meta, Args, LayerNorm}; -use crate::{common_cpu::Cpu, get_static, ByteOf, LaunchError, QueueAlloc, SchemeError}; +use super::{Args, LayerNorm, args::Meta}; +use crate::{ByteOf, LaunchError, QueueAlloc, common_cpu::Cpu}; use half::f16; -use num_traits::{real::Real, NumCast, ToPrimitive}; +use num_traits::{NumCast, ToPrimitive, real::Real}; use std::ops::AddAssign; pub struct Operator; @@ -17,15 +17,6 @@ impl crate::Operator for Operator { Self } - fn scheme( - &mut self, - args: &Self::Args, - _max_workspace_size: usize, - ) -> Result { - let _meta = args.meta()?; - Ok(0) - } - fn launch( &self, args: &Self::Args, @@ -60,14 +51,6 @@ impl crate::Operator for Operator { unreachable!() }; - get_static! { - n d - nsy dsy - nsx dsx - dss - dsb - } - macro_rules! calculate { ($eps:expr; $w:ty, $a:ty) => { Scheme { diff --git a/operators/src/layer_norm/cuda/layer_norm.cuh b/operators/src/layer_norm/cuda/layer_norm.cuh index ed076593..988dc3af 100644 --- a/operators/src/layer_norm/cuda/layer_norm.cuh +++ b/operators/src/layer_norm/cuda/layer_norm.cuh @@ -10,7 +10,8 @@ struct SumPair { return SumPair{this->average + other.average, this->variance + other.variance}; } }; -template + +template static __device__ void padding( Ta *__restrict__ y_, int const stride_y, @@ -39,7 +40,7 @@ static __device__ void padding( *y = Ta((x - average) * variance * s + b); } -template +template static __device__ void folding( Ta *__restrict__ y_, int const stride_y, diff --git a/operators/src/layer_norm/cuda/mod.rs b/operators/src/layer_norm/cuda/mod.rs index 131a073b..1599750c 100644 --- a/operators/src/layer_norm/cuda/mod.rs +++ b/operators/src/layer_norm/cuda/mod.rs @@ -1,15 +1,15 @@ use super::{Args, LayerNorm}; use crate::{ - cuda::{dt_name, Gpu, Handle, ModuleBox}, - get_static, + ByteOf, LaunchError, QueueAlloc, SchemeDiversity, + cuda::{Gpu, Handle, ModuleBox, dt_name}, layer_norm::args::Meta, - shape_not_support, strides_not_support, ByteOf, LaunchError, QueueAlloc, SchemeDiversity, - SchemeError, + shape_not_support, strides_not_support, }; +use cuda::params; use digit_layout::DigitLayout; use lru::LruCache; use std::{ - ffi::CString, + ffi::{CString, c_uint}, sync::{Arc, Mutex}, }; @@ -32,22 +32,6 @@ impl crate::Operator for Operator { } } - fn scheme( - &mut self, - args: &Self::Args, - _max_workspace_size: usize, - ) -> Result { - let Meta { dt_a, dt_w, d, .. } = args.meta()?; - get_static!(d); - - let key = SchemeKey { dt_a, dt_w, d }; - self.schemes - .lock() - .unwrap() - .try_get_or_insert(key, || Scheme::new(&self.handle, key))?; - Ok(0) - } - fn launch( &self, args: &Self::Args, @@ -82,21 +66,13 @@ impl crate::Operator for Operator { unreachable!() }; - get_static! { - n d - nsy dsy - nsx dsx - dss - dsb - } - let unit = dt_a.nbytes() as isize; if dsy != unit || dsx != unit || dss != dt_w.nbytes() as isize || dsb != dt_w.nbytes() as isize { - return Err(strides_not_support("").into()); + return Err(strides_not_support("")); }; let key = SchemeKey { dt_a, dt_w, d }; let scheme = self @@ -108,17 +84,27 @@ impl crate::Operator for Operator { let nsy = (nsy / unit) as i32; let nsx = (nsx / unit) as i32; - let params = cuda::params![y_base, nsy, x_base, nsx, scale_base, bias_base, epsilon]; + let params = params![ + *y_base, + nsy, + *x_base, + nsx, + *scale_base, + *bias_base, + *epsilon + ]; scheme.module.launch( &scheme.name, - n as u32, - match scheme.ty { - SchemeType::Padding => d, - SchemeType::Folding { block_size } => block_size, - } as u32, - params.as_ptr(), - 0, + ( + n as c_uint, + match scheme.ty { + SchemeType::Padding => d, + SchemeType::Folding { block_size } => block_size, + } as c_uint, + 0, + ), + ¶ms.to_ptrs(), queue_alloc.queue(), ); @@ -150,7 +136,7 @@ impl Scheme { pub fn new( handle: &Arc, SchemeKey { dt_a, dt_w, d }: SchemeKey, - ) -> Result { + ) -> Result { let device = handle.device(); let cc = device.compute_capability(); let block_size = device.block_limit().max_threads; @@ -231,30 +217,14 @@ extern "C" __global__ void {name}( #[cfg(test)] mod test { use super::{Args, Gpu, Operator}; - use crate::{dyn_, Hardware, Operator as _, TensorLayout}; + use crate::{Hardware, Operator as _, TensorLayout}; use core::f32; use digit_layout::{ - types::{F16, F32, F64}, DigitLayout, + types::{F16, F32, F64}, }; - use std::ptr::null; - fn dyn_args(dt_a: DigitLayout, dt_w: DigitLayout, d: usize) -> Args { - use std::ptr::null_mut; - let yx_layout = TensorLayout::new_dyn(dt_a, &[dyn_(), d.into()], &[dyn_(); 2]); - let sb_layout = TensorLayout::new_dyn(dt_w, &[d.into()], &[dyn_()]); - Args { - y_layout: yx_layout.clone(), - y_base: null_mut(), - x_layout: yx_layout.clone(), - x_base: null(), - scale_layout: sb_layout.clone(), - scale_base: null(), - bias_layout: sb_layout.clone(), - bias_base: null(), - epsilon: 0.1f32, - } - } + #[allow(clippy::too_many_arguments)] fn args( dt_a: DigitLayout, dt_w: DigitLayout, @@ -297,14 +267,12 @@ mod test { return; }; - let mut cpu_op = RefOp::new(&Cpu); - let mut gpu_op = Operator::new(&gpu); + let cpu_op = RefOp::new(&Cpu); + let gpu_op = Operator::new(&gpu); for k in 8..=13 { let n = 4; let d = 1 << k; let epsilon = 1.0f32; - cpu_op.scheme(&dyn_args(F64, F64, d), 0).unwrap(); - gpu_op.scheme(&dyn_args(F16, F32, d), 0).unwrap(); let y = vec![0.0f64; n * d]; let mut x = vec![1.0f64; n * d]; let mut scale = vec![1.0f64; d]; diff --git a/operators/src/layer_norm/infini/mod.rs b/operators/src/layer_norm/infini/mod.rs deleted file mode 100644 index 9caf6956..00000000 --- a/operators/src/layer_norm/infini/mod.rs +++ /dev/null @@ -1,36 +0,0 @@ -use super::{Args, LayerNorm}; -use crate::{infini::Device, ByteOf, LaunchError, QueueAlloc, SchemeError}; - -pub struct Operator; - -impl LayerNorm for Operator {} - -impl crate::Operator for Operator { - type Hardware = Device; - type TopoNode = Device; - type Args = Args; - - fn new(_node: &Self::TopoNode) -> Self { - todo!() - } - - fn scheme( - &mut self, - _args: &Self::Args, - _max_workspace_size: usize, - ) -> Result { - todo!() - } - - fn launch( - &self, - _args: &Self::Args, - _workspace: &mut [ByteOf], - _queue_alloc: &QA, - ) -> Result<(), LaunchError> - where - QA: QueueAlloc, - { - todo!() - } -} diff --git a/operators/src/layer_norm/mod.rs b/operators/src/layer_norm/mod.rs index fbe4550d..1c4cd473 100644 --- a/operators/src/layer_norm/mod.rs +++ b/operators/src/layer_norm/mod.rs @@ -2,10 +2,6 @@ pub mod common_cpu; #[cfg(use_cuda)] pub mod cuda; -#[cfg(use_infini)] -pub mod infini; -#[cfg(use_cl)] -pub mod opencl; mod args; pub use args::Args; diff --git a/operators/src/layer_norm/opencl/mod.rs b/operators/src/layer_norm/opencl/mod.rs deleted file mode 100644 index 4975844b..00000000 --- a/operators/src/layer_norm/opencl/mod.rs +++ /dev/null @@ -1,36 +0,0 @@ -use super::{Args, LayerNorm}; -use crate::{opencl::ClDevice, ByteOf, LaunchError, QueueAlloc, SchemeError}; - -pub struct Operator; - -impl LayerNorm for Operator {} - -impl crate::Operator for Operator { - type Hardware = ClDevice; - type TopoNode = ClDevice; - type Args = Args; - - fn new(_node: &Self::TopoNode) -> Self { - todo!() - } - - fn scheme( - &mut self, - _args: &Self::Args, - _max_workspace_size: usize, - ) -> Result { - todo!() - } - - fn launch( - &self, - _args: &Self::Args, - _workspace: &mut [ByteOf], - _queue_alloc: &QA, - ) -> Result<(), LaunchError> - where - QA: QueueAlloc, - { - todo!() - } -} diff --git a/operators/src/lib.rs b/operators/src/lib.rs index 02088acc..84f28ab2 100644 --- a/operators/src/lib.rs +++ b/operators/src/lib.rs @@ -113,23 +113,6 @@ pub trait Operator { /// 在指定拓扑节点上创建算子实例。 fn new(node: &Self::TopoNode) -> Self; - /// 规划执行方案。 - /// - /// 通过向算子实例提供尽可能详细的参数来尽量确定算子执行方案。 - /// 通过允许参数中标量值、张量形状、张量步长和张量基址的动态性([ArgVal] 或 [null](std::ptr::null))来尽可能复用算子实例。 - /// - /// 另外,需要传入一个最大工作空间容量。工作空间是与硬件存储单元相同类型的存储区域,供算子执行过程中使用。 - /// 规划执行方案时,将尽可能尝试计算一个满足最大工作空间容量的工作空间需求,作为返回值。 - /// - /// 算子的返回值将保证不大于最大工作空间容量。如果算子还需要更多空间,可能产生运行时分配。 - /// - /// 由于参数提供可能不全,有时无法计算出具体的工作空间需求,算子将返回 0 作为工作空间需求,并在执行时再计算实际的需求。 - fn scheme( - &mut self, - args: &Self::Args, - max_workspace_size: usize, - ) -> Result; - /// 发射算子到任务队列。 /// /// 如果算子实际需要的工作空间大于通过参数提供的工作空间,将通过流分配器分配和释放工作空间。 @@ -197,15 +180,6 @@ where Self(R::new(node), PhantomData) } - #[inline] - fn scheme( - &mut self, - args: &Self::Args, - max_workspace_size: usize, - ) -> Result { - self.0.scheme(args.as_ref(), max_workspace_size) - } - #[inline] fn launch( &self, diff --git a/operators/src/mat_mul/args.rs b/operators/src/mat_mul/args.rs index e4361616..91849d33 100644 --- a/operators/src/mat_mul/args.rs +++ b/operators/src/mat_mul/args.rs @@ -1,6 +1,6 @@ -use crate::{ - dyn_not_support, rank_not_support, shape_mismatch, shape_not_support, strides_not_support, - utils::type_distinct, ConstPtr, Hardware, MaybeDyn, MutPtr, SchemeError, TensorLayout, +use crate::{ + ConstPtr, Hardware, LaunchError, MutPtr, TensorLayout, rank_not_support, shape_mismatch, + shape_not_support, strides_not_support, utils::type_distinct, }; use digit_layout::DigitLayout; use std::{ @@ -61,7 +61,7 @@ impl Args { } } - pub(super) fn layout(&self) -> Result { + pub(super) fn layout(&self) -> Result { let Self { c_layout, a_layout, @@ -99,7 +99,7 @@ impl Args { let (a_ld, a_trans) = a.ld_trans()?; let (b_ld, b_trans) = b.ld_trans()?; Ok(SchemeLayout { - dt: type_distinct(&[c_layout.dt(), a_layout.dt(), b_layout.dt()])?, + dt: type_distinct(&[c_layout.dt, a_layout.dt, b_layout.dt])?, ab_swap, a_trans, b_trans, @@ -132,23 +132,16 @@ struct Matrix { } impl TryFrom<&TensorLayout> for Matrix { - type Error = SchemeError; + type Error = LaunchError; fn try_from(tensor: &TensorLayout) -> Result { - let Some(shape) = MaybeDyn::get_all(tensor.shape()) else { - return Err(dyn_not_support("")); - }; - let Some(strides) = MaybeDyn::get_all(tensor.strides()) else { - return Err(dyn_not_support("")); - }; - - let [batch @ .., r, c] = shape else { + let [batch @ .., r, c] = &*tensor.shape() else { return Err(rank_not_support("Matrix must have rank 2 or more")); }; - let [stride @ .., rs, cs] = strides else { + let [stride @ .., rs, cs] = tensor.strides() else { unreachable!(); }; - let unit = tensor.dt().nbytes() as isize; + let unit = tensor.dt.nbytes() as isize; let (batch, stride) = match batch { [] | [1] => { assert!(matches!(stride, [] | [_])); @@ -177,7 +170,7 @@ impl Matrix { self.batch == 1 || self.batch == batch } #[inline(always)] - fn ld_trans(&mut self) -> Result<(isize, bool), SchemeError> { + fn ld_trans(&mut self) -> Result<(isize, bool), LaunchError> { match (self.rs, self.cs) { (1, cs) => Ok((cs, false)), (rs, 1) => Ok((rs, true)), diff --git a/operators/src/mat_mul/common_cpu/mod.rs b/operators/src/mat_mul/common_cpu/mod.rs index 445c497e..cb76fadd 100644 --- a/operators/src/mat_mul/common_cpu/mod.rs +++ b/operators/src/mat_mul/common_cpu/mod.rs @@ -1,5 +1,5 @@ -use super::{args::SchemeLayout, Args, MatMul}; -use crate::{common_cpu::Cpu, type_not_support, ByteOf, LaunchError, QueueAlloc, SchemeError}; +use super::{Args, MatMul, args::SchemeLayout}; +use crate::{ByteOf, LaunchError, QueueAlloc, common_cpu::Cpu, type_not_support}; pub struct Operator; @@ -15,14 +15,6 @@ impl crate::Operator for Operator { Self } - fn scheme( - &mut self, - _args: &Self::Args, - _max_workspace_size: usize, - ) -> Result { - Ok(0) - } - fn launch( &self, args: &Self::Args, diff --git a/operators/src/mat_mul/cuda/mod.rs b/operators/src/mat_mul/cuda/mod.rs index 480408f8..6a3ef2e3 100644 --- a/operators/src/mat_mul/cuda/mod.rs +++ b/operators/src/mat_mul/cuda/mod.rs @@ -1,7 +1,8 @@ -use super::{args::SchemeLayout, Args, MatMul}; +use super::{Args, MatMul, args::SchemeLayout}; use crate::{ + ByteOf, LaunchError, QueueAlloc, cuda::{Gpu, Handle}, - type_not_support, ByteOf, LaunchError, QueueAlloc, SchemeError, + type_not_support, }; use cublas::cublas; use cuda::AsRaw; @@ -27,15 +28,6 @@ impl crate::Operator for Operator { } } - fn scheme( - &mut self, - _args: &Self::Args, - _max_workspace_size: usize, - ) -> Result { - // 仅支持 cublas,不需要为执行做准备 - Ok(0) - } - fn launch( &self, args: &Self::Args, @@ -71,7 +63,7 @@ impl crate::Operator for Operator { } = args; if dt != F16 { - return Err(type_not_support("").into()); + return Err(type_not_support("")); } let (a, b) = if ab_swap { @@ -137,6 +129,7 @@ mod test { const ALPHA: f32 = 0.5; const BETA: f32 = 1.; + #[allow(clippy::too_many_arguments)] fn args( dt: DigitLayout, batch: usize, @@ -163,10 +156,10 @@ mod test { fn test_compute() { use super::{super::common_cpu::Operator as RefOp, Gpu, Operator}; use crate::{ + Operator as _, common_cpu::{Cpu, ThisThread}, cuda::cast_load, test_utils::{Diff, ErrorCollector}, - Operator as _, }; use cuda::memcpy_d2h; use digit_layout::types::{F16, F64}; diff --git a/operators/src/mat_mul/infini/mod.rs b/operators/src/mat_mul/infini/mod.rs index 609d19e5..5f762799 100644 --- a/operators/src/mat_mul/infini/mod.rs +++ b/operators/src/mat_mul/infini/mod.rs @@ -1,8 +1,6 @@ -use super::{Args, MatMul}; -use crate::{ - infini::Device, ByteOf, LaunchError, QueueAlloc, SchemeError, TensorLayout, Workspace, -}; -use infini_op::{infiniop, AsRaw, Descriptor}; +use super::{Args, MatMul}; +use crate::{ByteOf, LaunchError, QueueAlloc, TensorLayout, Workspace, infini::Device}; +use infini_op::{AsRaw, Descriptor, infiniop}; pub struct Operator(Device); @@ -18,15 +16,6 @@ impl crate::Operator for Operator { Self(node.clone()) } - #[inline] - fn scheme( - &mut self, - _args: &Self::Args, - _max_workspace_size: usize, - ) -> Result { - Ok(0) - } - fn launch( &self, args: &Self::Args, @@ -49,9 +38,9 @@ impl crate::Operator for Operator { fn tensor(layout: &TensorLayout) -> infini_op::Tensor { infini_op::Tensor::new( - layout.dt(), - layout.shape().iter().map(|&x| *x.get_static().unwrap()), - layout.strides().iter().map(|&x| *x.get_static().unwrap()), + layout.dt, + layout.shape().iter().cloned(), + layout.strides().iter().cloned(), ) } @@ -104,6 +93,7 @@ mod test { const ALPHA: f32 = 0.5; const BETA: f32 = 1.; + #[allow(clippy::too_many_arguments)] fn args( dt: DigitLayout, batch: usize, @@ -130,10 +120,10 @@ mod test { fn test_compute() { use super::{super::common_cpu::Operator as RefOp, Device, Operator}; use crate::{ + Operator as _, common_cpu::{Cpu, ThisThread}, infini::cast_load, test_utils::{Diff, ErrorCollector}, - Operator as _, }; use digit_layout::types::{F16, F64}; use half::f16; diff --git a/operators/src/mat_mul/opencl/mod.rs b/operators/src/mat_mul/opencl/mod.rs index bc23a3df..d31fa159 100644 --- a/operators/src/mat_mul/opencl/mod.rs +++ b/operators/src/mat_mul/opencl/mod.rs @@ -1,12 +1,11 @@ -use super::{args::SchemeLayout, Args, MatMul}; +use super::{Args, MatMul, args::SchemeLayout}; use crate::{ - opencl::{ClDevice, CodeGen, KernelCache, CL2_0}, ByteOf, LaunchError, QueueAlloc, SchemeDiversity::Low as LowDiversity, - SchemeError, + opencl::{CL2_0, ClDevice, CodeGen, KernelCache}, }; -use clrt::{bindings::cl_int, Context}; -use digit_layout::{types as Ty, DigitLayout}; +use clrt::{Context, bindings::cl_int}; +use digit_layout::{DigitLayout, types as Ty}; use lru::LruCache; use std::sync::Mutex; @@ -39,14 +38,6 @@ impl crate::Operator for Operator { } } - fn scheme( - &mut self, - _args: &Self::Args, - _max_workspace_size: usize, - ) -> Result { - Ok(0) - } - fn launch( &self, args: &Self::Args, @@ -177,6 +168,7 @@ mod test { const ALPHA: f32 = 0.5; const BETA: f32 = 1.; + #[allow(clippy::too_many_arguments)] fn args( dt: DigitLayout, batch: usize, @@ -203,10 +195,10 @@ mod test { fn test_compute() { use super::{super::common_cpu::Operator as RefOp, Operator}; use crate::{ + Operator as _, common_cpu::{Cpu, ThisThread}, opencl::ClDevice, test_utils::{Diff, ErrorCollector}, - Operator as _, }; use clrt::Platform; use digit_layout::types::{F32, F64}; @@ -309,7 +301,7 @@ mod test { .unwrap(); let cpu_time = time.elapsed(); - let map = queue.map(&mut c_svm); + let map = queue.map(&c_svm); let ([], y_ans, []) = (unsafe { map.align_to::() }) else { panic!() }; diff --git a/operators/src/random_sample/args.rs b/operators/src/random_sample/args.rs index 0586b80e..d1a2e1a6 100644 --- a/operators/src/random_sample/args.rs +++ b/operators/src/random_sample/args.rs @@ -1,9 +1,8 @@ -use super::KVPair; +use super::KVPair; use crate::{ - type_not_support, utils::rank_error, ConstPtr, Hardware, MaybeDyn, MutPtr, SchemeError, - TensorLayout, + ConstPtr, Hardware, LaunchError, MutPtr, TensorLayout, type_not_support, utils::rank_error, }; -use digit_layout::{types as ty, DigitLayout}; +use digit_layout::{DigitLayout, types as ty}; use std::ptr::{null, null_mut}; pub struct Args { @@ -86,11 +85,11 @@ impl SampleArgs { #[derive(PartialEq, Eq, Debug)] pub(super) struct Meta { pub dt: DigitLayout, - pub n: MaybeDyn, + pub n: usize, } impl Args { - pub(super) fn meta(&self) -> Result { + pub(super) fn meta(&self) -> Result { let Self { kv_pair, logits, @@ -98,21 +97,21 @@ impl Args { .. } = self; - if kv_pair.dt() != KVPair::<()>::LAYOUT { + if kv_pair.dt != KVPair::<()>::LAYOUT { return Err(type_not_support("output must be KVpair")); } - let dt_p = logits.dt(); + let dt_p = logits.dt; if dt_p.nbytes() > size_of::() { return Err(type_not_support("element too large")); } - if indices.dt() != ty::U32 { + if indices.dt != ty::U32 { return Err(type_not_support("indices must be u32")); } - let &[n] = self.logits.shape() else { + let &[n] = &*self.logits.shape() else { return Err(rank_error("logits", 1, self.logits.ndim())); }; - let &[_] = self.indices.shape() else { + let &[_] = &*self.indices.shape() else { return Err(rank_error("indices", 1, self.indices.ndim())); }; diff --git a/operators/src/random_sample/common_cpu/mod.rs b/operators/src/random_sample/common_cpu/mod.rs index fee162ca..b883d095 100644 --- a/operators/src/random_sample/common_cpu/mod.rs +++ b/operators/src/random_sample/common_cpu/mod.rs @@ -1,7 +1,6 @@ -use super::{args::Meta, Args, Indices, KVPair, RandomSample, SampleArgs}; +use super::{Args, Indices, KVPair, RandomSample, SampleArgs, args::Meta}; use crate::{ - common_cpu::Cpu, get_static, strides_not_support, type_not_support, ByteOf, LaunchError, - QueueAlloc, SchemeError, + ByteOf, LaunchError, QueueAlloc, common_cpu::Cpu, strides_not_support, type_not_support, }; use half::f16; use num_traits::Float; @@ -30,14 +29,6 @@ impl crate::Operator for Operator { Self } - fn scheme( - &mut self, - _args: &Self::Args, - _max_workspace_size: usize, - ) -> Result { - Ok(0) - } - fn launch( &self, args: &Self::Args, @@ -51,11 +42,10 @@ impl crate::Operator for Operator { let &[s] = args.logits.strides() else { unreachable!() }; - if s.get_static().copied() != Some(dt.nbytes() as isize) { - return Err(strides_not_support("").into()); + if s != dt.nbytes() as isize { + return Err(strides_not_support("")); } - get_static!(n); let Args { kv_pair_base, logits_base, @@ -74,7 +64,7 @@ impl crate::Operator for Operator { match dt { ty::F16 => argmax!(f16), ty::F32 => argmax!(f32), - e => return Err(type_not_support(format!("{e} not support")).into()), + e => return Err(type_not_support(format!("{e} not support"))), } } else { let &SampleArgs { @@ -90,7 +80,7 @@ impl crate::Operator for Operator { match dt { ty::F16 => random!(f16), ty::F32 => random!(f32), - e => return Err(type_not_support(format!("{e} not support")).into()), + e => return Err(type_not_support(format!("{e} not support"))), } }; unsafe { kv_pair_base.cast::>().write(kv) }; diff --git a/operators/src/random_sample/cuda/ffi.rs b/operators/src/random_sample/cuda/ffi.rs index e0c2bbeb..fa913871 100644 --- a/operators/src/random_sample/cuda/ffi.rs +++ b/operators/src/random_sample/cuda/ffi.rs @@ -1,5 +1,5 @@ -use crate::{random_sample::SampleArgs, LaunchError}; -use cuda::{bindings::CUstream, AsRaw, DevByte, Stream}; +use crate::{LaunchError, random_sample::SampleArgs}; +use cuda::{AsRaw, DevByte, Stream, bindings::CUstream}; use libloading::Library; type WorkspaceFunc = unsafe extern "C" fn( diff --git a/operators/src/random_sample/cuda/mod.rs b/operators/src/random_sample/cuda/mod.rs index dfeace1b..94a65ada 100644 --- a/operators/src/random_sample/cuda/mod.rs +++ b/operators/src/random_sample/cuda/mod.rs @@ -1,13 +1,13 @@ mod ffi; use super::{ - args::{Meta, SampleArgs}, Args, Indices, RandomSample, + args::{Meta, SampleArgs}, }; use crate::{ - cuda::{dt_name, Gpu, Handle}, - get_static, strides_not_support, ByteOf, LaunchError, QueueAlloc, SchemeDiversity, SchemeError, - Workspace, + ByteOf, LaunchError, QueueAlloc, SchemeDiversity, Workspace, + cuda::{Gpu, Handle, dt_name}, + strides_not_support, }; use cuda::{DevByte, Stream}; use digit_layout::DigitLayout; @@ -50,35 +50,6 @@ impl crate::Operator for Operator { } } - fn scheme( - &mut self, - args: &Self::Args, - max_workspace_size: usize, - ) -> Result { - let meta = args.meta()?; - let mut schemes = self.schemes.lock().unwrap(); - let scheme = schemes.get_or_insert(meta.dt, || Scheme::new(&self.handle, meta.dt)); - let Some(&n) = meta.n.get_static() else { - return Ok(0); - }; - let (argmax_size, sample_size) = scheme.workspace_size(n); - drop(schemes); - - let (max, min) = if argmax_size > sample_size { - (argmax_size, sample_size) - } else { - (sample_size, argmax_size) - }; - - Ok(if max <= max_workspace_size { - max - } else if min <= max_workspace_size { - min - } else { - 0 - }) - } - fn launch( &self, args: &Self::Args, @@ -110,13 +81,11 @@ impl crate::Operator for Operator { unreachable!() }; - get_static!(n sp si); - if dt.nbytes() as isize != sp { - return Err(strides_not_support("").into()); + return Err(strides_not_support("")); } if size_of::() as isize != si { - return Err(strides_not_support("").into()); + return Err(strides_not_support("")); } let scheme = self @@ -239,10 +208,10 @@ impl Scheme { #[test] fn test_compute() { - use super::{common_cpu::Operator as RefOp, KVPair}; + use super::{KVPair, common_cpu::Operator as RefOp}; use crate::{ - common_cpu::{Cpu, ThisThread}, Operator as _, + common_cpu::{Cpu, ThisThread}, }; use cuda::memcpy_d2h; use digit_layout::types as ty; @@ -256,13 +225,7 @@ fn test_compute() { let n = 32000; let cpu_op = RefOp::new(&Cpu); - let mut gpu_op = Operator::new(&gpu); - println!( - "workspace = {}", - gpu_op - .scheme(&Args::layout(ty::F32, n), usize::MAX) - .unwrap() - ); + let gpu_op = Operator::new(&gpu); let mut logits = vec![0.0f32; n]; rand::rng().fill(&mut logits[..]); diff --git a/operators/src/random_sample/cuda/sample.cuh b/operators/src/random_sample/cuda/sample.cuh index 27ef3349..62dc625a 100644 --- a/operators/src/random_sample/cuda/sample.cuh +++ b/operators/src/random_sample/cuda/sample.cuh @@ -3,7 +3,7 @@ #include #include -template +template cudaError arg_max_( cub::KeyValuePair *kv_pair, T const *logits, @@ -17,7 +17,7 @@ cudaError arg_max_( stream); } -template +template cudaError radix_sort( void *workspace_ptr, size_t &workspace_len, T const *key_in, T *key_out, @@ -33,7 +33,7 @@ cudaError radix_sort( stream); } -template +template cudaError inclusive_sum( void *workspace_ptr, size_t &workspace_len, T *data, int n, @@ -44,24 +44,24 @@ cudaError inclusive_sum( stream); } -template +template __global__ void partial_softmax_kernel( T *__restrict__ data, int n, float temperature) { int i = blockIdx.x * blockDim.x + threadIdx.x; if (0 < i && i < n) { float max = __ldg(data); - data[i] = (T) expf(((float) data[i] - max) / temperature); + data[i] = (T)expf(((float)data[i] - max) / temperature); } } -template +template __global__ void set_softmax_max_kernel( T *__restrict__ data) { *data = 1; } -template +template __global__ void random_sample_kernel( cub::KeyValuePair *__restrict__ kv_pair, T const *__restrict__ sorted, @@ -69,7 +69,7 @@ __global__ void random_sample_kernel( size_t n, float random, float topp, size_t topk) { topk = cub::Min()(topk, n); - auto p = (T) (random * cub::Min()(topp * (float) sorted[n - 1], (float) sorted[topk - 1])); + auto p = (T)(random * cub::Min()(topp * (float)sorted[n - 1], (float)sorted[topk - 1])); for (size_t i = 0;; ++i) { if ((sorted[i]) >= p) { kv_pair->key = indices_out[i]; @@ -92,7 +92,7 @@ constexpr size_t align(size_t size, size_t alignment) { return (size + alignment - 1) & ~(alignment - 1); } -template +template cudaError calculate_workspace_size( size_t *argmax, size_t *random_sample, @@ -119,17 +119,17 @@ cudaError calculate_workspace_size( nullptr)) size_t size_random = 0; - size_random += sizeof(T) * n; // sorted - size_random = align(size_random, 256); // - size_random += sizeof(unsigned int) * n;// indices_out - size_random = align(size_random, 256); // + size_random += sizeof(T) * n; // sorted + size_random = align(size_random, 256); // + size_random += sizeof(unsigned int) * n; // indices_out + size_random = align(size_random, 256); // size_random += cub::Max()(size_radix_sort, size_inclusive_sum); *random_sample = size_random; return cudaGetLastError(); } -template +template cudaError arg_max( cub::KeyValuePair *kv_pair, T const *logits, @@ -147,7 +147,7 @@ cudaError arg_max( stream); } -template +template cudaError random_sample( cub::KeyValuePair *kv_pair, T const *logits, @@ -185,7 +185,7 @@ cudaError random_sample( n, stream)); // softmax - auto block = cub::Min()((size_t) 1024, n); + auto block = cub::Min()((size_t)1024, n); auto grid = (n + block - 1) / block; partial_softmax_kernel<<>>(sorted, n, temperature); set_softmax_max_kernel<<<1, 1, 0, stream>>>(sorted); diff --git a/operators/src/random_sample/infini/mod.rs b/operators/src/random_sample/infini/mod.rs index b0c07989..1a3161d1 100644 --- a/operators/src/random_sample/infini/mod.rs +++ b/operators/src/random_sample/infini/mod.rs @@ -1,9 +1,8 @@ -use super::{args::Meta, common_cpu::Operator as RefOp, Args, Indices, RandomSample}; +use super::{Args, Indices, RandomSample, args::Meta, common_cpu::Operator as RefOp}; use crate::{ + ByteOf, LaunchError, QueueAlloc, common_cpu::{Cpu, ThisThread}, - get_static, infini::Device, - ByteOf, LaunchError, QueueAlloc, SchemeError, }; use std::{ptr::null, slice::from_raw_parts}; @@ -30,14 +29,6 @@ impl crate::Operator for Operator { Self(node.clone()) } - fn scheme( - &mut self, - _args: &Self::Args, - _max_workspace_size: usize, - ) -> Result { - Ok(0) - } - fn launch( &self, args: &Self::Args, @@ -58,9 +49,7 @@ impl crate::Operator for Operator { seed, } = args; let Meta { dt, n } = args.meta()?; - get_static! { - n - } + let unit = dt.nbytes(); let mut host = vec![0u8; n * unit]; @@ -89,11 +78,10 @@ impl crate::Operator for Operator { #[test] fn test_compute() { use super::args::SampleArgs; - use super::{common_cpu::Operator as RefOp, KVPair}; + use super::{KVPair, common_cpu::Operator as RefOp}; use crate::{ - common_cpu::{Cpu, ThisThread}, - infini::cast_load, Operator as _, + common_cpu::{Cpu, ThisThread}, }; use digit_layout::types as ty; use rand::Rng; @@ -102,13 +90,9 @@ fn test_compute() { let dev = Device::cpu(); let cpu_op = RefOp::new(&Cpu); - let mut dev_op = Operator::new(&dev); + let dev_op = Operator::new(&dev); let n = 32000; - dev_op - .scheme(&Args::layout(ty::F32, n), usize::MAX) - .unwrap(); - let mut logits = vec![0.0f32; n]; rand::rng().fill(&mut logits[..]); @@ -116,7 +100,7 @@ fn test_compute() { { let kv_ans = { let stream = dev.stream(); - let logits = cast_load(&logits, |x| x as f32, &stream); + let logits = stream.from_host(&logits); let mut kv: KVPair = KVPair::new(u32::MAX, 0.0f32); dev_op diff --git a/operators/src/random_sample/opencl/mod.rs b/operators/src/random_sample/opencl/mod.rs index c12133dd..7de99c10 100644 --- a/operators/src/random_sample/opencl/mod.rs +++ b/operators/src/random_sample/opencl/mod.rs @@ -1,15 +1,15 @@ -//! ref: +//! ref: -use super::{args::Meta, Args, Indices, KVPair, RandomSample}; +use super::{Args, Indices, KVPair, RandomSample, args::Meta}; use crate::{ - get_static, - opencl::{ClDevice, CodeGen, KernelCache, CL2_0}, - strides_not_support, ByteOf, LaunchError, QueueAlloc, + ByteOf, LaunchError, QueueAlloc, SchemeDiversity::Low as LowDiversity, - SchemeError, Workspace, + Workspace, + opencl::{CL2_0, ClDevice, CodeGen, KernelCache}, + strides_not_support, }; -use clrt::{bindings::cl_uint, Context}; -use digit_layout::{types as Ty, DigitLayout}; +use clrt::{Context, bindings::cl_uint}; +use digit_layout::{DigitLayout, types as Ty}; use lru::LruCache; use std::sync::Mutex; @@ -52,27 +52,6 @@ impl crate::Operator for Operator { } } - fn scheme( - &mut self, - args: &Self::Args, - _max_workspace_size: usize, - ) -> Result { - let Meta { dt, n } = args.meta()?; - - let Some(&n) = n.get_static() else { - return Ok(0); - }; - - let key = self.cache_kernel(dt, n); - let n_pairs = n / key.group_size / 2; - - Ok(match n_pairs { - 0 => unreachable!(), - 1 => 0, - n => n * KVPair::<()>::LAYOUT.nbytes(), - }) - } - fn launch( &self, args: &Self::Args, @@ -86,11 +65,10 @@ impl crate::Operator for Operator { let &[s] = args.logits.strides() else { unreachable!() }; - if s.get_static().copied() != Some(dt.nbytes() as isize) { - return Err(strides_not_support("").into()); + if s != dt.nbytes() as isize { + return Err(strides_not_support("")); } - get_static!(n); let Args { kv_pair_base, logits_base, @@ -204,11 +182,11 @@ struct SchemeKey { #[test] fn test_compute() { - use super::{common_cpu::Operator as RefOp, KVPair}; + use super::{KVPair, common_cpu::Operator as RefOp}; use crate::{ + Operator as _, common_cpu::{Cpu, ThisThread}, opencl::ClDevice, - Operator as _, }; use clrt::Platform; use digit_layout::types as ty; @@ -251,7 +229,7 @@ fn test_compute() { &queue, ) .unwrap(); - let map = queue.map(&mut kv_pair_svm); + let map = queue.map(&kv_pair_svm); let kv_ans = unsafe { *map.as_ptr().cast::>() }; queue.unmap(map); queue.finish(); diff --git a/operators/src/rearrange/args.rs b/operators/src/rearrange/args.rs index 64aa3d3d..7c1b8d5e 100644 --- a/operators/src/rearrange/args.rs +++ b/operators/src/rearrange/args.rs @@ -1,6 +1,6 @@ -use crate::{ - rank_mismatch, shape_mismatch, shape_not_support, static_from, utils::type_distinct, ConstPtr, - Hardware, MutPtr, SchemeError, TensorLayout, +use crate::{ + ConstPtr, Hardware, LaunchError, MutPtr, TensorLayout, rank_mismatch, shape_mismatch, + shape_not_support, utils::type_distinct, }; use std::{ cmp::Ordering, @@ -32,14 +32,14 @@ impl Args { pub(super) struct Scheme(Vec); impl Scheme { - pub fn new(args: &Args) -> Result { + pub fn new(args: &Args) -> Result { let Args { dst_layout: dst_, src_layout: src_, .. } = args; // # 检查基本属性 - let _ = type_distinct(&[dst_.dt(), src_.dt()])?; + let _ = type_distinct(&[dst_.dt, src_.dt])?; let ndim = dst_.ndim(); if src_.ndim() != ndim { return Err(rank_mismatch(format!( @@ -62,16 +62,16 @@ impl Scheme { let sd = dst_.strides(); let ss = src_.strides(); for i in 0..ndim { - let dd = *static_from(&dd[i])?; - let ds = *static_from(&ds[i])?; + let dd = dd[i]; + let ds = ds[i]; if dd != ds { Err(shape_mismatch(format!("dst[{i}] = {dd}, src[{i}] = {ds}")))?; } // 静态化 let dim = Dim { len: dd, - dst: *static_from(&sd[i])?, - src: *static_from(&ss[i])?, + dst: sd[i], + src: ss[i], }; // 剔除初始的 1 长维度 if dim.len != 1 { @@ -105,7 +105,7 @@ impl Scheme { } dims.sort_unstable(); // # 合并连续维度 - let mut unit = dst_.dt().nbytes() as isize; + let mut unit = dst_.dt.nbytes() as isize; let mut ndim = dims.len(); // ## 合并末尾连续维度到 unit for dim in dims.iter_mut().rev() { @@ -268,13 +268,13 @@ fn test_scheme() { dst_layout: TensorLayout::new( F16, &shape, - &[33554432 * 2, 16777216 * 2, 524288 * 2, 128 * 2, 1 * 2], + &[33554432 * 2, 16777216 * 2, 524288 * 2, 128 * 2, 2], ), dst_base: null_mut(), src_layout: TensorLayout::new( F16, &shape, - &[33554432 * 2, 16777216 * 2, 524288 * 2, 128 * 2, 1 * 2], + &[33554432 * 2, 16777216 * 2, 524288 * 2, 128 * 2, 2], ), src_base: null(), }; diff --git a/operators/src/rearrange/common_cpu/mod.rs b/operators/src/rearrange/common_cpu/mod.rs index e7219e20..58bcb663 100644 --- a/operators/src/rearrange/common_cpu/mod.rs +++ b/operators/src/rearrange/common_cpu/mod.rs @@ -1,5 +1,5 @@ -use super::{args::Scheme, Args, Rearrange}; -use crate::{common_cpu::Cpu, ByteOf, LaunchError, QueueAlloc, SchemeError}; +use super::{Args, Rearrange, args::Scheme}; +use crate::{ByteOf, LaunchError, QueueAlloc, common_cpu::Cpu}; use rayon::iter::{IntoParallelIterator, ParallelIterator}; pub struct Operator; @@ -15,14 +15,6 @@ impl crate::Operator for Operator { Self } - fn scheme( - &mut self, - _args: &Self::Args, - _max_workspace_size: usize, - ) -> Result { - Ok(0) - } - fn launch( &self, args: &Self::Args, diff --git a/operators/src/rearrange/cuda/mod.rs b/operators/src/rearrange/cuda/mod.rs index 2350de67..9bfaba69 100644 --- a/operators/src/rearrange/cuda/mod.rs +++ b/operators/src/rearrange/cuda/mod.rs @@ -1,22 +1,93 @@ -use super::{args::Scheme, Args, Rearrange}; +use super::{Args, Rearrange, args::Scheme as ArgsScheme}; +use crate::rank_not_support; use crate::{ + ByteOf, LaunchError, QueueAlloc, SchemeDiversity, cuda::{Gpu, Handle, ModuleBox}, - rank_not_support, shape_not_support, ByteOf, LaunchError, QueueAlloc, SchemeError, }; +use itertools::Itertools; +use lru::LruCache; +use std::cmp::max; +use std::slice::{from_raw_parts, from_raw_parts_mut}; use std::{ ffi::CString, - slice::{from_raw_parts, from_raw_parts_mut}, - sync::Arc, + sync::{Arc, Mutex}, }; +#[derive(Clone, Copy, PartialEq, Eq, Hash, Debug)] +struct SchemeKey { + unit_size: usize, + block_array_size: usize, + grid_array_size: usize, + constrain_num: usize, +} + +#[derive(Clone)] +struct Scheme { + module: Arc, + name: CString, +} + +impl Scheme { + pub fn new(key: SchemeKey, handle: &Arc) -> Self { + let name = kernel_name(key); + let cc = handle.device().compute_capability(); + // for DEBUG + // let code = format_code(key.unit_size, key.constrain_num); + // std::fs::write("rearrange.cu", code).unwrap(); + + Self { + module: handle.compile_kernel(&name, cc, || format_code(key)), + name: CString::new(name).unwrap(), + } + } +} + +/// Type used for array indices and strides +type ArrayType = i32; + +// 默认的数组大小,同时也是最大的数组大小,不能为0 +const DEFAULT_ARRAY_SIZE: usize = 5; +const CONSTRAIN_ARRAY_SIZE: usize = 8; + +#[derive(Debug)] +struct SplitDim { + choose_idx: usize, + num_per_block: usize, + num_per_grid: usize, + array_struct_idx_block: ArrayType, + array_struct_idx_grid: ArrayType, + dim_len: usize, +} + +#[derive(Debug)] +struct ArrayStruct(Vec); + +impl ArrayStruct { + fn new(mut array: Vec, default: ArrayType) -> Self { + while array.len() < DEFAULT_ARRAY_SIZE { + array.push(default); + } + Self(array) + } + + fn try_into_array(self) -> Result<[ArrayType; N], LaunchError> { + let ArrayStruct(vec) = self; + if vec.len() <= N { + Ok(std::array::from_fn(|i| vec.get(i).copied().unwrap_or(0))) + } else { + Err(rank_not_support("over length")) + } + } +} pub struct Operator { - _handle: Arc, + handle: Arc, + #[allow(unused)] max_warps_block: usize, + #[allow(unused)] warp_size: usize, - module: Arc, + schemes: Mutex>, } -const NAME: &str = "rearrange"; const CODE: &str = include_str!("rearrange.cuh"); impl Rearrange for Operator {} @@ -31,26 +102,16 @@ impl crate::Operator for Operator { let device = node.0.device(); let max_threads_block = device.block_limit().max_threads; let warp_size = device.warp_size(); - let cc = device.compute_capability(); assert_eq!(max_threads_block % warp_size, 0); // 生成执行资源 Self { - _handle: node.0.clone(), + handle: node.0.clone(), max_warps_block: max_threads_block / warp_size, warp_size, - module: node.0.compile_kernel(NAME, cc, format_code), + schemes: node.0.scheme_cache(SchemeDiversity::Low), } } - fn scheme( - &mut self, - _args: &Self::Args, - _max_workspace_size: usize, - ) -> Result { - // 完全动态,不需要做任何准备工作 - Ok(0) - } - fn launch( &self, args: &Self::Args, @@ -60,156 +121,389 @@ impl crate::Operator for Operator { where QA: QueueAlloc, { - let scheme = Scheme::new(args)?; - if scheme.ndim() == 0 { - let unit = scheme.unit(); + let scheme_update = ArgsScheme::new(args)?; + + // 发现最大的1 thread 处理的数据量 + let scheme_update = scheme_update.distribute_unit((0..=5).rev().map(|n| (1 << n))); + if scheme_update.ndim() == 0 { + let unit = scheme_update.unit(); let dst = unsafe { from_raw_parts_mut(args.dst_base, unit) }; let src = unsafe { from_raw_parts(args.src_base, unit) }; queue_alloc.queue().memcpy_d2d(dst, src); return Ok(()); } - let scheme = scheme.distribute_unit((0..=5).rev().map(|n| 32 * (1 << n))); - let unit = scheme.unit(); + let src_strides = scheme_update.src_strides(); + let dst_strides = scheme_update.dst_strides(); + let shape = scheme_update.shape().collect::>(); + let unit = scheme_update.unit(); + let ndim = scheme_update.ndim(); - struct Layout { - r: u32, - c: u32, - dst_rs: i32, - dst_cs: i32, - src_rs: i32, - src_cs: i32, + //src strides 降序 index + let src_strides_desc_idx = (0..scheme_update.ndim()) + .zip(src_strides) + .sorted_by(|a, b| b.1.cmp(a.1)) + .map(|(i, _)| i) + .collect::>(); + + //分离维度,分成grid处理的维度和block处理的维度,与dst的维度相对应 + let mut block_dim_choose = vec![false; ndim]; + + //TODO 需要优化 + let max_block_size = 256; + let mut split_dims = Vec::new(); // 长度最多为2 + + //进行维度选择 + { + let mut src_choose_idx = ndim; + let mut dst_choose_idx = ndim; + + let mut block_elements = 1; + let mut block_src_elements = 1; + let mut block_dst_elements = 1; + + while src_choose_idx > 0 && dst_choose_idx > 0 { + let src_idx = src_strides_desc_idx[src_choose_idx - 1]; + let dst_idx = dst_choose_idx - 1; + + if src_idx == dst_idx { + let idx = src_idx; + let len = shape[idx]; + if block_elements * shape[src_idx] <= max_block_size { + //选择维度 + block_dim_choose[idx] = true; + block_elements *= len; + block_src_elements *= len; + block_dst_elements *= len; + src_choose_idx -= 1; + dst_choose_idx -= 1; + } else { + //切分维度,并退出 + let num_per_block = max_block_size.div_euclid(block_elements); + assert!(num_per_block > 0); + assert!(len >= num_per_block); + if num_per_block > 1 { + split_dims.push(SplitDim { + choose_idx: idx, + num_per_block, + num_per_grid: len.div_ceil(num_per_block), + array_struct_idx_block: 0, + array_struct_idx_grid: 0, + dim_len: len, + }); + } + break; + } + } else { + let src_div_dst = block_src_elements as f64 / block_dst_elements as f64; + let src_num_per_block = + (max_block_size as f64 / block_elements as f64 / src_div_dst).sqrt(); + let dst_num_per_block = src_num_per_block * src_div_dst; + + let src_current_dim_len = shape[src_idx]; + let dst_current_dim_len = shape[dst_idx]; + + if (src_current_dim_len as f64) < src_num_per_block { + //选择维度 + block_dim_choose[src_idx] = true; + block_elements *= src_current_dim_len; + block_src_elements *= src_current_dim_len; + src_choose_idx -= 1; + } else if (dst_current_dim_len as f64) < dst_num_per_block { + //选择维度 + block_dim_choose[dst_idx] = true; + block_elements *= dst_current_dim_len; + block_dst_elements *= dst_current_dim_len; + dst_choose_idx -= 1; + } else { + //切分维度,并退出 + let src_num_per_block = src_num_per_block.floor() as usize; + let dst_num_per_block = dst_num_per_block.floor() as usize; + let src_num_per_grid = src_current_dim_len.div_ceil(src_num_per_block); + let dst_num_per_grid = dst_current_dim_len.div_ceil(dst_num_per_block); + + if src_num_per_block == 1 { + } else if src_num_per_grid == 1 { + block_dim_choose[src_idx] = true; + } else { + split_dims.push(SplitDim { + choose_idx: src_idx, + num_per_block: src_num_per_block, + num_per_grid: src_num_per_grid, + array_struct_idx_block: 0, + array_struct_idx_grid: 0, + dim_len: src_current_dim_len, + }); + } + + if dst_num_per_block == 1 { + } else if dst_num_per_grid == 1 { + block_dim_choose[dst_idx] = true; + } else { + split_dims.push(SplitDim { + choose_idx: dst_idx, + num_per_block: dst_num_per_block, + num_per_grid: dst_num_per_grid, + array_struct_idx_block: 0, + array_struct_idx_grid: 0, + dim_len: dst_current_dim_len, + }); + } + break; + } + } + } } - let Layout { - r, - c, - dst_rs, - dst_cs, - src_rs, - src_cs, - } = match scheme.ndim() { - 0 => unreachable!(), - 1 => { - let &[dst_cs] = scheme.dst_strides() else { - unreachable!() - }; - let &[src_cs] = scheme.src_strides() else { - unreachable!() - }; - Layout { - r: 1, - c: scheme.shape().next().unwrap() as _, - dst_rs: 0, - dst_cs: dst_cs as _, - src_rs: 0, - src_cs: src_cs as _, + let mut block_dim: ArrayType = 0; + + let mut block_len = Vec::::with_capacity(DEFAULT_ARRAY_SIZE); + let mut src_block_stride = Vec::::with_capacity(DEFAULT_ARRAY_SIZE); + let mut dst_block_stride = Vec::::with_capacity(DEFAULT_ARRAY_SIZE); + + let mut grid_len = Vec::::with_capacity(DEFAULT_ARRAY_SIZE); + let mut src_grid_stride = Vec::::with_capacity(DEFAULT_ARRAY_SIZE); + let mut dst_grid_stride = Vec::::with_capacity(DEFAULT_ARRAY_SIZE); + + // 处理block,填充block_len,block_stride + for i in 0..ndim { + if block_dim_choose[i] { + block_len.push(shape[i] as ArrayType); + src_block_stride.push(src_strides[i] as ArrayType); + dst_block_stride.push(dst_strides[i] as ArrayType); + block_dim += 1; + } + + for split_dim in split_dims.iter_mut() { + if i == split_dim.choose_idx { + block_len.push(split_dim.num_per_block as ArrayType); + src_block_stride.push(src_strides[i] as ArrayType); + dst_block_stride.push(dst_strides[i] as ArrayType); + split_dim.array_struct_idx_block = block_dim; + block_dim += 1; } } - 2 => { - let mut shape = scheme.shape(); - let r = shape.next().unwrap(); - let c = shape.next().unwrap(); - let &[dst_rs, dst_cs] = scheme.dst_strides() else { - unreachable!() - }; - let &[src_rs, src_cs] = scheme.src_strides() else { - unreachable!() - }; - Layout { - r: r as _, - c: c as _, - dst_rs: dst_rs as _, - dst_cs: dst_cs as _, - src_rs: src_rs as _, - src_cs: src_cs as _, + } + + // 处理grid,填充grid_len,grid_stride + let mut grid_dim = 0_u32; + for i in 0..ndim { + let mut is_split = false; + if !block_dim_choose[i] { + for split_dim in split_dims.iter_mut() { + if i == split_dim.choose_idx { + is_split = true; + grid_len.push(split_dim.num_per_grid as ArrayType); + src_grid_stride + .push((src_strides[i] * split_dim.num_per_block as isize) as ArrayType); + dst_grid_stride + .push((dst_strides[i] * split_dim.num_per_block as isize) as ArrayType); + split_dim.array_struct_idx_grid = grid_dim as ArrayType; + } } + if !is_split { + grid_len.push(shape[i] as ArrayType); + src_grid_stride.push(src_strides[i] as ArrayType); + dst_grid_stride.push(dst_strides[i] as ArrayType); + } + grid_dim += 1; } - _ => Err(rank_not_support("rearrange not support ndim > 2 on NV GPU"))?, + } + + let filter_split_dims = split_dims + .iter() + .filter(|split_dim| split_dim.dim_len % split_dim.num_per_block != 0) + .collect::>(); + + let constrain_num = filter_split_dims.len(); + + // 准备kernel + let key = SchemeKey { + unit_size: unit, + constrain_num, + block_array_size: block_len.len(), + grid_array_size: grid_len.len(), }; - let name = CString::new(NAME).unwrap(); - if unit % self.warp_size != 0 { - Err(shape_not_support(format!( - "memory region {unit} is not align to warp size, which is not supported yet on NV GPU", - )))?; - } - let bytes_thread = (unit / self.warp_size) as u32; - if bytes_thread > 32 || !bytes_thread.is_power_of_two() { - Err(shape_not_support(format!( - "bytes per thread {bytes_thread} is not supported yet on NV GPU" - )))?; - } + let mut schemes = self.schemes.lock().unwrap(); - let warps = self.max_warps_block as u32; - let grid = (r, c.div_ceil(warps)); - let block = (c.div_ceil(grid.1), self.warp_size as u32); + let scheme = schemes.get_or_insert(key, || Scheme::new(key, &self.handle)); - let unit = unit as i32; - let dst_rs = dst_rs / unit; - let dst_cs = dst_cs / unit; - let src_rs = src_rs / unit; - let src_cs = src_cs / unit; + // 计算grid和block + let grid = grid_len.iter().product::() as u32; + let block = block_len.iter().product::() as u32; + + // cuda 参数准备 + let block_len_total = block_len.iter().map(|x| *x as u32).product::(); + let src_block_stride = ArrayStruct::new(src_block_stride, 0); + let dst_block_stride = ArrayStruct::new(dst_block_stride, 0); + let src_grid_stride = ArrayStruct::new(src_grid_stride, 0); + let dst_grid_stride = ArrayStruct::new(dst_grid_stride, 0); + let block_len = ArrayStruct::new(block_len, 1); + let grid_len = ArrayStruct::new(grid_len, 1); + + let constrains = match filter_split_dims.len() { + 0 => ArrayStruct(vec![0; 8]), + 1 => ArrayStruct(vec![ + filter_split_dims[0].array_struct_idx_grid, + filter_split_dims[0].array_struct_idx_block, + filter_split_dims[0].num_per_block as ArrayType, + filter_split_dims[0].dim_len as ArrayType, + 0, + 0, + 0, + 0, + ]), + 2 => ArrayStruct(vec![ + filter_split_dims[0].array_struct_idx_grid, + filter_split_dims[0].array_struct_idx_block, + filter_split_dims[0].num_per_block as ArrayType, + filter_split_dims[0].dim_len as ArrayType, + filter_split_dims[1].array_struct_idx_grid, + filter_split_dims[1].array_struct_idx_block, + filter_split_dims[1].num_per_block as ArrayType, + filter_split_dims[1].dim_len as ArrayType, + ]), + _ => unreachable!(), + }; let params = cuda::params![ args.dst_base, - dst_rs, - dst_cs, args.src_base, - src_rs, - src_cs, - c, - bytes_thread + block_dim, + block_len_total, + block_len.try_into_array::()?, // 各维度的长度 + src_block_stride.try_into_array::()?, // 源tensor在各维度上的步长(bytes) + dst_block_stride.try_into_array::()?, // 目标tensor在各维度上的步长(bytes) + grid_len.try_into_array::()?, // 各维度的长度 + src_grid_stride.try_into_array::()?, // 源tensor在各维度上的步长(bytes) + dst_grid_stride.try_into_array::()?, // 目标tensor在各维度上的步长(bytes) + constrains.try_into_array::()? ]; - self.module - .launch(&name, grid, block, params.as_ptr(), 0, queue_alloc.queue()); + + scheme.module.launch( + &scheme.name, + (grid, block, 0), + ¶ms.to_ptrs(), + queue_alloc.queue(), + ); Ok(()) } } -fn format_code() -> String { +fn kernel_name( + SchemeKey { + unit_size, + block_array_size, + grid_array_size, + constrain_num, + }: SchemeKey, +) -> String { + let tmem_type = match unit_size { + 1 => "uchar1", + 2 => "uchar2", + 4 => "float1", + 8 => "float2", + 16 => "float4", + 32 => "double4", + _ => unreachable!(), + }; format!( - r#"{CODE} + "rearrange_unit_{tmem_type}_block_{block_array_size}_grid_{grid_array_size}_constrain_{constrain_num}" + ) +} + +fn format_code( + SchemeKey { + unit_size, + block_array_size, + grid_array_size, + constrain_num, + }: SchemeKey, +) -> String { + assert!(block_array_size != 0); + + let kernel_name = kernel_name(SchemeKey { + unit_size, + block_array_size, + grid_array_size, + constrain_num, + }); + //处理 grid_array_size = 0的情况 + let grid_array_size = max(grid_array_size, 1); + + let mut code = String::new(); -extern "C" __global__ void {NAME}( - void *__restrict__ dst, - int const rsa, - int const csa, + let tmem_type = match unit_size { + 1 => "uchar1", + 2 => "uchar2", + 4 => "float1", + 8 => "float2", + 16 => "float4", + 32 => "double4", + _ => unreachable!(), + }; + + // 添加头部定义 + code.push_str(&format!("#define BLOCK_ARRAY_SIZE {block_array_size}\n")); + code.push_str(&format!("#define GRID_ARRAY_SIZE {grid_array_size}\n")); + code.push_str("#define ARRAY_TYPE int\n"); + code.push_str(&format!("#define CONSTRAIN_NUM {constrain_num}\n")); + code.push_str(CODE); + code.push('\n'); + + // 添加实例化宏调用 + code.push_str(&format!( + r#" +extern "C" __global__ void {kernel_name}( + void *__restrict__ dst, void const *__restrict__ src, - int const rsb, - int const csb, - unsigned int const ncols, - unsigned int const bytes_per_thread -){{ - switch (bytes_per_thread) {{ - case 1: rearrange(dst, rsa, csa, src, rsb, csb, ncols); break; - case 2: rearrange(dst, rsa, csa, src, rsb, csb, ncols); break; - case 4: rearrange(dst, rsa, csa, src, rsb, csb, ncols); break; - case 8: rearrange(dst, rsa, csa, src, rsb, csb, ncols); break; - case 16: rearrange(dst, rsa, csa, src, rsb, csb, ncols); break; - case 32: rearrange(dst, rsa, csa, src, rsb, csb, ncols); break; - }} + unsigned int const block_dim, + unsigned int const block_len_total, + const ArrayStruct block_len, + const ArrayStruct src_block_stride, + const ArrayStruct dst_block_stride, + const ArrayStruct grid_len, + const ArrayStruct src_grid_stride, + const ArrayStruct dst_grid_stride +#if CONSTRAIN_NUM > 0 + ,const ArrayStruct> constrains +#endif +) {{ + rearrange_kernel<{tmem_type}, {constrain_num}>( + dst, src, block_dim, block_len_total, + block_len, src_block_stride, dst_block_stride, + grid_len, src_grid_stride, dst_grid_stride +#if CONSTRAIN_NUM > 0 + ,constrains +#endif + ); }} "# - ) + )); + code.push('\n'); + + code } #[cfg(test)] mod test { + use std::time::Duration; + use super::{Args, Gpu, Operator}; use crate::{ConstPtr, Hardware, MutPtr, Operator as _, TensorLayout}; - use digit_layout::{types as ty, DigitLayout}; + use digit_layout::{DigitLayout, types as ty}; + use log::debug; - fn dyn_args(dt: DigitLayout) -> Args { - use crate::dyn_; - use std::ptr::{null, null_mut}; - Args { - dst_layout: TensorLayout::new_dyn(dt, &[dyn_(); 2], &[dyn_(); 2]), - dst_base: null_mut(), - src_layout: TensorLayout::new_dyn(dt, &[dyn_(); 2], &[dyn_(); 2]), - src_base: null(), - } - } + // fn dyn_args(dt: DigitLayout) -> Args { + // use std::ptr::{null, null_mut}; + // Args { + // dst_layout: TensorLayout::new(dt, &[0; 2], &[0; 2]), + // dst_base: null_mut(), + // src_layout: TensorLayout::new_dyn(dt, &[0; 2], &[0; 2]), + // src_base: null(), + // } + // } fn args( dt: DigitLayout, @@ -229,69 +523,126 @@ mod test { #[test] fn test_compile() { - use super::NAME; - use std::ffi::CString; + use super::Scheme; + use super::SchemeKey; let Some(gpu) = Gpu::init() else { return; }; println!("{}", gpu.0.device().info()); - let mut op = Operator::new(&gpu); - op.scheme(&dyn_args(ty::F16), 0).unwrap(); + let op = Operator::new(&gpu); + + // 遍历所有可能的unit_size和constrain_num组合,编译所有kernel + for unit_size in (0..=5).map(|n| (1 << n)) { + for constrain_num in 0..=2 { + println!( + "compile unit_size: {}, constrain_num: {}", + unit_size, constrain_num + ); + let key = SchemeKey { + unit_size, + constrain_num, + block_array_size: 5, + grid_array_size: 5, + }; + op.schemes + .lock() + .unwrap() + .get_or_insert(key, || Scheme::new(key, &op.handle)); + } + } - let module = op.module; + // 打印所有编译好的kernel信息 gpu.apply(|ctx| { - println!( - "{NAME}\n{}", - module.load(CString::new(NAME).unwrap(), ctx).info() - ); - }) + let schemes = op.schemes.lock().unwrap(); + for (key, scheme) in schemes.iter() { + println!("{:?}", scheme.name); + println!( + "unit_size: {}, constrain_num: {}\n{}", + key.unit_size, + key.constrain_num, + // scheme.name.to_str().unwrap(), + scheme.module.load(&scheme.name, ctx).info() + ); + println!("----------------------------------------"); + } + }); } - #[test] - fn test_compute() { + fn copute_with_check( + gpu: &Gpu, + shape: [usize; N], + ) -> Duration { + assert!(TRANS_N <= N, "TRANS_N must be less than or equal to N"); use super::super::common_cpu::Operator as RefOp; use crate::common_cpu::{Cpu, ThisThread}; + use cuda::memcpy_d2h; use ndarray_layout::{ArrayLayout, Endian::BigEndian}; use rand::Rng; - let Some(gpu) = Gpu::init() else { - return; - }; + let dt = ty::U64; + + let cpu_op = RefOp::new(&Cpu); + let gpu_op = Operator::new(gpu); - let dt = ty::U32; + let mut r_shape = shape; + r_shape[0..TRANS_N].reverse(); - let mut cpu_op = RefOp::new(&Cpu); - let mut gpu_op = Operator::new(&gpu); - cpu_op.scheme(&dyn_args(dt), 0).unwrap(); - gpu_op.scheme(&dyn_args(dt), 0).unwrap(); + let trans_param: [usize; TRANS_N] = + (0..TRANS_N).rev().collect::>().try_into().unwrap(); - let nh = 32; - let seq = 7; - let dh = 128; - let mut src = vec![0u32; nh * seq * dh]; + let mut src = vec![0u64; shape.iter().product::()]; rand::rng().fill(&mut src[..]); let ele = dt.nbytes(); - let s_src = ArrayLayout::<3>::new_contiguous(&[nh, seq, dh], BigEndian, ele); + let s_src = ArrayLayout::::new_contiguous(&shape, BigEndian, ele); let s_dst = - ArrayLayout::<3>::new_contiguous(&[seq, nh, dh], BigEndian, ele).transpose(&[1, 0]); + ArrayLayout::::new_contiguous(&r_shape, BigEndian, ele).transpose(&trans_param); + + debug!("s_src shape: {:?}", s_src.shape()); + debug!("s_dst shape: {:?}", s_dst.shape()); + debug!("s_src strides: {:?}", s_src.strides()); + debug!("s_dst strides: {:?}", s_dst.strides()); - let dst_ans = gpu.apply(|ctx| { + let (dst_ans, time) = gpu.apply(|ctx| { let stream = ctx.stream(); #[cfg(use_nvidia)] let rt = &stream; #[cfg(use_iluvatar)] let rt = ctx; + let src = rt.from_host(&src); let mut dst = rt.malloc::(src.len()); + + let start_event = stream.record(); + + stream.bench( + |_, stream| { + gpu_op + .launch( + &args( + dt, + &shape, + s_src.strides(), + s_dst.strides(), + src.as_ptr().cast(), + dst.as_mut_ptr().cast(), + ), + &mut [], + stream, + ) + .unwrap(); + }, + 5, + 1, + ); gpu_op .launch( &args( dt, - &[nh, seq, dh], + &shape, s_src.strides(), s_dst.strides(), src.as_ptr().cast(), @@ -301,17 +652,21 @@ mod test { &stream, ) .unwrap(); - let mut host = vec![0u32; nh * seq * dh]; + let end_event = stream.record(); + end_event.synchronize(); + let time = end_event.elapse_from(&start_event); + + let mut host = vec![0u64; shape.iter().product::()]; memcpy_d2h(&mut host, &dst); - host + (host, time) }); - let mut dst_ref = vec![0u32; seq * nh * dh]; + let mut dst_ref = vec![0u64; shape.iter().product::()]; cpu_op .launch( &args( dt, - &[nh, seq, dh], + &shape, s_src.strides(), s_dst.strides(), src.as_ptr().cast(), @@ -322,5 +677,32 @@ mod test { ) .unwrap(); assert_eq!(dst_ans, dst_ref); + time + } + + #[test] + fn test_compute() { + let Some(gpu) = Gpu::init() else { + return; + }; + let shape = [2]; + let time = copute_with_check::<1, 1>(&gpu, shape); + println!("time: {time:?}"); + + let shape = [13]; + let time = copute_with_check::<1, 1>(&gpu, shape); + println!("time: {time:?}"); + + let shape = [16, 2, 16]; + let time = copute_with_check::<3, 3>(&gpu, shape); + println!("time: {time:?}"); + + let shape = [32, 2, 17]; + let time = copute_with_check::<3, 3>(&gpu, shape); + println!("time: {time:?}"); + + let shape = [32, 2, 17, 2, 13]; + let time = copute_with_check::<5, 5>(&gpu, shape); + println!("time: {time:?}"); } } diff --git a/operators/src/rearrange/cuda/rearrange.cuh b/operators/src/rearrange/cuda/rearrange.cuh index 7ac8ce28..e542d39e 100644 --- a/operators/src/rearrange/cuda/rearrange.cuh +++ b/operators/src/rearrange/cuda/rearrange.cuh @@ -1,23 +1,137 @@ -template -static __device__ void rearrange( + +#ifndef BLOCK_ARRAY_SIZE +#define BLOCK_ARRAY_SIZE 5 +#endif + +#ifndef GRID_ARRAY_SIZE +#define GRID_ARRAY_SIZE 5 +#endif + +#ifndef ARRAY_TYPE +#define ARRAY_TYPE int +#endif + +template +struct ArrayStruct { + ArrayType a[ArrSize]; +}; + +// 各个元素分别代表:[grid_idx, block_idx, grid 的stride相对于block的倍数,总的len限制] +template +struct Constrains { + ElementType grid_idx; + ElementType block_idx; + ElementType grid_div_block; + ElementType total_len; +}; + +// 主要的重排序内核模板 +template +__forceinline__ __device__ void rearrange_kernel( void *__restrict__ dst, - int const rsa, - int const csa, void const *__restrict__ src, - int const rsb, - int const csb, - unsigned int const ncols) { - - auto row = blockIdx.y, - col = blockIdx.x * blockDim.y + threadIdx.y; - if (col >= ncols) return; - - auto thread = threadIdx.x, - warp_size = blockDim.x; - auto i = (row * rsa + col * csa) * warp_size + thread; - auto j = (row * rsb + col * csb) * warp_size + thread; - // printf("%d %d %d %d: row = %d, col = %d, nrows = %d, ncols = %d, rsa = %d, rsb = %d, csa = %d, csb = %d, warp_size = %d, thread = %d, i = %d, j = %d\n", - // blockIdx.y, blockIdx.x, threadIdx.y, threadIdx.x, row, col, gridDim.y, ncols, rsa, rsb, csa, csb, warp_size, thread, i, j); - - reinterpret_cast(dst)[i] = reinterpret_cast(src)[j]; + unsigned int const block_dim, + unsigned int const block_len_total, // block_len 各元素的乘积 + const ArrayStruct block_len, // 各维度的长度 + const ArrayStruct src_block_stride,// 源tensor在各维度上的步长(bytes) + const ArrayStruct dst_block_stride,// 目标tensor在各维度上的步长(bytes) + const ArrayStruct grid_len, // 各维度的长度 + const ArrayStruct src_grid_stride, // 源tensor在各维度上的步长(bytes) + const ArrayStruct dst_grid_stride // 目标tensor在各维度上的步长(bytes) +#if CONSTRAIN_NUM > 0 + , + const ArrayStruct> constrains// 切分维度的约束条件数组 +#endif +) { + int remaining = threadIdx.x; + if (remaining >= block_len_total) { + return; + } + + // 声明共享内存 + __shared__ int shared_src_offset; + __shared__ int shared_dst_offset; +#if CONSTRAIN_NUM > 0 + __shared__ int shared_constrains_grid_idx_multiple[CONSTRAIN_NUM]; +#endif + + if (threadIdx.x == 0) {// 只让0号线程计算 + // 计算当前block处理的数据在src和dst中的基础偏移(bytes) + int src_offset = 0; + int dst_offset = 0; +#if CONSTRAIN_NUM > 0 + int constrains_grid_idx_multiple[CONSTRAIN_NUM] = {0}; +#endif + int remaining = blockIdx.x; + + for (int i = GRID_ARRAY_SIZE - 1; i >= 0; i--) { + int idx = remaining % grid_len.a[i]; + remaining /= grid_len.a[i]; + src_offset += idx * src_grid_stride.a[i]; + dst_offset += idx * dst_grid_stride.a[i]; +#if CONSTRAIN_NUM > 0 + for (int j = 0; j < CONSTRAIN_NUM; j++) { + if (i == constrains.a[j].grid_idx) { + constrains_grid_idx_multiple[j] = idx * constrains.a[j].grid_div_block; + } + } +#endif + } + + // 将结果存入共享内存 + shared_src_offset = src_offset; + shared_dst_offset = dst_offset; +#if CONSTRAIN_NUM > 0 + for (int j = 0; j < CONSTRAIN_NUM; j++) { + shared_constrains_grid_idx_multiple[j] = constrains_grid_idx_multiple[j]; + } +#endif + } + + // 确保所有线程都能看到共享内存中的值 + __syncthreads(); + + // 所有线程直接使用计算好的偏移值 + int src_offset = shared_src_offset; + int dst_offset = shared_dst_offset; +#if CONSTRAIN_NUM > 0 + int constrains_grid_idx_multiple[CONSTRAIN_NUM]; + for (int j = 0; j < CONSTRAIN_NUM; j++) { + constrains_grid_idx_multiple[j] = shared_constrains_grid_idx_multiple[j]; + } +#endif + + for (int i = BLOCK_ARRAY_SIZE - 1; i > 0; i--) { + + int idx = remaining % block_len.a[i]; + remaining /= block_len.a[i]; + // 计算偏移量 + src_offset += idx * src_block_stride.a[i]; + dst_offset += idx * dst_block_stride.a[i]; +#if CONSTRAIN_NUM > 0 + for (int j = 0; j < CONSTRAIN_NUM; j++) { + if (i == constrains.a[j].block_idx) { + if (constrains_grid_idx_multiple[j] + idx >= constrains.a[j].total_len) { + return; + } + } + } +#endif + } + + src_offset += remaining * src_block_stride.a[0]; + dst_offset += remaining * dst_block_stride.a[0]; +#if CONSTRAIN_NUM > 0 + for (int j = 0; j < CONSTRAIN_NUM; j++) { + if (0 == constrains.a[j].block_idx) { + if (constrains_grid_idx_multiple[j] + remaining >= constrains.a[j].total_len) { + return; + } + } + } +#endif + + // 执行数据拷贝,注意offset已经是字节偏移 + *reinterpret_cast(reinterpret_cast(dst) + dst_offset) = + *reinterpret_cast(reinterpret_cast(src) + src_offset); } diff --git a/operators/src/rearrange/infini/mod.rs b/operators/src/rearrange/infini/mod.rs index ce6895ea..65f244a3 100644 --- a/operators/src/rearrange/infini/mod.rs +++ b/operators/src/rearrange/infini/mod.rs @@ -1,7 +1,7 @@ -use super::{args::Scheme, Args, Rearrange}; -use crate::{infini::Device, ByteOf, LaunchError, QueueAlloc, SchemeError}; +use super::{Args, Rearrange, args::Scheme}; +use crate::{ByteOf, LaunchError, QueueAlloc, infini::Device}; use digit_layout::types; -use infini_op::{infiniop, AsRaw, Descriptor, Handle}; +use infini_op::{AsRaw, Descriptor, Handle, infiniop}; use std::{ slice::{from_raw_parts, from_raw_parts_mut}, sync::Arc, @@ -21,15 +21,6 @@ impl crate::Operator for Operator { Self(node.handle().clone()) } - #[inline] - fn scheme( - &mut self, - _args: &Self::Args, - _max_workspace_size: usize, - ) -> Result { - Ok(0) - } - fn launch( &self, args: &Self::Args, @@ -89,18 +80,7 @@ impl crate::Operator for Operator { mod test { use super::{Args, Device, Operator}; use crate::{ConstPtr, Hardware, MutPtr, Operator as _, TensorLayout}; - use digit_layout::{types as ty, DigitLayout}; - - fn dyn_args(dt: DigitLayout) -> Args { - use crate::dyn_; - use std::ptr::{null, null_mut}; - Args { - dst_layout: TensorLayout::new_dyn(dt, &[dyn_(); 2], &[dyn_(); 2]), - dst_base: null_mut(), - src_layout: TensorLayout::new_dyn(dt, &[dyn_(); 2], &[dyn_(); 2]), - src_base: null(), - } - } + use digit_layout::{DigitLayout, types as ty}; fn args( dt: DigitLayout, @@ -133,10 +113,8 @@ mod test { infini_rt::init(infini_rt::DEVICE_CPU); let dev = Device::cpu(); - let mut cpu_op = RefOp::new(&Cpu); - let mut dev_op = Operator::new(&dev); - cpu_op.scheme(&dyn_args(dt), 0).unwrap(); - dev_op.scheme(&dyn_args(dt), 0).unwrap(); + let cpu_op = RefOp::new(&Cpu); + let dev_op = Operator::new(&dev); let mut src = vec![0u32; nh * seq * dh]; rand::rng().fill(&mut src[..]); diff --git a/operators/src/rearrange/opencl/mod.rs b/operators/src/rearrange/opencl/mod.rs index 9ad81c71..95900d34 100644 --- a/operators/src/rearrange/opencl/mod.rs +++ b/operators/src/rearrange/opencl/mod.rs @@ -1,11 +1,11 @@ -use super::{args::Scheme, Args, Rearrange}; +use super::{Args, Rearrange, args::Scheme}; use crate::{ - opencl::{ClDevice, CodeGen, KernelCache, CL2_0}, - rank_not_support, ByteOf, LaunchError, QueueAlloc, + ByteOf, LaunchError, QueueAlloc, SchemeDiversity::Low as LowDiversity, - SchemeError, + opencl::{CL2_0, ClDevice, CodeGen, KernelCache}, + rank_not_support, }; -use clrt::{bindings::cl_int, Context}; +use clrt::{Context, bindings::cl_int}; use lru::LruCache; use std::slice::{from_raw_parts, from_raw_parts_mut}; use std::sync::Mutex; @@ -39,14 +39,6 @@ impl crate::Operator for Operator { } } - fn scheme( - &mut self, - _args: &Self::Args, - _max_workspace_size: usize, - ) -> Result { - Ok(0) - } - fn launch( &self, args: &Self::Args, @@ -190,17 +182,6 @@ mod test { use crate::{ConstPtr, Hardware, MutPtr, TensorLayout}; use digit_layout::DigitLayout; - fn dyn_args(dt: DigitLayout) -> Args { - use crate::dyn_; - use std::ptr::{null, null_mut}; - Args { - dst_layout: TensorLayout::new_dyn(dt, &[dyn_(); 2], &[dyn_(); 2]), - dst_base: null_mut(), - src_layout: TensorLayout::new_dyn(dt, &[dyn_(); 2], &[dyn_(); 2]), - src_base: null(), - } - } - fn args( dt: DigitLayout, shape: &[usize], @@ -221,9 +202,9 @@ mod test { fn test_compute() { use super::{super::common_cpu::Operator as RefOp, Operator}; use crate::{ + Operator as _, common_cpu::{Cpu, ThisThread}, opencl::ClDevice, - Operator as _, }; use clrt::Platform; use digit_layout::types as ty; @@ -233,14 +214,14 @@ mod test { let dt = ty::U32; - let mut cpu_op = RefOp::new(&Cpu); + let cpu_op = RefOp::new(&Cpu); for platform in Platform::all() { for device in platform.devices() { println!("device: {}", device.name()); let context = device.context(); let queue = context.queue(); - let mut cl_op = Operator::new(&ClDevice::new(context.clone(), Default::default())); + let cl_op = Operator::new(&ClDevice::new(context.clone(), Default::default())); let nh = 5; let seq = 32; @@ -254,8 +235,6 @@ mod test { .transpose(&[1, 0]); let dt = ty::U32; - cpu_op.scheme(&dyn_args(dt), 0).unwrap(); - cl_op.scheme(&dyn_args(dt), 0).unwrap(); let mut s_svm = context.malloc::(nh * seq * dh * 2); let mut d_svm = context.malloc::(nh * seq * dh); @@ -305,7 +284,7 @@ mod test { .unwrap(); let cpu_time = time.elapsed(); - let map = queue.map(&mut d_svm); + let map = queue.map(&d_svm); let ([], y_ans, []) = (unsafe { map.align_to::() }) else { panic!() }; diff --git a/operators/src/rms_norm/args.rs b/operators/src/rms_norm/args.rs index bb0f3de2..ba638cc6 100644 --- a/operators/src/rms_norm/args.rs +++ b/operators/src/rms_norm/args.rs @@ -1,6 +1,6 @@ -use crate::{ +use crate::{ + ConstPtr, Hardware, LaunchError, MutPtr, TensorLayout, utils::{dim_distinct, rank_error, type_distinct}, - ConstPtr, Hardware, MaybeDyn, MutPtr, SchemeError, TensorLayout, }; use digit_layout::DigitLayout; @@ -17,12 +17,12 @@ pub struct Args { pub(super) struct Meta { pub dt_a: DigitLayout, pub dt_w: DigitLayout, - pub n: MaybeDyn, - pub d: MaybeDyn, + pub n: usize, + pub d: usize, } impl Args { - pub(super) fn meta(&self) -> Result { + pub(super) fn meta(&self) -> Result { let Self { y_layout, x_layout, @@ -30,21 +30,21 @@ impl Args { .. } = self; - let &[ny, dy] = y_layout.shape() else { + let &[ny, dy] = &*y_layout.shape() else { return Err(rank_error("y", 2, y_layout.ndim())); }; - let &[nx, dx] = x_layout.shape() else { + let &[nx, dx] = &*x_layout.shape() else { return Err(rank_error("x", 2, x_layout.ndim())); }; - let &[dw] = w_layout.shape() else { + let &[dw] = &*w_layout.shape() else { return Err(rank_error("w", 1, w_layout.ndim())); }; Ok(Meta { - dt_a: type_distinct(&[y_layout.dt(), x_layout.dt()])?, - dt_w: w_layout.dt(), - n: dim_distinct(&[ny, nx])?, - d: dim_distinct(&[dy, dx, dw])?, + dt_a: type_distinct(&[y_layout.dt, x_layout.dt])?, + dt_w: w_layout.dt, + n: dim_distinct(&[ny, nx]).expect("n mismatch"), + d: dim_distinct(&[dy, dx, dw]).expect("d mismatch"), }) } } diff --git a/operators/src/rms_norm/common_cpu/mod.rs b/operators/src/rms_norm/common_cpu/mod.rs index 14ffbea4..8e3caf54 100644 --- a/operators/src/rms_norm/common_cpu/mod.rs +++ b/operators/src/rms_norm/common_cpu/mod.rs @@ -1,5 +1,5 @@ -use super::{args::Meta, Args, RmsNorm}; -use crate::{common_cpu::Cpu, get_static, ByteOf, LaunchError, QueueAlloc, SchemeError}; +use super::{Args, RmsNorm, args::Meta}; +use crate::{ByteOf, LaunchError, QueueAlloc, common_cpu::Cpu}; use half::f16; use rayon::iter::{IntoParallelIterator, ParallelIterator}; @@ -16,15 +16,6 @@ impl crate::Operator for Operator { Self } - fn scheme( - &mut self, - args: &Self::Args, - _max_workspace_size: usize, - ) -> Result { - let _meta = args.meta()?; - Ok(0) - } - fn launch( &self, args: &Self::Args, @@ -54,13 +45,6 @@ impl crate::Operator for Operator { unreachable!() }; - get_static! { - n d - nsy dsy - nsx dsx - dsw - } - macro_rules! calculate { ($w:ty, $a:ty) => { Scheme::<$w, $a> { @@ -113,15 +97,15 @@ unsafe impl Sync for Scheme {} impl Scheme { #[inline] unsafe fn y_ptr(&self, i: isize, j: isize) -> *mut A { - self.y.byte_offset(i * self.nsy + j * self.dsy) + unsafe { self.y.byte_offset(i * self.nsy + j * self.dsy) } } #[inline] unsafe fn x_ptr(&self, i: isize, j: isize) -> *const A { - self.x.byte_offset(i * self.nsx + j * self.dsx) + unsafe { self.x.byte_offset(i * self.nsx + j * self.dsx) } } #[inline] unsafe fn w_ptr(&self, j: isize) -> *const W { - self.w.byte_offset(j * self.dsw) + unsafe { self.w.byte_offset(j * self.dsw) } } } @@ -141,11 +125,11 @@ impl Scheme { #[inline] unsafe fn y(&self, i: isize, j: isize, val: f32) { - self.y_ptr(i, j).write(f16::from_f32(val)) + unsafe { self.y_ptr(i, j).write(f16::from_f32(val)) } } #[inline] unsafe fn x(&self, i: isize, j: isize) -> f32 { - self.x_ptr(i, j).read().to_f32() + unsafe { self.x_ptr(i, j).read().to_f32() } } } impl Scheme { @@ -153,11 +137,11 @@ impl Scheme { #[inline] unsafe fn y(&self, i: isize, j: isize, val: f32) { - self.y_ptr(i, j).write(val) + unsafe { self.y_ptr(i, j).write(val) } } #[inline] unsafe fn x(&self, i: isize, j: isize) -> f32 { - self.x_ptr(i, j).read() + unsafe { self.x_ptr(i, j).read() } } } impl Scheme { @@ -165,30 +149,30 @@ impl Scheme { #[inline] unsafe fn y(&self, i: isize, j: isize, val: f64) { - self.y_ptr(i, j).write(val) + unsafe { self.y_ptr(i, j).write(val) } } #[inline] unsafe fn x(&self, i: isize, j: isize) -> f64 { - self.x_ptr(i, j).read() + unsafe { self.x_ptr(i, j).read() } } } impl Scheme { #[inline] unsafe fn w(&self, j: isize) -> f32 { - self.w_ptr(j).read().to_f32() + unsafe { self.w_ptr(j).read() }.to_f32() } } impl Scheme { #[inline] unsafe fn w(&self, j: isize) -> f32 { - self.w_ptr(j).read() + unsafe { self.w_ptr(j).read() } } } impl Scheme { #[inline] unsafe fn w(&self, j: isize) -> f64 { - self.w_ptr(j).read() + unsafe { self.w_ptr(j).read() } } } diff --git a/operators/src/rms_norm/cuda/mod.rs b/operators/src/rms_norm/cuda/mod.rs index e09cbcdd..21042d5a 100644 --- a/operators/src/rms_norm/cuda/mod.rs +++ b/operators/src/rms_norm/cuda/mod.rs @@ -1,9 +1,10 @@ -use super::{args::Meta, Args, RmsNorm}; +use super::{Args, RmsNorm, args::Meta}; use crate::{ - cuda::{dt_name, Gpu, Handle, ModuleBox}, - get_static, shape_not_support, strides_not_support, ByteOf, LaunchError, QueueAlloc, - SchemeDiversity, SchemeError, + ByteOf, LaunchError, QueueAlloc, SchemeDiversity, + cuda::{Gpu, Handle, ModuleBox, dt_name}, + shape_not_support, strides_not_support, }; +use cuda::params; use digit_layout::DigitLayout; use lru::LruCache; use std::{ @@ -30,22 +31,6 @@ impl crate::Operator for Operator { } } - fn scheme( - &mut self, - args: &Self::Args, - _max_workspace_size: usize, - ) -> Result { - let Meta { dt_a, dt_w, d, .. } = args.meta()?; - get_static!(d); - - let key = SchemeKey { dt_a, dt_w, d }; - self.schemes - .lock() - .unwrap() - .try_get_or_insert(key, || Scheme::new(&self.handle, key))?; - Ok(0) - } - fn launch( &self, args: &Self::Args, @@ -75,16 +60,9 @@ impl crate::Operator for Operator { unreachable!() }; - get_static! { - n d - yns yds - xns xds - wds - } - let unit = dt_a.nbytes() as isize; if yds != unit || xds != unit || wds != dt_w.nbytes() as isize { - return Err(strides_not_support("").into()); + return Err(strides_not_support("")); }; let key = SchemeKey { dt_a, dt_w, d }; @@ -97,17 +75,19 @@ impl crate::Operator for Operator { let nsy = (yns / unit) as i32; let nsx = (xns / unit) as i32; - let params = cuda::params![y_base, nsy, x_base, nsx, w_base, epsilon]; + let params = params![*y_base, nsy, *x_base, nsx, *w_base, *epsilon]; scheme.module.launch( &scheme.name, - n as u32, - match scheme.ty { - SchemeType::Padding => d, - SchemeType::Folding { block_size } => block_size, - } as u32, - params.as_ptr(), - 0, + ( + n as u32, + match scheme.ty { + SchemeType::Padding => d, + SchemeType::Folding { block_size } => block_size, + } as u32, + 0, + ), + ¶ms.to_ptrs(), queue_alloc.queue(), ); Ok(()) @@ -138,7 +118,7 @@ impl Scheme { pub fn new( handle: &Arc, SchemeKey { dt_a, dt_w, d }: SchemeKey, - ) -> Result { + ) -> Result { let device = handle.device(); let cc = device.compute_capability(); let block_size = device.block_limit().max_threads; @@ -222,24 +202,10 @@ mod test { use super::{Args, Gpu, Operator}; use crate::{Hardware, Operator as _, TensorLayout}; use digit_layout::{ - types::{F16, F32, F64}, DigitLayout, + types::{F16, F32, F64}, }; - fn dyn_args(dt_w: DigitLayout, dt_a: DigitLayout, d: usize) -> Args { - use crate::dyn_; - use std::ptr::{null, null_mut}; - Args { - y_layout: TensorLayout::new_dyn(dt_a, &[dyn_(), d.into()], &[dyn_(); 2]), - y_base: null_mut(), - x_layout: TensorLayout::new_dyn(dt_a, &[dyn_(), d.into()], &[dyn_(); 2]), - x_base: null(), - w_layout: TensorLayout::new_dyn(dt_w, &[d.into()], &[dyn_()]), - w_base: null(), - epsilon: 1e-5, - } - } - fn args( dt_w: DigitLayout, dt_a: DigitLayout, @@ -261,28 +227,6 @@ mod test { } } - #[test] - fn test_compile() { - let Some(gpu) = Gpu::init() else { - return; - }; - println!("{}", gpu.0.device().info()); - - let mut op = Operator::new(&gpu); - for k in 8..=13 { - let d = 1 << k; - op.scheme(&dyn_args(F32, F16, d), 0).unwrap(); - let scheme = op.schemes.lock().unwrap().iter().next().unwrap().1.clone(); - gpu.apply(|ctx| { - println!( - "{}\n{}", - scheme.name.to_str().unwrap(), - scheme.module.load(&scheme.name, ctx).info() - ) - }); - } - } - #[test] fn test_compute() { use super::super::common_cpu::Operator as RefOp; @@ -300,13 +244,11 @@ mod test { return; }; - let mut cpu_op = RefOp::new(&Cpu); - let mut gpu_op = Operator::new(&gpu); + let cpu_op = RefOp::new(&Cpu); + let gpu_op = Operator::new(&gpu); for k in 8..=13 { let n = 4; let d = 1 << k; - cpu_op.scheme(&dyn_args(F64, F64, d), 0).unwrap(); - gpu_op.scheme(&dyn_args(F32, F16, d), 0).unwrap(); let mut x = vec![0.0f64; n * d]; let mut w = vec![0.0f64; d]; diff --git a/operators/src/rms_norm/cuda/rms_norm.cuh b/operators/src/rms_norm/cuda/rms_norm.cuh index 626ec926..65eee528 100644 --- a/operators/src/rms_norm/cuda/rms_norm.cuh +++ b/operators/src/rms_norm/cuda/rms_norm.cuh @@ -3,7 +3,7 @@ #include // assert BLOCK_SIZE >= blockDim.x -template +template static __device__ void padding( Ta *__restrict__ y_, int const stride_y, @@ -29,7 +29,7 @@ static __device__ void padding( *y = Ta(rms * x * w); } -template +template static __device__ void folding( Ta *__restrict__ y, int const stride_y, diff --git a/operators/src/rms_norm/infini/mod.rs b/operators/src/rms_norm/infini/mod.rs index 7a37568d..de42f5e3 100644 --- a/operators/src/rms_norm/infini/mod.rs +++ b/operators/src/rms_norm/infini/mod.rs @@ -1,6 +1,6 @@ -use super::{args::Meta, Args, RmsNorm}; -use crate::{get_static, infini::Device, ByteOf, LaunchError, QueueAlloc, SchemeError, Workspace}; -use infini_op::{infiniop, AsRaw, Descriptor}; +use super::{Args, RmsNorm, args::Meta}; +use crate::{ByteOf, LaunchError, QueueAlloc, Workspace, infini::Device}; +use infini_op::{AsRaw, Descriptor, infiniop}; pub struct Operator(Device); @@ -16,15 +16,6 @@ impl crate::Operator for Operator { Self(node.clone()) } - #[inline] - fn scheme( - &mut self, - _args: &Self::Args, - _max_workspace_size: usize, - ) -> Result { - Ok(0) - } - fn launch( &self, args: &Self::Args, @@ -54,13 +45,6 @@ impl crate::Operator for Operator { unreachable!() }; - get_static! { - n d - yns yds - xns xds - wds - } - let y = infini_op::Tensor::new(dt_a, [n, d], [yns, yds]); let x = infini_op::Tensor::new(dt_a, [n, d], [xns, xds]); let w = infini_op::Tensor::new(dt_w, [d], [wds]); @@ -102,25 +86,11 @@ mod test { use super::{Args, Device, Operator}; use crate::{Hardware, Operator as _, TensorLayout}; use digit_layout::{ - types::{F16, F32, F64}, DigitLayout, + types::{F16, F32, F64}, }; use rayon::iter::ParallelIterator; - fn dyn_args(dt_w: DigitLayout, dt_a: DigitLayout, d: usize) -> Args { - use crate::dyn_; - use std::ptr::{null, null_mut}; - Args { - y_layout: TensorLayout::new_dyn(dt_a, &[dyn_(), d.into()], &[dyn_(); 2]), - y_base: null_mut(), - x_layout: TensorLayout::new_dyn(dt_a, &[dyn_(), d.into()], &[dyn_(); 2]), - x_base: null(), - w_layout: TensorLayout::new_dyn(dt_w, &[d.into()], &[dyn_()]), - w_base: null(), - epsilon: 1e-5, - } - } - fn args( dt_w: DigitLayout, dt_a: DigitLayout, @@ -157,14 +127,12 @@ mod test { infini_rt::init(infini_rt::DEVICE_CPU); let dev = Device::cpu(); - let mut cpu_op = RefOp::new(&Cpu); - let mut dev_op = Operator::new(&dev); + let cpu_op = RefOp::new(&Cpu); + let dev_op = Operator::new(&dev); for k in 8..=13 { let n = 4; let d = 1 << k; - cpu_op.scheme(&dyn_args(F64, F64, d), 0).unwrap(); - dev_op.scheme(&dyn_args(F32, F16, d), 0).unwrap(); let mut x = vec![0.0f64; n * d]; let mut w = vec![0.0f64; d]; diff --git a/operators/src/rms_norm/opencl/mod.rs b/operators/src/rms_norm/opencl/mod.rs index f1a3b9ca..4b0f78a5 100644 --- a/operators/src/rms_norm/opencl/mod.rs +++ b/operators/src/rms_norm/opencl/mod.rs @@ -1,16 +1,14 @@ -use super::{args::Meta, Args, RmsNorm}; +use super::{Args, RmsNorm, args::Meta}; use crate::{ - get_static, - opencl::{ClDevice, CodeGen, KernelCache, CL2_0}, ByteOf, LaunchError, QueueAlloc, SchemeDiversity::Low as LowDiversity, - SchemeError, + opencl::{CL2_0, ClDevice, CodeGen, KernelCache}, }; use clrt::{ - bindings::{cl_int, cl_uint}, Context, + bindings::{cl_int, cl_uint}, }; -use digit_layout::{types as Ty, DigitLayout}; +use digit_layout::{DigitLayout, types as Ty}; use lru::LruCache; use std::sync::Mutex; @@ -43,18 +41,6 @@ impl crate::Operator for Operator { } } - fn scheme( - &mut self, - args: &Self::Args, - _max_workspace_size: usize, - ) -> Result { - let Meta { dt_a, dt_w, d, .. } = args.meta()?; - if let Some(&d) = d.get_static() { - self.cache_kernel(dt_a, dt_w, d); - } - Ok(0) - } - fn launch( &self, args: &Self::Args, @@ -80,11 +66,6 @@ impl crate::Operator for Operator { let &[nsx, ..] = x_layout.strides() else { unreachable!() }; - get_static! { - n d - nsy - nsx - } let (key, group_size) = self.cache_kernel(dt_a, dt_w, d); @@ -169,20 +150,6 @@ mod test { use crate::{Hardware, TensorLayout}; use digit_layout::DigitLayout; - fn dyn_args(dt_w: DigitLayout, dt_a: DigitLayout, d: usize) -> Args { - use crate::dyn_; - use std::ptr::{null, null_mut}; - Args { - y_layout: TensorLayout::new_dyn(dt_a, &[dyn_(), d.into()], &[dyn_(); 2]), - y_base: null_mut(), - x_layout: TensorLayout::new_dyn(dt_a, &[dyn_(), d.into()], &[dyn_(); 2]), - x_base: null(), - w_layout: TensorLayout::new_dyn(dt_w, &[d.into()], &[dyn_()]), - w_base: null(), - epsilon: 1e-5, - } - } - fn args( dt_w: DigitLayout, dt_a: DigitLayout, @@ -208,10 +175,10 @@ mod test { fn test_compute() { use super::{super::common_cpu::Operator as RefOp, Operator}; use crate::{ + Operator as _, common_cpu::{Cpu, ThisThread}, opencl::ClDevice, test_utils::{Diff, ErrorCollector}, - Operator as _, }; use clrt::Platform; use digit_layout::types as ty; @@ -219,22 +186,19 @@ mod test { use rayon::iter::{IndexedParallelIterator, IntoParallelIterator, ParallelIterator}; use std::{iter::zip, time::Instant}; - let mut cpu_op = RefOp::new(&Cpu); + let cpu_op = RefOp::new(&Cpu); for platform in Platform::all() { for device in platform.devices() { println!("device: {}", device.name()); let context = device.context(); let queue = context.queue(); - let mut cl_op = Operator::new(&ClDevice::new(context.clone(), Default::default())); + let cl_op = Operator::new(&ClDevice::new(context.clone(), Default::default())); for k in 2..=12 { let n = 5; let d = 1 << k; - cpu_op.scheme(&dyn_args(ty::F64, ty::F64, d), 0).unwrap(); - cl_op.scheme(&dyn_args(ty::F32, ty::F32, d), 0).unwrap(); - let mut x = vec![0.0f64; n * d]; let mut w = vec![0.0f64; d]; rand::rng().fill(&mut x[..]); @@ -301,7 +265,7 @@ mod test { .unwrap(); let cpu_time = time.elapsed(); - let map = queue.map(&mut y_svm); + let map = queue.map(&y_svm); let ([], y_ans, []) = (unsafe { map.align_to::() }) else { panic!() }; diff --git a/operators/src/rope/args.rs b/operators/src/rope/args.rs index 6c3dff97..b1ce84e5 100644 --- a/operators/src/rope/args.rs +++ b/operators/src/rope/args.rs @@ -1,7 +1,6 @@ -use crate::{ - type_not_support, +use crate::{ + ConstPtr, Hardware, LaunchError, MutPtr, TensorLayout, type_not_support, utils::{dim_distinct, rank_error}, - ConstPtr, Hardware, MaybeDyn, MutPtr, SchemeError, TensorLayout, }; use digit_layout::DigitLayout; @@ -20,13 +19,13 @@ pub struct Args { pub(super) struct Meta { pub dt_t: DigitLayout, pub dt_p: DigitLayout, - pub nt: MaybeDyn, + pub nt: usize, #[allow(dead_code)] - pub dh: MaybeDyn, + pub dh: usize, } impl Args { - pub(super) fn meta(&self) -> Result { + pub(super) fn meta(&self) -> Result { let Self { t_layout, p_layout, @@ -35,21 +34,21 @@ impl Args { .. } = self; - let &[nt, _, dh] = t_layout.shape() else { + let &[nt, _, dh] = &*t_layout.shape() else { return Err(rank_error("t", 3, t_layout.ndim())); }; - let &[np] = p_layout.shape() else { + let &[np] = &*p_layout.shape() else { return Err(rank_error("p", 1, p_layout.ndim())); }; - let &[_, dh_sin] = sin_layout.shape() else { + let &[_, dh_sin] = &*sin_layout.shape() else { return Err(rank_error("sin", 2, sin_layout.ndim())); }; - let &[_, dh_cos] = cos_layout.shape() else { + let &[_, dh_cos] = &*cos_layout.shape() else { return Err(rank_error("cos", 2, cos_layout.ndim())); }; - let dt_t = t_layout.dt(); - let dt_p = p_layout.dt(); + let dt_t = t_layout.dt; + let dt_p = p_layout.dt; use digit_layout::LayoutContent::{Real, Unsigned}; // tokens must be floating-point numbers if !matches!(dt_t.decode(), Real { exponent: 1.., .. },) { @@ -66,8 +65,8 @@ impl Args { Ok(Meta { dt_t, dt_p, - nt: dim_distinct(&[nt, np])?, - dh: dim_distinct(&[dh, dh_sin, dh_cos])?, + nt: dim_distinct(&[nt, np]).expect("nt mismatch"), + dh: dim_distinct(&[dh, dh_sin, dh_cos]).expect("dh mismatch"), }) } } diff --git a/operators/src/rope/common_cpu/mod.rs b/operators/src/rope/common_cpu/mod.rs index 0630555b..14f6853b 100644 --- a/operators/src/rope/common_cpu/mod.rs +++ b/operators/src/rope/common_cpu/mod.rs @@ -1,9 +1,6 @@ -use super::{args::Meta, fill_pos, Args, Rope, Seq, SinCosTable}; -use crate::{ - common_cpu::Cpu, get_static, strides_not_support, ByteOf, LaunchError, QueueAlloc, SchemeError, - Unsigned, -}; -use digit_layout::{types as ty, DigitLayout}; +use super::{Args, Rope, Seq, SinCosTable, args::Meta, fill_pos}; +use crate::{ByteOf, LaunchError, QueueAlloc, Unsigned, common_cpu::Cpu, strides_not_support}; +use digit_layout::{DigitLayout, types as ty}; use half::f16; pub struct Operator; @@ -53,15 +50,6 @@ impl crate::Operator for Operator { Self } - fn scheme( - &mut self, - args: &Self::Args, - _max_workspace_size: usize, - ) -> Result { - let _meta = args.meta()?; - Ok(0) - } - fn launch( &self, args: &Self::Args, @@ -80,7 +68,7 @@ impl crate::Operator for Operator { theta, .. } = args; - let &[_, nh, dh] = t_layout.shape() else { + let &[_, nh, dh] = &*t_layout.shape() else { unreachable!() }; let &[st, sh, sd] = t_layout.strides() else { @@ -90,13 +78,8 @@ impl crate::Operator for Operator { unreachable!() }; - get_static! { - nt nh dh - st sh sd - sp - } if sd != dt_t.nbytes() as isize { - return Err(strides_not_support("").into()); + return Err(strides_not_support("")); } macro_rules! calculate { diff --git a/operators/src/rope/cuda/mod.rs b/operators/src/rope/cuda/mod.rs index 9d457120..16470f9d 100644 --- a/operators/src/rope/cuda/mod.rs +++ b/operators/src/rope/cuda/mod.rs @@ -1,10 +1,10 @@ -use super::{args::Meta, fill_pos, Args, Rope, Seq, SinCosTable}; +use super::{Args, Rope, Seq, SinCosTable, args::Meta, fill_pos}; use crate::{ + Blob, ByteOf, LaunchError, QueueAlloc, cuda::{Gpu, Handle, ModuleBox}, - get_static, shape_not_support, strides_not_support, type_not_support, Blob, ByteOf, - LaunchError, QueueAlloc, SchemeError, + shape_not_support, strides_not_support, type_not_support, }; -use digit_layout::{types as ty, DigitLayout}; +use digit_layout::{DigitLayout, types as ty}; use std::{ffi::CString, sync::Arc}; pub struct Operator { @@ -70,14 +70,6 @@ impl crate::Operator for Operator { } } - fn scheme( - &mut self, - _args: &Self::Args, - _max_workspace_size: usize, - ) -> Result { - Ok(0) - } - fn launch( &self, args: &Self::Args, @@ -92,12 +84,12 @@ impl crate::Operator for Operator { } = args.meta()?; if dt_t != ty::F16 { - return Err(type_not_support("").into()); + return Err(type_not_support("")); } let name = match dt_p { ty::U32 => POS_U32, ty::U64 => POS_U64, - _ => return Err(type_not_support("").into()), + _ => return Err(type_not_support("")), }; let Args { @@ -108,7 +100,7 @@ impl crate::Operator for Operator { theta, .. } = args; - let &[_, nh, _] = t_layout.shape() else { + let &[_, nh, _] = &*t_layout.shape() else { unreachable!() }; let &[st, sh, sd] = t_layout.strides() else { @@ -118,24 +110,18 @@ impl crate::Operator for Operator { unreachable!() }; - get_static! { - nt nh dh - st sh sd - sp - } - let unit = dt_t.nbytes() as isize; if sd != unit || sp != dt_p.nbytes() as isize { - return Err(strides_not_support("").into()); + return Err(strides_not_support("")); } let dh = dh / 2; let st = (st / unit / 2) as i32; let sh = (sh / unit / 2) as i32; - let params = cuda::params![t_base, st, sh, p_base, theta]; + let params = cuda::params![*t_base, st, sh, *p_base, *theta]; if self.max_threads_block % dh != 0 { - return Err(shape_not_support("").into()); + return Err(shape_not_support("")); } let max_nh_l = (self.max_threads_block / dh).min(nh); @@ -144,10 +130,8 @@ impl crate::Operator for Operator { self.module.launch( CString::new(name).unwrap(), - (nt as _, nh_h as _), - (nh_l as _, dh as _), - params.as_ptr(), - 0, + ((nt as _, nh_h as _), (nh_l as _, dh as _), 0), + ¶ms.to_ptrs(), queue_alloc.queue(), ); Ok(()) @@ -183,29 +167,14 @@ extern "C" __global__ void {POS_U64}( #[cfg(test)] mod test { - use super::{Args, Gpu, Operator, POS_U32, POS_U64}; + use super::{Args, Gpu, Operator}; use crate::{Hardware, Operator as _, TensorLayout}; use digit_layout::{ - types::{F16, F64, U32}, DigitLayout, + types::{F16, F64, U32}, }; - fn dyn_args(dt_t: DigitLayout, dt_p: DigitLayout) -> Args { - use crate::dyn_; - use std::ptr::{null, null_mut}; - Args { - t_layout: TensorLayout::new_dyn(dt_t, &[dyn_(); 3], &[dyn_(); 3]), - t_base: null_mut(), - p_layout: TensorLayout::new_dyn(dt_p, &[dyn_()], &[dyn_()]), - p_base: null(), - sin_layout: TensorLayout::new_dyn(dt_t, &[dyn_(); 2], &[dyn_(); 2]), - sin_base: null(), - cos_layout: TensorLayout::new_dyn(dt_t, &[dyn_(); 2], &[dyn_(); 2]), - cos_base: null(), - theta: 0., - } - } - + #[allow(clippy::too_many_arguments)] fn args( dt_t: DigitLayout, dt_p: DigitLayout, @@ -230,30 +199,6 @@ mod test { } } - #[test] - fn test_compile() { - use std::ffi::CString; - - let Some(gpu) = Gpu::init() else { - return; - }; - println!("{}", gpu.0.device().info()); - - let mut op = Operator::new(&gpu); - op.scheme(&dyn_args(F16, U32), 0).unwrap(); - - gpu.apply(|ctx| { - println!( - "{POS_U32}\n{}", - op.module.load(CString::new(POS_U32).unwrap(), ctx).info() - ); - println!( - "{POS_U64}\n{}", - op.module.load(CString::new(POS_U64).unwrap(), ctx).info() - ); - }) - } - #[test] fn test_compute() { use super::super::common_cpu::Operator as RefOp; @@ -270,10 +215,8 @@ mod test { return; }; - let mut cpu_op = RefOp::new(&Cpu); - let mut gpu_op = Operator::new(&gpu); - cpu_op.scheme(&dyn_args(F64, U32), 0).unwrap(); - gpu_op.scheme(&dyn_args(F16, U32), 0).unwrap(); + let cpu_op = RefOp::new(&Cpu); + let gpu_op = Operator::new(&gpu); const NT: usize = 7; let nh = 32; diff --git a/operators/src/rope/cuda/rope.cuh b/operators/src/rope/cuda/rope.cuh index 6daa1a4b..4373d8b3 100644 --- a/operators/src/rope/cuda/rope.cuh +++ b/operators/src/rope/cuda/rope.cuh @@ -1,6 +1,6 @@ #include -template +template static __device__ void padding( half2 *__restrict__ t, int const stride_token, @@ -14,11 +14,11 @@ static __device__ void padding( nh_l = blockDim.y, dh = blockDim.x, - it = blockIdx.y, // token index - ih_h = blockIdx.x, // head index (high) - ih_l = threadIdx.y, // head index (low) - ih = ih_h * nh_l + ih_l,// head index - i = threadIdx.x; // element index + it = blockIdx.y, // token index + ih_h = blockIdx.x, // head index (high) + ih_l = threadIdx.y, // head index (low) + ih = ih_h * nh_l + ih_l, // head index + i = threadIdx.x; // element index t += it * stride_token + ih * stride_head + i; float a = t->x, b = t->y, sin, cos; diff --git a/operators/src/rope/infini/mod.rs b/operators/src/rope/infini/mod.rs index 79ef85dd..3bbbbf4f 100644 --- a/operators/src/rope/infini/mod.rs +++ b/operators/src/rope/infini/mod.rs @@ -1,9 +1,7 @@ -use super::{args::Meta, fill_pos, Args, Rope, Seq, SinCosTable}; -use crate::{ - get_static, infini::Device, Blob, ByteOf, LaunchError, QueueAlloc, SchemeError, Workspace, -}; -use digit_layout::{types as ty, DigitLayout}; -use infini_op::{infiniop, AsRaw, Descriptor}; +use super::{Args, Rope, Seq, SinCosTable, args::Meta, fill_pos}; +use crate::{Blob, ByteOf, LaunchError, QueueAlloc, Workspace, infini::Device}; +use digit_layout::{DigitLayout, types as ty}; +use infini_op::{AsRaw, Descriptor, infiniop}; pub struct Operator(Device); @@ -71,15 +69,6 @@ impl crate::Operator for Operator { Self(node.clone()) } - #[inline] - fn scheme( - &mut self, - _args: &Self::Args, - _max_workspace_size: usize, - ) -> Result { - Ok(0) - } - fn launch( &self, args: &Self::Args, @@ -100,7 +89,7 @@ impl crate::Operator for Operator { .. } = args; - let &[nctx, nh, dh] = t_layout.shape() else { + let &[nctx, nh, dh] = &*t_layout.shape() else { unreachable!() }; let &[ncs, nhs, dhs] = t_layout.strides() else { @@ -116,18 +105,10 @@ impl crate::Operator for Operator { unreachable!() }; - get_static! { - nctx nh dh - ncs nhs dhs - ps - sns sds - snc sdc - } - let t = infini_op::Tensor::new(dt_t, [nctx, nh, dh], [ncs, nhs, dhs]); let p = infini_op::Tensor::new(dt_p, [nctx], [ps]); - let sin = infini_op::Tensor::new(sin_layout.dt(), [nctx, dh], [sns, sds]); - let cos = infini_op::Tensor::new(cos_layout.dt(), [nctx, dh], [snc, sdc]); + let sin = infini_op::Tensor::new(sin_layout.dt, [nctx, dh], [sns, sds]); + let cos = infini_op::Tensor::new(cos_layout.dt, [nctx, dh], [snc, sdc]); let descriptor = Descriptor::new( |ptr| { @@ -166,26 +147,11 @@ impl crate::Operator for Operator { #[cfg(test)] mod test { use super::{Args, Device, Operator}; - use crate::{rope::Rope, Hardware, Operator as _, TensorLayout}; - use digit_layout::{types as ty, DigitLayout}; + use crate::{Hardware, Operator as _, TensorLayout, rope::Rope}; + use digit_layout::{DigitLayout, types as ty}; use std::ptr::null; - fn dyn_args(dt_t: DigitLayout, dt_p: DigitLayout) -> Args { - use crate::dyn_; - use std::ptr::{null, null_mut}; - Args { - t_layout: TensorLayout::new_dyn(dt_t, &[dyn_(); 3], &[dyn_(); 3]), - t_base: null_mut(), - p_layout: TensorLayout::new_dyn(dt_p, &[dyn_()], &[dyn_()]), - p_base: null(), - sin_layout: TensorLayout::new_dyn(ty::F32, &[dyn_(); 2], &[dyn_(); 2]), - sin_base: null(), - cos_layout: TensorLayout::new_dyn(ty::F32, &[dyn_(); 2], &[dyn_(); 2]), - cos_base: null(), - theta: 0., - } - } - + #[allow(clippy::too_many_arguments)] fn args( dt_t: DigitLayout, dt_p: DigitLayout, @@ -200,15 +166,15 @@ mod test { ) -> Args { use ndarray_layout::{ArrayLayout, Endian::BigEndian}; Args { - t_layout: TensorLayout::from_arr( - dt_t, - &ArrayLayout::<3>::new_contiguous(&[nt, nh, dh], BigEndian, dt_t.nbytes()).slice( + t_layout: TensorLayout { + dt: dt_t, + layout: ArrayLayout::new_contiguous(&[nt, nh, dh], BigEndian, dt_t.nbytes()).slice( 1, 4, 1, nh - 8, ), - ), + }, t_base, p_layout: TensorLayout::new_contiguous(dt_p, &[nt]), p_base, @@ -235,11 +201,8 @@ mod test { infini_rt::init(infini_rt::DEVICE_CPU); let dev = Device::cpu(); - let mut cpu_op = RefOp::new(&Cpu); - let mut dev_op = Operator::new(&dev); - - cpu_op.scheme(&dyn_args(ty::F64, ty::U32), 0).unwrap(); - dev_op.scheme(&dyn_args(ty::F16, ty::U64), 0).unwrap(); + let cpu_op = RefOp::new(&Cpu); + let dev_op = Operator::new(&dev); const NT: usize = 7; let nh = 32; diff --git a/operators/src/rope/opencl/mod.rs b/operators/src/rope/opencl/mod.rs index ee619e40..c70124df 100644 --- a/operators/src/rope/opencl/mod.rs +++ b/operators/src/rope/opencl/mod.rs @@ -1,13 +1,12 @@ -use super::{args::Meta, fill_pos, Args, Rope, Seq, SinCosTable}; +use super::{Args, Rope, Seq, SinCosTable, args::Meta, fill_pos}; use crate::{ - get_static, - opencl::{ClDevice, CodeGen, KernelCache, CL2_0}, - shape_not_support, strides_not_support, ByteOf, LaunchError, QueueAlloc, + ByteOf, LaunchError, QueueAlloc, SchemeDiversity::Low as LowDiversity, - SchemeError, + opencl::{CL2_0, ClDevice, CodeGen, KernelCache}, + shape_not_support, strides_not_support, }; -use clrt::{bindings::cl_int, Context}; -use digit_layout::{types as Ty, DigitLayout}; +use clrt::{Context, bindings::cl_int}; +use digit_layout::{DigitLayout, types as Ty}; use lru::LruCache; use std::sync::Mutex; use std::{alloc::Layout, iter::zip}; @@ -101,14 +100,6 @@ impl crate::Operator for Operator { } } - fn scheme( - &mut self, - _args: &Self::Args, - _max_workspace_size: usize, - ) -> Result { - Ok(0) - } - fn launch( &self, args: &Self::Args, @@ -130,7 +121,7 @@ impl crate::Operator for Operator { theta, .. } = args; - let &[_, nh, _] = t_layout.shape() else { + let &[_, nh, _] = &*t_layout.shape() else { unreachable!() }; let &[st, sh, sd] = t_layout.strides() else { @@ -140,15 +131,9 @@ impl crate::Operator for Operator { unreachable!() }; - get_static! { - nt nh dh - st sh sd - sp - } - let unit = dt_t.nbytes() as isize; if sd != unit || sp != dt_p.nbytes() as isize { - return Err(strides_not_support("").into()); + return Err(strides_not_support("")); }; let dh = dh / 2; @@ -156,7 +141,7 @@ impl crate::Operator for Operator { let sh = (sh / unit / 2) as i32; if self.max_group_size % dh != 0 { - return Err(shape_not_support("").into()); + return Err(shape_not_support("")); } let max_nh_l = (self.max_group_size / dh).min(nh); @@ -180,8 +165,8 @@ impl crate::Operator for Operator { .set_arg(4, theta) .launch( &[0, 0], - &[(nt * nh_l) as usize, (nh_h * dh) as usize], - &[nh_l as usize, dh as usize], + &[nt * nh_l, nh_h * dh], + &[nh_l, dh], queue_alloc.queue(), None, ); @@ -238,26 +223,11 @@ mod test { use super::Args; use crate::{Hardware, TensorLayout}; use digit_layout::{ - types::{F32, F64, U32}, DigitLayout, + types::{F64, U32}, }; - fn dyn_args(dt_t: DigitLayout, dt_p: DigitLayout) -> Args { - use crate::dyn_; - use std::ptr::{null, null_mut}; - Args { - t_layout: TensorLayout::new_dyn(dt_t, &[dyn_(); 3], &[dyn_(); 3]), - t_base: null_mut(), - p_layout: TensorLayout::new_dyn(dt_p, &[dyn_()], &[dyn_()]), - p_base: null(), - sin_layout: TensorLayout::new_dyn(dt_t, &[dyn_(); 2], &[dyn_(); 2]), - sin_base: null(), - cos_layout: TensorLayout::new_dyn(dt_t, &[dyn_(); 2], &[dyn_(); 2]), - cos_base: null(), - theta: 0., - } - } - + #[allow(clippy::too_many_arguments)] fn args( dt_t: DigitLayout, dt_p: DigitLayout, @@ -286,26 +256,24 @@ mod test { fn test_compute() { use super::{super::common_cpu::Operator as RefOp, Operator}; use crate::{ + Operator as _, common_cpu::{Cpu, ThisThread}, opencl::ClDevice, test_utils::{Diff, ErrorCollector}, - Operator as _, }; use clrt::Platform; use digit_layout::types as ty; use rand::Rng; use std::{iter::zip, time::Instant}; - let mut cpu_op = RefOp::new(&Cpu); + let cpu_op = RefOp::new(&Cpu); for platform in Platform::all() { for device in platform.devices() { println!("device: {}", device.name()); let context = device.context(); let queue = context.queue(); - let mut cl_op = Operator::new(&ClDevice::new(context.clone(), Default::default())); - cpu_op.scheme(&dyn_args(F64, U32), 0).unwrap(); - cl_op.scheme(&dyn_args(F32, U32), 0).unwrap(); + let cl_op = Operator::new(&ClDevice::new(context.clone(), Default::default())); const NT: usize = 1; let nh = 32; @@ -375,7 +343,7 @@ mod test { .unwrap(); let cpu_time = time.elapsed(); - let map = queue.map(&mut t_svm); + let map = queue.map(&t_svm); let ([], y_ans, []) = (unsafe { map.align_to::() }) else { panic!() diff --git a/operators/src/swiglu/args.rs b/operators/src/swiglu/args.rs index fbc2bc7e..723b7594 100644 --- a/operators/src/swiglu/args.rs +++ b/operators/src/swiglu/args.rs @@ -1,6 +1,6 @@ -use crate::{ +use crate::{ + ConstPtr, Hardware, LaunchError, MutPtr, TensorLayout, utils::{dim_distinct, rank_error, type_distinct}, - ConstPtr, Hardware, MaybeDyn, MutPtr, SchemeError, TensorLayout, }; use digit_layout::DigitLayout; @@ -13,8 +13,8 @@ pub struct Args { pub(super) struct Meta { pub dt: DigitLayout, - pub n: MaybeDyn, - pub d: MaybeDyn, + pub n: usize, + pub d: usize, } impl Args { @@ -28,24 +28,24 @@ impl Args { } } - pub(super) fn meta(&self) -> Result { + pub(super) fn meta(&self) -> Result { let Self { gate_layout, up_layout, .. } = self; - let &[gn, gd] = gate_layout.shape() else { + let &[gn, gd] = &*gate_layout.shape() else { return Err(rank_error("gate", 2, gate_layout.ndim())); }; - let &[un, ud] = up_layout.shape() else { + let &[un, ud] = &*up_layout.shape() else { return Err(rank_error("up", 2, up_layout.ndim())); }; Ok(Meta { - dt: type_distinct(&[gate_layout.dt(), up_layout.dt()])?, - n: dim_distinct(&[gn, un])?, - d: dim_distinct(&[gd, ud])?, + dt: type_distinct(&[gate_layout.dt, up_layout.dt])?, + n: dim_distinct(&[gn, un]).expect("n mismatch"), + d: dim_distinct(&[gd, ud]).expect("d mismatch"), }) } } diff --git a/operators/src/swiglu/common_cpu/mod.rs b/operators/src/swiglu/common_cpu/mod.rs index 6b885ea6..716c8fe8 100644 --- a/operators/src/swiglu/common_cpu/mod.rs +++ b/operators/src/swiglu/common_cpu/mod.rs @@ -1,5 +1,5 @@ -use super::{args::Meta, Args, Swiglu}; -use crate::{common_cpu::Cpu, get_static, ByteOf, LaunchError, QueueAlloc, SchemeError}; +use super::{Args, Swiglu, args::Meta}; +use crate::{ByteOf, LaunchError, QueueAlloc, common_cpu::Cpu}; use half::f16; pub struct Operator; @@ -15,15 +15,6 @@ impl crate::Operator for Operator { Self } - fn scheme( - &mut self, - args: &Self::Args, - _max_workspace_size: usize, - ) -> Result { - let _meta = args.meta()?; - Ok(0) - } - fn launch( &self, args: &Self::Args, @@ -47,12 +38,6 @@ impl crate::Operator for Operator { unreachable!() }; - get_static! { - n d - sgn sgd - sun sud - } - macro_rules! calculate { ($ty:ty) => { Scheme::<$ty> { diff --git a/operators/src/swiglu/cuda/mod.rs b/operators/src/swiglu/cuda/mod.rs index 5d9e578f..133099e8 100644 --- a/operators/src/swiglu/cuda/mod.rs +++ b/operators/src/swiglu/cuda/mod.rs @@ -1,12 +1,16 @@ -use super::{args::Meta, Args, Swiglu}; +use super::{Args, Swiglu, args::Meta}; use crate::{ + ByteOf, LaunchError, QueueAlloc, cuda::{Gpu, Handle, ModuleBox}, - get_static, strides_not_support, type_not_support, + strides_not_support, type_not_support, utils::gcd, - ByteOf, LaunchError, QueueAlloc, SchemeError, }; +use cuda::params; use digit_layout::types::F16; -use std::{ffi::CString, sync::Arc}; +use std::{ + ffi::{CString, c_uint}, + sync::Arc, +}; pub struct Operator { _handle: Arc, @@ -34,15 +38,6 @@ impl crate::Operator for Operator { } } - #[inline] - fn scheme( - &mut self, - _args: &Self::Args, - _max_workspace_size: usize, - ) -> Result { - Ok(0) - } - fn launch( &self, args: &Self::Args, @@ -67,31 +62,23 @@ impl crate::Operator for Operator { }; if dt != F16 { - return Err(type_not_support("").into()); - } - - get_static! { - n d - gns gds - uns uds + return Err(type_not_support("")); } let unit = dt.nbytes() as isize; if gds != unit || uds != unit { - return Err(strides_not_support("").into()); + return Err(strides_not_support("")); }; let sg = (gns / unit) as i32; let su = (uns / unit) as i32; - let params = cuda::params![gate_base, sg, up_base, su]; + let params = params![*gate_base, sg, *up_base, su]; let block = gcd(self.max_threads_block, d); self.module.launch( CString::new(NAME).unwrap(), - (n as _, (d / block) as _), - block as u32, - params.as_ptr(), - 0, + ((n as _, (d / block) as _), block as c_uint, 0), + ¶ms.to_ptrs(), queue_alloc.queue(), ); Ok(()) @@ -117,23 +104,12 @@ extern "C" __global__ void {NAME}( #[cfg(test)] mod test { use super::{Args, Gpu, Operator}; - use crate::{dyn_, Hardware, Operator as _, TensorLayout}; + use crate::{Hardware, Operator as _, TensorLayout}; use digit_layout::{ - types::{F16, F64}, DigitLayout, + types::{F16, F64}, }; - fn dyn_args(dt: DigitLayout) -> Args { - use std::ptr::{null, null_mut}; - let layout = TensorLayout::new_dyn(dt, &[dyn_(); 2], &[dyn_(); 2]); - Args { - gate_layout: layout.clone(), - gate_base: null_mut(), - up_layout: layout, - up_base: null(), - } - } - fn args( dt: DigitLayout, n: usize, @@ -150,27 +126,6 @@ mod test { } } - #[test] - fn test_compile() { - use super::NAME; - use std::ffi::CString; - - let Some(gpu) = Gpu::init() else { - return; - }; - println!("{}", gpu.0.device().info()); - - let mut op = Operator::new(&gpu); - op.scheme(&dyn_args(F16), 0).unwrap(); - - gpu.apply(|ctx| { - println!( - "{NAME}\n{}", - op.module.load(CString::new(NAME).unwrap(), ctx).info() - ); - }) - } - #[test] fn test_compute() { use super::super::common_cpu::Operator as RefOp; @@ -187,10 +142,8 @@ mod test { return; }; - let mut cpu_op = RefOp::new(&Cpu); - let mut gpu_op = Operator::new(&gpu); - cpu_op.scheme(&dyn_args(F64), 0).unwrap(); - gpu_op.scheme(&dyn_args(F16), 0).unwrap(); + let cpu_op = RefOp::new(&Cpu); + let gpu_op = Operator::new(&gpu); let n = 5632; let d = 2048; diff --git a/operators/src/swiglu/cuda/swiglu.cuh b/operators/src/swiglu/cuda/swiglu.cuh index 8ff92010..74e4004d 100644 --- a/operators/src/swiglu/cuda/swiglu.cuh +++ b/operators/src/swiglu/cuda/swiglu.cuh @@ -2,7 +2,7 @@ static __forceinline__ __device__ float sigmoid(float x) { return fdividef(1, 1 + expf(-x)); } -template +template static __device__ void swiglu( Tdata *__restrict__ gate_, int const stride_gate, diff --git a/operators/src/swiglu/infini/mod.rs b/operators/src/swiglu/infini/mod.rs index 840b6551..00e9090f 100644 --- a/operators/src/swiglu/infini/mod.rs +++ b/operators/src/swiglu/infini/mod.rs @@ -1,6 +1,6 @@ -use super::{args::Meta, Args, Swiglu}; -use crate::{get_static, infini::Device, ByteOf, LaunchError, QueueAlloc, SchemeError}; -use infini_op::{infiniop, AsRaw, Descriptor, Handle}; +use super::{Args, Swiglu, args::Meta}; +use crate::{ByteOf, LaunchError, QueueAlloc, infini::Device}; +use infini_op::{AsRaw, Descriptor, Handle, infiniop}; use std::sync::Arc; pub struct Operator(Arc); @@ -17,15 +17,6 @@ impl crate::Operator for Operator { Self(node.handle().clone()) } - #[inline] - fn scheme( - &mut self, - _args: &Self::Args, - _max_workspace_size: usize, - ) -> Result { - Ok(0) - } - fn launch( &self, args: &Self::Args, @@ -49,12 +40,6 @@ impl crate::Operator for Operator { unreachable!() }; - get_static! { - n d - gns gds - uns uds - } - let gate = infini_op::Tensor::new(dt, [n, d], [gns, gds]); let up = infini_op::Tensor::new(dt, [n, d], [uns, uds]); @@ -84,23 +69,12 @@ impl crate::Operator for Operator { #[cfg(test)] mod test { use super::{Args, Device, Operator}; - use crate::{dyn_, Hardware, Operator as _, TensorLayout}; + use crate::{Hardware, Operator as _, TensorLayout}; use digit_layout::{ - types::{F16, F64}, DigitLayout, + types::{F16, F64}, }; - fn dyn_args(dt: DigitLayout) -> Args { - use std::ptr::{null, null_mut}; - let layout = TensorLayout::new_dyn(dt, &[dyn_(); 2], &[dyn_(); 2]); - Args { - gate_layout: layout.clone(), - gate_base: null_mut(), - up_layout: layout, - up_base: null(), - } - } - fn args( dt: DigitLayout, n: usize, @@ -134,10 +108,8 @@ mod test { infini_rt::init(infini_rt::DEVICE_CPU); let dev = Device::cpu(); - let mut cpu_op = RefOp::new(&Cpu); - let mut dev_op = Operator::new(&dev); - cpu_op.scheme(&dyn_args(F64), 0).unwrap(); - dev_op.scheme(&dyn_args(F16), 0).unwrap(); + let cpu_op = RefOp::new(&Cpu); + let dev_op = Operator::new(&dev); let mut gate = vec![0.0f64; n * d]; let mut up = vec![0.0f64; n * d]; diff --git a/operators/src/swiglu/opencl/mod.rs b/operators/src/swiglu/opencl/mod.rs index 4179ea02..ed4d4ef4 100644 --- a/operators/src/swiglu/opencl/mod.rs +++ b/operators/src/swiglu/opencl/mod.rs @@ -1,15 +1,13 @@ -use super::{args::Meta, Args, Swiglu}; +use super::{Args, Swiglu, args::Meta}; use crate::{ - get_static, - opencl::{ClDevice, CodeGen, KernelCache, CL2_0}, - strides_not_support, - utils::gcd, ByteOf, LaunchError, QueueAlloc, SchemeDiversity::Low as LowDiversity, - SchemeError, + opencl::{CL2_0, ClDevice, CodeGen, KernelCache}, + strides_not_support, + utils::gcd, }; -use clrt::{bindings::cl_int, Context}; -use digit_layout::{types as Ty, DigitLayout}; +use clrt::{Context, bindings::cl_int}; +use digit_layout::{DigitLayout, types as Ty}; use lru::LruCache; use std::sync::Mutex; @@ -42,18 +40,6 @@ impl crate::Operator for Operator { } } - fn scheme( - &mut self, - args: &Self::Args, - _max_workspace_size: usize, - ) -> Result { - let Meta { dt, d, .. } = args.meta()?; - if let Some(&d) = d.get_static() { - self.cache_kernel(dt, d); - } - Ok(0) - } - fn launch( &self, args: &Self::Args, @@ -77,15 +63,9 @@ impl crate::Operator for Operator { unreachable!() }; - get_static! { - n d - sgn sgd - sun sud - } - let unit = dt.nbytes() as isize; if sgd != unit || sud != unit { - return Err(strides_not_support("opencl: swiglu").into()); + return Err(strides_not_support("opencl: swiglu")); }; let sg = (sgn / unit) as i32; @@ -109,7 +89,7 @@ impl crate::Operator for Operator { .set_arg(3, (su) as cl_int) .launch( &[0, 0], - &[n as usize, d as usize], + &[n, d], &[1, group_size], queue_alloc.queue(), None, @@ -152,23 +132,12 @@ struct SchemeKey { #[cfg(test)] mod test { use super::{Args, Operator}; - use crate::{dyn_, Hardware, Operator as _, TensorLayout}; + use crate::{Hardware, Operator as _, TensorLayout}; use digit_layout::{ - types::{F32, F64}, DigitLayout, + types::{F32, F64}, }; - fn dyn_args(dt: DigitLayout) -> Args { - use std::ptr::{null, null_mut}; - let layout = TensorLayout::new_dyn(dt, &[dyn_(); 2], &[dyn_(); 2]); - Args { - gate_layout: layout.clone(), - gate_base: null_mut(), - up_layout: layout, - up_base: null(), - } - } - fn args( dt: DigitLayout, n: usize, @@ -197,16 +166,14 @@ mod test { use rand::Rng; use std::{iter::zip, time::Instant}; - let mut cpu_op = RefOp::new(&Cpu); + let cpu_op = RefOp::new(&Cpu); for platform in Platform::all() { for device in platform.devices() { println!("device: {}", device.name()); let context = device.context(); let queue = context.queue(); - let mut cl_op = Operator::new(&ClDevice::new(context.clone(), Default::default())); - cpu_op.scheme(&dyn_args(F64), 0).unwrap(); - cl_op.scheme(&dyn_args(F32), 0).unwrap(); + let cl_op = Operator::new(&ClDevice::new(context.clone(), Default::default())); // let n = 5632; // let d = 2048; @@ -264,7 +231,7 @@ mod test { ) .unwrap(); let cpu_time = time.elapsed(); - let map = queue.map(&mut gate_svm); + let map = queue.map(&gate_svm); let ([], y_ans, []) = (unsafe { map.align_to::() }) else { panic!() };