From e974180f2212ce707c0e7fa2057cf1b2418b4c7e Mon Sep 17 00:00:00 2001 From: Alex Crichton Date: Thu, 13 Dec 2018 11:40:44 -0800 Subject: [PATCH] Start adding some avx512 intrinsics First one is the quite simple `_mm512_abs_epi32` intrinsic! --- .appveyor.yml | 3 + Cargo.toml | 2 + coresimd/simd.rs | 8 + coresimd/x86/avx512f.rs | 189 +++++++++++++++++++++++ coresimd/x86/mod.rs | 25 +++ coresimd/x86/test.rs | 8 + crates/coresimd/src/lib.rs | 1 + crates/stdsimd-verify/src/lib.rs | 4 + crates/stdsimd-verify/tests/x86-intel.rs | 17 ++ 9 files changed, 257 insertions(+) create mode 100644 coresimd/x86/avx512f.rs diff --git a/.appveyor.yml b/.appveyor.yml index 1ee078630d..170a59b754 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -4,6 +4,9 @@ environment: # default so pass a flag to disable it to ensure our tests work ok. RUSTFLAGS: -Clink-args=/OPT:NOICF + # VS2017 looks to be the first with avx-512 support, notably in dumpbin + APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2017 + matrix: - TARGET: x86_64-pc-windows-msvc diff --git a/Cargo.toml b/Cargo.toml index 4e96e5f494..98c4cd5e4b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -10,7 +10,9 @@ exclude = [ [profile.release] debug = true opt-level = 3 +incremental = true [profile.bench] debug = 1 opt-level = 3 +incremental = true diff --git a/coresimd/simd.rs b/coresimd/simd.rs index dacda2f814..144050f01e 100644 --- a/coresimd/simd.rs +++ b/coresimd/simd.rs @@ -181,3 +181,11 @@ simd_ty!(i32x8[i32]: i32, i32, i32, i32, i32, i32, i32, i32 | x0, x1, x2, x3, x4, x5, x6, x7); simd_ty!(i64x4[i64]: i64, i64, i64, i64 | x0, x1, x2, x3); + +// 512-bit wide types: + +simd_ty!(i32x16[i32]: + i32, i32, i32, i32, i32, i32, i32, i32, + i32, i32, i32, i32, i32, i32, i32, i32 + | x0, x1, x2, x3, x4, x5, x6, x7, + x8, x9, x10, x11, x12, x13, x14, x15); diff --git a/coresimd/x86/avx512f.rs b/coresimd/x86/avx512f.rs new file mode 100644 index 0000000000..b8aa470a19 --- /dev/null +++ b/coresimd/x86/avx512f.rs @@ -0,0 +1,189 @@ +use coresimd::simd::*; +use coresimd::x86::*; +use mem; + +#[cfg(test)] +use stdsimd_test::assert_instr; + +/// Computes the absolute values of packed 32-bit integers in `a`. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#avx512techs=AVX512F&expand=33,34,4990,33&text=_mm512_abs_epi32) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpabsd))] +pub unsafe fn _mm512_abs_epi32(a: __m512i) -> __m512i { + mem::transmute(pabsd(a.as_i32x16(), _mm512_setzero_si512().as_i32x16(), -1)) +} + +/// Compute the absolute value of packed 32-bit integers in `a`, and store the +/// unsigned results in `dst` using writemask `k` (elements are copied from +/// `src` when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#avx512techs=AVX512F&expand=33,34,4990,33&text=_mm512_abs_epi32) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpabsd))] +pub unsafe fn _mm512_mask_abs_epi32(src: __m512i, k: __mmask16, a: __m512i) -> __m512i { + mem::transmute(pabsd(a.as_i32x16(), src.as_i32x16(), k)) +} + +/// Compute the absolute value of packed 32-bit integers in `a`, and store the +/// unsigned results in `dst` using zeromask `k` (elements are zeroed out when +/// the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#avx512techs=AVX512F&expand=33,34,4990,33,34,35,35&text=_mm512_maskz_abs_epi32) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpabsd))] +pub unsafe fn _mm512_maskz_abs_epi32(k: __mmask16, a: __m512i) -> __m512i { + mem::transmute(pabsd(a.as_i32x16(), _mm512_setzero_si512().as_i32x16(), k)) +} + +/// Return vector of type `__m512i` with all elements set to zero. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#avx512techs=AVX512F&expand=33,34,4990&text=_mm512_setzero_si512) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vxorps))] +pub unsafe fn _mm512_setzero_si512() -> __m512i { + mem::zeroed() +} + +/// Set packed 32-bit integers in `dst` with the supplied values in reverse +/// order. +#[inline] +#[target_feature(enable = "avx512f")] +pub unsafe fn _mm512_setr_epi32( + e15: i32, + e14: i32, + e13: i32, + e12: i32, + e11: i32, + e10: i32, + e9: i32, + e8: i32, + e7: i32, + e6: i32, + e5: i32, + e4: i32, + e3: i32, + e2: i32, + e1: i32, + e0: i32, +) -> __m512i { + let r = i32x16( + e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0, + ); + mem::transmute(r) +} + +#[allow(improper_ctypes)] +extern "C" { + #[link_name = "llvm.x86.avx512.mask.pabs.d.512"] + fn pabsd(a: i32x16, b: i32x16, c: i16) -> i32x16; +} + +#[cfg(test)] +mod tests { + use std; + use stdsimd_test::simd_test; + + use coresimd::x86::*; + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_abs_epi32() { + #[rustfmt::skip] + let a = _mm512_setr_epi32( + 0, 1, -1, std::i32::MAX, + std::i32::MIN, 100, -100, -32, + 0, 1, -1, std::i32::MAX, + std::i32::MIN, 100, -100, -32, + ); + let r = _mm512_abs_epi32(a); + let e = _mm512_setr_epi32( + 0, + 1, + 1, + std::i32::MAX, + std::i32::MAX.wrapping_add(1), + 100, + 100, + 32, + 0, + 1, + 1, + std::i32::MAX, + std::i32::MAX.wrapping_add(1), + 100, + 100, + 32, + ); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_abs_epi32() { + #[rustfmt::skip] + let a = _mm512_setr_epi32( + 0, 1, -1, std::i32::MAX, + std::i32::MIN, 100, -100, -32, + 0, 1, -1, std::i32::MAX, + std::i32::MIN, 100, -100, -32, + ); + let r = _mm512_mask_abs_epi32(a, 0, a); + assert_eq_m512i(r, a); + let r = _mm512_mask_abs_epi32(a, 0b11111111, a); + let e = _mm512_setr_epi32( + 0, + 1, + 1, + std::i32::MAX, + std::i32::MAX.wrapping_add(1), + 100, + 100, + 32, + 0, + 1, + -1, + std::i32::MAX, + std::i32::MIN, + 100, + -100, + -32, + ); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_maskz_abs_epi32() { + #[rustfmt::skip] + let a = _mm512_setr_epi32( + 0, 1, -1, std::i32::MAX, + std::i32::MIN, 100, -100, -32, + 0, 1, -1, std::i32::MAX, + std::i32::MIN, 100, -100, -32, + ); + let r = _mm512_maskz_abs_epi32(0, a); + assert_eq_m512i(r, _mm512_setzero_si512()); + let r = _mm512_maskz_abs_epi32(0b11111111, a); + let e = _mm512_setr_epi32( + 0, + 1, + 1, + std::i32::MAX, + std::i32::MAX.wrapping_add(1), + 100, + 100, + 32, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + ); + assert_eq_m512i(r, e); + } +} diff --git a/coresimd/x86/mod.rs b/coresimd/x86/mod.rs index 6b32ad8641..63cdecf4c0 100644 --- a/coresimd/x86/mod.rs +++ b/coresimd/x86/mod.rs @@ -391,6 +391,10 @@ types! { pub struct __m512d(f64, f64, f64, f64, f64, f64, f64, f64); } +/// The `__mmask16` type used in AVX-512 intrinsics, a 16-bit integer +#[allow(non_camel_case_types)] +pub type __mmask16 = i16; + #[cfg(test)] mod test; #[cfg(test)] @@ -502,6 +506,24 @@ impl m256iExt for __m256i { } } +#[allow(non_camel_case_types)] +#[unstable(feature = "stdimd_internal", issue = "0")] +pub(crate) trait m512iExt: Sized { + fn as_m512i(self) -> __m512i; + + #[inline] + fn as_i32x16(self) -> ::coresimd::simd::i32x16 { + unsafe { mem::transmute(self.as_m512i()) } + } +} + +impl m512iExt for __m512i { + #[inline] + fn as_m512i(self) -> Self { + self + } +} + mod eflags; pub use self::eflags::*; @@ -580,3 +602,6 @@ use stdsimd_test::assert_instr; pub unsafe fn ud2() -> ! { ::intrinsics::abort() } + +mod avx512f; +pub use self::avx512f::*; diff --git a/coresimd/x86/test.rs b/coresimd/x86/test.rs index bb9ed7142e..40a49402f8 100644 --- a/coresimd/x86/test.rs +++ b/coresimd/x86/test.rs @@ -135,3 +135,11 @@ mod x86_polyfill { pub use coresimd::x86_64::{_mm256_insert_epi64, _mm_insert_epi64}; } pub use self::x86_polyfill::*; + +pub unsafe fn assert_eq_m512i(a: __m512i, b: __m512i) { + union A { + a: __m512i, + b: [i32; 16], + } + assert_eq!(A { a }.b, A { a: b }.b) +} diff --git a/crates/coresimd/src/lib.rs b/crates/coresimd/src/lib.rs index 407986c43f..416218dada 100644 --- a/crates/coresimd/src/lib.rs +++ b/crates/coresimd/src/lib.rs @@ -33,6 +33,7 @@ sse4a_target_feature, arm_target_feature, aarch64_target_feature, + avx512_target_feature, mips_target_feature, powerpc_target_feature )] diff --git a/crates/stdsimd-verify/src/lib.rs b/crates/stdsimd-verify/src/lib.rs index db213c80d3..7e76bbdc63 100644 --- a/crates/stdsimd-verify/src/lib.rs +++ b/crates/stdsimd-verify/src/lib.rs @@ -98,6 +98,10 @@ fn to_type(t: &syn::Type) -> proc_macro2::TokenStream { "__m256" => quote! { &M256 }, "__m256d" => quote! { &M256D }, "__m256i" => quote! { &M256I }, + "__m512" => quote! { &M512 }, + "__m512d" => quote! { &M512D }, + "__m512i" => quote! { &M512I }, + "__mmask16" => quote! { &MMASK16 }, "__m64" => quote! { &M64 }, "bool" => quote! { &BOOL }, "f32" => quote! { &F32 }, diff --git a/crates/stdsimd-verify/tests/x86-intel.rs b/crates/stdsimd-verify/tests/x86-intel.rs index b1d12d10aa..2363927311 100644 --- a/crates/stdsimd-verify/tests/x86-intel.rs +++ b/crates/stdsimd-verify/tests/x86-intel.rs @@ -54,6 +54,10 @@ static M128D: Type = Type::M128D; static M256: Type = Type::M256; static M256I: Type = Type::M256I; static M256D: Type = Type::M256D; +static M512: Type = Type::M512; +static M512I: Type = Type::M512I; +static M512D: Type = Type::M512D; +static MMASK16: Type = Type::MMASK16; static TUPLE: Type = Type::Tuple; static CPUID: Type = Type::CpuidResult; @@ -72,6 +76,10 @@ enum Type { M256, M256D, M256I, + M512, + M512D, + M512I, + MMASK16, Tuple, CpuidResult, Never, @@ -422,6 +430,15 @@ fn equate(t: &Type, intel: &str, intrinsic: &str, is_const: bool) -> Result<(), | (&Type::M256, "__m256") | (&Type::Ptr(&Type::M256), "__m256*") => {} + (&Type::M512I, "__m512i") + | (&Type::Ptr(&Type::M512I), "__m512i*") + | (&Type::M512D, "__m512d") + | (&Type::Ptr(&Type::M512D), "__m512d*") + | (&Type::M512, "__m512") + | (&Type::Ptr(&Type::M512), "__m512*") => {} + + (&Type::MMASK16, "__mmask16") => {} + // This is a macro (?) in C which seems to mutate its arguments, but // that means that we're taking pointers to arguments in rust // as we're not exposing it as a macro.