diff --git a/ci/docker/x86_64-unknown-linux-gnu-emulated/Dockerfile b/ci/docker/x86_64-unknown-linux-gnu-emulated/Dockerfile index 5af97f9adf..40dbebdcc9 100644 --- a/ci/docker/x86_64-unknown-linux-gnu-emulated/Dockerfile +++ b/ci/docker/x86_64-unknown-linux-gnu-emulated/Dockerfile @@ -8,6 +8,6 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ wget \ bzip2 -RUN wget https://github.com/gnzlbg/intel_sde/raw/master/sde-external-8.16.0-2018-01-30-lin.tar.bz2 -RUN tar -xjf sde-external-8.16.0-2018-01-30-lin.tar.bz2 -ENV CARGO_TARGET_X86_64_UNKNOWN_LINUX_GNU_RUNNER="/sde-external-8.16.0-2018-01-30-lin/sde64 --" +RUN wget https://github.com/gnzlbg/intel_sde/raw/master/sde-external-8.35.0-2019-03-11-lin.tar.bz2 +RUN tar -xjf sde-external-8.35.0-2019-03-11-lin.tar.bz2 +ENV CARGO_TARGET_X86_64_UNKNOWN_LINUX_GNU_RUNNER="/sde-external-8.35.0-2019-03-11-lin/sde64 -rtm_mode full --" diff --git a/crates/core_arch/src/lib.rs b/crates/core_arch/src/lib.rs index dcec914bcc..58554b7ba6 100644 --- a/crates/core_arch/src/lib.rs +++ b/crates/core_arch/src/lib.rs @@ -32,6 +32,7 @@ abi_unadjusted, adx_target_feature, rtm_target_feature, + f16c_target_feature, external_doc )] #![cfg_attr(test, feature(test, abi_vectorcall, untagged_unions))] @@ -75,7 +76,4 @@ mod core_arch; pub use self::core_arch::arch::*; #[allow(unused_imports)] -use core::{ffi, intrinsics, marker, mem, ptr, sync}; - -#[cfg(test)] -use core::hint; +use core::{ffi, hint, intrinsics, marker, mem, ptr, sync}; diff --git a/crates/core_arch/src/simd.rs b/crates/core_arch/src/simd.rs index 79c61b5170..568fd1d16d 100644 --- a/crates/core_arch/src/simd.rs +++ b/crates/core_arch/src/simd.rs @@ -184,6 +184,10 @@ simd_ty!(i32x8[i32]: | x0, x1, x2, x3, x4, x5, x6, x7); simd_ty!(i64x4[i64]: i64, i64, i64, i64 | x0, x1, x2, x3); +simd_ty!(f32x8[f32]: + f32, f32, f32, f32, f32, f32, f32, f32 | + x0, x1, x2, x3, x4, x5, x6, x7); + // 512-bit wide types: simd_ty!(i32x16[i32]: diff --git a/crates/core_arch/src/x86/f16c.rs b/crates/core_arch/src/x86/f16c.rs new file mode 100644 index 0000000000..195485914b --- /dev/null +++ b/crates/core_arch/src/x86/f16c.rs @@ -0,0 +1,134 @@ +//! [F16C intrinsics]. +//! +//! [F16C intrinsics]: https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=fp16&expand=1769 + +use crate::{ + core_arch::{simd::*, x86::*}, + hint::unreachable_unchecked, + mem::transmute, +}; + +#[cfg(test)] +use stdsimd_test::assert_instr; + +#[allow(improper_ctypes)] +extern "unadjusted" { + #[link_name = "llvm.x86.vcvtph2ps.128"] + fn llvm_vcvtph2ps_128(a: i16x8) -> f32x4; + #[link_name = "llvm.x86.vcvtph2ps.256"] + fn llvm_vcvtph2ps_256(a: i16x8) -> f32x8; + #[link_name = "llvm.x86.vcvtps2ph.128"] + fn llvm_vcvtps2ph_128(a: f32x4, rounding: i32) -> i16x8; + #[link_name = "llvm.x86.vcvtps2ph.256"] + fn llvm_vcvtps2ph_256(a: f32x8, rounding: i32) -> i16x8; +} + +/// Converts the 4 x 16-bit half-precision float values in the lowest 64-bit of +/// the 128-bit vector `a` into 4 x 32-bit float values stored in a 128-bit wide +/// vector. +#[inline] +#[target_feature(enable = "f16c")] +#[cfg_attr(test, assert_instr("vcvtph2ps"))] +pub unsafe fn _mm_cvtph_ps(a: __m128i) -> __m128 { + transmute(llvm_vcvtph2ps_128(transmute(a))) +} + +/// Converts the 8 x 16-bit half-precision float values in the 128-bit vector +/// `a` into 8 x 32-bit float values stored in a 256-bit wide vector. +#[inline] +#[target_feature(enable = "f16c")] +#[cfg_attr(test, assert_instr("vcvtph2ps"))] +pub unsafe fn _mm256_cvtph_ps(a: __m128i) -> __m256 { + transmute(llvm_vcvtph2ps_256(transmute(a))) +} + +macro_rules! dispatch_rounding { + ($rounding:ident, $call:ident) => {{ + match $rounding { + 0 => call!(0), + 1 => call!(1), + 2 => call!(2), + 3 => call!(3), + 4 => call!(4), + 5 => call!(5), + 6 => call!(6), + 7 => call!(7), + _ => unreachable_unchecked(), + } + }}; +} + +/// Converts the 4 x 32-bit float values in the 128-bit vector `a` into 4 x +/// 16-bit half-precision float values stored in the lowest 64-bit of a 128-bit +/// vector. +/// +/// Rounding is done according to the `imm_rounding` parameter, which can be one of: +/// +/// * `_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC`: round to nearest and suppress exceptions, +/// * `_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC`: round down and suppress exceptions, +/// * `_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC`: round up and suppress exceptions, +/// * `_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC`: truncate and suppress exceptions, +/// * `_MM_FROUND_CUR_DIRECTION`: use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]. +#[inline] +#[target_feature(enable = "f16c")] +#[rustc_args_required_const(1)] +#[cfg_attr(test, assert_instr("vcvtps2ph", imm_rounding = 0))] +pub unsafe fn _mm_cvtps_ph(a: __m128, imm_rounding: i32) -> __m128i { + let a = transmute(a); + macro_rules! call { + ($rounding:expr) => { + llvm_vcvtps2ph_128(a, $rounding) + }; + } + transmute(dispatch_rounding!(imm_rounding, call)) +} + +/// Converts the 8 x 32-bit float values in the 256-bit vector `a` into 8 x +/// 16-bit half-precision float values stored in a 128-bit wide vector. +/// +/// Rounding is done according to the `imm_rounding` parameter, which can be one of: +/// +/// * `_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC`: round to nearest and suppress exceptions, +/// * `_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC`: round down and suppress exceptions, +/// * `_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC`: round up and suppress exceptions, +/// * `_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC`: truncate and suppress exceptions, +/// * `_MM_FROUND_CUR_DIRECTION`: use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]. +#[inline] +#[target_feature(enable = "f16c")] +#[rustc_args_required_const(1)] +#[cfg_attr(test, assert_instr("vcvtps2ph", imm_rounding = 0))] +pub unsafe fn _mm256_cvtps_ph(a: __m256, imm_rounding: i32) -> __m128i { + let a = transmute(a); + macro_rules! call { + ($rounding:expr) => { + llvm_vcvtps2ph_256(a, $rounding) + }; + } + transmute(dispatch_rounding!(imm_rounding, call)) +} + +#[cfg(test)] +mod tests { + use crate::{core_arch::x86::*, mem::transmute}; + use stdsimd_test::simd_test; + + #[simd_test(enable = "f16c")] + unsafe fn test_mm_cvtph_ps() { + let array = [1_f32, 2_f32, 3_f32, 4_f32]; + let float_vec: __m128 = transmute(array); + let halfs: __m128i = _mm_cvtps_ph(float_vec, 0); + let floats: __m128 = _mm_cvtph_ps(halfs); + let result: [f32; 4] = transmute(floats); + assert_eq!(result, array); + } + + #[simd_test(enable = "f16c")] + unsafe fn test_mm256_cvtph_ps() { + let array = [1_f32, 2_f32, 3_f32, 4_f32, 5_f32, 6_f32, 7_f32, 8_f32]; + let float_vec: __m256 = transmute(array); + let halfs: __m128i = _mm256_cvtps_ph(float_vec, 0); + let floats: __m256 = _mm256_cvtph_ps(halfs); + let result: [f32; 8] = transmute(floats); + assert_eq!(result, array); + } +} diff --git a/crates/core_arch/src/x86/mod.rs b/crates/core_arch/src/x86/mod.rs index 026ec1eec7..bed9e4a020 100644 --- a/crates/core_arch/src/x86/mod.rs +++ b/crates/core_arch/src/x86/mod.rs @@ -568,3 +568,6 @@ pub use self::bt::*; mod rtm; pub use self::rtm::*; + +mod f16c; +pub use self::f16c::*; diff --git a/crates/core_arch/src/x86/rtm.rs b/crates/core_arch/src/x86/rtm.rs index fa559faf37..ebe3ed80da 100644 --- a/crates/core_arch/src/x86/rtm.rs +++ b/crates/core_arch/src/x86/rtm.rs @@ -32,6 +32,7 @@ pub const _XBEGIN_STARTED: u32 = !0; /// Transaction explicitly aborted with xabort. The parameter passed to xabort is available with /// `_xabort_code(status)`. +#[allow(clippy::identity_op)] pub const _XABORT_EXPLICIT: u32 = 1 << 0; /// Transaction retry is possible. diff --git a/crates/core_arch/tests/cpu-detection.rs b/crates/core_arch/tests/cpu-detection.rs index 9a7c999a18..321f24e9fc 100644 --- a/crates/core_arch/tests/cpu-detection.rs +++ b/crates/core_arch/tests/cpu-detection.rs @@ -31,6 +31,7 @@ fn x86_all() { "avx512_vpopcntdq {:?}", is_x86_feature_detected!("avx512vpopcntdq") ); + println!("f16c: {:?}", is_x86_feature_detected!("f16c")); println!("fma: {:?}", is_x86_feature_detected!("fma")); println!("abm: {:?}", is_x86_feature_detected!("abm")); println!("bmi: {:?}", is_x86_feature_detected!("bmi1")); diff --git a/crates/std_detect/src/detect/arch/x86.rs b/crates/std_detect/src/detect/arch/x86.rs index 45f2d5bfc8..50d5cfa87c 100644 --- a/crates/std_detect/src/detect/arch/x86.rs +++ b/crates/std_detect/src/detect/arch/x86.rs @@ -62,6 +62,7 @@ /// * `"avx512ifma"` /// * `"avx512vbmi"` /// * `"avx512vpopcntdq"` +/// * `"f16c"` /// * `"fma"` /// * `"bmi1"` /// * `"bmi2"` @@ -179,6 +180,10 @@ macro_rules! is_x86_feature_detected { cfg!(target_feature = "avx512vpopcntdq") || $crate::detect::check_for( $crate::detect::Feature::avx512_vpopcntdq) }; + ("f16c") => { + cfg!(target_feature = "f16c") || $crate::detect::check_for( + $crate::detect::Feature::f16c) + }; ("fma") => { cfg!(target_feature = "fma") || $crate::detect::check_for( $crate::detect::Feature::fma) @@ -309,6 +314,8 @@ pub enum Feature { /// AVX-512 VPOPCNTDQ (Vector Population Count Doubleword and /// Quadword) avx512_vpopcntdq, + /// F16C (Conversions between IEEE-754 `binary16` and `binary32` formats) + f16c, /// FMA (Fused Multiply Add) fma, /// BMI1 (Bit Manipulation Instructions 1) diff --git a/crates/std_detect/src/detect/os/x86.rs b/crates/std_detect/src/detect/os/x86.rs index ab0622106c..e543d301c7 100644 --- a/crates/std_detect/src/detect/os/x86.rs +++ b/crates/std_detect/src/detect/os/x86.rs @@ -113,13 +113,14 @@ fn detect_features() -> cache::Initializer { }; enable(proc_info_ecx, 0, Feature::sse3); + enable(proc_info_ecx, 1, Feature::pclmulqdq); enable(proc_info_ecx, 9, Feature::ssse3); enable(proc_info_ecx, 13, Feature::cmpxchg16b); enable(proc_info_ecx, 19, Feature::sse4_1); enable(proc_info_ecx, 20, Feature::sse4_2); enable(proc_info_ecx, 23, Feature::popcnt); enable(proc_info_ecx, 25, Feature::aes); - enable(proc_info_ecx, 1, Feature::pclmulqdq); + enable(proc_info_ecx, 29, Feature::f16c); enable(proc_info_ecx, 30, Feature::rdrand); enable(extended_features_ebx, 18, Feature::rdseed); enable(extended_features_ebx, 19, Feature::adx); diff --git a/crates/std_detect/tests/cpu-detection.rs b/crates/std_detect/tests/cpu-detection.rs index b2b8abb010..0aae39e294 100644 --- a/crates/std_detect/tests/cpu-detection.rs +++ b/crates/std_detect/tests/cpu-detection.rs @@ -87,6 +87,7 @@ fn x86_all() { "avx512_vpopcntdq {:?}", is_x86_feature_detected!("avx512vpopcntdq") ); + println!("f16c: {:?}", is_x86_feature_detected!("f16c")); println!("fma: {:?}", is_x86_feature_detected!("fma")); println!("bmi1: {:?}", is_x86_feature_detected!("bmi1")); println!("bmi2: {:?}", is_x86_feature_detected!("bmi2")); diff --git a/crates/stdsimd-verify/tests/x86-intel.rs b/crates/stdsimd-verify/tests/x86-intel.rs index 3de07f30f4..fe6d801440 100644 --- a/crates/stdsimd-verify/tests/x86-intel.rs +++ b/crates/stdsimd-verify/tests/x86-intel.rs @@ -293,11 +293,15 @@ fn matches(rust: &Function, intel: &Intrinsic) -> Result<(), String> { .flat_map(|c| c.to_lowercase()) .collect::(); - // The XML file names IFMA as "avx512ifma52", while Rust calls - // it "avx512ifma". Fix this mismatch by replacing the Intel - // name with the Rust name. + // Fix mismatching feature names: let fixup_cpuid = |cpuid: String| match cpuid.as_ref() { + // The XML file names IFMA as "avx512ifma52", while Rust calls + // it "avx512ifma". "avx512ifma52" => String::from("avx512ifma"), + // See: https://github.com/rust-lang-nursery/stdsimd/issues/738 + // The intrinsics guide calls `f16c` `fp16c` in disagreement with + // Intel's architecture manuals. + "fp16c" => String::from("f16c"), _ => cpuid, }; let fixed_cpuid = fixup_cpuid(cpuid);