From 73b21f50ae1a872d53a360b850b56c54532a98e6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Oliveira?= Date: Mon, 6 Nov 2017 14:35:32 +0000 Subject: [PATCH 01/12] Add _mm_cvtepi32_epi64 and fix typo --- src/x86/sse41.rs | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/src/x86/sse41.rs b/src/x86/sse41.rs index aabb8fdb79..6f5dbe0780 100644 --- a/src/x86/sse41.rs +++ b/src/x86/sse41.rs @@ -330,7 +330,7 @@ pub unsafe fn _mm_cvtepi16_epi32(a: i16x8) -> i32x4 { simd_shuffle4::<_, ::v64::i16x4>(a, a, [0, 1, 2, 3]).as_i32x4() } -/// Sign extend packed 16-bit integers in a to packed 64-bit integers +/// Sign extend packed 16-bit integers in `a` to packed 64-bit integers #[inline(always)] #[target_feature = "+sse4.1"] #[cfg_attr(test, assert_instr(pmovsxwq))] @@ -338,6 +338,14 @@ pub unsafe fn _mm_cvtepi16_epi64(a: i16x8) -> i64x2 { simd_shuffle2::<_, ::v32::i16x2>(a, a, [0, 1]).as_i64x2() } +/// Sign extend packed 32-bit integers in `a` to packed 64-bit integers +#[inline(always)] +#[target_feature = "+sse4.1"] +#[cfg_attr(test, assert_instr(pmovsxdq))] +pub unsafe fn _mm_cvtepi32_epi64(a: i32x4) -> i64x2 { + simd_shuffle2::<_, ::v64::i32x2>(a, a, [0, 1]).as_i64x2() +} + /// Returns the dot product of two f64x2 vectors. /// /// `imm8[1:0]` is the broadcast mask, and `imm8[5:4]` is the condition mask. @@ -1020,6 +1028,18 @@ mod tests { let e = i64x2::splat(-10); assert_eq!(r, e); } + + #[simd_test = "sse4.1"] + unsafe fn _mm_cvtepi32_epi64() { + let a = i32x4::splat(10); + let r = sse41::_mm_cvtepi32_epi64(a); + let e = i64x2::splat(10); + assert_eq!(r, e); + let a = i32x4::splat(-10); + let r = sse41::_mm_cvtepi32_epi64(a); + let e = i64x2::splat(-10); + assert_eq!(r, e); + } #[simd_test = "sse4.1"] unsafe fn _mm_dp_pd() { From f8cc9f609f27ee01c641b4d238e41fe0eb1f6780 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Oliveira?= Date: Mon, 6 Nov 2017 15:09:14 +0000 Subject: [PATCH 02/12] Add _mm_cvtepu8_epi{16, 32, 64} --- src/lib.rs | 14 ++++++++++++-- src/v64.rs | 1 + src/x86/sse41.rs | 50 ++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 63 insertions(+), 2 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 6e41a7026f..2ea9d96867 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -176,7 +176,11 @@ mod v32 { define_ty! { u8x4, u8, u8, u8, u8 } define_impl! { u8x4, u8, 4, i8x4, x0, x1, x2, x3 } - define_casts!((i8x4, i32x4, as_i32x4), (i16x2, i64x2, as_i64x2)); + define_casts!( + (i8x4, i32x4, as_i32x4), + (u8x4, i32x4, as_i32x4), + (i16x2, i64x2, as_i64x2) + ); } /// 16-bit wide vector tpyes @@ -186,7 +190,13 @@ mod v16 { define_ty! { i8x2, i8, i8 } define_impl! { i8x2, i8, 2, i8x2, x0, x1 } - define_casts!((i8x2, i64x2, as_i64x2)); + define_ty! { u8x2, u8, u8 } + define_impl! { u8x2, u8, 2, i8x2, x0, x1 } + + define_casts!( + (i8x2, i64x2, as_i64x2), + (u8x2, i64x2, as_i64x2) + ); } #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] diff --git a/src/v64.rs b/src/v64.rs index 0df2e878d6..9b4670bae9 100644 --- a/src/v64.rs +++ b/src/v64.rs @@ -60,6 +60,7 @@ define_casts!( (u8x8, i8x8, as_i8x8), (i8x8, u8x8, as_u8x8), (i8x8, i16x8, as_i16x8), + (u8x8, i16x8, as_i16x8), (i16x4, i32x4, as_i32x4), (i32x2, i64x2, as_i64x2), (u8x8, u16x8, as_u16x8), diff --git a/src/x86/sse41.rs b/src/x86/sse41.rs index 6f5dbe0780..23d2581999 100644 --- a/src/x86/sse41.rs +++ b/src/x86/sse41.rs @@ -346,6 +346,30 @@ pub unsafe fn _mm_cvtepi32_epi64(a: i32x4) -> i64x2 { simd_shuffle2::<_, ::v64::i32x2>(a, a, [0, 1]).as_i64x2() } +/// Zero extend packed unsigned 8-bit integers in `a` to packed 16-bit integers +#[inline(always)] +#[target_feature = "+sse4.1"] +#[cfg_attr(test, assert_instr(pmovzxbw))] +pub unsafe fn _mm_cvtepu8_epi16(a: u8x16) -> i16x8 { + simd_shuffle8::<_, ::v64::u8x8>(a, a, [0, 1, 2, 3, 4, 5, 6, 7]).as_i16x8() +} + +/// Zero extend packed unsigned 8-bit integers in `a` to packed 16-bit integers +#[inline(always)] +#[target_feature = "+sse4.1"] +#[cfg_attr(test, assert_instr(pmovzxbd))] +pub unsafe fn _mm_cvtepu8_epi32(a: u8x16) -> i32x4 { + simd_shuffle4::<_, ::v32::u8x4>(a, a, [0, 1, 2, 3]).as_i32x4() +} + +/// Zero extend packed unsigned 8-bit integers in `a` to packed 16-bit integers +#[inline(always)] +#[target_feature = "+sse4.1"] +#[cfg_attr(test, assert_instr(pmovzxbq))] +pub unsafe fn _mm_cvtepu8_epi64(a: u8x16) -> i64x2 { + simd_shuffle2::<_, ::v16::u8x2>(a, a, [0, 1]).as_i64x2() +} + /// Returns the dot product of two f64x2 vectors. /// /// `imm8[1:0]` is the broadcast mask, and `imm8[5:4]` is the condition mask. @@ -1041,6 +1065,32 @@ mod tests { assert_eq!(r, e); } + #[simd_test = "sse4.1"] + unsafe fn _mm_cvtepu8_epi16() { + let a = u8x16::splat(10); + let r = sse41::_mm_cvtepu8_epi16(a); + let e = i16x8::splat(10); + assert_eq!(r, e); + } + + #[simd_test = "sse4.1"] + unsafe fn _mm_cvtepu8_epi32() { + let a = u8x16::splat(10); + let r = sse41::_mm_cvtepu8_epi32(a); + let e = i32x4::splat(10); + assert_eq!(r, e); + } + + #[simd_test = "sse4.1"] + unsafe fn _mm_cvtepu8_epi64() { + let a = u8x16::splat(10); + let r = sse41::_mm_cvtepu8_epi64(a); + let e = i64x2::splat(10); + assert_eq!(r, e); + } + + + #[simd_test = "sse4.1"] unsafe fn _mm_dp_pd() { let a = f64x2::new(2.0, 3.0); From 33a3819e848558d0a105a49b53f988256618ffcb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Oliveira?= Date: Mon, 6 Nov 2017 15:45:33 +0000 Subject: [PATCH 03/12] Add remaining _mm_cvtep* intrinsics --- src/lib.rs | 9 +++++---- src/v64.rs | 4 +++- src/x86/sse41.rs | 48 +++++++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 55 insertions(+), 6 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 2ea9d96867..509935e42e 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -169,17 +169,19 @@ mod v32 { define_ty! { i16x2, i16, i16 } define_impl! { i16x2, i16, 2, i16x2, x0, x1 } + define_ty! { u16x2, u16, u16 } + define_impl! { u16x2, u16, 2, i16x2, x0, x1 } define_ty! { i8x4, i8, i8, i8, i8 } define_impl! { i8x4, i8, 4, i8x4, x0, x1, x2, x3 } - define_ty! { u8x4, u8, u8, u8, u8 } define_impl! { u8x4, u8, 4, i8x4, x0, x1, x2, x3 } define_casts!( + (i16x2, i64x2, as_i64x2), + (u16x2, i64x2, as_i64x2), (i8x4, i32x4, as_i32x4), - (u8x4, i32x4, as_i32x4), - (i16x2, i64x2, as_i64x2) + (u8x4, i32x4, as_i32x4) ); } @@ -189,7 +191,6 @@ mod v16 { define_ty! { i8x2, i8, i8 } define_impl! { i8x2, i8, 2, i8x2, x0, x1 } - define_ty! { u8x2, u8, u8 } define_impl! { u8x2, u8, 2, i8x2, x0, x1 } diff --git a/src/v64.rs b/src/v64.rs index 9b4670bae9..c1e346d1b2 100644 --- a/src/v64.rs +++ b/src/v64.rs @@ -65,7 +65,9 @@ define_casts!( (i32x2, i64x2, as_i64x2), (u8x8, u16x8, as_u16x8), (u16x4, u32x4, as_u32x4), - (u32x2, u64x2, as_u64x2) + (u16x4, i32x4, as_i32x4), + (u32x2, u64x2, as_u64x2), + (u32x2, i64x2, as_i64x2) ); #[cfg(test)] diff --git a/src/x86/sse41.rs b/src/x86/sse41.rs index 23d2581999..06493a7526 100644 --- a/src/x86/sse41.rs +++ b/src/x86/sse41.rs @@ -370,6 +370,30 @@ pub unsafe fn _mm_cvtepu8_epi64(a: u8x16) -> i64x2 { simd_shuffle2::<_, ::v16::u8x2>(a, a, [0, 1]).as_i64x2() } +/// Zero extend packed unsigned 16-bit integers in `a` to packed 32-bit integers +#[inline(always)] +#[target_feature = "+sse4.1"] +#[cfg_attr(test, assert_instr(pmovzxwd))] +pub unsafe fn _mm_cvtepu16_epi32(a: u16x8) -> i32x4 { + simd_shuffle4::<_, ::v64::u16x4>(a, a, [0, 1, 2, 3]).as_i32x4() +} + +/// Zero extend packed unsigned 16-bit integers in `a` to packed 64-bit integers +#[inline(always)] +#[target_feature = "+sse4.1"] +#[cfg_attr(test, assert_instr(pmovzxwq))] +pub unsafe fn _mm_cvtepu16_epi64(a: u16x8) -> i64x2 { + simd_shuffle2::<_, ::v32::u16x2>(a, a, [0, 1]).as_i64x2() +} + +/// Zero extend packed unsigned 32-bit integers in `a` to packed 64-bit integers +#[inline(always)] +#[target_feature = "+sse4.1"] +#[cfg_attr(test, assert_instr(pmovzxdq))] +pub unsafe fn _mm_cvtepu32_epi64(a: u32x4) -> i64x2 { + simd_shuffle2::<_, ::v64::u32x2>(a, a, [0, 1]).as_i64x2() +} + /// Returns the dot product of two f64x2 vectors. /// /// `imm8[1:0]` is the broadcast mask, and `imm8[5:4]` is the condition mask. @@ -1081,7 +1105,7 @@ mod tests { assert_eq!(r, e); } - #[simd_test = "sse4.1"] + #[simd_test = "sse4.1"] unsafe fn _mm_cvtepu8_epi64() { let a = u8x16::splat(10); let r = sse41::_mm_cvtepu8_epi64(a); @@ -1089,7 +1113,29 @@ mod tests { assert_eq!(r, e); } + #[simd_test = "sse4.1"] + unsafe fn _mm_cvtepu16_epi32() { + let a = u16x8::splat(10); + let r = sse41::_mm_cvtepu16_epi32(a); + let e = i32x4::splat(10); + assert_eq!(r, e); + } + #[simd_test = "sse4.1"] + unsafe fn _mm_cvtepu16_epi64() { + let a = u16x8::splat(10); + let r = sse41::_mm_cvtepu16_epi64(a); + let e = i64x2::splat(10); + assert_eq!(r, e); + } + + #[simd_test = "sse4.1"] + unsafe fn _mm_cvtepu32_epi64() { + let a = u32x4::splat(10); + let r = sse41::_mm_cvtepu32_epi64(a); + let e = i64x2::splat(10); + assert_eq!(r, e); + } #[simd_test = "sse4.1"] unsafe fn _mm_dp_pd() { From f630b1f58826aa0374e3e4719f59ec471ac74d12 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Oliveira?= Date: Mon, 6 Nov 2017 17:23:30 +0000 Subject: [PATCH 04/12] Add _mm_mul_epi32 and _mm_mullo_epi32 --- src/x86/sse41.rs | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/src/x86/sse41.rs b/src/x86/sse41.rs index 06493a7526..6e3e84eccc 100644 --- a/src/x86/sse41.rs +++ b/src/x86/sse41.rs @@ -394,6 +394,24 @@ pub unsafe fn _mm_cvtepu32_epi64(a: u32x4) -> i64x2 { simd_shuffle2::<_, ::v64::u32x2>(a, a, [0, 1]).as_i64x2() } +/// Multiply the low 32-bit integers from each packed 64-bit element in `a` and `b` +#[inline(always)] +#[target_feature = "+sse4.1"] +#[cfg_attr(test, assert_instr(pmuldq))] +pub unsafe fn _mm_mul_epi32(a: i32x4, b:i32x4) -> i64x2 { + pmuldq(a, b) +} + + +/// Multiply the packed 32-bit integers in `a` and `b`, producing intermediate 64-bit integers, +/// and return the low 32 bits of the intermediate integers. +#[inline(always)] +#[target_feature = "+sse4.1"] +#[cfg_attr(test, assert_instr(pmulld))] +pub unsafe fn _mm_mullo_epi32 (a: i32x4, b:i32x4) -> i32x4 { + a * b +} + /// Returns the dot product of two f64x2 vectors. /// /// `imm8[1:0]` is the broadcast mask, and `imm8[5:4]` is the condition mask. @@ -704,6 +722,8 @@ extern "C" { fn pminud(a: u32x4, b: u32x4) -> u32x4; #[link_name = "llvm.x86.sse41.packusdw"] fn packusdw(a: i32x4, b: i32x4) -> u16x8; + #[link_name = "llvm.x86.sse41.pmuldq"] + fn pmuldq(a: i32x4, b: i32x4) -> i64x2; #[link_name = "llvm.x86.sse41.dppd"] fn dppd(a: f64x2, b: f64x2, imm8: u8) -> f64x2; #[link_name = "llvm.x86.sse41.dpps"] @@ -1137,6 +1157,24 @@ mod tests { assert_eq!(r, e); } + #[simd_test = "sse4.1"] + unsafe fn _mm_mul_epi32() { + let a = i32x4::new(1, 1, 1, 1); + let b = i32x4::new(1, 2, 3, 4); + let r = sse41::_mm_mul_epi32(a, b); + let e = i64x2::new(1, 3); + assert_eq!(r, e); + } + + #[simd_test = "sse4.1"] + unsafe fn _mm_mullo_epi32() { + let a = i32x4::new(1, 1, 1, 1); + let b = i32x4::new(1, 2, 3, 4); + let r = sse41::_mm_mullo_epi32(a, b); + let e = i32x4::new(1, 2, 3, 4); + assert_eq!(r, e); + } + #[simd_test = "sse4.1"] unsafe fn _mm_dp_pd() { let a = f64x2::new(2.0, 3.0); From cdf0c240e60db67c4131f3bbbc1768948410ae53 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Oliveira?= Date: Mon, 6 Nov 2017 17:24:55 +0000 Subject: [PATCH 05/12] Add _mm_testz_si128, _mm_testc_si128 and _mm_testnzc_si128 This should work for any 128 bit sized vector, but it only accepts i64x2 for now --- src/x86/sse41.rs | 80 +++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 79 insertions(+), 1 deletion(-) diff --git a/src/x86/sse41.rs b/src/x86/sse41.rs index 6e3e84eccc..6f34435c7b 100644 --- a/src/x86/sse41.rs +++ b/src/x86/sse41.rs @@ -402,7 +402,6 @@ pub unsafe fn _mm_mul_epi32(a: i32x4, b:i32x4) -> i64x2 { pmuldq(a, b) } - /// Multiply the packed 32-bit integers in `a` and `b`, producing intermediate 64-bit integers, /// and return the low 32 bits of the intermediate integers. #[inline(always)] @@ -412,6 +411,27 @@ pub unsafe fn _mm_mullo_epi32 (a: i32x4, b:i32x4) -> i32x4 { a * b } +#[inline(always)] +#[target_feature = "+sse4.1"] +#[cfg_attr(test, assert_instr(ptest))] +pub unsafe fn _mm_testz_si128(a: i64x2, mask: i64x2) -> i32 { + ptestz(a, mask) +} + +#[inline(always)] +#[target_feature = "+sse4.1"] +#[cfg_attr(test, assert_instr(ptest))] +pub unsafe fn _mm_testc_si128(a: i64x2, mask: i64x2) -> i32 { + ptestc(a, mask) +} + +#[inline(always)] +#[target_feature = "+sse4.1"] +#[cfg_attr(test, assert_instr(ptest))] +pub unsafe fn _mm_testnzc_si128(a: i64x2, mask: i64x2) -> i32 { + ptestnzc(a, mask) +} + /// Returns the dot product of two f64x2 vectors. /// /// `imm8[1:0]` is the broadcast mask, and `imm8[5:4]` is the condition mask. @@ -724,6 +744,12 @@ extern "C" { fn packusdw(a: i32x4, b: i32x4) -> u16x8; #[link_name = "llvm.x86.sse41.pmuldq"] fn pmuldq(a: i32x4, b: i32x4) -> i64x2; + #[link_name = "llvm.x86.sse41.ptestz"] + fn ptestz(a: i64x2, mask: i64x2) -> i32; + #[link_name = "llvm.x86.sse41.ptestc"] + fn ptestc(a: i64x2, mask: i64x2) -> i32; + #[link_name = "llvm.x86.sse41.ptestnzc"] + fn ptestnzc(a: i64x2, mask: i64x2) -> i32; #[link_name = "llvm.x86.sse41.dppd"] fn dppd(a: f64x2, b: f64x2, imm8: u8) -> f64x2; #[link_name = "llvm.x86.sse41.dpps"] @@ -1175,6 +1201,58 @@ mod tests { assert_eq!(r, e); } + #[simd_test = "sse4.1"] + unsafe fn _mm_testz_si128() { + let a = i64x2::splat(1); + let mask = i64x2::splat(0); + let r = sse41::_mm_testz_si128(a, mask); + assert_eq!(r, 1); + let a = i64x2::splat(0b101); + let mask = i64x2::splat(0b110); + let r = sse41::_mm_testz_si128(a, mask); + assert_eq!(r, 0); + let a = i64x2::splat(0b011); + let mask = i64x2::splat(0b100); + let r = sse41::_mm_testz_si128(a, mask); + assert_eq!(r, 1); + } + + #[simd_test = "sse4.1"] + unsafe fn _mm_testc_si128() { + let a = i64x2::splat(-1); + let mask = i64x2::splat(0); + let r = sse41::_mm_testc_si128(a, mask); + assert_eq!(r, 1); + let a = i64x2::splat(0b101); + let mask = i64x2::splat(0b110); + let r = sse41::_mm_testc_si128(a, mask); + assert_eq!(r, 0); + let a = i64x2::splat(0b101); + let mask = i64x2::splat(0b100); + let r = sse41::_mm_testc_si128(a, mask); + assert_eq!(r, 1); + } + + #[simd_test = "sse4.1"] + unsafe fn _mm_testnzc_si128() { + let a = i64x2::splat(0); + let mask = i64x2::splat(1); + let r = sse41::_mm_testnzc_si128(a, mask); + assert_eq!(r, 0); + let a = i64x2::splat(-1); + let mask = i64x2::splat(0); + let r = sse41::_mm_testnzc_si128(a, mask); + assert_eq!(r, 0); + let a = i64x2::splat(0b101); + let mask = i64x2::splat(0b110); + let r = sse41::_mm_testnzc_si128(a, mask); + assert_eq!(r, 1); + let a = i64x2::splat(0b101); + let mask = i64x2::splat(0b101); + let r = sse41::_mm_testnzc_si128(a, mask); + assert_eq!(r, 0); + } + #[simd_test = "sse4.1"] unsafe fn _mm_dp_pd() { let a = f64x2::new(2.0, 3.0); From c8a5739abbbc3d2bda626bd325424a37c4a891bf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Oliveira?= Date: Tue, 7 Nov 2017 09:32:09 +0000 Subject: [PATCH 06/12] Add documentation for testz, testc and testnzc --- src/x86/sse41.rs | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/src/x86/sse41.rs b/src/x86/sse41.rs index 6f34435c7b..30450479d0 100644 --- a/src/x86/sse41.rs +++ b/src/x86/sse41.rs @@ -411,6 +411,18 @@ pub unsafe fn _mm_mullo_epi32 (a: i32x4, b:i32x4) -> i32x4 { a * b } +/// Tests whether the specified bits in a 128-bit integer vector are all +/// zeros. +/// +/// Arguments: +/// +/// * `a` - A 128-bit integer vector containing the bits to be tested. +/// * `mask` - A 128-bit integer vector selecting which bits to test in operand `a`. +/// +/// Returns: +/// +/// * `1` - if the specified bits are all zeros, +/// * `0` - otherwise. #[inline(always)] #[target_feature = "+sse4.1"] #[cfg_attr(test, assert_instr(ptest))] @@ -418,6 +430,19 @@ pub unsafe fn _mm_testz_si128(a: i64x2, mask: i64x2) -> i32 { ptestz(a, mask) } + +/// Tests whether the specified bits in a 128-bit integer vector are all +/// ones. +/// +/// Arguments: +/// +/// * `a` - A 128-bit integer vector containing the bits to be tested. +/// * `mask` - A 128-bit integer vector selecting which bits to test in operand `a`. +/// +/// Returns: +/// +/// * `1` - if the specified bits are all ones, +/// * `0` - otherwise. #[inline(always)] #[target_feature = "+sse4.1"] #[cfg_attr(test, assert_instr(ptest))] @@ -425,6 +450,18 @@ pub unsafe fn _mm_testc_si128(a: i64x2, mask: i64x2) -> i32 { ptestc(a, mask) } +/// Tests whether the specified bits in a 128-bit integer vector are +/// neither all zeros nor all ones. +/// +/// Arguments: +/// +/// * `a` - A 128-bit integer vector containing the bits to be tested. +/// * `mask` - A 128-bit integer vector selecting which bits to test in operand `a`. +/// +/// Returns: +/// +/// * `1` - if the specified bits are neither all zeros nor all ones, +/// * `0` - otherwise. #[inline(always)] #[target_feature = "+sse4.1"] #[cfg_attr(test, assert_instr(ptest))] From 19b2aede121a620bc63a0bd600fd668ecdea0b9d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Oliveira?= Date: Tue, 7 Nov 2017 09:54:53 +0000 Subject: [PATCH 07/12] Add _mm_test_all_zeros, _mm_test_all_ones and _mm_test_mix_ones_zeros --- src/x86/sse41.rs | 103 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 103 insertions(+) diff --git a/src/x86/sse41.rs b/src/x86/sse41.rs index 30450479d0..abc037949a 100644 --- a/src/x86/sse41.rs +++ b/src/x86/sse41.rs @@ -469,6 +469,63 @@ pub unsafe fn _mm_testnzc_si128(a: i64x2, mask: i64x2) -> i32 { ptestnzc(a, mask) } +/// Tests whether the specified bits in a 128-bit integer vector are all +/// zeros. +/// +/// Arguments: +/// +/// * `a` - A 128-bit integer vector containing the bits to be tested. +/// * `mask` - A 128-bit integer vector selecting which bits to test in operand `a`. +/// +/// Returns: +/// +/// * `1` - if the specified bits are all zeros, +/// * `0` - otherwise. +#[inline(always)] +#[target_feature = "+sse4.1"] +#[cfg_attr(test, assert_instr(ptest))] +pub unsafe fn _mm_test_all_zeros(a: i64x2, mask: i64x2) -> i32 { + _mm_testz_si128(a, mask) +} + +/// Tests whether the specified bits in `a` 128-bit integer vector are all +/// ones. +/// +/// Argument: +/// +/// * `a` - A 128-bit integer vector containing the bits to be tested. +/// +/// Returns: +/// +/// * `1` - if the bits specified in the operand are all set to 1, +/// * `0` - otherwise. +#[inline(always)] +#[target_feature = "+sse4.1"] +#[cfg_attr(test, assert_instr(pcmpeqd))] +#[cfg_attr(test, assert_instr(ptest))] +pub unsafe fn _mm_test_all_ones(a: i64x2) -> i32 { + _mm_testc_si128(a, i64x2::splat(-1)) +} + +/// Tests whether the specified bits in a 128-bit integer vector are +/// neither all zeros nor all ones. +/// +/// Arguments: +/// +/// * `a` - A 128-bit integer vector containing the bits to be tested. +/// * `mask` - A 128-bit integer vector selecting which bits to test in operand `a`. +/// +/// Returns: +/// +/// * `1` - if the specified bits are neither all zeros nor all ones, +/// * `0` - otherwise. +#[inline(always)] +#[target_feature = "+sse4.1"] +#[cfg_attr(test, assert_instr(ptest))] +pub unsafe fn _mm_test_mix_ones_zeros(a: i64x2, mask: i64x2) -> i32 { + _mm_testnzc_si128(a, mask) +} + /// Returns the dot product of two f64x2 vectors. /// /// `imm8[1:0]` is the broadcast mask, and `imm8[5:4]` is the condition mask. @@ -1290,6 +1347,52 @@ mod tests { assert_eq!(r, 0); } + #[simd_test = "sse4.1"] + unsafe fn _mm_test_all_zeros() { + let a = i64x2::splat(1); + let mask = i64x2::splat(0); + let r = sse41::_mm_test_all_zeros(a, mask); + assert_eq!(r, 1); + let a = i64x2::splat(0b101); + let mask = i64x2::splat(0b110); + let r = sse41::_mm_test_all_zeros(a, mask); + assert_eq!(r, 0); + let a = i64x2::splat(0b011); + let mask = i64x2::splat(0b100); + let r = sse41::_mm_test_all_zeros(a, mask); + assert_eq!(r, 1); + } + + #[simd_test = "sse4.1"] + unsafe fn _mm_test_all_ones() { + let a = i64x2::splat(-1); + let r = sse41::_mm_test_all_ones(a); + assert_eq!(r, 1); + let a = i64x2::splat(0b101); + let r = sse41::_mm_test_all_ones(a); + assert_eq!(r, 0); + } + + #[simd_test = "sse4.1"] + unsafe fn _mm_test_mix_ones_zeros() { + let a = i64x2::splat(0); + let mask = i64x2::splat(1); + let r = sse41::_mm_test_mix_ones_zeros(a, mask); + assert_eq!(r, 0); + let a = i64x2::splat(-1); + let mask = i64x2::splat(0); + let r = sse41::_mm_test_mix_ones_zeros(a, mask); + assert_eq!(r, 0); + let a = i64x2::splat(0b101); + let mask = i64x2::splat(0b110); + let r = sse41::_mm_test_mix_ones_zeros(a, mask); + assert_eq!(r, 1); + let a = i64x2::splat(0b101); + let mask = i64x2::splat(0b101); + let r = sse41::_mm_test_mix_ones_zeros(a, mask); + assert_eq!(r, 0); + } + #[simd_test = "sse4.1"] unsafe fn _mm_dp_pd() { let a = f64x2::new(2.0, 3.0); From f7f2a2348f729bec52b70c1b52320eb73800ed9e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Oliveira?= Date: Tue, 7 Nov 2017 10:12:55 +0000 Subject: [PATCH 08/12] Add _mm_minpos_epu16 --- src/x86/sse41.rs | 33 +++++++++++++++++++++++++++------ 1 file changed, 27 insertions(+), 6 deletions(-) diff --git a/src/x86/sse41.rs b/src/x86/sse41.rs index abc037949a..12f4016f6e 100644 --- a/src/x86/sse41.rs +++ b/src/x86/sse41.rs @@ -768,10 +768,26 @@ pub unsafe fn _mm_round_ss(a: f32x4, b: f32x4, rounding: i32) -> f32x4 { constify_imm4!(rounding, call) } -/// Find minimal u16 element in vector. -/// Place it in the first element of resulting vector and it's index -/// in second element (formally bits [16..18] inclusive). -/// All other elements are set to zero. +/// Finds the minimum u16 in the u16x8 vector, returning it in the first +/// position of the result vector along with its index in the second position; +/// all other elements are set to zero. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VPHMINPOSUW / PHMINPOSUW +/// instruction. +/// +/// Arguments: +/// +/// * `a` - A 128-bit vector of type `u16x8`. +/// +/// Returns: +/// +/// A 128-bit value where: +/// +/// * bits `[15:0]` - contain the minimum value found in parameter `a`, +/// * bits `[18:16]` - contain the index of the minimum value +/// * remaining bits are set to `0`. #[inline(always)] #[target_feature = "+sse4.1"] #[cfg_attr(test, assert_instr(phminposuw))] @@ -801,7 +817,6 @@ pub unsafe fn _mm_mullo_epi32(a: i32x4, b: i32x4) -> i32x4 { a * b } - #[allow(improper_ctypes)] extern "C" { #[link_name = "llvm.x86.sse41.pblendvb"] @@ -1216,7 +1231,7 @@ mod tests { let e = i64x2::splat(-10); assert_eq!(r, e); } - + #[simd_test = "sse4.1"] unsafe fn _mm_cvtepi32_epi64() { let a = i32x4::splat(10); @@ -1558,6 +1573,12 @@ mod tests { // Attention, most significant bit in r[2] is treated as a sign bit! // 1234567 * 666666 = -1589877210 let e = i32x4::new(-300, 512, -1589877210, -1409865409); + + #[simd_test = "sse4.1"] + unsafe fn _mm_minpos_epu16() { + let a = u16x8::new(8, 7, 6, 5, 4, 1, 2, 3); + let r = sse41::_mm_minpos_epu16(a); + let e = u16x8::splat(0).replace(0, 1).replace(1, 5); assert_eq!(r, e); } } From 6b718d9702d2d3c8304d93ff32b81a3ba7aeeca3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Oliveira?= Date: Tue, 7 Nov 2017 12:14:43 +0000 Subject: [PATCH 09/12] Add _mm_mpsadbw_epu8 --- src/x86/macros.rs | 16 ++++++ src/x86/sse41.rs | 121 +++++++++++++++++++++++++++++++++++----------- 2 files changed, 109 insertions(+), 28 deletions(-) diff --git a/src/x86/macros.rs b/src/x86/macros.rs index f268a3499b..79109fd67c 100644 --- a/src/x86/macros.rs +++ b/src/x86/macros.rs @@ -328,6 +328,22 @@ macro_rules! constify_imm4 { } } +macro_rules! constify_imm3 { + ($imm8:expr, $expand:ident) => { + #[allow(overflowing_literals)] + match $imm8 & 0b111 { + 0 => $expand!(0), + 1 => $expand!(1), + 2 => $expand!(2), + 3 => $expand!(3), + 4 => $expand!(4), + 5 => $expand!(5), + 6 => $expand!(6), + _ => $expand!(7), + } + } +} + macro_rules! constify_imm2 { ($imm8:expr, $expand:ident) => { #[allow(overflowing_literals)] diff --git a/src/x86/sse41.rs b/src/x86/sse41.rs index 12f4016f6e..88302a2964 100644 --- a/src/x86/sse41.rs +++ b/src/x86/sse41.rs @@ -151,8 +151,7 @@ pub unsafe fn _mm_extract_epi64(a: i64x2, imm8: u8) -> i64 { /// Then zero elements according to `imm8`. /// /// `imm8` specifies which bits from operand `a` will be copied, which bits in -/// the -/// result they will be copied to, and which bits in the result will be +/// the result they will be copied to, and which bits in the result will be /// cleared. The following assignments are made: /// /// * Bits `[7:6]` specify the bits to copy from operand `a`: @@ -413,14 +412,14 @@ pub unsafe fn _mm_mullo_epi32 (a: i32x4, b:i32x4) -> i32x4 { /// Tests whether the specified bits in a 128-bit integer vector are all /// zeros. -/// +/// /// Arguments: -/// +/// /// * `a` - A 128-bit integer vector containing the bits to be tested. /// * `mask` - A 128-bit integer vector selecting which bits to test in operand `a`. -/// +/// /// Returns: -/// +/// /// * `1` - if the specified bits are all zeros, /// * `0` - otherwise. #[inline(always)] @@ -435,12 +434,12 @@ pub unsafe fn _mm_testz_si128(a: i64x2, mask: i64x2) -> i32 { /// ones. /// /// Arguments: -/// +/// /// * `a` - A 128-bit integer vector containing the bits to be tested. /// * `mask` - A 128-bit integer vector selecting which bits to test in operand `a`. -/// +/// /// Returns: -/// +/// /// * `1` - if the specified bits are all ones, /// * `0` - otherwise. #[inline(always)] @@ -454,12 +453,12 @@ pub unsafe fn _mm_testc_si128(a: i64x2, mask: i64x2) -> i32 { /// neither all zeros nor all ones. /// /// Arguments: -/// +/// /// * `a` - A 128-bit integer vector containing the bits to be tested. /// * `mask` - A 128-bit integer vector selecting which bits to test in operand `a`. -/// +/// /// Returns: -/// +/// /// * `1` - if the specified bits are neither all zeros nor all ones, /// * `0` - otherwise. #[inline(always)] @@ -471,14 +470,14 @@ pub unsafe fn _mm_testnzc_si128(a: i64x2, mask: i64x2) -> i32 { /// Tests whether the specified bits in a 128-bit integer vector are all /// zeros. -/// +/// /// Arguments: -/// +/// /// * `a` - A 128-bit integer vector containing the bits to be tested. /// * `mask` - A 128-bit integer vector selecting which bits to test in operand `a`. -/// +/// /// Returns: -/// +/// /// * `1` - if the specified bits are all zeros, /// * `0` - otherwise. #[inline(always)] @@ -490,13 +489,13 @@ pub unsafe fn _mm_test_all_zeros(a: i64x2, mask: i64x2) -> i32 { /// Tests whether the specified bits in `a` 128-bit integer vector are all /// ones. -/// +/// /// Argument: -/// +/// /// * `a` - A 128-bit integer vector containing the bits to be tested. -/// +/// /// Returns: -/// +/// /// * `1` - if the bits specified in the operand are all set to 1, /// * `0` - otherwise. #[inline(always)] @@ -511,12 +510,12 @@ pub unsafe fn _mm_test_all_ones(a: i64x2) -> i32 { /// neither all zeros nor all ones. /// /// Arguments: -/// +/// /// * `a` - A 128-bit integer vector containing the bits to be tested. /// * `mask` - A 128-bit integer vector selecting which bits to test in operand `a`. -/// +/// /// Returns: -/// +/// /// * `1` - if the specified bits are neither all zeros nor all ones, /// * `0` - otherwise. #[inline(always)] @@ -768,11 +767,9 @@ pub unsafe fn _mm_round_ss(a: f32x4, b: f32x4, rounding: i32) -> f32x4 { constify_imm4!(rounding, call) } -/// Finds the minimum u16 in the u16x8 vector, returning it in the first -/// position of the result vector along with its index in the second position; -/// all other elements are set to zero. -/// -/// \headerfile +/// Finds the minimum unsigned 16-bit element in the 128-bit u16x8 vector, +/// returning a vector containing its value in its first position, and its index +/// in its second position; all other elements are set to zero. /// /// This intrinsic corresponds to the VPHMINPOSUW / PHMINPOSUW /// instruction. @@ -817,6 +814,47 @@ pub unsafe fn _mm_mullo_epi32(a: i32x4, b: i32x4) -> i32x4 { a * b } +/// Subtracts 8-bit unsigned integer values and computes the absolute +/// values of the differences to the corresponding bits in the destination. +/// Then sums of the absolute differences are returned according to the bit +/// fields in the immediate operand. +/// +/// The following algorithm is performed: +/// +/// ```ignore +/// i = imm8[2] * 4 +/// j = imm8[1:0] * 4 +/// for k := 0 to 7 +/// d0 = abs(a[i + k + 0] - b[j + 0]) +/// d1 = abs(a[i + k + 1] - b[j + 1]) +/// d2 = abs(a[i + k + 2] - b[j + 2]) +/// d3 = abs(a[i + k + 3] - b[j + 3]) +/// r[k] = d0 + d1 + d2 + d3 +/// ``` +/// +/// Arguments: +/// +/// * `a` - A 128-bit vector of type `i8x16`. +/// * `b` - A 128-bit vector of type `i8x16`. +/// * `imm8` - An 8-bit immediate operand specifying how the absolute differences are to +/// be calculated +/// * Bit `[2]` specify the offset for operand `a` +/// * Bits `[1:0]` specify the offset for operand `b` +/// +/// Returns: +/// +/// * A `i16x8` vector containing the sums of the sets of +/// absolute differences between both operands. +#[inline(always)] +#[target_feature = "+sse4.1"] +#[cfg_attr(test, assert_instr(mpsadbw, imm8=0))] +pub unsafe fn _mm_mpsadbw_epu8(a: i8x16, b: i8x16, imm8: u8) -> i16x8 { + macro_rules! call { + ($imm8:expr) => { mpsadbw(a, b, $imm8) } + } + constify_imm3!(imm8, call) +} + #[allow(improper_ctypes)] extern "C" { #[link_name = "llvm.x86.sse41.pblendvb"] @@ -875,6 +913,8 @@ extern "C" { fn phminposuw(a: u16x8) -> u16x8; #[link_name = "llvm.x86.sse41.pmuldq"] fn pmuldq(a: i32x4, b: i32x4) -> i64x2; + #[link_name = "llvm.x86.sse41.mpsadbw"] + fn mpsadbw(a: i8x16, b: i8x16, imm8: u8) -> i16x8; } #[cfg(test)] @@ -1581,4 +1621,29 @@ mod tests { let e = u16x8::splat(0).replace(0, 1).replace(1, 5); assert_eq!(r, e); } + + #[simd_test = "sse4.1"] + unsafe fn _mm_mpsadbw_epu8() { + let a = i8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + + let r = sse41::_mm_mpsadbw_epu8(a, a, 0b000); + let e = i16x8::new(0, 4, 8, 12, 16, 20, 24, 28); + assert_eq!(r, e); + + let r = sse41::_mm_mpsadbw_epu8(a, a, 0b001); + let e = i16x8::new(16, 12, 8, 4, 0, 4, 8, 12); + assert_eq!(r, e); + + let r = sse41::_mm_mpsadbw_epu8(a, a, 0b100); + let e = i16x8::new(16, 20, 24, 28, 32, 36, 40, 44); + assert_eq!(r, e); + + let r = sse41::_mm_mpsadbw_epu8(a, a, 0b101); + let e = i16x8::new(0, 4, 8, 12, 16, 20, 24, 28); + assert_eq!(r, e); + + let r = sse41::_mm_mpsadbw_epu8(a, a, 0b111); + let e = i16x8::new(32, 28, 24, 20, 16, 12, 8, 4); + assert_eq!(r, e); + } } From 0d406d0e3d4298afce69d889b4fdb0edc4a7bcb0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Oliveira?= Date: Tue, 7 Nov 2017 14:58:12 +0000 Subject: [PATCH 10/12] Change _mm_mpsadbw_epu8 to work with unsigned integers --- src/x86/sse41.rs | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/x86/sse41.rs b/src/x86/sse41.rs index 88302a2964..eaa345289f 100644 --- a/src/x86/sse41.rs +++ b/src/x86/sse41.rs @@ -848,7 +848,7 @@ pub unsafe fn _mm_mullo_epi32(a: i32x4, b: i32x4) -> i32x4 { #[inline(always)] #[target_feature = "+sse4.1"] #[cfg_attr(test, assert_instr(mpsadbw, imm8=0))] -pub unsafe fn _mm_mpsadbw_epu8(a: i8x16, b: i8x16, imm8: u8) -> i16x8 { +pub unsafe fn _mm_mpsadbw_epu8(a: u8x16, b: u8x16, imm8: u8) -> u16x8 { macro_rules! call { ($imm8:expr) => { mpsadbw(a, b, $imm8) } } @@ -914,7 +914,7 @@ extern "C" { #[link_name = "llvm.x86.sse41.pmuldq"] fn pmuldq(a: i32x4, b: i32x4) -> i64x2; #[link_name = "llvm.x86.sse41.mpsadbw"] - fn mpsadbw(a: i8x16, b: i8x16, imm8: u8) -> i16x8; + fn mpsadbw(a: u8x16, b: u8x16, imm8: u8) -> u16x8; } #[cfg(test)] @@ -1624,26 +1624,26 @@ mod tests { #[simd_test = "sse4.1"] unsafe fn _mm_mpsadbw_epu8() { - let a = i8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + let a = u8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); let r = sse41::_mm_mpsadbw_epu8(a, a, 0b000); - let e = i16x8::new(0, 4, 8, 12, 16, 20, 24, 28); + let e = u16x8::new(0, 4, 8, 12, 16, 20, 24, 28); assert_eq!(r, e); let r = sse41::_mm_mpsadbw_epu8(a, a, 0b001); - let e = i16x8::new(16, 12, 8, 4, 0, 4, 8, 12); + let e = u16x8::new(16, 12, 8, 4, 0, 4, 8, 12); assert_eq!(r, e); let r = sse41::_mm_mpsadbw_epu8(a, a, 0b100); - let e = i16x8::new(16, 20, 24, 28, 32, 36, 40, 44); + let e = u16x8::new(16, 20, 24, 28, 32, 36, 40, 44); assert_eq!(r, e); let r = sse41::_mm_mpsadbw_epu8(a, a, 0b101); - let e = i16x8::new(0, 4, 8, 12, 16, 20, 24, 28); + let e = u16x8::new(0, 4, 8, 12, 16, 20, 24, 28); assert_eq!(r, e); let r = sse41::_mm_mpsadbw_epu8(a, a, 0b111); - let e = i16x8::new(32, 28, 24, 20, 16, 12, 8, 4); + let e = u16x8::new(32, 28, 24, 20, 16, 12, 8, 4); assert_eq!(r, e); } } From e6b9e9d53d36633d1a3fecaf814d5a1bf87b7c1c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Oliveira?= Date: Tue, 7 Nov 2017 15:12:06 +0000 Subject: [PATCH 11/12] Make test intrinsics use __m128i --- src/x86/sse41.rs | 94 ++++++++++++++++++++++++------------------------ 1 file changed, 48 insertions(+), 46 deletions(-) diff --git a/src/x86/sse41.rs b/src/x86/sse41.rs index eaa345289f..1f02dd38c6 100644 --- a/src/x86/sse41.rs +++ b/src/x86/sse41.rs @@ -6,6 +6,7 @@ use std::mem; use stdsimd_test::assert_instr; use simd_llvm::{simd_shuffle2, simd_shuffle4, simd_shuffle8}; +use x86::__m128i; use v128::*; // SSE4 rounding constans @@ -425,8 +426,8 @@ pub unsafe fn _mm_mullo_epi32 (a: i32x4, b:i32x4) -> i32x4 { #[inline(always)] #[target_feature = "+sse4.1"] #[cfg_attr(test, assert_instr(ptest))] -pub unsafe fn _mm_testz_si128(a: i64x2, mask: i64x2) -> i32 { - ptestz(a, mask) +pub unsafe fn _mm_testz_si128(a: __m128i, mask: __m128i) -> i32 { + ptestz(a.into(), mask.into()) } @@ -445,8 +446,8 @@ pub unsafe fn _mm_testz_si128(a: i64x2, mask: i64x2) -> i32 { #[inline(always)] #[target_feature = "+sse4.1"] #[cfg_attr(test, assert_instr(ptest))] -pub unsafe fn _mm_testc_si128(a: i64x2, mask: i64x2) -> i32 { - ptestc(a, mask) +pub unsafe fn _mm_testc_si128(a: __m128i, mask: __m128i) -> i32 { + ptestc(a.into(), mask.into()) } /// Tests whether the specified bits in a 128-bit integer vector are @@ -464,8 +465,8 @@ pub unsafe fn _mm_testc_si128(a: i64x2, mask: i64x2) -> i32 { #[inline(always)] #[target_feature = "+sse4.1"] #[cfg_attr(test, assert_instr(ptest))] -pub unsafe fn _mm_testnzc_si128(a: i64x2, mask: i64x2) -> i32 { - ptestnzc(a, mask) +pub unsafe fn _mm_testnzc_si128(a: __m128i, mask: __m128i) -> i32 { + ptestnzc(a.into(), mask.into()) } /// Tests whether the specified bits in a 128-bit integer vector are all @@ -483,7 +484,7 @@ pub unsafe fn _mm_testnzc_si128(a: i64x2, mask: i64x2) -> i32 { #[inline(always)] #[target_feature = "+sse4.1"] #[cfg_attr(test, assert_instr(ptest))] -pub unsafe fn _mm_test_all_zeros(a: i64x2, mask: i64x2) -> i32 { +pub unsafe fn _mm_test_all_zeros(a: __m128i, mask: __m128i) -> i32 { _mm_testz_si128(a, mask) } @@ -502,8 +503,8 @@ pub unsafe fn _mm_test_all_zeros(a: i64x2, mask: i64x2) -> i32 { #[target_feature = "+sse4.1"] #[cfg_attr(test, assert_instr(pcmpeqd))] #[cfg_attr(test, assert_instr(ptest))] -pub unsafe fn _mm_test_all_ones(a: i64x2) -> i32 { - _mm_testc_si128(a, i64x2::splat(-1)) +pub unsafe fn _mm_test_all_ones(a: __m128i) -> i32 { + _mm_testc_si128(a, ::x86::sse2::_mm_cmpeq_epi32(a.into(), a.into()).into()) } /// Tests whether the specified bits in a 128-bit integer vector are @@ -521,7 +522,7 @@ pub unsafe fn _mm_test_all_ones(a: i64x2) -> i32 { #[inline(always)] #[target_feature = "+sse4.1"] #[cfg_attr(test, assert_instr(ptest))] -pub unsafe fn _mm_test_mix_ones_zeros(a: i64x2, mask: i64x2) -> i32 { +pub unsafe fn _mm_test_mix_ones_zeros(a: __m128i, mask: __m128i) -> i32 { _mm_testnzc_si128(a, mask) } @@ -924,6 +925,7 @@ mod tests { use stdsimd_test::simd_test; use v128::*; + use x86::__m128i; use x86::sse41; #[simd_test = "sse4.1"] @@ -1352,98 +1354,98 @@ mod tests { #[simd_test = "sse4.1"] unsafe fn _mm_testz_si128() { - let a = i64x2::splat(1); - let mask = i64x2::splat(0); + let a = __m128i::splat(1); + let mask = __m128i::splat(0); let r = sse41::_mm_testz_si128(a, mask); assert_eq!(r, 1); - let a = i64x2::splat(0b101); - let mask = i64x2::splat(0b110); + let a = __m128i::splat(0b101); + let mask = __m128i::splat(0b110); let r = sse41::_mm_testz_si128(a, mask); assert_eq!(r, 0); - let a = i64x2::splat(0b011); - let mask = i64x2::splat(0b100); + let a = __m128i::splat(0b011); + let mask = __m128i::splat(0b100); let r = sse41::_mm_testz_si128(a, mask); assert_eq!(r, 1); } #[simd_test = "sse4.1"] unsafe fn _mm_testc_si128() { - let a = i64x2::splat(-1); - let mask = i64x2::splat(0); + let a = __m128i::splat(-1); + let mask = __m128i::splat(0); let r = sse41::_mm_testc_si128(a, mask); assert_eq!(r, 1); - let a = i64x2::splat(0b101); - let mask = i64x2::splat(0b110); + let a = __m128i::splat(0b101); + let mask = __m128i::splat(0b110); let r = sse41::_mm_testc_si128(a, mask); assert_eq!(r, 0); - let a = i64x2::splat(0b101); - let mask = i64x2::splat(0b100); + let a = __m128i::splat(0b101); + let mask = __m128i::splat(0b100); let r = sse41::_mm_testc_si128(a, mask); assert_eq!(r, 1); } #[simd_test = "sse4.1"] unsafe fn _mm_testnzc_si128() { - let a = i64x2::splat(0); - let mask = i64x2::splat(1); + let a = __m128i::splat(0); + let mask = __m128i::splat(1); let r = sse41::_mm_testnzc_si128(a, mask); assert_eq!(r, 0); - let a = i64x2::splat(-1); - let mask = i64x2::splat(0); + let a = __m128i::splat(-1); + let mask = __m128i::splat(0); let r = sse41::_mm_testnzc_si128(a, mask); assert_eq!(r, 0); - let a = i64x2::splat(0b101); - let mask = i64x2::splat(0b110); + let a = __m128i::splat(0b101); + let mask = __m128i::splat(0b110); let r = sse41::_mm_testnzc_si128(a, mask); assert_eq!(r, 1); - let a = i64x2::splat(0b101); - let mask = i64x2::splat(0b101); + let a = __m128i::splat(0b101); + let mask = __m128i::splat(0b101); let r = sse41::_mm_testnzc_si128(a, mask); assert_eq!(r, 0); } #[simd_test = "sse4.1"] unsafe fn _mm_test_all_zeros() { - let a = i64x2::splat(1); - let mask = i64x2::splat(0); + let a = __m128i::splat(1); + let mask = __m128i::splat(0); let r = sse41::_mm_test_all_zeros(a, mask); assert_eq!(r, 1); - let a = i64x2::splat(0b101); - let mask = i64x2::splat(0b110); + let a = __m128i::splat(0b101); + let mask = __m128i::splat(0b110); let r = sse41::_mm_test_all_zeros(a, mask); assert_eq!(r, 0); - let a = i64x2::splat(0b011); - let mask = i64x2::splat(0b100); + let a = __m128i::splat(0b011); + let mask = __m128i::splat(0b100); let r = sse41::_mm_test_all_zeros(a, mask); assert_eq!(r, 1); } #[simd_test = "sse4.1"] unsafe fn _mm_test_all_ones() { - let a = i64x2::splat(-1); + let a = __m128i::splat(-1); let r = sse41::_mm_test_all_ones(a); assert_eq!(r, 1); - let a = i64x2::splat(0b101); + let a = __m128i::splat(0b101); let r = sse41::_mm_test_all_ones(a); assert_eq!(r, 0); } #[simd_test = "sse4.1"] unsafe fn _mm_test_mix_ones_zeros() { - let a = i64x2::splat(0); - let mask = i64x2::splat(1); + let a = __m128i::splat(0); + let mask = __m128i::splat(1); let r = sse41::_mm_test_mix_ones_zeros(a, mask); assert_eq!(r, 0); - let a = i64x2::splat(-1); - let mask = i64x2::splat(0); + let a = __m128i::splat(-1); + let mask = __m128i::splat(0); let r = sse41::_mm_test_mix_ones_zeros(a, mask); assert_eq!(r, 0); - let a = i64x2::splat(0b101); - let mask = i64x2::splat(0b110); + let a = __m128i::splat(0b101); + let mask = __m128i::splat(0b110); let r = sse41::_mm_test_mix_ones_zeros(a, mask); assert_eq!(r, 1); - let a = i64x2::splat(0b101); - let mask = i64x2::splat(0b101); + let a = __m128i::splat(0b101); + let mask = __m128i::splat(0b101); let r = sse41::_mm_test_mix_ones_zeros(a, mask); assert_eq!(r, 0); } From e6cb4652ef84d65db8cf3775ef1c56c4a8d8824a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Oliveira?= Date: Tue, 7 Nov 2017 15:29:47 +0000 Subject: [PATCH 12/12] Formatting --- src/lib.rs | 5 +- src/macros.rs | 8 +- src/x86/sse41.rs | 247 ++++++++++++++++++++++++----------------------- 3 files changed, 130 insertions(+), 130 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 509935e42e..685dbef092 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -194,10 +194,7 @@ mod v16 { define_ty! { u8x2, u8, u8 } define_impl! { u8x2, u8, 2, i8x2, x0, x1 } - define_casts!( - (i8x2, i64x2, as_i64x2), - (u8x2, i64x2, as_i64x2) - ); + define_casts!((i8x2, i64x2, as_i64x2), (u8x2, i64x2, as_i64x2)); } #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] diff --git a/src/macros.rs b/src/macros.rs index c2018acc40..c2009fa939 100644 --- a/src/macros.rs +++ b/src/macros.rs @@ -485,7 +485,7 @@ macro_rules! test_arithmetic_ { #[cfg(test)] #[macro_export] - macro_rules! test_neg_ { +macro_rules! test_neg_ { ($tn:ident, $zero:expr, $one:expr, $two:expr, $four:expr) => { { let z = $tn::splat($zero); @@ -573,7 +573,7 @@ macro_rules! test_bit_arithmetic_ { #[cfg(test)] #[macro_export] - macro_rules! test_ops_si { +macro_rules! test_ops_si { ($($tn:ident),+) => { $( test_arithmetic_!($tn, 0, 1, 2, 4); @@ -585,7 +585,7 @@ macro_rules! test_bit_arithmetic_ { #[cfg(test)] #[macro_export] - macro_rules! test_ops_ui { +macro_rules! test_ops_ui { ($($tn:ident),+) => { $( test_arithmetic_!($tn, 0, 1, 2, 4); @@ -596,7 +596,7 @@ macro_rules! test_bit_arithmetic_ { #[cfg(test)] #[macro_export] - macro_rules! test_ops_f { +macro_rules! test_ops_f { ($($tn:ident),+) => { $( test_arithmetic_!($tn, 0., 1., 2., 4.); diff --git a/src/x86/sse41.rs b/src/x86/sse41.rs index 1f02dd38c6..86538ca562 100644 --- a/src/x86/sse41.rs +++ b/src/x86/sse41.rs @@ -139,7 +139,7 @@ pub unsafe fn _mm_extract_epi32(a: i32x4, imm8: u8) -> i32 { } /// Extract an 64-bit integer from `a` selected with `imm8` -#[cfg(target_arch = "x86_64")] +#[cfg(all(target_arch = "x86_64", not(target_feature = "sse2")))] // i586 #[inline(always)] #[target_feature = "+sse4.1"] // TODO: Add test for Windows @@ -200,7 +200,7 @@ pub unsafe fn _mm_insert_epi32(a: i32x4, i: i32, imm8: u8) -> i32x4 { /// Return a copy of `a` with the 64-bit integer from `i` inserted at a /// location specified by `imm8`. -#[cfg(target_arch = "x86_64")] +#[cfg(all(target_arch = "x86_64", not(target_feature = "sse2")))] // i586 #[inline(always)] #[target_feature = "+sse4.1"] #[cfg_attr(test, assert_instr(pinsrq, imm8 = 0))] @@ -370,7 +370,8 @@ pub unsafe fn _mm_cvtepu8_epi64(a: u8x16) -> i64x2 { simd_shuffle2::<_, ::v16::u8x2>(a, a, [0, 1]).as_i64x2() } -/// Zero extend packed unsigned 16-bit integers in `a` to packed 32-bit integers +/// Zero extend packed unsigned 16-bit integers in `a` +/// to packed 32-bit integers #[inline(always)] #[target_feature = "+sse4.1"] #[cfg_attr(test, assert_instr(pmovzxwd))] @@ -378,7 +379,8 @@ pub unsafe fn _mm_cvtepu16_epi32(a: u16x8) -> i32x4 { simd_shuffle4::<_, ::v64::u16x4>(a, a, [0, 1, 2, 3]).as_i32x4() } -/// Zero extend packed unsigned 16-bit integers in `a` to packed 64-bit integers +/// Zero extend packed unsigned 16-bit integers in `a` +/// to packed 64-bit integers #[inline(always)] #[target_feature = "+sse4.1"] #[cfg_attr(test, assert_instr(pmovzxwq))] @@ -386,7 +388,8 @@ pub unsafe fn _mm_cvtepu16_epi64(a: u16x8) -> i64x2 { simd_shuffle2::<_, ::v32::u16x2>(a, a, [0, 1]).as_i64x2() } -/// Zero extend packed unsigned 32-bit integers in `a` to packed 64-bit integers +/// Zero extend packed unsigned 32-bit integers in `a` +/// to packed 64-bit integers #[inline(always)] #[target_feature = "+sse4.1"] #[cfg_attr(test, assert_instr(pmovzxdq))] @@ -394,30 +397,14 @@ pub unsafe fn _mm_cvtepu32_epi64(a: u32x4) -> i64x2 { simd_shuffle2::<_, ::v64::u32x2>(a, a, [0, 1]).as_i64x2() } -/// Multiply the low 32-bit integers from each packed 64-bit element in `a` and `b` -#[inline(always)] -#[target_feature = "+sse4.1"] -#[cfg_attr(test, assert_instr(pmuldq))] -pub unsafe fn _mm_mul_epi32(a: i32x4, b:i32x4) -> i64x2 { - pmuldq(a, b) -} - -/// Multiply the packed 32-bit integers in `a` and `b`, producing intermediate 64-bit integers, -/// and return the low 32 bits of the intermediate integers. -#[inline(always)] -#[target_feature = "+sse4.1"] -#[cfg_attr(test, assert_instr(pmulld))] -pub unsafe fn _mm_mullo_epi32 (a: i32x4, b:i32x4) -> i32x4 { - a * b -} - /// Tests whether the specified bits in a 128-bit integer vector are all /// zeros. /// /// Arguments: /// /// * `a` - A 128-bit integer vector containing the bits to be tested. -/// * `mask` - A 128-bit integer vector selecting which bits to test in operand `a`. +/// * `mask` - A 128-bit integer vector selecting which bits to test in +/// operand `a`. /// /// Returns: /// @@ -437,7 +424,8 @@ pub unsafe fn _mm_testz_si128(a: __m128i, mask: __m128i) -> i32 { /// Arguments: /// /// * `a` - A 128-bit integer vector containing the bits to be tested. -/// * `mask` - A 128-bit integer vector selecting which bits to test in operand `a`. +/// * `mask` - A 128-bit integer vector selecting which bits to test in +/// operand `a`. /// /// Returns: /// @@ -456,7 +444,8 @@ pub unsafe fn _mm_testc_si128(a: __m128i, mask: __m128i) -> i32 { /// Arguments: /// /// * `a` - A 128-bit integer vector containing the bits to be tested. -/// * `mask` - A 128-bit integer vector selecting which bits to test in operand `a`. +/// * `mask` - A 128-bit integer vector selecting which bits to test in +/// operand `a`. /// /// Returns: /// @@ -475,7 +464,8 @@ pub unsafe fn _mm_testnzc_si128(a: __m128i, mask: __m128i) -> i32 { /// Arguments: /// /// * `a` - A 128-bit integer vector containing the bits to be tested. -/// * `mask` - A 128-bit integer vector selecting which bits to test in operand `a`. +/// * `mask` - A 128-bit integer vector selecting which bits to test in +/// operand `a`. /// /// Returns: /// @@ -513,7 +503,8 @@ pub unsafe fn _mm_test_all_ones(a: __m128i) -> i32 { /// Arguments: /// /// * `a` - A 128-bit integer vector containing the bits to be tested. -/// * `mask` - A 128-bit integer vector selecting which bits to test in operand `a`. +/// * `mask` - A 128-bit integer vector selecting which bits to test in +/// operand `a`. /// /// Returns: /// @@ -769,7 +760,8 @@ pub unsafe fn _mm_round_ss(a: f32x4, b: f32x4, rounding: i32) -> f32x4 { } /// Finds the minimum unsigned 16-bit element in the 128-bit u16x8 vector, -/// returning a vector containing its value in its first position, and its index +/// returning a vector containing its value in its first position, and its +/// index /// in its second position; all other elements are set to zero. /// /// This intrinsic corresponds to the VPHMINPOSUW / PHMINPOSUW @@ -793,8 +785,8 @@ pub unsafe fn _mm_minpos_epu16(a: u16x8) -> u16x8 { phminposuw(a) } -/// Multiply the low 32-bit integers from each packed 64-bit element -/// in a and b, and store the signed 64-bit results in dst. +/// Multiply the low 32-bit integers from each packed 64-bit +/// element in `a` and `b`, and return the signed 64-bit result. #[inline(always)] #[target_feature = "+sse4.1"] #[cfg_attr(test, assert_instr(pmuldq))] @@ -802,12 +794,12 @@ pub unsafe fn _mm_mul_epi32(a: i32x4, b: i32x4) -> i64x2 { pmuldq(a, b) } -/// Multiply the packed 32-bit integers in a and b, producing intermediate -/// 64-bit integers, and returns the lowest 32-bit, whatever they might be, -/// reinterpreted as a signed integer. -/// While pmulld i32x4::splat(2), i32x4::splat(2) returns the obvious -/// i32x4::splat(4), pmulld i32x4::splat(i32::MAX), i32x4::splat(2) -/// would return a negative number. +/// Multiply the packed 32-bit integers in `a` and `b`, producing intermediate +/// 64-bit integers, and returns the lowest 32-bit, whatever they might be, +/// reinterpreted as a signed integer. While `pmulld i32x4::splat(2), +/// i32x4::splat(2)` returns the obvious `i32x4::splat(4)`, due to wrapping +/// arithmetic `pmulld i32x4::splat(i32::MAX), i32x4::splat(2)` would return a +/// negative number. #[inline(always)] #[target_feature = "+sse4.1"] #[cfg_attr(test, assert_instr(pmulld))] @@ -837,8 +829,8 @@ pub unsafe fn _mm_mullo_epi32(a: i32x4, b: i32x4) -> i32x4 { /// /// * `a` - A 128-bit vector of type `i8x16`. /// * `b` - A 128-bit vector of type `i8x16`. -/// * `imm8` - An 8-bit immediate operand specifying how the absolute differences are to -/// be calculated +/// * `imm8` - An 8-bit immediate operand specifying how the absolute +/// differences are to be calculated /// * Bit `[2]` specify the offset for operand `a` /// * Bits `[1:0]` specify the offset for operand `b` /// @@ -848,7 +840,7 @@ pub unsafe fn _mm_mullo_epi32(a: i32x4, b: i32x4) -> i32x4 { /// absolute differences between both operands. #[inline(always)] #[target_feature = "+sse4.1"] -#[cfg_attr(test, assert_instr(mpsadbw, imm8=0))] +#[cfg_attr(test, assert_instr(mpsadbw, imm8 = 0))] pub unsafe fn _mm_mpsadbw_epu8(a: u8x16, b: u8x16, imm8: u8) -> u16x8 { macro_rules! call { ($imm8:expr) => { mpsadbw(a, b, $imm8) } @@ -890,8 +882,6 @@ extern "C" { fn pminud(a: u32x4, b: u32x4) -> u32x4; #[link_name = "llvm.x86.sse41.packusdw"] fn packusdw(a: i32x4, b: i32x4) -> u16x8; - #[link_name = "llvm.x86.sse41.pmuldq"] - fn pmuldq(a: i32x4, b: i32x4) -> i64x2; #[link_name = "llvm.x86.sse41.ptestz"] fn ptestz(a: i64x2, mask: i64x2) -> i32; #[link_name = "llvm.x86.sse41.ptestc"] @@ -921,12 +911,11 @@ extern "C" { #[cfg(test)] mod tests { use std::mem; - use stdsimd_test::simd_test; - + use x86::sse41; use v128::*; + #[cfg(all(target_arch = "x86_64", not(target_feature = "sse2")))] // i586 use x86::__m128i; - use x86::sse41; #[simd_test = "sse4.1"] unsafe fn _mm_blendv_epi8() { @@ -1020,7 +1009,7 @@ mod tests { assert_eq!(r, 1); } - #[cfg(target_arch = "x86_64")] + #[cfg(all(target_arch = "x86_64", not(target_feature = "sse2")))] // i586 #[simd_test = "sse4.1"] unsafe fn _mm_extract_epi64() { let a = i64x2::new(0, 1); @@ -1059,7 +1048,7 @@ mod tests { assert_eq!(r, e); } - #[cfg(target_arch = "x86_64")] + #[cfg(all(target_arch = "x86_64", not(target_feature = "sse2")))] // i586 #[simd_test = "sse4.1"] unsafe fn _mm_insert_epi64() { let a = i64x2::splat(0); @@ -1280,7 +1269,7 @@ mod tests { let r = sse41::_mm_cvtepi32_epi64(a); let e = i64x2::splat(10); assert_eq!(r, e); - let a = i32x4::splat(-10); + let a = i32x4::splat(-10); let r = sse41::_mm_cvtepi32_epi64(a); let e = i64x2::splat(-10); assert_eq!(r, e); @@ -1334,119 +1323,107 @@ mod tests { assert_eq!(r, e); } - #[simd_test = "sse4.1"] - unsafe fn _mm_mul_epi32() { - let a = i32x4::new(1, 1, 1, 1); - let b = i32x4::new(1, 2, 3, 4); - let r = sse41::_mm_mul_epi32(a, b); - let e = i64x2::new(1, 3); - assert_eq!(r, e); - } - - #[simd_test = "sse4.1"] - unsafe fn _mm_mullo_epi32() { - let a = i32x4::new(1, 1, 1, 1); - let b = i32x4::new(1, 2, 3, 4); - let r = sse41::_mm_mullo_epi32(a, b); - let e = i32x4::new(1, 2, 3, 4); - assert_eq!(r, e); - } - + #[cfg(all(target_arch = "x86_64", not(target_feature = "sse2")))] // i586 #[simd_test = "sse4.1"] unsafe fn _mm_testz_si128() { - let a = __m128i::splat(1); + let a = __m128i::splat(1); let mask = __m128i::splat(0); - let r = sse41::_mm_testz_si128(a, mask); + let r = sse41::_mm_testz_si128(a, mask); assert_eq!(r, 1); - let a = __m128i::splat(0b101); + let a = __m128i::splat(0b101); let mask = __m128i::splat(0b110); - let r = sse41::_mm_testz_si128(a, mask); + let r = sse41::_mm_testz_si128(a, mask); assert_eq!(r, 0); - let a = __m128i::splat(0b011); + let a = __m128i::splat(0b011); let mask = __m128i::splat(0b100); - let r = sse41::_mm_testz_si128(a, mask); + let r = sse41::_mm_testz_si128(a, mask); assert_eq!(r, 1); } + #[cfg(all(target_arch = "x86_64", not(target_feature = "sse2")))] // i586 #[simd_test = "sse4.1"] unsafe fn _mm_testc_si128() { - let a = __m128i::splat(-1); + let a = __m128i::splat(-1); let mask = __m128i::splat(0); - let r = sse41::_mm_testc_si128(a, mask); + let r = sse41::_mm_testc_si128(a, mask); assert_eq!(r, 1); - let a = __m128i::splat(0b101); + let a = __m128i::splat(0b101); let mask = __m128i::splat(0b110); - let r = sse41::_mm_testc_si128(a, mask); + let r = sse41::_mm_testc_si128(a, mask); assert_eq!(r, 0); - let a = __m128i::splat(0b101); + let a = __m128i::splat(0b101); let mask = __m128i::splat(0b100); - let r = sse41::_mm_testc_si128(a, mask); + let r = sse41::_mm_testc_si128(a, mask); assert_eq!(r, 1); } + #[cfg(all(target_arch = "x86_64", not(target_feature = "sse2")))] // i586 #[simd_test = "sse4.1"] unsafe fn _mm_testnzc_si128() { - let a = __m128i::splat(0); + let a = __m128i::splat(0); let mask = __m128i::splat(1); - let r = sse41::_mm_testnzc_si128(a, mask); + let r = sse41::_mm_testnzc_si128(a, mask); assert_eq!(r, 0); - let a = __m128i::splat(-1); + let a = __m128i::splat(-1); let mask = __m128i::splat(0); - let r = sse41::_mm_testnzc_si128(a, mask); + let r = sse41::_mm_testnzc_si128(a, mask); assert_eq!(r, 0); - let a = __m128i::splat(0b101); + let a = __m128i::splat(0b101); let mask = __m128i::splat(0b110); - let r = sse41::_mm_testnzc_si128(a, mask); + let r = sse41::_mm_testnzc_si128(a, mask); assert_eq!(r, 1); - let a = __m128i::splat(0b101); + let a = __m128i::splat(0b101); let mask = __m128i::splat(0b101); - let r = sse41::_mm_testnzc_si128(a, mask); + let r = sse41::_mm_testnzc_si128(a, mask); assert_eq!(r, 0); } + #[cfg(all(target_arch = "x86_64", not(target_feature = "sse2")))] // i586 #[simd_test = "sse4.1"] unsafe fn _mm_test_all_zeros() { - let a = __m128i::splat(1); + let a = __m128i::splat(1); let mask = __m128i::splat(0); - let r = sse41::_mm_test_all_zeros(a, mask); + let r = sse41::_mm_test_all_zeros(a, mask); assert_eq!(r, 1); - let a = __m128i::splat(0b101); + let a = __m128i::splat(0b101); let mask = __m128i::splat(0b110); - let r = sse41::_mm_test_all_zeros(a, mask); + let r = sse41::_mm_test_all_zeros(a, mask); assert_eq!(r, 0); - let a = __m128i::splat(0b011); + let a = __m128i::splat(0b011); let mask = __m128i::splat(0b100); - let r = sse41::_mm_test_all_zeros(a, mask); + let r = sse41::_mm_test_all_zeros(a, mask); assert_eq!(r, 1); } + #[cfg(all(target_arch = "x86_64", not(target_feature = "sse2")))] // i586 #[simd_test = "sse4.1"] unsafe fn _mm_test_all_ones() { - let a = __m128i::splat(-1); - let r = sse41::_mm_test_all_ones(a); + let a = __m128i::splat(-1); + let r = sse41::_mm_test_all_ones(a); assert_eq!(r, 1); - let a = __m128i::splat(0b101); - let r = sse41::_mm_test_all_ones(a); + let a = __m128i::splat(0b101); + let r = sse41::_mm_test_all_ones(a); assert_eq!(r, 0); } + #[cfg(all(target_arch = "x86_64", not(target_feature = "sse2")))] // i586 #[simd_test = "sse4.1"] unsafe fn _mm_test_mix_ones_zeros() { - let a = __m128i::splat(0); + let a = __m128i::splat(0); let mask = __m128i::splat(1); - let r = sse41::_mm_test_mix_ones_zeros(a, mask); + let r = sse41::_mm_test_mix_ones_zeros(a, mask); assert_eq!(r, 0); - let a = __m128i::splat(-1); + let a = __m128i::splat(-1); let mask = __m128i::splat(0); - let r = sse41::_mm_test_mix_ones_zeros(a, mask); + let r = sse41::_mm_test_mix_ones_zeros(a, mask); assert_eq!(r, 0); - let a = __m128i::splat(0b101); + let a = __m128i::splat(0b101); let mask = __m128i::splat(0b110); - let r = sse41::_mm_test_mix_ones_zeros(a, mask); + let r = sse41::_mm_test_mix_ones_zeros(a, mask); assert_eq!(r, 1); - let a = __m128i::splat(0b101); + let a = __m128i::splat(0b101); let mask = __m128i::splat(0b101); - let r = sse41::_mm_test_mix_ones_zeros(a, mask); + let r = sse41::_mm_test_mix_ones_zeros(a, mask); assert_eq!(r, 0); } @@ -1594,27 +1571,52 @@ mod tests { #[simd_test = "sse4.1"] unsafe fn _mm_mul_epi32() { - let a = - i32x4::new(15, 2 /* ignored */, 1234567, 4 /* ignored */); - let b = i32x4::new( - -20, - -256, /* ignored */ - 666666, - 666666, /* ignored */ - ); - let r = sse41::_mm_mul_epi32(a, b); - let e = i64x2::new(-300, 823043843622); - assert_eq!(r, e); + { + let a = i32x4::new(1, 1, 1, 1); + let b = i32x4::new(1, 2, 3, 4); + let r = sse41::_mm_mul_epi32(a, b); + let e = i64x2::new(1, 3); + assert_eq!(r, e); + } + { + let a = i32x4::new( + 15, + 2, /* ignored */ + 1234567, + 4, /* ignored */ + ); + let b = i32x4::new( + -20, + -256, /* ignored */ + 666666, + 666666, /* ignored */ + ); + let r = sse41::_mm_mul_epi32(a, b); + let e = i64x2::new(-300, 823043843622); + assert_eq!(r, e); + } } #[simd_test = "sse4.1"] unsafe fn _mm_mullo_epi32() { - let a = i32x4::new(15, -2, 1234567, 99999); - let b = i32x4::new(-20, -256, 666666, -99999); - let r = sse41::_mm_mullo_epi32(a, b); - // Attention, most significant bit in r[2] is treated as a sign bit! - // 1234567 * 666666 = -1589877210 - let e = i32x4::new(-300, 512, -1589877210, -1409865409); + { + let a = i32x4::new(1, 1, 1, 1); + let b = i32x4::new(1, 2, 3, 4); + let r = sse41::_mm_mullo_epi32(a, b); + let e = i32x4::new(1, 2, 3, 4); + assert_eq!(r, e); + } + { + let a = i32x4::new(15, -2, 1234567, 99999); + let b = i32x4::new(-20, -256, 666666, -99999); + let r = sse41::_mm_mullo_epi32(a, b); + // Attention, most significant bit in r[2] is treated + // as a sign bit: + // 1234567 * 666666 = -1589877210 + let e = i32x4::new(-300, 512, -1589877210, -1409865409); + assert_eq!(r, e); + } + } #[simd_test = "sse4.1"] unsafe fn _mm_minpos_epu16() { @@ -1626,7 +1628,8 @@ mod tests { #[simd_test = "sse4.1"] unsafe fn _mm_mpsadbw_epu8() { - let a = u8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + let a = + u8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); let r = sse41::_mm_mpsadbw_epu8(a, a, 0b000); let e = u16x8::new(0, 4, 8, 12, 16, 20, 24, 28); @@ -1635,7 +1638,7 @@ mod tests { let r = sse41::_mm_mpsadbw_epu8(a, a, 0b001); let e = u16x8::new(16, 12, 8, 4, 0, 4, 8, 12); assert_eq!(r, e); - + let r = sse41::_mm_mpsadbw_epu8(a, a, 0b100); let e = u16x8::new(16, 20, 24, 28, 32, 36, 40, 44); assert_eq!(r, e);