diff --git a/crates/core_arch/src/aarch64/armclang.rs b/crates/core_arch/src/aarch64/armclang.rs
new file mode 100644
index 0000000000..54847be7b1
--- /dev/null
+++ b/crates/core_arch/src/aarch64/armclang.rs
@@ -0,0 +1,23 @@
+//! ARM compiler specific intrinsics
+//!
+//! # References
+//!
+//! - [ARM Compiler v 6.10 - armclang Reference Guide][arm_comp_ref]
+//!
+//! [arm_comp_ref]: https://developer.arm.com/docs/100067/0610
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+/// Inserts a breakpoint instruction.
+///
+/// `VAL` is a compile-time constant integer in range `[0, 65535]`.
+///
+/// The breakpoint instruction inserted is `BRK` on A64.
+#[cfg_attr(test, assert_instr(brk, VAL = 0))]
+#[inline(always)]
+#[rustc_legacy_const_generics(0)]
+pub unsafe fn __breakpoint<const VAL: i32>() {
+    static_assert_imm16!(VAL);
+    asm!("brk {}", const VAL);
+}
diff --git a/crates/core_arch/src/aarch64/mod.rs b/crates/core_arch/src/aarch64/mod.rs
index f6d0fc9dbd..0411fc1068 100644
--- a/crates/core_arch/src/aarch64/mod.rs
+++ b/crates/core_arch/src/aarch64/mod.rs
@@ -21,7 +21,11 @@ pub use self::crc::*;
 mod prefetch;
 pub use self::prefetch::*;
 
-pub use super::acle::*;
+pub use super::arm_shared::*;
+
+mod armclang;
+
+pub use self::armclang::*;
 
 #[cfg(test)]
 use stdarch_test::assert_instr;
diff --git a/crates/core_arch/src/aarch64/neon/mod.rs b/crates/core_arch/src/aarch64/neon/mod.rs
index a925fbc504..71e6b83a63 100644
--- a/crates/core_arch/src/aarch64/neon/mod.rs
+++ b/crates/core_arch/src/aarch64/neon/mod.rs
@@ -10,7 +10,7 @@ pub use self::generated::*;
 // FIXME: replace neon with asimd
 
 use crate::{
-    core_arch::{arm::*, simd::*, simd_llvm::*},
+    core_arch::{arm_shared::*, simd::*, simd_llvm::*},
     hint::unreachable_unchecked,
     mem::{transmute, zeroed},
 };
@@ -2812,7 +2812,7 @@ pub unsafe fn vsriq_n_p16<const N: i32>(a: poly16x8_t, b: poly16x8_t) -> poly16x
 #[cfg(test)]
 mod tests {
     use crate::core_arch::aarch64::test_support::*;
-    use crate::core_arch::arm::test_support::*;
+    use crate::core_arch::arm_shared::test_support::*;
     use crate::core_arch::{aarch64::neon::*, aarch64::*, simd::*};
     use std::mem::transmute;
     use stdarch_test::simd_test;
@@ -4261,13 +4261,13 @@ mod tests {
 
 #[cfg(test)]
 #[cfg(target_endian = "little")]
-#[path = "../../arm/neon/table_lookup_tests.rs"]
+#[path = "../../arm_shared/neon/table_lookup_tests.rs"]
 mod table_lookup_tests;
 
 #[cfg(test)]
-#[path = "../../arm/neon/shift_and_insert_tests.rs"]
+#[path = "../../arm_shared/neon/shift_and_insert_tests.rs"]
 mod shift_and_insert_tests;
 
 #[cfg(test)]
-#[path = "../../arm/neon/load_tests.rs"]
+#[path = "../../arm_shared/neon/load_tests.rs"]
 mod load_tests;
diff --git a/crates/core_arch/src/aarch64/test_support.rs b/crates/core_arch/src/aarch64/test_support.rs
index e08c39a545..9c5994b150 100644
--- a/crates/core_arch/src/aarch64/test_support.rs
+++ b/crates/core_arch/src/aarch64/test_support.rs
@@ -1,4 +1,4 @@
-use crate::core_arch::{aarch64::neon::*, arm::*, simd::*};
+use crate::core_arch::{aarch64::neon::*, arm_shared::*, simd::*};
 use std::{i16, i32, i8, mem::transmute, u16, u32, u8, vec::Vec};
 
 macro_rules! V_u64 {
diff --git a/crates/core_arch/src/arm/armclang.rs b/crates/core_arch/src/arm/armclang.rs
index aa4bab49f1..6b332e82c0 100644
--- a/crates/core_arch/src/arm/armclang.rs
+++ b/crates/core_arch/src/arm/armclang.rs
@@ -9,20 +9,6 @@
 #[cfg(test)]
 use stdarch_test::assert_instr;
 
-/// Inserts a breakpoint instruction.
-///
-/// `VAL` is a compile-time constant integer in range `[0, 65535]`.
-///
-/// The breakpoint instruction inserted is `BRK` on A64.
-#[cfg(all(target_arch = "aarch64", not(doc)))]
-#[cfg_attr(test, assert_instr(brk, VAL = 0))]
-#[inline(always)]
-#[rustc_legacy_const_generics(0)]
-pub unsafe fn __breakpoint<const VAL: i32>() {
-    static_assert_imm16!(VAL);
-    asm!("brk {}", const VAL);
-}
-
 /// Inserts a breakpoint instruction.
 ///
 /// `VAL` is a compile-time constant integer in range `[0, 255]`.
@@ -40,8 +26,6 @@ pub unsafe fn __breakpoint<const VAL: i32>() {
 /// The current implementation only accepts values in range `[0, 255]`.
 ///
 /// [arm_docs]: https://developer.arm.com/docs/100067/latest/compiler-specific-intrinsics/__breakpoint-intrinsic
-#[cfg(any(target_arch = "arm", doc))]
-#[doc(cfg(target_arch = "arm"))]
 #[cfg_attr(test, assert_instr(bkpt, VAL = 0))]
 #[inline(always)]
 #[rustc_legacy_const_generics(0)]
diff --git a/crates/core_arch/src/acle/dsp.rs b/crates/core_arch/src/arm/dsp.rs
similarity index 100%
rename from crates/core_arch/src/acle/dsp.rs
rename to crates/core_arch/src/arm/dsp.rs
diff --git a/crates/core_arch/src/acle/ex.rs b/crates/core_arch/src/arm/ex.rs
similarity index 95%
rename from crates/core_arch/src/acle/ex.rs
rename to crates/core_arch/src/arm/ex.rs
index 0426c65186..b9d5047a05 100644
--- a/crates/core_arch/src/acle/ex.rs
+++ b/crates/core_arch/src/arm/ex.rs
@@ -8,6 +8,7 @@
 #[cfg(any(
     all(target_feature = "v6k", not(target_feature = "mclass")), // excludes v6-M
     all(target_feature = "v7", target_feature = "mclass"), // v7-M
+    doc
 ))]
 pub unsafe fn __clrex() {
     extern "C" {
@@ -21,9 +22,10 @@ pub unsafe fn __clrex() {
 /// Executes a exclusive LDR instruction for 8 bit value.
 // Supported: v6K, v7-M, v7-A, v7-R
 // Not supported: v5, v6, v6-M
-#[cfg(
+#[cfg(any(
     target_feature = "v6k", // includes v7-M but excludes v6-M
-)]
+    doc
+))]
 pub unsafe fn __ldrexb(p: *const u8) -> u8 {
     extern "C" {
         #[link_name = "llvm.arm.ldrex.p0i8"]
@@ -36,9 +38,10 @@ pub unsafe fn __ldrexb(p: *const u8) -> u8 {
 /// Executes a exclusive LDR instruction for 16 bit value.
 // Supported: v6K, v7-M, v7-A, v7-R, v8
 // Not supported: v5, v6, v6-M
-#[cfg(
+#[cfg(any(
     target_feature = "v6k", // includes v7-M but excludes v6-M
-)]
+    doc
+))]
 pub unsafe fn __ldrexh(p: *const u16) -> u16 {
     extern "C" {
         #[link_name = "llvm.arm.ldrex.p0i16"]
@@ -54,6 +57,7 @@ pub unsafe fn __ldrexh(p: *const u16) -> u16 {
 #[cfg(any(
     all(target_feature = "v6", not(target_feature = "mclass")), // excludes v6-M
     all(target_feature = "v7", target_feature = "mclass"), // v7-M
+    doc
 ))]
 pub unsafe fn __ldrex(p: *const u32) -> u32 {
     extern "C" {
@@ -69,9 +73,10 @@ pub unsafe fn __ldrex(p: *const u32) -> u32 {
 /// Returns `0` if the operation succeeded, or `1` if it failed
 // supported: v6K, v7-M, v7-A, v7-R
 // Not supported: v5, v6, v6-M
-#[cfg(
+#[cfg(any(
     target_feature = "v6k", // includes v7-M but excludes v6-M
-)]
+    doc
+))]
 pub unsafe fn __strexb(value: u32, addr: *mut u8) -> u32 {
     extern "C" {
         #[link_name = "llvm.arm.strex.p0i8"]
@@ -86,9 +91,11 @@ pub unsafe fn __strexb(value: u32, addr: *mut u8) -> u32 {
 /// Returns `0` if the operation succeeded, or `1` if it failed
 // Supported: v6K, v7-M, v7-A, v7-R, v8
 // Not supported: v5, v6, v6-M
-#[cfg(
+#[cfg(target_feature = "aarch64")]
+#[cfg(any(
     target_feature = "v6k", // includes v7-M but excludes v6-M
-)]
+    doc
+))]
 pub unsafe fn __strexh(value: u16, addr: *mut u16) -> u32 {
     extern "C" {
         #[link_name = "llvm.arm.strex.p0i16"]
@@ -106,6 +113,7 @@ pub unsafe fn __strexh(value: u16, addr: *mut u16) -> u32 {
 #[cfg(any(
     all(target_feature = "v6", not(target_feature = "mclass")), // excludes v6-M
     all(target_feature = "v7", target_feature = "mclass"), // v7-M
+    doc
 ))]
 pub unsafe fn __strex(value: u32, addr: *mut u32) -> u32 {
     extern "C" {
diff --git a/crates/core_arch/src/arm/mod.rs b/crates/core_arch/src/arm/mod.rs
index fd0cb2cf8d..d6b12b8292 100644
--- a/crates/core_arch/src/arm/mod.rs
+++ b/crates/core_arch/src/arm/mod.rs
@@ -5,40 +5,81 @@
 //!
 //! [arm_ref]: http://infocenter.arm.com/help/topic/com.arm.doc.ihi0073a/IHI0073A_arm_neon_intrinsics_ref.pdf
 //! [arm_dat]: https://developer.arm.com/technologies/neon/intrinsics
-#![allow(non_camel_case_types)]
 
 mod armclang;
-
 pub use self::armclang::*;
 
 mod v6;
 pub use self::v6::*;
 
-#[cfg(any(target_arch = "aarch64", target_feature = "v7"))]
-mod v7;
-#[cfg(any(target_arch = "aarch64", target_feature = "v7"))]
-pub use self::v7::*;
+// Supported arches: 6, 7-M. See Section 10.1 of ACLE (e.g. SSAT)
+#[cfg(any(target_feature = "v6", doc))]
+mod sat;
+
+#[cfg(any(target_feature = "v6", doc))]
+pub use self::sat::*;
+
+// Supported arches: 5TE, 7E-M. See Section 10.1 of ACLE (e.g. QADD)
+// We also include the A profile even though DSP is deprecated on that profile as of ACLE 2.0 (see
+// section 5.4.7)
+// Here we workaround the difference between LLVM's +dsp and ACLE's __ARM_FEATURE_DSP by gating on
+// '+v5te' rather than on '+dsp'
+#[cfg(any(
+    // >= v5TE but excludes v7-M
+    all(target_feature = "v5te", not(target_feature = "mclass")),
+    // v7E-M
+    all(target_feature = "mclass", target_feature = "dsp"),
+    doc,
+))]
+pub mod dsp;
+
+#[cfg(any(
+    // >= v5TE but excludes v7-M
+    all(target_feature = "v5te", not(target_feature = "mclass")),
+    // v7E-M
+    all(target_feature = "mclass", target_feature = "dsp"),
+    doc,
+))]
+pub use self::dsp::*;
+
+// Deprecated in ACLE 2.0 for the A profile but fully supported on the M and R profiles, says
+// Section 5.4.9 of ACLE. We'll expose these for the A profile even if deprecated
+#[cfg(any(
+    // v7-A, v7-R
+    all(target_feature = "v6", not(target_feature = "mclass")),
+    // v7E-M
+    all(target_feature = "mclass", target_feature = "dsp"),
+    doc,
+))]
+mod simd32;
 
-#[cfg(any(target_arch = "aarch64", target_feature = "v7", doc))]
-mod neon;
-#[cfg(any(target_arch = "aarch64", target_feature = "v7", doc))]
-pub use self::neon::*;
+#[cfg(any(
+    // v7-A, v7-R
+    all(target_feature = "v6", not(target_feature = "mclass")),
+    // v7E-M
+    all(target_feature = "mclass", target_feature = "dsp"),
+    doc,
+))]
+pub use self::simd32::*;
 
-#[cfg(any(target_arch = "aarch64", target_feature = "v7"))]
-mod crc;
-#[cfg(any(target_arch = "aarch64", target_feature = "v7"))]
-pub use self::crc::*;
+#[cfg(any(target_feature = "v7", doc))]
+mod v7;
+#[cfg(any(target_feature = "v7", doc))]
+pub use self::v7::*;
 
-#[cfg(any(target_arch = "aarch64", target_feature = "v7"))]
-mod crypto;
-#[cfg(any(target_arch = "aarch64", target_feature = "v7"))]
-pub use self::crypto::*;
+mod ex;
+pub use self::ex::*;
 
-pub use crate::core_arch::acle::*;
+pub use crate::core_arch::arm_shared::*;
 
 #[cfg(test)]
 use stdarch_test::assert_instr;
 
+#[cfg(any(target_feature = "v7", doc))]
+pub(crate) mod neon;
+#[cfg(any(target_feature = "v7", doc))]
+pub use neon::*;
+
 /// Generates the trap instruction `UDF`
 #[cfg(target_arch = "arm")]
 #[cfg_attr(test, assert_instr(udf))]
@@ -47,6 +88,26 @@ pub unsafe fn udf() -> ! {
     crate::intrinsics::abort()
 }
 
-#[cfg(test)]
-#[cfg(any(target_arch = "aarch64", target_feature = "v7"))]
-pub(crate) mod test_support;
+/// Generates a DBG instruction.
+///
+/// This provides a hint to debugging and related systems. The argument must be
+/// a constant integer from 0 to 15 inclusive. See implementation documentation
+/// for the effect (if any) of this instruction and the meaning of the
+/// argument. This is available only when compliling for AArch32.
+// Section 10.1 of ACLE says that the supported arches are: 7, 7-M
+// "The DBG hint instruction is added in ARMv7. It is UNDEFINED in the ARMv6 base architecture, and
+// executes as a NOP instruction in ARMv6K and ARMv6T2." - ARM Architecture Reference Manual ARMv7-A
+// and ARMv7-R edition (ARM DDI 0406C.c) sections D12.4.1 "ARM instruction set support" and D12.4.2
+// "Thumb instruction set support"
+#[cfg(any(target_feature = "v7", doc))]
+#[inline(always)]
+#[rustc_legacy_const_generics(0)]
+pub unsafe fn __dbg<const IMM4: i32>() {
+    static_assert_imm4!(IMM4);
+    dbg(IMM4);
+}
+
+extern "C" {
+    #[link_name = "llvm.arm.dbg"]
+    fn dbg(_: i32);
+}
diff --git a/crates/core_arch/src/arm/neon.rs b/crates/core_arch/src/arm/neon.rs
new file mode 100644
index 0000000000..6bb1d0bfd4
--- /dev/null
+++ b/crates/core_arch/src/arm/neon.rs
@@ -0,0 +1,1091 @@
+use crate::core_arch::arm_shared::neon::*;
+use crate::core_arch::simd::{f32x4, i32x4, u32x4};
+use crate::core_arch::simd_llvm::*;
+use crate::mem::transmute;
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+#[allow(non_camel_case_types)]
+pub(crate) type p8 = u8;
+#[allow(non_camel_case_types)]
+pub(crate) type p16 = u16;
+
+use crate::mem::align_of;
+
+#[allow(improper_ctypes)]
+extern "C" {
+    #[link_name = "llvm.arm.neon.vbsl.v8i8"]
+    fn vbsl_s8_(a: int8x8_t, b: int8x8_t, c: int8x8_t) -> int8x8_t;
+    #[link_name = "llvm.arm.neon.vbsl.v16i8"]
+    fn vbslq_s8_(a: int8x16_t, b: int8x16_t, c: int8x16_t) -> int8x16_t;
+    #[link_name = "llvm.arm.neon.vpadals.v4i16.v8i8"]
+    pub(crate) fn vpadal_s8_(a: int16x4_t, b: int8x8_t) -> int16x4_t;
+    #[link_name = "llvm.arm.neon.vpadals.v2i32.v4i16"]
+    pub(crate) fn vpadal_s16_(a: int32x2_t, b: int16x4_t) -> int32x2_t;
+    #[link_name = "llvm.arm.neon.vpadals.v1i64.v2i32"]
+    pub(crate) fn vpadal_s32_(a: int64x1_t, b: int32x2_t) -> int64x1_t;
+    #[link_name = "llvm.arm.neon.vpadals.v8i16.v16i8"]
+    pub(crate) fn vpadalq_s8_(a: int16x8_t, b: int8x16_t) -> int16x8_t;
+    #[link_name = "llvm.arm.neon.vpadals.v4i32.v8i16"]
+    pub(crate) fn vpadalq_s16_(a: int32x4_t, b: int16x8_t) -> int32x4_t;
+    #[link_name = "llvm.arm.neon.vpadals.v2i64.v4i32"]
+    pub(crate) fn vpadalq_s32_(a: int64x2_t, b: int32x4_t) -> int64x2_t;
+
+    #[link_name = "llvm.arm.neon.vpadalu.v4i16.v8i8"]
+    pub(crate) fn vpadal_u8_(a: uint16x4_t, b: uint8x8_t) -> uint16x4_t;
+    #[link_name = "llvm.arm.neon.vpadalu.v2i32.v4i16"]
+    pub(crate) fn vpadal_u16_(a: uint32x2_t, b: uint16x4_t) -> uint32x2_t;
+    #[link_name = "llvm.arm.neon.vpadalu.v1i64.v2i32"]
+    pub(crate) fn vpadal_u32_(a: uint64x1_t, b: uint32x2_t) -> uint64x1_t;
+    #[link_name = "llvm.arm.neon.vpadalu.v8i16.v16i8"]
+    pub(crate) fn vpadalq_u8_(a: uint16x8_t, b: uint8x16_t) -> uint16x8_t;
+    #[link_name = "llvm.arm.neon.vpadalu.v4i32.v8i16"]
+    pub(crate) fn vpadalq_u16_(a: uint32x4_t, b: uint16x8_t) -> uint32x4_t;
+    #[link_name = "llvm.arm.neon.vpadalu.v2i64.v4i32"]
+    pub(crate) fn vpadalq_u32_(a: uint64x2_t, b: uint32x4_t) -> uint64x2_t;
+
+    #[link_name = "llvm.arm.neon.vtbl1"]
+    fn vtbl1(a: int8x8_t, b: int8x8_t) -> int8x8_t;
+    #[link_name = "llvm.arm.neon.vtbl2"]
+    fn vtbl2(a: int8x8_t, b: int8x8_t, b: int8x8_t) -> int8x8_t;
+    #[link_name = "llvm.arm.neon.vtbl3"]
+    fn vtbl3(a: int8x8_t, b: int8x8_t, b: int8x8_t, c: int8x8_t) -> int8x8_t;
+    #[link_name = "llvm.arm.neon.vtbl4"]
+    fn vtbl4(a: int8x8_t, b: int8x8_t, b: int8x8_t, c: int8x8_t, d: int8x8_t) -> int8x8_t;
+
+    #[link_name = "llvm.arm.neon.vtbx1"]
+    fn vtbx1(a: int8x8_t, b: int8x8_t, b: int8x8_t) -> int8x8_t;
+    #[link_name = "llvm.arm.neon.vtbx2"]
+    fn vtbx2(a: int8x8_t, b: int8x8_t, b: int8x8_t, c: int8x8_t) -> int8x8_t;
+    #[link_name = "llvm.arm.neon.vtbx3"]
+    fn vtbx3(a: int8x8_t, b: int8x8_t, b: int8x8_t, c: int8x8_t, d: int8x8_t) -> int8x8_t;
+    #[link_name = "llvm.arm.neon.vtbx4"]
+    fn vtbx4(
+        a: int8x8_t,
+        b: int8x8_t,
+        b: int8x8_t,
+        c: int8x8_t,
+        d: int8x8_t,
+        e: int8x8_t,
+    ) -> int8x8_t;
+
+    #[link_name = "llvm.arm.neon.vshiftins.v8i8"]
+    fn vshiftins_v8i8(a: int8x8_t, b: int8x8_t, shift: int8x8_t) -> int8x8_t;
+    #[link_name = "llvm.arm.neon.vshiftins.v16i8"]
+    fn vshiftins_v16i8(a: int8x16_t, b: int8x16_t, shift: int8x16_t) -> int8x16_t;
+    #[link_name = "llvm.arm.neon.vshiftins.v4i16"]
+    fn vshiftins_v4i16(a: int16x4_t, b: int16x4_t, shift: int16x4_t) -> int16x4_t;
+    #[link_name = "llvm.arm.neon.vshiftins.v8i16"]
+    fn vshiftins_v8i16(a: int16x8_t, b: int16x8_t, shift: int16x8_t) -> int16x8_t;
+    #[link_name = "llvm.arm.neon.vshiftins.v2i32"]
+    fn vshiftins_v2i32(a: int32x2_t, b: int32x2_t, shift: int32x2_t) -> int32x2_t;
+    #[link_name = "llvm.arm.neon.vshiftins.v4i32"]
+    fn vshiftins_v4i32(a: int32x4_t, b: int32x4_t, shift: int32x4_t) -> int32x4_t;
+    #[link_name = "llvm.arm.neon.vshiftins.v1i64"]
+    fn vshiftins_v1i64(a: int64x1_t, b: int64x1_t, shift: int64x1_t) -> int64x1_t;
+    #[link_name = "llvm.arm.neon.vshiftins.v2i64"]
+    fn vshiftins_v2i64(a: int64x2_t, b: int64x2_t, shift: int64x2_t) -> int64x2_t;
+
+    #[link_name = "llvm.arm.neon.vld1.v8i8.p0i8"]
+    fn vld1_v8i8(addr: *const i8, align: i32) -> int8x8_t;
+    #[link_name = "llvm.arm.neon.vld1.v16i8.p0i8"]
+    fn vld1q_v16i8(addr: *const i8, align: i32) -> int8x16_t;
+    #[link_name = "llvm.arm.neon.vld1.v4i16.p0i8"]
+    fn vld1_v4i16(addr: *const i8, align: i32) -> int16x4_t;
+    #[link_name = "llvm.arm.neon.vld1.v8i16.p0i8"]
+    fn vld1q_v8i16(addr: *const i8, align: i32) -> int16x8_t;
+    #[link_name = "llvm.arm.neon.vld1.v2i32.p0i8"]
+    fn vld1_v2i32(addr: *const i8, align: i32) -> int32x2_t;
+    #[link_name = "llvm.arm.neon.vld1.v4i32.p0i8"]
+    fn vld1q_v4i32(addr: *const i8, align: i32) -> int32x4_t;
+    #[link_name = "llvm.arm.neon.vld1.v1i64.p0i8"]
+    fn vld1_v1i64(addr: *const i8, align: i32) -> int64x1_t;
+    #[link_name = "llvm.arm.neon.vld1.v2i64.p0i8"]
+    fn vld1q_v2i64(addr: *const i8, align: i32) -> int64x2_t;
+    #[link_name = "llvm.arm.neon.vld1.v2f32.p0i8"]
+    fn vld1_v2f32(addr: *const i8, align: i32) -> float32x2_t;
+    #[link_name = "llvm.arm.neon.vld1.v4f32.p0i8"]
+    fn vld1q_v4f32(addr: *const i8, align: i32) -> float32x4_t;
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vld1.8"))]
+pub unsafe fn vld1_s8(ptr: *const i8) -> int8x8_t {
+    vld1_v8i8(ptr as *const i8, align_of::<i8>() as i32)
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vld1.8"))]
+pub unsafe fn vld1q_s8(ptr: *const i8) -> int8x16_t {
+    vld1q_v16i8(ptr as *const i8, align_of::<i8>() as i32)
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vld1.16"))]
+pub unsafe fn vld1_s16(ptr: *const i16) -> int16x4_t {
+    vld1_v4i16(ptr as *const i8, align_of::<i16>() as i32)
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vld1.16"))]
+pub unsafe fn vld1q_s16(ptr: *const i16) -> int16x8_t {
+    vld1q_v8i16(ptr as *const i8, align_of::<i16>() as i32)
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vldr))]
+pub unsafe fn vld1_s32(ptr: *const i32) -> int32x2_t {
+    vld1_v2i32(ptr as *const i8, align_of::<i32>() as i32)
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vld1.32"))]
+pub unsafe fn vld1q_s32(ptr: *const i32) -> int32x4_t {
+    vld1q_v4i32(ptr as *const i8, align_of::<i32>() as i32)
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vldr))]
+pub unsafe fn vld1_s64(ptr: *const i64) -> int64x1_t {
+    vld1_v1i64(ptr as *const i8, align_of::<i64>() as i32)
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vld1.64"))]
+pub unsafe fn vld1q_s64(ptr: *const i64) -> int64x2_t {
+    vld1q_v2i64(ptr as *const i8, align_of::<i64>() as i32)
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vld1.8"))]
+pub unsafe fn vld1_u8(ptr: *const u8) -> uint8x8_t {
+    transmute(vld1_v8i8(ptr as *const i8, align_of::<u8>() as i32))
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vld1.8"))]
+pub unsafe fn vld1q_u8(ptr: *const u8) -> uint8x16_t {
+    transmute(vld1q_v16i8(ptr as *const i8, align_of::<u8>() as i32))
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vld1.16"))]
+pub unsafe fn vld1_u16(ptr: *const u16) -> uint16x4_t {
+    transmute(vld1_v4i16(ptr as *const i8, align_of::<u16>() as i32))
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vld1.16"))]
+pub unsafe fn vld1q_u16(ptr: *const u16) -> uint16x8_t {
+    transmute(vld1q_v8i16(ptr as *const i8, align_of::<u16>() as i32))
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vldr))]
+pub unsafe fn vld1_u32(ptr: *const u32) -> uint32x2_t {
+    transmute(vld1_v2i32(ptr as *const i8, align_of::<u32>() as i32))
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vld1.32"))]
+pub unsafe fn vld1q_u32(ptr: *const u32) -> uint32x4_t {
+    transmute(vld1q_v4i32(ptr as *const i8, align_of::<u32>() as i32))
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vldr))]
+pub unsafe fn vld1_u64(ptr: *const u64) -> uint64x1_t {
+    transmute(vld1_v1i64(ptr as *const i8, align_of::<u64>() as i32))
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vld1.64"))]
+pub unsafe fn vld1q_u64(ptr: *const u64) -> uint64x2_t {
+    transmute(vld1q_v2i64(ptr as *const i8, align_of::<u64>() as i32))
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vld1.8"))]
+pub unsafe fn vld1_p8(ptr: *const p8) -> poly8x8_t {
+    transmute(vld1_v8i8(ptr as *const i8, align_of::<p8>() as i32))
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vld1.8"))]
+pub unsafe fn vld1q_p8(ptr: *const p8) -> poly8x16_t {
+    transmute(vld1q_v16i8(ptr as *const i8, align_of::<p8>() as i32))
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vld1.16"))]
+pub unsafe fn vld1_p16(ptr: *const p16) -> poly16x4_t {
+    transmute(vld1_v4i16(ptr as *const i8, align_of::<p16>() as i32))
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vld1.16"))]
+pub unsafe fn vld1q_p16(ptr: *const p16) -> poly16x8_t {
+    transmute(vld1q_v8i16(ptr as *const i8, align_of::<p16>() as i32))
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vldr))]
+pub unsafe fn vld1_f32(ptr: *const f32) -> float32x2_t {
+    vld1_v2f32(ptr as *const i8, align_of::<f32>() as i32)
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vld1.32"))]
+pub unsafe fn vld1q_f32(ptr: *const f32) -> float32x4_t {
+    vld1q_v4f32(ptr as *const i8, align_of::<f32>() as i32)
+}
+
+/// Table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vtbl))]
+pub unsafe fn vtbl1_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
+    vtbl1(a, b)
+}
+
+/// Table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vtbl))]
+pub unsafe fn vtbl1_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+    transmute(vtbl1(transmute(a), transmute(b)))
+}
+
+/// Table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vtbl))]
+pub unsafe fn vtbl1_p8(a: poly8x8_t, b: uint8x8_t) -> poly8x8_t {
+    transmute(vtbl1(transmute(a), transmute(b)))
+}
+
+/// Table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vtbl))]
+pub unsafe fn vtbl2_s8(a: int8x8x2_t, b: int8x8_t) -> int8x8_t {
+    vtbl2(a.0, a.1, b)
+}
+
+/// Table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vtbl))]
+pub unsafe fn vtbl2_u8(a: uint8x8x2_t, b: uint8x8_t) -> uint8x8_t {
+    transmute(vtbl2(transmute(a.0), transmute(a.1), transmute(b)))
+}
+
+/// Table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vtbl))]
+pub unsafe fn vtbl2_p8(a: poly8x8x2_t, b: uint8x8_t) -> poly8x8_t {
+    transmute(vtbl2(transmute(a.0), transmute(a.1), transmute(b)))
+}
+
+/// Table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vtbl))]
+pub unsafe fn vtbl3_s8(a: int8x8x3_t, b: int8x8_t) -> int8x8_t {
+    vtbl3(a.0, a.1, a.2, b)
+}
+
+/// Table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vtbl))]
+pub unsafe fn vtbl3_u8(a: uint8x8x3_t, b: uint8x8_t) -> uint8x8_t {
+    transmute(vtbl3(
+        transmute(a.0),
+        transmute(a.1),
+        transmute(a.2),
+        transmute(b),
+    ))
+}
+
+/// Table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vtbl))]
+pub unsafe fn vtbl3_p8(a: poly8x8x3_t, b: uint8x8_t) -> poly8x8_t {
+    transmute(vtbl3(
+        transmute(a.0),
+        transmute(a.1),
+        transmute(a.2),
+        transmute(b),
+    ))
+}
+
+/// Table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vtbl))]
+pub unsafe fn vtbl4_s8(a: int8x8x4_t, b: int8x8_t) -> int8x8_t {
+    vtbl4(a.0, a.1, a.2, a.3, b)
+}
+
+/// Table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vtbl))]
+pub unsafe fn vtbl4_u8(a: uint8x8x4_t, b: uint8x8_t) -> uint8x8_t {
+    transmute(vtbl4(
+        transmute(a.0),
+        transmute(a.1),
+        transmute(a.2),
+        transmute(a.3),
+        transmute(b),
+    ))
+}
+
+/// Table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vtbl))]
+pub unsafe fn vtbl4_p8(a: poly8x8x4_t, b: uint8x8_t) -> poly8x8_t {
+    transmute(vtbl4(
+        transmute(a.0),
+        transmute(a.1),
+        transmute(a.2),
+        transmute(a.3),
+        transmute(b),
+    ))
+}
+
+/// Extended table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vtbx))]
+pub unsafe fn vtbx1_s8(a: int8x8_t, b: int8x8_t, c: int8x8_t) -> int8x8_t {
+    vtbx1(a, b, c)
+}
+
+/// Extended table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vtbx))]
+pub unsafe fn vtbx1_u8(a: uint8x8_t, b: uint8x8_t, c: uint8x8_t) -> uint8x8_t {
+    transmute(vtbx1(transmute(a), transmute(b), transmute(c)))
+}
+
+/// Extended table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vtbx))]
+pub unsafe fn vtbx1_p8(a: poly8x8_t, b: poly8x8_t, c: uint8x8_t) -> poly8x8_t {
+    transmute(vtbx1(transmute(a), transmute(b), transmute(c)))
+}
+
+/// Extended table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vtbx))]
+pub unsafe fn vtbx2_s8(a: int8x8_t, b: int8x8x2_t, c: int8x8_t) -> int8x8_t {
+    vtbx2(a, b.0, b.1, c)
+}
+
+/// Extended table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vtbx))]
+pub unsafe fn vtbx2_u8(a: uint8x8_t, b: uint8x8x2_t, c: uint8x8_t) -> uint8x8_t {
+    transmute(vtbx2(
+        transmute(a),
+        transmute(b.0),
+        transmute(b.1),
+        transmute(c),
+    ))
+}
+
+/// Extended table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vtbx))]
+pub unsafe fn vtbx2_p8(a: poly8x8_t, b: poly8x8x2_t, c: uint8x8_t) -> poly8x8_t {
+    transmute(vtbx2(
+        transmute(a),
+        transmute(b.0),
+        transmute(b.1),
+        transmute(c),
+    ))
+}
+
+/// Extended table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vtbx))]
+pub unsafe fn vtbx3_s8(a: int8x8_t, b: int8x8x3_t, c: int8x8_t) -> int8x8_t {
+    vtbx3(a, b.0, b.1, b.2, c)
+}
+
+/// Extended table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vtbx))]
+pub unsafe fn vtbx3_u8(a: uint8x8_t, b: uint8x8x3_t, c: uint8x8_t) -> uint8x8_t {
+    transmute(vtbx3(
+        transmute(a),
+        transmute(b.0),
+        transmute(b.1),
+        transmute(b.2),
+        transmute(c),
+    ))
+}
+
+/// Extended table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vtbx))]
+pub unsafe fn vtbx3_p8(a: poly8x8_t, b: poly8x8x3_t, c: uint8x8_t) -> poly8x8_t {
+    transmute(vtbx3(
+        transmute(a),
+        transmute(b.0),
+        transmute(b.1),
+        transmute(b.2),
+        transmute(c),
+    ))
+}
+
+/// Extended table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vtbx))]
+pub unsafe fn vtbx4_s8(a: int8x8_t, b: int8x8x4_t, c: int8x8_t) -> int8x8_t {
+    vtbx4(a, b.0, b.1, b.2, b.3, c)
+}
+
+/// Extended table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vtbx))]
+pub unsafe fn vtbx4_u8(a: uint8x8_t, b: uint8x8x4_t, c: uint8x8_t) -> uint8x8_t {
+    transmute(vtbx4(
+        transmute(a),
+        transmute(b.0),
+        transmute(b.1),
+        transmute(b.2),
+        transmute(b.3),
+        transmute(c),
+    ))
+}
+
+/// Extended table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vtbx))]
+pub unsafe fn vtbx4_p8(a: poly8x8_t, b: poly8x8x4_t, c: uint8x8_t) -> poly8x8_t {
+    transmute(vtbx4(
+        transmute(a),
+        transmute(b.0),
+        transmute(b.1),
+        transmute(b.2),
+        transmute(b.3),
+        transmute(c),
+    ))
+}
+
+// These float-to-int implementations have undefined behaviour when `a` overflows
+// the destination type. Clang has the same problem: https://llvm.org/PR47510
+
+/// Floating-point Convert to Signed fixed-point, rounding toward Zero (vector)
+#[inline]
+#[target_feature(enable = "neon")]
+#[target_feature(enable = "v7")]
+#[cfg_attr(test, assert_instr("vcvt.s32.f32"))]
+pub unsafe fn vcvtq_s32_f32(a: float32x4_t) -> int32x4_t {
+    transmute(simd_cast::<_, i32x4>(transmute::<_, f32x4>(a)))
+}
+
+/// Floating-point Convert to Unsigned fixed-point, rounding toward Zero (vector)
+#[inline]
+#[target_feature(enable = "neon")]
+#[target_feature(enable = "v7")]
+#[cfg_attr(test, assert_instr("vcvt.u32.f32"))]
+pub unsafe fn vcvtq_u32_f32(a: float32x4_t) -> uint32x4_t {
+    transmute(simd_cast::<_, u32x4>(transmute::<_, f32x4>(a)))
+}
+
+/// Shift Left and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vsli.8", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsli_n_s8<const N: i32>(a: int8x8_t, b: int8x8_t) -> int8x8_t {
+    static_assert_imm3!(N);
+    let n = N as i8;
+    vshiftins_v8i8(a, b, int8x8_t(n, n, n, n, n, n, n, n))
+}
+/// Shift Left and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vsli.8", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsliq_n_s8<const N: i32>(a: int8x16_t, b: int8x16_t) -> int8x16_t {
+    static_assert_imm3!(N);
+    let n = N as i8;
+    vshiftins_v16i8(
+        a,
+        b,
+        int8x16_t(n, n, n, n, n, n, n, n, n, n, n, n, n, n, n, n),
+    )
+}
+/// Shift Left and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vsli.16", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsli_n_s16<const N: i32>(a: int16x4_t, b: int16x4_t) -> int16x4_t {
+    static_assert_imm4!(N);
+    let n = N as i16;
+    vshiftins_v4i16(a, b, int16x4_t(n, n, n, n))
+}
+/// Shift Left and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vsli.16", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsliq_n_s16<const N: i32>(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+    static_assert_imm4!(N);
+    let n = N as i16;
+    vshiftins_v8i16(a, b, int16x8_t(n, n, n, n, n, n, n, n))
+}
+/// Shift Left and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vsli.32", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsli_n_s32<const N: i32>(a: int32x2_t, b: int32x2_t) -> int32x2_t {
+    static_assert!(N: i32 where N >= 0 && N <= 31);
+    vshiftins_v2i32(a, b, int32x2_t(N, N))
+}
+/// Shift Left and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vsli.32", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsliq_n_s32<const N: i32>(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+    static_assert!(N: i32 where N >= 0 && N <= 31);
+    vshiftins_v4i32(a, b, int32x4_t(N, N, N, N))
+}
+/// Shift Left and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vsli.64", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsli_n_s64<const N: i32>(a: int64x1_t, b: int64x1_t) -> int64x1_t {
+    static_assert!(N : i32 where 0 <= N && N <= 63);
+    vshiftins_v1i64(a, b, int64x1_t(N as i64))
+}
+/// Shift Left and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vsli.64", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsliq_n_s64<const N: i32>(a: int64x2_t, b: int64x2_t) -> int64x2_t {
+    static_assert!(N : i32 where 0 <= N && N <= 63);
+    vshiftins_v2i64(a, b, int64x2_t(N as i64, N as i64))
+}
+/// Shift Left and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vsli.8", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsli_n_u8<const N: i32>(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+    static_assert_imm3!(N);
+    let n = N as i8;
+    transmute(vshiftins_v8i8(
+        transmute(a),
+        transmute(b),
+        int8x8_t(n, n, n, n, n, n, n, n),
+    ))
+}
+/// Shift Left and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vsli.8", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsliq_n_u8<const N: i32>(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    static_assert_imm3!(N);
+    let n = N as i8;
+    transmute(vshiftins_v16i8(
+        transmute(a),
+        transmute(b),
+        int8x16_t(n, n, n, n, n, n, n, n, n, n, n, n, n, n, n, n),
+    ))
+}
+/// Shift Left and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vsli.16", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsli_n_u16<const N: i32>(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
+    static_assert_imm4!(N);
+    let n = N as i16;
+    transmute(vshiftins_v4i16(
+        transmute(a),
+        transmute(b),
+        int16x4_t(n, n, n, n),
+    ))
+}
+/// Shift Left and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vsli.16", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsliq_n_u16<const N: i32>(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
+    static_assert_imm4!(N);
+    let n = N as i16;
+    transmute(vshiftins_v8i16(
+        transmute(a),
+        transmute(b),
+        int16x8_t(n, n, n, n, n, n, n, n),
+    ))
+}
+/// Shift Left and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vsli.32", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsli_n_u32<const N: i32>(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
+    static_assert!(N: i32 where N >= 0 && N <= 31);
+    transmute(vshiftins_v2i32(transmute(a), transmute(b), int32x2_t(N, N)))
+}
+/// Shift Left and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vsli.32", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsliq_n_u32<const N: i32>(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    static_assert!(N: i32 where N >= 0 && N <= 31);
+    transmute(vshiftins_v4i32(
+        transmute(a),
+        transmute(b),
+        int32x4_t(N, N, N, N),
+    ))
+}
+/// Shift Left and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vsli.64", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsli_n_u64<const N: i32>(a: uint64x1_t, b: uint64x1_t) -> uint64x1_t {
+    static_assert!(N : i32 where 0 <= N && N <= 63);
+    transmute(vshiftins_v1i64(
+        transmute(a),
+        transmute(b),
+        int64x1_t(N as i64),
+    ))
+}
+/// Shift Left and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vsli.64", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsliq_n_u64<const N: i32>(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
+    static_assert!(N : i32 where 0 <= N && N <= 63);
+    transmute(vshiftins_v2i64(
+        transmute(a),
+        transmute(b),
+        int64x2_t(N as i64, N as i64),
+    ))
+}
+/// Shift Left and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vsli.8", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsli_n_p8<const N: i32>(a: poly8x8_t, b: poly8x8_t) -> poly8x8_t {
+    static_assert_imm3!(N);
+    let n = N as i8;
+    transmute(vshiftins_v8i8(
+        transmute(a),
+        transmute(b),
+        int8x8_t(n, n, n, n, n, n, n, n),
+    ))
+}
+/// Shift Left and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vsli.8", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsliq_n_p8<const N: i32>(a: poly8x16_t, b: poly8x16_t) -> poly8x16_t {
+    static_assert_imm3!(N);
+    let n = N as i8;
+    transmute(vshiftins_v16i8(
+        transmute(a),
+        transmute(b),
+        int8x16_t(n, n, n, n, n, n, n, n, n, n, n, n, n, n, n, n),
+    ))
+}
+/// Shift Left and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vsli.16", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsli_n_p16<const N: i32>(a: poly16x4_t, b: poly16x4_t) -> poly16x4_t {
+    static_assert_imm4!(N);
+    let n = N as i16;
+    transmute(vshiftins_v4i16(
+        transmute(a),
+        transmute(b),
+        int16x4_t(n, n, n, n),
+    ))
+}
+/// Shift Left and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vsli.16", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsliq_n_p16<const N: i32>(a: poly16x8_t, b: poly16x8_t) -> poly16x8_t {
+    static_assert_imm4!(N);
+    let n = N as i16;
+    transmute(vshiftins_v8i16(
+        transmute(a),
+        transmute(b),
+        int16x8_t(n, n, n, n, n, n, n, n),
+    ))
+}
+
+/// Shift Right and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vsri.8", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsri_n_s8<const N: i32>(a: int8x8_t, b: int8x8_t) -> int8x8_t {
+    static_assert!(N : i32 where 1 <= N && N <= 8);
+    let n = -N as i8;
+    vshiftins_v8i8(a, b, int8x8_t(n, n, n, n, n, n, n, n))
+}
+/// Shift Right and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vsri.8", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsriq_n_s8<const N: i32>(a: int8x16_t, b: int8x16_t) -> int8x16_t {
+    static_assert!(N : i32 where 1 <= N && N <= 8);
+    let n = -N as i8;
+    vshiftins_v16i8(
+        a,
+        b,
+        int8x16_t(n, n, n, n, n, n, n, n, n, n, n, n, n, n, n, n),
+    )
+}
+/// Shift Right and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vsri.16", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsri_n_s16<const N: i32>(a: int16x4_t, b: int16x4_t) -> int16x4_t {
+    static_assert!(N : i32 where 1 <= N && N <= 16);
+    let n = -N as i16;
+    vshiftins_v4i16(a, b, int16x4_t(n, n, n, n))
+}
+/// Shift Right and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vsri.16", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsriq_n_s16<const N: i32>(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+    static_assert!(N : i32 where 1 <= N && N <= 16);
+    let n = -N as i16;
+    vshiftins_v8i16(a, b, int16x8_t(n, n, n, n, n, n, n, n))
+}
+/// Shift Right and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vsri.32", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsri_n_s32<const N: i32>(a: int32x2_t, b: int32x2_t) -> int32x2_t {
+    static_assert!(N : i32 where 1 <= N && N <= 32);
+    vshiftins_v2i32(a, b, int32x2_t(-N, -N))
+}
+/// Shift Right and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vsri.32", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsriq_n_s32<const N: i32>(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+    static_assert!(N : i32 where 1 <= N && N <= 32);
+    vshiftins_v4i32(a, b, int32x4_t(-N, -N, -N, -N))
+}
+/// Shift Right and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vsri.64", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsri_n_s64<const N: i32>(a: int64x1_t, b: int64x1_t) -> int64x1_t {
+    static_assert!(N : i32 where 1 <= N && N <= 64);
+    vshiftins_v1i64(a, b, int64x1_t(-N as i64))
+}
+/// Shift Right and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vsri.64", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsriq_n_s64<const N: i32>(a: int64x2_t, b: int64x2_t) -> int64x2_t {
+    static_assert!(N : i32 where 1 <= N && N <= 64);
+    vshiftins_v2i64(a, b, int64x2_t(-N as i64, -N as i64))
+}
+/// Shift Right and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vsri.8", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsri_n_u8<const N: i32>(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+    static_assert!(N : i32 where 1 <= N && N <= 8);
+    let n = -N as i8;
+    transmute(vshiftins_v8i8(
+        transmute(a),
+        transmute(b),
+        int8x8_t(n, n, n, n, n, n, n, n),
+    ))
+}
+/// Shift Right and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vsri.8", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsriq_n_u8<const N: i32>(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    static_assert!(N : i32 where 1 <= N && N <= 8);
+    let n = -N as i8;
+    transmute(vshiftins_v16i8(
+        transmute(a),
+        transmute(b),
+        int8x16_t(n, n, n, n, n, n, n, n, n, n, n, n, n, n, n, n),
+    ))
+}
+/// Shift Right and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vsri.16", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsri_n_u16<const N: i32>(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
+    static_assert!(N : i32 where 1 <= N && N <= 16);
+    let n = -N as i16;
+    transmute(vshiftins_v4i16(
+        transmute(a),
+        transmute(b),
+        int16x4_t(n, n, n, n),
+    ))
+}
+/// Shift Right and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vsri.16", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsriq_n_u16<const N: i32>(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
+    static_assert!(N : i32 where 1 <= N && N <= 16);
+    let n = -N as i16;
+    transmute(vshiftins_v8i16(
+        transmute(a),
+        transmute(b),
+        int16x8_t(n, n, n, n, n, n, n, n),
+    ))
+}
+/// Shift Right and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vsri.32", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsri_n_u32<const N: i32>(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
+    static_assert!(N : i32 where 1 <= N && N <= 32);
+    transmute(vshiftins_v2i32(
+        transmute(a),
+        transmute(b),
+        int32x2_t(-N, -N),
+    ))
+}
+/// Shift Right and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vsri.32", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsriq_n_u32<const N: i32>(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    static_assert!(N : i32 where 1 <= N && N <= 32);
+    transmute(vshiftins_v4i32(
+        transmute(a),
+        transmute(b),
+        int32x4_t(-N, -N, -N, -N),
+    ))
+}
+/// Shift Right and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vsri.64", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsri_n_u64<const N: i32>(a: uint64x1_t, b: uint64x1_t) -> uint64x1_t {
+    static_assert!(N : i32 where 1 <= N && N <= 64);
+    transmute(vshiftins_v1i64(
+        transmute(a),
+        transmute(b),
+        int64x1_t(-N as i64),
+    ))
+}
+/// Shift Right and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vsri.64", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsriq_n_u64<const N: i32>(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
+    static_assert!(N : i32 where 1 <= N && N <= 64);
+    transmute(vshiftins_v2i64(
+        transmute(a),
+        transmute(b),
+        int64x2_t(-N as i64, -N as i64),
+    ))
+}
+/// Shift Right and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vsri.8", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsri_n_p8<const N: i32>(a: poly8x8_t, b: poly8x8_t) -> poly8x8_t {
+    static_assert!(N : i32 where 1 <= N && N <= 8);
+    let n = -N as i8;
+    transmute(vshiftins_v8i8(
+        transmute(a),
+        transmute(b),
+        int8x8_t(n, n, n, n, n, n, n, n),
+    ))
+}
+/// Shift Right and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vsri.8", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsriq_n_p8<const N: i32>(a: poly8x16_t, b: poly8x16_t) -> poly8x16_t {
+    static_assert!(N : i32 where 1 <= N && N <= 8);
+    let n = -N as i8;
+    transmute(vshiftins_v16i8(
+        transmute(a),
+        transmute(b),
+        int8x16_t(n, n, n, n, n, n, n, n, n, n, n, n, n, n, n, n),
+    ))
+}
+/// Shift Right and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vsri.16", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsri_n_p16<const N: i32>(a: poly16x4_t, b: poly16x4_t) -> poly16x4_t {
+    static_assert!(N : i32 where 1 <= N && N <= 16);
+    let n = -N as i16;
+    transmute(vshiftins_v4i16(
+        transmute(a),
+        transmute(b),
+        int16x4_t(n, n, n, n),
+    ))
+}
+/// Shift Right and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vsri.16", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsriq_n_p16<const N: i32>(a: poly16x8_t, b: poly16x8_t) -> poly16x8_t {
+    static_assert!(N : i32 where 1 <= N && N <= 16);
+    let n = -N as i16;
+    transmute(vshiftins_v8i16(
+        transmute(a),
+        transmute(b),
+        int16x8_t(n, n, n, n, n, n, n, n),
+    ))
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::core_arch::{arm::*, simd::*};
+    use crate::mem::transmute;
+    use stdarch_test::simd_test;
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcvtq_s32_f32() {
+        let f = f32x4::new(-1., 2., 3., 4.);
+        let e = i32x4::new(-1, 2, 3, 4);
+        let r: i32x4 = transmute(vcvtq_s32_f32(transmute(f)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcvtq_u32_f32() {
+        let f = f32x4::new(1., 2., 3., 4.);
+        let e = u32x4::new(1, 2, 3, 4);
+        let r: u32x4 = transmute(vcvtq_u32_f32(transmute(f)));
+        assert_eq!(r, e);
+    }
+}
diff --git a/crates/core_arch/src/acle/sat.rs b/crates/core_arch/src/arm/sat.rs
similarity index 100%
rename from crates/core_arch/src/acle/sat.rs
rename to crates/core_arch/src/arm/sat.rs
diff --git a/crates/core_arch/src/acle/simd32.rs b/crates/core_arch/src/arm/simd32.rs
similarity index 99%
rename from crates/core_arch/src/acle/simd32.rs
rename to crates/core_arch/src/arm/simd32.rs
index 04183d4b88..5cae2fc2aa 100644
--- a/crates/core_arch/src/acle/simd32.rs
+++ b/crates/core_arch/src/arm/simd32.rs
@@ -65,7 +65,7 @@
 #[cfg(test)]
 use stdarch_test::assert_instr;
 
-use crate::{core_arch::acle::dsp::int16x2_t, mem::transmute};
+use crate::{core_arch::arm::dsp::int16x2_t, mem::transmute};
 
 types! {
     /// ARM-specific 32-bit wide vector of four packed `i8`.
diff --git a/crates/core_arch/src/acle/barrier/common.rs b/crates/core_arch/src/arm_shared/barrier/common.rs
similarity index 100%
rename from crates/core_arch/src/acle/barrier/common.rs
rename to crates/core_arch/src/arm_shared/barrier/common.rs
diff --git a/crates/core_arch/src/acle/barrier/cp15.rs b/crates/core_arch/src/arm_shared/barrier/cp15.rs
similarity index 100%
rename from crates/core_arch/src/acle/barrier/cp15.rs
rename to crates/core_arch/src/arm_shared/barrier/cp15.rs
diff --git a/crates/core_arch/src/acle/barrier/mod.rs b/crates/core_arch/src/arm_shared/barrier/mod.rs
similarity index 100%
rename from crates/core_arch/src/acle/barrier/mod.rs
rename to crates/core_arch/src/arm_shared/barrier/mod.rs
diff --git a/crates/core_arch/src/acle/barrier/not_mclass.rs b/crates/core_arch/src/arm_shared/barrier/not_mclass.rs
similarity index 100%
rename from crates/core_arch/src/acle/barrier/not_mclass.rs
rename to crates/core_arch/src/arm_shared/barrier/not_mclass.rs
diff --git a/crates/core_arch/src/acle/barrier/v8.rs b/crates/core_arch/src/arm_shared/barrier/v8.rs
similarity index 100%
rename from crates/core_arch/src/acle/barrier/v8.rs
rename to crates/core_arch/src/arm_shared/barrier/v8.rs
diff --git a/crates/core_arch/src/arm/crc.rs b/crates/core_arch/src/arm_shared/crc.rs
similarity index 98%
rename from crates/core_arch/src/arm/crc.rs
rename to crates/core_arch/src/arm_shared/crc.rs
index ffce20fe22..b1cfbb381b 100644
--- a/crates/core_arch/src/arm/crc.rs
+++ b/crates/core_arch/src/arm_shared/crc.rs
@@ -79,7 +79,7 @@ pub unsafe fn __crc32cw(crc: u32, data: u32) -> u32 {
 
 #[cfg(test)]
 mod tests {
-    use crate::core_arch::{arm::*, simd::*};
+    use crate::core_arch::{arm_shared::*, simd::*};
     use std::mem;
     use stdarch_test::simd_test;
 
diff --git a/crates/core_arch/src/arm/crypto.rs b/crates/core_arch/src/arm_shared/crypto.rs
similarity index 99%
rename from crates/core_arch/src/arm/crypto.rs
rename to crates/core_arch/src/arm_shared/crypto.rs
index 8361e39646..b4d5b2978f 100644
--- a/crates/core_arch/src/arm/crypto.rs
+++ b/crates/core_arch/src/arm_shared/crypto.rs
@@ -1,4 +1,4 @@
-use crate::core_arch::arm::{uint32x4_t, uint8x16_t};
+use crate::core_arch::arm_shared::{uint32x4_t, uint8x16_t};
 
 #[allow(improper_ctypes)]
 extern "C" {
@@ -191,7 +191,8 @@ pub unsafe fn vsha256su1q_u32(
 
 #[cfg(test)]
 mod tests {
-    use crate::core_arch::{arm::*, simd::*};
+    use super::*;
+    use crate::core_arch::{arm_shared::*, simd::*};
     use std::mem;
     use stdarch_test::simd_test;
 
diff --git a/crates/core_arch/src/acle/hints.rs b/crates/core_arch/src/arm_shared/hints.rs
similarity index 69%
rename from crates/core_arch/src/acle/hints.rs
rename to crates/core_arch/src/arm_shared/hints.rs
index 280aa00cf8..3145cde8d5 100644
--- a/crates/core_arch/src/acle/hints.rs
+++ b/crates/core_arch/src/arm_shared/hints.rs
@@ -9,7 +9,7 @@
 /// low-power state until one of a number of asynchronous events occurs.
 // Section 10.1 of ACLE says that the supported arches are: 8, 6K, 6-M
 // LLVM says "instruction requires: armv6k"
-#[cfg(any(target_feature = "v6", target_arch = "aarch64"))]
+#[cfg(any(target_feature = "v6", target_arch = "aarch64", doc))]
 #[inline(always)]
 pub unsafe fn __wfi() {
     hint(HINT_WFI);
@@ -22,7 +22,7 @@ pub unsafe fn __wfi() {
 /// another processor.
 // Section 10.1 of ACLE says that the supported arches are: 8, 6K, 6-M
 // LLVM says "instruction requires: armv6k"
-#[cfg(any(target_feature = "v6", target_arch = "aarch64"))]
+#[cfg(any(target_feature = "v6", target_arch = "aarch64", doc))]
 #[inline(always)]
 pub unsafe fn __wfe() {
     hint(HINT_WFE);
@@ -34,7 +34,7 @@ pub unsafe fn __wfe() {
 /// system. It is a NOP on a uniprocessor system.
 // Section 10.1 of ACLE says that the supported arches are: 8, 6K, 6-M, 7-M
 // LLVM says "instruction requires: armv6k"
-#[cfg(any(target_feature = "v6", target_arch = "aarch64"))]
+#[cfg(any(target_feature = "v6", target_arch = "aarch64", doc))]
 #[inline(always)]
 pub unsafe fn __sev() {
     hint(HINT_SEV);
@@ -49,6 +49,7 @@ pub unsafe fn __sev() {
 #[cfg(any(
     target_feature = "v8", // 32-bit ARMv8
     target_arch = "aarch64", // AArch64
+    doc,
 ))]
 #[inline(always)]
 pub unsafe fn __sevl() {
@@ -62,33 +63,12 @@ pub unsafe fn __sevl() {
 /// improve overall system performance.
 // Section 10.1 of ACLE says that the supported arches are: 8, 6K, 6-M
 // LLVM says "instruction requires: armv6k"
-#[cfg(any(target_feature = "v6", target_arch = "aarch64"))]
+#[cfg(any(target_feature = "v6", target_arch = "aarch64", doc))]
 #[inline(always)]
 pub unsafe fn __yield() {
     hint(HINT_YIELD);
 }
 
-/// Generates a DBG instruction.
-///
-/// This provides a hint to debugging and related systems. The argument must be
-/// a constant integer from 0 to 15 inclusive. See implementation documentation
-/// for the effect (if any) of this instruction and the meaning of the
-/// argument. This is available only when compliling for AArch32.
-// Section 10.1 of ACLE says that the supported arches are: 7, 7-M
-// "The DBG hint instruction is added in ARMv7. It is UNDEFINED in the ARMv6 base architecture, and
-// executes as a NOP instruction in ARMv6K and ARMv6T2." - ARM Architecture Reference Manual ARMv7-A
-// and ARMv7-R edition (ARM DDI 0406C.c) sections D12.4.1 "ARM instruction set support" and D12.4.2
-// "Thumb instruction set support"
-#[cfg(target_feature = "v7")]
-#[cfg(any(target_arch = "arm", doc))]
-#[doc(cfg(target_arch = "arm"))]
-#[inline(always)]
-#[rustc_legacy_const_generics(0)]
-pub unsafe fn __dbg<const IMM4: i32>() {
-    static_assert_imm4!(IMM4);
-    dbg(IMM4);
-}
-
 /// Generates an unspecified no-op instruction.
 ///
 /// Note that not all architectures provide a distinguished NOP instruction. On
@@ -104,10 +84,6 @@ extern "C" {
     #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.hint")]
     #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.hint")]
     fn hint(_: i32);
-
-    #[cfg(target_arch = "arm")]
-    #[link_name = "llvm.arm.dbg"]
-    fn dbg(_: i32);
 }
 
 // from LLVM 7.0.1's lib/Target/ARM/{ARMInstrThumb,ARMInstrInfo,ARMInstrThumb2}.td
diff --git a/crates/core_arch/src/acle/mod.rs b/crates/core_arch/src/arm_shared/mod.rs
similarity index 65%
rename from crates/core_arch/src/acle/mod.rs
rename to crates/core_arch/src/arm_shared/mod.rs
index 5f29decf5a..4c8d19854e 100644
--- a/crates/core_arch/src/acle/mod.rs
+++ b/crates/core_arch/src/arm_shared/mod.rs
@@ -47,6 +47,9 @@
 //!
 //! - [ACLE Q2 2018](https://developer.arm.com/docs/101028/latest)
 
+// Only for 'neon' submodule
+#![allow(non_camel_case_types)]
+
 // 8, 7 and 6-M are supported via dedicated instructions like DMB. All other arches are supported
 // via CP15 instructions. See Section 10.1 of ACLE
 mod barrier;
@@ -54,70 +57,29 @@ mod barrier;
 pub use self::barrier::*;
 
 mod hints;
-
 pub use self::hints::*;
 
 mod registers;
-
 pub use self::registers::*;
 
-mod ex;
-
-pub use self::ex::*;
-
-// Supported arches: 5TE, 7E-M. See Section 10.1 of ACLE (e.g. QADD)
-// We also include the A profile even though DSP is deprecated on that profile as of ACLE 2.0 (see
-// section 5.4.7)
-// Here we workaround the difference between LLVM's +dsp and ACLE's __ARM_FEATURE_DSP by gating on
-// '+v5te' rather than on '+dsp'
-#[cfg(all(
-    not(target_arch = "aarch64"),
-    any(
-        // >= v5TE but excludes v7-M
-        all(target_feature = "v5te", not(target_feature = "mclass")),
-        // v7E-M
-        all(target_feature = "mclass", target_feature = "dsp"),
-    )
-))]
-mod dsp;
-
-#[cfg(all(
-    not(target_arch = "aarch64"),
-    any(
-        all(target_feature = "v5te", not(target_feature = "mclass")),
-        all(target_feature = "mclass", target_feature = "dsp"),
-    )
-))]
-pub use self::dsp::*;
-
-// Supported arches: 6, 7-M. See Section 10.1 of ACLE (e.g. SSAT)
-#[cfg(all(not(target_arch = "aarch64"), target_feature = "v6",))]
-mod sat;
-
-#[cfg(all(not(target_arch = "aarch64"), target_feature = "v6",))]
-pub use self::sat::*;
-
-// Deprecated in ACLE 2.0 for the A profile but fully supported on the M and R profiles, says
-// Section 5.4.9 of ACLE. We'll expose these for the A profile even if deprecated
-#[cfg(all(
-    not(target_arch = "aarch64"),
-    any(
-        // v7-A, v7-R
-        all(target_feature = "v6", not(target_feature = "mclass")),
-        // v7E-M
-        all(target_feature = "mclass", target_feature = "dsp")
-    )
-))]
-mod simd32;
-
-#[cfg(all(
-    not(target_arch = "aarch64"),
-    any(
-        all(target_feature = "v6", not(target_feature = "mclass")),
-        all(target_feature = "mclass", target_feature = "dsp")
-    )
-))]
-pub use self::simd32::*;
+#[cfg(any(target_arch = "aarch64", target_feature = "v7", doc))]
+mod crc;
+#[cfg(any(target_arch = "aarch64", target_feature = "v7", doc))]
+pub use crc::*;
+
+#[cfg(any(target_arch = "aarch64", target_feature = "v7", doc))]
+mod crypto;
+#[cfg(any(target_arch = "aarch64", target_feature = "v7", doc))]
+pub use self::crypto::*;
+
+#[cfg(any(target_arch = "aarch64", target_feature = "v7", doc))]
+pub(crate) mod neon;
+#[cfg(any(target_arch = "aarch64", target_feature = "v7", doc))]
+pub use self::neon::*;
+
+#[cfg(test)]
+#[cfg(any(target_arch = "aarch64", target_feature = "v7", doc))]
+pub(crate) mod test_support;
 
 mod sealed {
     pub trait Dmb {
diff --git a/crates/core_arch/src/arm/neon/generated.rs b/crates/core_arch/src/arm_shared/neon/generated.rs
similarity index 100%
rename from crates/core_arch/src/arm/neon/generated.rs
rename to crates/core_arch/src/arm_shared/neon/generated.rs
diff --git a/crates/core_arch/src/arm/neon/load_tests.rs b/crates/core_arch/src/arm_shared/neon/load_tests.rs
similarity index 100%
rename from crates/core_arch/src/arm/neon/load_tests.rs
rename to crates/core_arch/src/arm_shared/neon/load_tests.rs
diff --git a/crates/core_arch/src/arm/neon/mod.rs b/crates/core_arch/src/arm_shared/neon/mod.rs
similarity index 88%
rename from crates/core_arch/src/arm/neon/mod.rs
rename to crates/core_arch/src/arm_shared/neon/mod.rs
index 02a58c1e87..3c87862166 100644
--- a/crates/core_arch/src/arm/neon/mod.rs
+++ b/crates/core_arch/src/arm_shared/neon/mod.rs
@@ -5,8 +5,6 @@ mod generated;
 #[rustfmt::skip]
 pub use self::generated::*;
 
-#[cfg(target_arch = "arm")]
-use crate::mem::align_of;
 use crate::{
     convert::TryInto, core_arch::simd::*, core_arch::simd_llvm::*, hint::unreachable_unchecked,
     mem::transmute,
@@ -21,67 +19,67 @@ pub(crate) type p128 = u128;
 
 types! {
     /// ARM-specific 64-bit wide vector of eight packed `i8`.
-    pub struct int8x8_t(i8, i8, i8, i8, i8, i8, i8, i8);
+    pub struct int8x8_t(pub(crate) i8, pub(crate) i8, pub(crate) i8, pub(crate) i8, pub(crate) i8, pub(crate) i8, pub(crate) i8, pub(crate) i8);
     /// ARM-specific 64-bit wide vector of eight packed `u8`.
-    pub struct uint8x8_t(u8, u8, u8, u8, u8, u8, u8, u8);
+    pub struct uint8x8_t(pub(crate) u8, pub(crate) u8, pub(crate) u8, pub(crate) u8, pub(crate) u8, pub(crate) u8, pub(crate) u8, pub(crate) u8);
     /// ARM-specific 64-bit wide polynomial vector of eight packed `p8`.
-    pub struct poly8x8_t(p8, p8, p8, p8, p8, p8, p8, p8);
+    pub struct poly8x8_t(pub(crate) p8, pub(crate) p8, pub(crate) p8, pub(crate) p8, pub(crate) p8, pub(crate) p8, pub(crate) p8, pub(crate) p8);
     /// ARM-specific 64-bit wide vector of four packed `i16`.
-    pub struct int16x4_t(i16, i16, i16, i16);
+    pub struct int16x4_t(pub(crate) i16, pub(crate) i16, pub(crate) i16, pub(crate) i16);
     /// ARM-specific 64-bit wide vector of four packed `u16`.
-    pub struct uint16x4_t(u16, u16, u16, u16);
+    pub struct uint16x4_t(pub(crate) u16, pub(crate) u16, pub(crate) u16, pub(crate) u16);
     // FIXME: ARM-specific 64-bit wide vector of four packed `f16`.
     // pub struct float16x4_t(f16, f16, f16, f16);
     /// ARM-specific 64-bit wide vector of four packed `p16`.
-    pub struct poly16x4_t(p16, p16, p16, p16);
+    pub struct poly16x4_t(pub(crate) p16, pub(crate) p16, pub(crate) p16, pub(crate) p16);
     /// ARM-specific 64-bit wide vector of two packed `i32`.
-    pub struct int32x2_t(i32, i32);
+    pub struct int32x2_t(pub(crate) i32, pub(crate) i32);
     /// ARM-specific 64-bit wide vector of two packed `u32`.
-    pub struct uint32x2_t(u32, u32);
+    pub struct uint32x2_t(pub(crate) u32, pub(crate) u32);
     /// ARM-specific 64-bit wide vector of two packed `f32`.
-    pub struct float32x2_t(f32, f32);
+    pub struct float32x2_t(pub(crate) f32, pub(crate) f32);
     /// ARM-specific 64-bit wide vector of one packed `i64`.
-    pub struct int64x1_t(i64);
+    pub struct int64x1_t(pub(crate) i64);
     /// ARM-specific 64-bit wide vector of one packed `u64`.
-    pub struct uint64x1_t(u64);
+    pub struct uint64x1_t(pub(crate) u64);
     /// ARM-specific 64-bit wide vector of one packed `p64`.
-    pub struct poly64x1_t(p64);
+    pub struct poly64x1_t(pub(crate) p64);
 
     /// ARM-specific 128-bit wide vector of sixteen packed `i8`.
     pub struct int8x16_t(
-        i8, i8, i8, i8, i8, i8 ,i8, i8,
-        i8, i8, i8, i8, i8, i8 ,i8, i8,
+        pub(crate) i8, pub(crate) i8, pub(crate) i8, pub(crate) i8, pub(crate) i8, pub(crate) i8 , pub(crate) i8, pub(crate) i8,
+        pub(crate) i8, pub(crate) i8, pub(crate) i8, pub(crate) i8, pub(crate) i8, pub(crate) i8 , pub(crate) i8, pub(crate) i8,
     );
     /// ARM-specific 128-bit wide vector of sixteen packed `u8`.
     pub struct uint8x16_t(
-        u8, u8 ,u8, u8, u8, u8 ,u8, u8,
-        u8, u8 ,u8, u8, u8, u8 ,u8, u8,
+        pub(crate) u8, pub(crate) u8 , pub(crate) u8, pub(crate) u8, pub(crate) u8, pub(crate) u8 , pub(crate) u8, pub(crate) u8,
+        pub(crate) u8, pub(crate) u8 , pub(crate) u8,  pub(crate) u8,  pub(crate) u8,  pub(crate) u8 , pub(crate) u8,  pub(crate) u8,
     );
     /// ARM-specific 128-bit wide vector of sixteen packed `p8`.
     pub struct poly8x16_t(
-        p8, p8, p8, p8, p8, p8, p8, p8,
-        p8, p8, p8, p8, p8, p8, p8, p8,
+         pub(crate) p8,  pub(crate) p8,  pub(crate) p8,  pub(crate) p8,  pub(crate) p8,  pub(crate) p8,  pub(crate) p8,  pub(crate) p8,
+         pub(crate) p8,  pub(crate) p8,  pub(crate) p8,  pub(crate) p8,  pub(crate) p8,  pub(crate) p8,  pub(crate) p8,  pub(crate) p8,
     );
     /// ARM-specific 128-bit wide vector of eight packed `i16`.
-    pub struct int16x8_t(i16, i16, i16, i16, i16, i16, i16, i16);
+    pub struct int16x8_t(pub(crate) i16, pub(crate) i16, pub(crate) i16, pub(crate) i16, pub(crate) i16, pub(crate) i16, pub(crate) i16, pub(crate) i16);
     /// ARM-specific 128-bit wide vector of eight packed `u16`.
-    pub struct uint16x8_t(u16, u16, u16, u16, u16, u16, u16, u16);
+    pub struct uint16x8_t(pub(crate) u16, pub(crate) u16, pub(crate) u16, pub(crate) u16, pub(crate) u16, pub(crate) u16, pub(crate) u16, pub(crate) u16);
     // FIXME: ARM-specific 128-bit wide vector of eight packed `f16`.
     // pub struct float16x8_t(f16, f16, f16, f16, f16, f16, f16);
     /// ARM-specific 128-bit wide vector of eight packed `p16`.
-    pub struct poly16x8_t(p16, p16, p16, p16, p16, p16, p16, p16);
+    pub struct poly16x8_t(pub(crate) p16, pub(crate) p16, pub(crate) p16, pub(crate) p16, pub(crate) p16, pub(crate) p16, pub(crate) p16, pub(crate) p16);
     /// ARM-specific 128-bit wide vector of four packed `i32`.
-    pub struct int32x4_t(i32, i32, i32, i32);
+    pub struct int32x4_t(pub(crate) i32, pub(crate) i32, pub(crate) i32, pub(crate) i32);
     /// ARM-specific 128-bit wide vector of four packed `u32`.
-    pub struct uint32x4_t(u32, u32, u32, u32);
+    pub struct uint32x4_t(pub(crate) u32, pub(crate) u32, pub(crate) u32, pub(crate) u32);
     /// ARM-specific 128-bit wide vector of four packed `f32`.
-    pub struct float32x4_t(f32, f32, f32, f32);
+    pub struct float32x4_t(pub(crate) f32, pub(crate) f32, pub(crate) f32, pub(crate) f32);
     /// ARM-specific 128-bit wide vector of two packed `i64`.
-    pub struct int64x2_t(i64, i64);
+    pub struct int64x2_t(pub(crate) i64, pub(crate) i64);
     /// ARM-specific 128-bit wide vector of two packed `u64`.
-    pub struct uint64x2_t(u64, u64);
+    pub struct uint64x2_t(pub(crate) u64, pub(crate) u64);
     /// ARM-specific 128-bit wide vector of two packed `p64`.
-    pub struct poly64x2_t(p64, p64);
+    pub struct poly64x2_t(pub(crate) p64, pub(crate) p64);
 }
 
 /// ARM-specific type containing two `int8x8_t` vectors.
@@ -214,74 +212,74 @@ extern "C" {
         target_arch = "aarch64",
         link_name = "llvm.aarch64.neon.saddlp.v4i16.v8i8"
     )]
-    fn vpaddl_s8_(a: int8x8_t) -> int16x4_t;
+    pub(crate) fn vpaddl_s8_(a: int8x8_t) -> int16x4_t;
     #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpaddls.v2i32.v4i16")]
     #[cfg_attr(
         target_arch = "aarch64",
         link_name = "llvm.aarch64.neon.saddlp.v2i32.v4i16"
     )]
-    fn vpaddl_s16_(a: int16x4_t) -> int32x2_t;
+    pub(crate) fn vpaddl_s16_(a: int16x4_t) -> int32x2_t;
     #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpaddls.v1i64.v2i32")]
     #[cfg_attr(
         target_arch = "aarch64",
         link_name = "llvm.aarch64.neon.saddlp.v1i64.v2i32"
     )]
-    fn vpaddl_s32_(a: int32x2_t) -> int64x1_t;
+    pub(crate) fn vpaddl_s32_(a: int32x2_t) -> int64x1_t;
     #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpaddls.v8i16.v16i8")]
     #[cfg_attr(
         target_arch = "aarch64",
         link_name = "llvm.aarch64.neon.saddlp.v8i16.v16i8"
     )]
-    fn vpaddlq_s8_(a: int8x16_t) -> int16x8_t;
+    pub(crate) fn vpaddlq_s8_(a: int8x16_t) -> int16x8_t;
     #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpaddls.v4i32.v8i16")]
     #[cfg_attr(
         target_arch = "aarch64",
         link_name = "llvm.aarch64.neon.saddlp.v4i32.v8i16"
     )]
-    fn vpaddlq_s16_(a: int16x8_t) -> int32x4_t;
+    pub(crate) fn vpaddlq_s16_(a: int16x8_t) -> int32x4_t;
     #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpaddls.v2i64.v4i32")]
     #[cfg_attr(
         target_arch = "aarch64",
         link_name = "llvm.aarch64.neon.saddlp.v2i64.v4i32"
     )]
-    fn vpaddlq_s32_(a: int32x4_t) -> int64x2_t;
+    pub(crate) fn vpaddlq_s32_(a: int32x4_t) -> int64x2_t;
 
     #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpaddlu.v4i16.v8i8")]
     #[cfg_attr(
         target_arch = "aarch64",
         link_name = "llvm.aarch64.neon.uaddlp.v4i16.v8i8"
     )]
-    fn vpaddl_u8_(a: uint8x8_t) -> uint16x4_t;
+    pub(crate) fn vpaddl_u8_(a: uint8x8_t) -> uint16x4_t;
     #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpaddlu.v2i32.v4i16")]
     #[cfg_attr(
         target_arch = "aarch64",
         link_name = "llvm.aarch64.neon.uaddlp.v2i32.v4i16"
     )]
-    fn vpaddl_u16_(a: uint16x4_t) -> uint32x2_t;
+    pub(crate) fn vpaddl_u16_(a: uint16x4_t) -> uint32x2_t;
     #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpaddlu.v1i64.v2i32")]
     #[cfg_attr(
         target_arch = "aarch64",
         link_name = "llvm.aarch64.neon.uaddlp.v1i64.v2i32"
     )]
-    fn vpaddl_u32_(a: uint32x2_t) -> uint64x1_t;
+    pub(crate) fn vpaddl_u32_(a: uint32x2_t) -> uint64x1_t;
     #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpaddlu.v8i16.v16i8")]
     #[cfg_attr(
         target_arch = "aarch64",
         link_name = "llvm.aarch64.neon.uaddlp.v8i16.v16i8"
     )]
-    fn vpaddlq_u8_(a: uint8x16_t) -> uint16x8_t;
+    pub(crate) fn vpaddlq_u8_(a: uint8x16_t) -> uint16x8_t;
     #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpaddlu.v4i32.v8i16")]
     #[cfg_attr(
         target_arch = "aarch64",
         link_name = "llvm.aarch64.neon.uaddlp.v4i32.v8i16"
     )]
-    fn vpaddlq_u16_(a: uint16x8_t) -> uint32x4_t;
+    pub(crate) fn vpaddlq_u16_(a: uint16x8_t) -> uint32x4_t;
     #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vpaddlu.v2i64.v4i32")]
     #[cfg_attr(
         target_arch = "aarch64",
         link_name = "llvm.aarch64.neon.uaddlp.v2i64.v4i32"
     )]
-    fn vpaddlq_u32_(a: uint32x4_t) -> uint64x2_t;
+    pub(crate) fn vpaddlq_u32_(a: uint32x4_t) -> uint64x2_t;
 
     #[cfg_attr(target_arch = "arm", link_name = "llvm.ctpop.v8i8")]
     #[cfg_attr(target_arch = "aarch64", link_name = "llvm.ctpop.v8i8")]
@@ -310,301 +308,6 @@ extern "C" {
     fn vclzq_s32_(a: int32x4_t) -> int32x4_t;
 }
 
-#[cfg(target_arch = "arm")]
-#[allow(improper_ctypes)]
-extern "C" {
-    #[link_name = "llvm.arm.neon.vbsl.v8i8"]
-    fn vbsl_s8_(a: int8x8_t, b: int8x8_t, c: int8x8_t) -> int8x8_t;
-    #[link_name = "llvm.arm.neon.vbsl.v16i8"]
-    fn vbslq_s8_(a: int8x16_t, b: int8x16_t, c: int8x16_t) -> int8x16_t;
-    #[link_name = "llvm.arm.neon.vpadals.v4i16.v8i8"]
-    fn vpadal_s8_(a: int16x4_t, b: int8x8_t) -> int16x4_t;
-    #[link_name = "llvm.arm.neon.vpadals.v2i32.v4i16"]
-    fn vpadal_s16_(a: int32x2_t, b: int16x4_t) -> int32x2_t;
-    #[link_name = "llvm.arm.neon.vpadals.v1i64.v2i32"]
-    fn vpadal_s32_(a: int64x1_t, b: int32x2_t) -> int64x1_t;
-    #[link_name = "llvm.arm.neon.vpadals.v8i16.v16i8"]
-    fn vpadalq_s8_(a: int16x8_t, b: int8x16_t) -> int16x8_t;
-    #[link_name = "llvm.arm.neon.vpadals.v4i32.v8i16"]
-    fn vpadalq_s16_(a: int32x4_t, b: int16x8_t) -> int32x4_t;
-    #[link_name = "llvm.arm.neon.vpadals.v2i64.v4i32"]
-    fn vpadalq_s32_(a: int64x2_t, b: int32x4_t) -> int64x2_t;
-
-    #[link_name = "llvm.arm.neon.vpadalu.v4i16.v8i8"]
-    fn vpadal_u8_(a: uint16x4_t, b: uint8x8_t) -> uint16x4_t;
-    #[link_name = "llvm.arm.neon.vpadalu.v2i32.v4i16"]
-    fn vpadal_u16_(a: uint32x2_t, b: uint16x4_t) -> uint32x2_t;
-    #[link_name = "llvm.arm.neon.vpadalu.v1i64.v2i32"]
-    fn vpadal_u32_(a: uint64x1_t, b: uint32x2_t) -> uint64x1_t;
-    #[link_name = "llvm.arm.neon.vpadalu.v8i16.v16i8"]
-    fn vpadalq_u8_(a: uint16x8_t, b: uint8x16_t) -> uint16x8_t;
-    #[link_name = "llvm.arm.neon.vpadalu.v4i32.v8i16"]
-    fn vpadalq_u16_(a: uint32x4_t, b: uint16x8_t) -> uint32x4_t;
-    #[link_name = "llvm.arm.neon.vpadalu.v2i64.v4i32"]
-    fn vpadalq_u32_(a: uint64x2_t, b: uint32x4_t) -> uint64x2_t;
-
-    #[link_name = "llvm.arm.neon.vtbl1"]
-    fn vtbl1(a: int8x8_t, b: int8x8_t) -> int8x8_t;
-    #[link_name = "llvm.arm.neon.vtbl2"]
-    fn vtbl2(a: int8x8_t, b: int8x8_t, b: int8x8_t) -> int8x8_t;
-    #[link_name = "llvm.arm.neon.vtbl3"]
-    fn vtbl3(a: int8x8_t, b: int8x8_t, b: int8x8_t, c: int8x8_t) -> int8x8_t;
-    #[link_name = "llvm.arm.neon.vtbl4"]
-    fn vtbl4(a: int8x8_t, b: int8x8_t, b: int8x8_t, c: int8x8_t, d: int8x8_t) -> int8x8_t;
-
-    #[link_name = "llvm.arm.neon.vtbx1"]
-    fn vtbx1(a: int8x8_t, b: int8x8_t, b: int8x8_t) -> int8x8_t;
-    #[link_name = "llvm.arm.neon.vtbx2"]
-    fn vtbx2(a: int8x8_t, b: int8x8_t, b: int8x8_t, c: int8x8_t) -> int8x8_t;
-    #[link_name = "llvm.arm.neon.vtbx3"]
-    fn vtbx3(a: int8x8_t, b: int8x8_t, b: int8x8_t, c: int8x8_t, d: int8x8_t) -> int8x8_t;
-    #[link_name = "llvm.arm.neon.vtbx4"]
-    fn vtbx4(
-        a: int8x8_t,
-        b: int8x8_t,
-        b: int8x8_t,
-        c: int8x8_t,
-        d: int8x8_t,
-        e: int8x8_t,
-    ) -> int8x8_t;
-
-    #[link_name = "llvm.arm.neon.vshiftins.v8i8"]
-    fn vshiftins_v8i8(a: int8x8_t, b: int8x8_t, shift: int8x8_t) -> int8x8_t;
-    #[link_name = "llvm.arm.neon.vshiftins.v16i8"]
-    fn vshiftins_v16i8(a: int8x16_t, b: int8x16_t, shift: int8x16_t) -> int8x16_t;
-    #[link_name = "llvm.arm.neon.vshiftins.v4i16"]
-    fn vshiftins_v4i16(a: int16x4_t, b: int16x4_t, shift: int16x4_t) -> int16x4_t;
-    #[link_name = "llvm.arm.neon.vshiftins.v8i16"]
-    fn vshiftins_v8i16(a: int16x8_t, b: int16x8_t, shift: int16x8_t) -> int16x8_t;
-    #[link_name = "llvm.arm.neon.vshiftins.v2i32"]
-    fn vshiftins_v2i32(a: int32x2_t, b: int32x2_t, shift: int32x2_t) -> int32x2_t;
-    #[link_name = "llvm.arm.neon.vshiftins.v4i32"]
-    fn vshiftins_v4i32(a: int32x4_t, b: int32x4_t, shift: int32x4_t) -> int32x4_t;
-    #[link_name = "llvm.arm.neon.vshiftins.v1i64"]
-    fn vshiftins_v1i64(a: int64x1_t, b: int64x1_t, shift: int64x1_t) -> int64x1_t;
-    #[link_name = "llvm.arm.neon.vshiftins.v2i64"]
-    fn vshiftins_v2i64(a: int64x2_t, b: int64x2_t, shift: int64x2_t) -> int64x2_t;
-
-    #[link_name = "llvm.arm.neon.vld1.v8i8.p0i8"]
-    fn vld1_v8i8(addr: *const i8, align: i32) -> int8x8_t;
-    #[link_name = "llvm.arm.neon.vld1.v16i8.p0i8"]
-    fn vld1q_v16i8(addr: *const i8, align: i32) -> int8x16_t;
-    #[link_name = "llvm.arm.neon.vld1.v4i16.p0i8"]
-    fn vld1_v4i16(addr: *const i8, align: i32) -> int16x4_t;
-    #[link_name = "llvm.arm.neon.vld1.v8i16.p0i8"]
-    fn vld1q_v8i16(addr: *const i8, align: i32) -> int16x8_t;
-    #[link_name = "llvm.arm.neon.vld1.v2i32.p0i8"]
-    fn vld1_v2i32(addr: *const i8, align: i32) -> int32x2_t;
-    #[link_name = "llvm.arm.neon.vld1.v4i32.p0i8"]
-    fn vld1q_v4i32(addr: *const i8, align: i32) -> int32x4_t;
-    #[link_name = "llvm.arm.neon.vld1.v1i64.p0i8"]
-    fn vld1_v1i64(addr: *const i8, align: i32) -> int64x1_t;
-    #[link_name = "llvm.arm.neon.vld1.v2i64.p0i8"]
-    fn vld1q_v2i64(addr: *const i8, align: i32) -> int64x2_t;
-    #[link_name = "llvm.arm.neon.vld1.v2f32.p0i8"]
-    fn vld1_v2f32(addr: *const i8, align: i32) -> float32x2_t;
-    #[link_name = "llvm.arm.neon.vld1.v4f32.p0i8"]
-    fn vld1q_v4f32(addr: *const i8, align: i32) -> float32x4_t;
-}
-
-/// Load multiple single-element structures to one, two, three, or four registers.
-#[inline]
-#[cfg(target_arch = "arm")]
-#[target_feature(enable = "neon,v7")]
-#[cfg_attr(test, assert_instr("vld1.8"))]
-pub unsafe fn vld1_s8(ptr: *const i8) -> int8x8_t {
-    vld1_v8i8(ptr as *const i8, align_of::<i8>() as i32)
-}
-
-/// Load multiple single-element structures to one, two, three, or four registers.
-#[inline]
-#[cfg(target_arch = "arm")]
-#[target_feature(enable = "neon,v7")]
-#[cfg_attr(test, assert_instr("vld1.8"))]
-pub unsafe fn vld1q_s8(ptr: *const i8) -> int8x16_t {
-    vld1q_v16i8(ptr as *const i8, align_of::<i8>() as i32)
-}
-
-/// Load multiple single-element structures to one, two, three, or four registers.
-#[inline]
-#[cfg(target_arch = "arm")]
-#[target_feature(enable = "neon,v7")]
-#[cfg_attr(test, assert_instr("vld1.16"))]
-pub unsafe fn vld1_s16(ptr: *const i16) -> int16x4_t {
-    vld1_v4i16(ptr as *const i8, align_of::<i16>() as i32)
-}
-
-/// Load multiple single-element structures to one, two, three, or four registers.
-#[inline]
-#[cfg(target_arch = "arm")]
-#[target_feature(enable = "neon,v7")]
-#[cfg_attr(test, assert_instr("vld1.16"))]
-pub unsafe fn vld1q_s16(ptr: *const i16) -> int16x8_t {
-    vld1q_v8i16(ptr as *const i8, align_of::<i16>() as i32)
-}
-
-/// Load multiple single-element structures to one, two, three, or four registers.
-#[inline]
-#[cfg(target_arch = "arm")]
-#[target_feature(enable = "neon,v7")]
-#[cfg_attr(test, assert_instr(vldr))]
-pub unsafe fn vld1_s32(ptr: *const i32) -> int32x2_t {
-    vld1_v2i32(ptr as *const i8, align_of::<i32>() as i32)
-}
-
-/// Load multiple single-element structures to one, two, three, or four registers.
-#[inline]
-#[cfg(target_arch = "arm")]
-#[target_feature(enable = "neon,v7")]
-#[cfg_attr(test, assert_instr("vld1.32"))]
-pub unsafe fn vld1q_s32(ptr: *const i32) -> int32x4_t {
-    vld1q_v4i32(ptr as *const i8, align_of::<i32>() as i32)
-}
-
-/// Load multiple single-element structures to one, two, three, or four registers.
-#[inline]
-#[cfg(target_arch = "arm")]
-#[target_feature(enable = "neon,v7")]
-#[cfg_attr(test, assert_instr(vldr))]
-pub unsafe fn vld1_s64(ptr: *const i64) -> int64x1_t {
-    vld1_v1i64(ptr as *const i8, align_of::<i64>() as i32)
-}
-
-/// Load multiple single-element structures to one, two, three, or four registers.
-#[inline]
-#[cfg(target_arch = "arm")]
-#[target_feature(enable = "neon,v7")]
-#[cfg_attr(test, assert_instr("vld1.64"))]
-pub unsafe fn vld1q_s64(ptr: *const i64) -> int64x2_t {
-    vld1q_v2i64(ptr as *const i8, align_of::<i64>() as i32)
-}
-
-/// Load multiple single-element structures to one, two, three, or four registers.
-#[inline]
-#[cfg(target_arch = "arm")]
-#[target_feature(enable = "neon,v7")]
-#[cfg_attr(test, assert_instr("vld1.8"))]
-pub unsafe fn vld1_u8(ptr: *const u8) -> uint8x8_t {
-    transmute(vld1_v8i8(ptr as *const i8, align_of::<u8>() as i32))
-}
-
-/// Load multiple single-element structures to one, two, three, or four registers.
-#[inline]
-#[cfg(target_arch = "arm")]
-#[target_feature(enable = "neon,v7")]
-#[cfg_attr(test, assert_instr("vld1.8"))]
-pub unsafe fn vld1q_u8(ptr: *const u8) -> uint8x16_t {
-    transmute(vld1q_v16i8(ptr as *const i8, align_of::<u8>() as i32))
-}
-
-/// Load multiple single-element structures to one, two, three, or four registers.
-#[inline]
-#[cfg(target_arch = "arm")]
-#[target_feature(enable = "neon,v7")]
-#[cfg_attr(test, assert_instr("vld1.16"))]
-pub unsafe fn vld1_u16(ptr: *const u16) -> uint16x4_t {
-    transmute(vld1_v4i16(ptr as *const i8, align_of::<u16>() as i32))
-}
-
-/// Load multiple single-element structures to one, two, three, or four registers.
-#[inline]
-#[cfg(target_arch = "arm")]
-#[target_feature(enable = "neon,v7")]
-#[cfg_attr(test, assert_instr("vld1.16"))]
-pub unsafe fn vld1q_u16(ptr: *const u16) -> uint16x8_t {
-    transmute(vld1q_v8i16(ptr as *const i8, align_of::<u16>() as i32))
-}
-
-/// Load multiple single-element structures to one, two, three, or four registers.
-#[inline]
-#[cfg(target_arch = "arm")]
-#[target_feature(enable = "neon,v7")]
-#[cfg_attr(test, assert_instr(vldr))]
-pub unsafe fn vld1_u32(ptr: *const u32) -> uint32x2_t {
-    transmute(vld1_v2i32(ptr as *const i8, align_of::<u32>() as i32))
-}
-
-/// Load multiple single-element structures to one, two, three, or four registers.
-#[inline]
-#[cfg(target_arch = "arm")]
-#[target_feature(enable = "neon,v7")]
-#[cfg_attr(test, assert_instr("vld1.32"))]
-pub unsafe fn vld1q_u32(ptr: *const u32) -> uint32x4_t {
-    transmute(vld1q_v4i32(ptr as *const i8, align_of::<u32>() as i32))
-}
-
-/// Load multiple single-element structures to one, two, three, or four registers.
-#[inline]
-#[cfg(target_arch = "arm")]
-#[target_feature(enable = "neon,v7")]
-#[cfg_attr(test, assert_instr(vldr))]
-pub unsafe fn vld1_u64(ptr: *const u64) -> uint64x1_t {
-    transmute(vld1_v1i64(ptr as *const i8, align_of::<u64>() as i32))
-}
-
-/// Load multiple single-element structures to one, two, three, or four registers.
-#[inline]
-#[cfg(target_arch = "arm")]
-#[target_feature(enable = "neon,v7")]
-#[cfg_attr(test, assert_instr("vld1.64"))]
-pub unsafe fn vld1q_u64(ptr: *const u64) -> uint64x2_t {
-    transmute(vld1q_v2i64(ptr as *const i8, align_of::<u64>() as i32))
-}
-
-/// Load multiple single-element structures to one, two, three, or four registers.
-#[inline]
-#[cfg(target_arch = "arm")]
-#[target_feature(enable = "neon,v7")]
-#[cfg_attr(test, assert_instr("vld1.8"))]
-pub unsafe fn vld1_p8(ptr: *const p8) -> poly8x8_t {
-    transmute(vld1_v8i8(ptr as *const i8, align_of::<p8>() as i32))
-}
-
-/// Load multiple single-element structures to one, two, three, or four registers.
-#[inline]
-#[cfg(target_arch = "arm")]
-#[target_feature(enable = "neon,v7")]
-#[cfg_attr(test, assert_instr("vld1.8"))]
-pub unsafe fn vld1q_p8(ptr: *const p8) -> poly8x16_t {
-    transmute(vld1q_v16i8(ptr as *const i8, align_of::<p8>() as i32))
-}
-
-/// Load multiple single-element structures to one, two, three, or four registers.
-#[inline]
-#[cfg(target_arch = "arm")]
-#[target_feature(enable = "neon,v7")]
-#[cfg_attr(test, assert_instr("vld1.16"))]
-pub unsafe fn vld1_p16(ptr: *const p16) -> poly16x4_t {
-    transmute(vld1_v4i16(ptr as *const i8, align_of::<p16>() as i32))
-}
-
-/// Load multiple single-element structures to one, two, three, or four registers.
-#[inline]
-#[cfg(target_arch = "arm")]
-#[target_feature(enable = "neon,v7")]
-#[cfg_attr(test, assert_instr("vld1.16"))]
-pub unsafe fn vld1q_p16(ptr: *const p16) -> poly16x8_t {
-    transmute(vld1q_v8i16(ptr as *const i8, align_of::<p16>() as i32))
-}
-
-/// Load multiple single-element structures to one, two, three, or four registers.
-#[inline]
-#[cfg(target_arch = "arm")]
-#[target_feature(enable = "neon,v7")]
-#[cfg_attr(test, assert_instr(vldr))]
-pub unsafe fn vld1_f32(ptr: *const f32) -> float32x2_t {
-    vld1_v2f32(ptr as *const i8, align_of::<f32>() as i32)
-}
-
-/// Load multiple single-element structures to one, two, three, or four registers.
-#[inline]
-#[cfg(target_arch = "arm")]
-#[target_feature(enable = "neon,v7")]
-#[cfg_attr(test, assert_instr("vld1.32"))]
-pub unsafe fn vld1q_f32(ptr: *const f32) -> float32x4_t {
-    vld1q_v4f32(ptr as *const i8, align_of::<f32>() as i32)
-}
-
 /// Load one single-element structure to one lane of one register.
 #[inline]
 #[target_feature(enable = "neon")]
@@ -943,8 +646,13 @@ pub unsafe fn vld1q_dup_s32(ptr: *const i32) -> int32x4_t {
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ldr))]
 pub unsafe fn vld1_dup_s64(ptr: *const i64) -> int64x1_t {
     #[cfg(target_arch = "aarch64")]
-    use crate::core_arch::aarch64::vld1_s64;
-    vld1_s64(ptr)
+    {
+        crate::core_arch::aarch64::vld1_s64(ptr)
+    }
+    #[cfg(target_arch = "arm")]
+    {
+        crate::core_arch::arm::vld1_s64(ptr)
+    }
 }
 
 /// Load one single-element structure and Replicate to all lanes (of one register).
@@ -1032,8 +740,13 @@ pub unsafe fn vld1q_dup_u32(ptr: *const u32) -> uint32x4_t {
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ldr))]
 pub unsafe fn vld1_dup_u64(ptr: *const u64) -> uint64x1_t {
     #[cfg(target_arch = "aarch64")]
-    use crate::core_arch::aarch64::vld1_u64;
-    vld1_u64(ptr)
+    {
+        crate::core_arch::aarch64::vld1_u64(ptr)
+    }
+    #[cfg(target_arch = "arm")]
+    {
+        crate::core_arch::arm::vld1_u64(ptr)
+    }
 }
 
 /// Load one single-element structure and Replicate to all lanes (of one register).
@@ -2158,321 +1871,117 @@ pub unsafe fn vpaddlq_u32(a: uint32x4_t) -> uint64x2_t {
     vpaddlq_u32_(a)
 }
 
-/// Signed Add and Accumulate Long Pairwise.
+/// Vector narrow integer.
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpadal.s8))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sadalp))]
-pub unsafe fn vpadal_s8(a: int16x4_t, b: int8x8_t) -> int16x4_t {
-    #[cfg(target_arch = "arm")]
-    {
-        vpadal_s8_(a, b)
-    }
-    #[cfg(target_arch = "aarch64")]
-    {
-        simd_add(vpaddl_s8_(b), a)
-    }
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmovn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(xtn))]
+pub unsafe fn vmovn_s16(a: int16x8_t) -> int8x8_t {
+    simd_cast(a)
 }
 
-/// Signed Add and Accumulate Long Pairwise.
+/// Vector narrow integer.
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpadal.s16))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sadalp))]
-pub unsafe fn vpadal_s16(a: int32x2_t, b: int16x4_t) -> int32x2_t {
-    #[cfg(target_arch = "arm")]
-    {
-        vpadal_s16_(a, b)
-    }
-    #[cfg(target_arch = "aarch64")]
-    {
-        simd_add(vpaddl_s16_(b), a)
-    }
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmovn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(xtn))]
+pub unsafe fn vmovn_s32(a: int32x4_t) -> int16x4_t {
+    simd_cast(a)
 }
 
-/// Signed Add and Accumulate Long Pairwise.
+/// Vector narrow integer.
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpadal.s32))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sadalp))]
-pub unsafe fn vpadal_s32(a: int64x1_t, b: int32x2_t) -> int64x1_t {
-    #[cfg(target_arch = "arm")]
-    {
-        vpadal_s32_(a, b)
-    }
-    #[cfg(target_arch = "aarch64")]
-    {
-        simd_add(vpaddl_s32_(b), a)
-    }
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmovn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(xtn))]
+pub unsafe fn vmovn_s64(a: int64x2_t) -> int32x2_t {
+    simd_cast(a)
 }
 
-/// Signed Add and Accumulate Long Pairwise.
+/// Vector narrow integer.
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpadal.s8))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sadalp))]
-pub unsafe fn vpadalq_s8(a: int16x8_t, b: int8x16_t) -> int16x8_t {
-    #[cfg(target_arch = "arm")]
-    {
-        vpadalq_s8_(a, b)
-    }
-    #[cfg(target_arch = "aarch64")]
-    {
-        simd_add(vpaddlq_s8_(b), a)
-    }
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmovn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(xtn))]
+pub unsafe fn vmovn_u16(a: uint16x8_t) -> uint8x8_t {
+    simd_cast(a)
 }
 
-/// Signed Add and Accumulate Long Pairwise.
+/// Vector narrow integer.
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpadal.s16))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sadalp))]
-pub unsafe fn vpadalq_s16(a: int32x4_t, b: int16x8_t) -> int32x4_t {
-    #[cfg(target_arch = "arm")]
-    {
-        vpadalq_s16_(a, b)
-    }
-    #[cfg(target_arch = "aarch64")]
-    {
-        simd_add(vpaddlq_s16_(b), a)
-    }
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmovn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(xtn))]
+pub unsafe fn vmovn_u32(a: uint32x4_t) -> uint16x4_t {
+    simd_cast(a)
 }
 
-/// Signed Add and Accumulate Long Pairwise.
+/// Vector narrow integer.
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpadal.s32))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sadalp))]
-pub unsafe fn vpadalq_s32(a: int64x2_t, b: int32x4_t) -> int64x2_t {
-    #[cfg(target_arch = "arm")]
-    {
-        vpadalq_s32_(a, b)
-    }
-    #[cfg(target_arch = "aarch64")]
-    {
-        simd_add(vpaddlq_s32_(b), a)
-    }
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmovn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(xtn))]
+pub unsafe fn vmovn_u64(a: uint64x2_t) -> uint32x2_t {
+    simd_cast(a)
 }
 
-/// Unsigned Add and Accumulate Long Pairwise.
+/// Vector long move.
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpadal.u8))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uadalp))]
-pub unsafe fn vpadal_u8(a: uint16x4_t, b: uint8x8_t) -> uint16x4_t {
-    #[cfg(target_arch = "arm")]
-    {
-        vpadal_u8_(a, b)
-    }
-    #[cfg(target_arch = "aarch64")]
-    {
-        simd_add(vpaddl_u8_(b), a)
-    }
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmovl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sxtl))]
+pub unsafe fn vmovl_s8(a: int8x8_t) -> int16x8_t {
+    simd_cast(a)
 }
 
-/// Unsigned Add and Accumulate Long Pairwise.
+/// Vector long move.
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpadal.u16))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uadalp))]
-pub unsafe fn vpadal_u16(a: uint32x2_t, b: uint16x4_t) -> uint32x2_t {
-    #[cfg(target_arch = "arm")]
-    {
-        vpadal_u16_(a, b)
-    }
-    #[cfg(target_arch = "aarch64")]
-    {
-        simd_add(vpaddl_u16_(b), a)
-    }
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmovl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sxtl))]
+pub unsafe fn vmovl_s16(a: int16x4_t) -> int32x4_t {
+    simd_cast(a)
 }
 
-/// Unsigned Add and Accumulate Long Pairwise.
+/// Vector long move.
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpadal.u32))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uadalp))]
-pub unsafe fn vpadal_u32(a: uint64x1_t, b: uint32x2_t) -> uint64x1_t {
-    #[cfg(target_arch = "arm")]
-    {
-        vpadal_u32_(a, b)
-    }
-    #[cfg(target_arch = "aarch64")]
-    {
-        simd_add(vpaddl_u32_(b), a)
-    }
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmovl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sxtl))]
+pub unsafe fn vmovl_s32(a: int32x2_t) -> int64x2_t {
+    simd_cast(a)
 }
 
-/// Unsigned Add and Accumulate Long Pairwise.
+/// Vector long move.
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpadal.u8))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uadalp))]
-pub unsafe fn vpadalq_u8(a: uint16x8_t, b: uint8x16_t) -> uint16x8_t {
-    #[cfg(target_arch = "arm")]
-    {
-        vpadalq_u8_(a, b)
-    }
-    #[cfg(target_arch = "aarch64")]
-    {
-        simd_add(vpaddlq_u8_(b), a)
-    }
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmovl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uxtl))]
+pub unsafe fn vmovl_u8(a: uint8x8_t) -> uint16x8_t {
+    simd_cast(a)
 }
 
-/// Unsigned Add and Accumulate Long Pairwise.
+/// Vector long move.
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpadal.u16))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uadalp))]
-pub unsafe fn vpadalq_u16(a: uint32x4_t, b: uint16x8_t) -> uint32x4_t {
-    #[cfg(target_arch = "arm")]
-    {
-        vpadalq_u16_(a, b)
-    }
-    #[cfg(target_arch = "aarch64")]
-    {
-        simd_add(vpaddlq_u16_(b), a)
-    }
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmovl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uxtl))]
+pub unsafe fn vmovl_u16(a: uint16x4_t) -> uint32x4_t {
+    simd_cast(a)
 }
 
-/// Unsigned Add and Accumulate Long Pairwise.
-#[inline]
-#[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpadal.u32))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uadalp))]
-pub unsafe fn vpadalq_u32(a: uint64x2_t, b: uint32x4_t) -> uint64x2_t {
-    #[cfg(target_arch = "arm")]
-    {
-        vpadalq_u32_(a, b)
-    }
-    #[cfg(target_arch = "aarch64")]
-    {
-        simd_add(vpaddlq_u32_(b), a)
-    }
-}
-
-/// Vector narrow integer.
-#[inline]
-#[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmovn))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(xtn))]
-pub unsafe fn vmovn_s16(a: int16x8_t) -> int8x8_t {
-    simd_cast(a)
-}
-
-/// Vector narrow integer.
-#[inline]
-#[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmovn))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(xtn))]
-pub unsafe fn vmovn_s32(a: int32x4_t) -> int16x4_t {
-    simd_cast(a)
-}
-
-/// Vector narrow integer.
-#[inline]
-#[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmovn))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(xtn))]
-pub unsafe fn vmovn_s64(a: int64x2_t) -> int32x2_t {
-    simd_cast(a)
-}
-
-/// Vector narrow integer.
-#[inline]
-#[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmovn))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(xtn))]
-pub unsafe fn vmovn_u16(a: uint16x8_t) -> uint8x8_t {
-    simd_cast(a)
-}
-
-/// Vector narrow integer.
-#[inline]
-#[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmovn))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(xtn))]
-pub unsafe fn vmovn_u32(a: uint32x4_t) -> uint16x4_t {
-    simd_cast(a)
-}
-
-/// Vector narrow integer.
-#[inline]
-#[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmovn))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(xtn))]
-pub unsafe fn vmovn_u64(a: uint64x2_t) -> uint32x2_t {
-    simd_cast(a)
-}
-
-/// Vector long move.
-#[inline]
-#[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmovl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sxtl))]
-pub unsafe fn vmovl_s8(a: int8x8_t) -> int16x8_t {
-    simd_cast(a)
-}
-
-/// Vector long move.
-#[inline]
-#[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmovl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sxtl))]
-pub unsafe fn vmovl_s16(a: int16x4_t) -> int32x4_t {
-    simd_cast(a)
-}
-
-/// Vector long move.
-#[inline]
-#[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmovl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sxtl))]
-pub unsafe fn vmovl_s32(a: int32x2_t) -> int64x2_t {
-    simd_cast(a)
-}
-
-/// Vector long move.
-#[inline]
-#[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmovl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uxtl))]
-pub unsafe fn vmovl_u8(a: uint8x8_t) -> uint16x8_t {
-    simd_cast(a)
-}
-
-/// Vector long move.
-#[inline]
-#[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmovl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uxtl))]
-pub unsafe fn vmovl_u16(a: uint16x4_t) -> uint32x4_t {
-    simd_cast(a)
-}
-
-/// Vector long move.
+/// Vector long move.
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
@@ -3368,304 +2877,6 @@ pub unsafe fn vpmax_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t {
     vpmaxf_v2f32(a, b)
 }
 
-/// Table look-up
-#[inline]
-#[cfg(target_arch = "arm")]
-#[cfg(target_endian = "little")]
-#[target_feature(enable = "neon,v7")]
-#[cfg_attr(test, assert_instr(vtbl))]
-pub unsafe fn vtbl1_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
-    vtbl1(a, b)
-}
-
-/// Table look-up
-#[inline]
-#[cfg(target_arch = "arm")]
-#[cfg(target_endian = "little")]
-#[target_feature(enable = "neon,v7")]
-#[cfg_attr(test, assert_instr(vtbl))]
-pub unsafe fn vtbl1_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
-    transmute(vtbl1(transmute(a), transmute(b)))
-}
-
-/// Table look-up
-#[inline]
-#[cfg(target_arch = "arm")]
-#[cfg(target_endian = "little")]
-#[target_feature(enable = "neon,v7")]
-#[cfg_attr(test, assert_instr(vtbl))]
-pub unsafe fn vtbl1_p8(a: poly8x8_t, b: uint8x8_t) -> poly8x8_t {
-    transmute(vtbl1(transmute(a), transmute(b)))
-}
-
-/// Table look-up
-#[inline]
-#[cfg(target_arch = "arm")]
-#[cfg(target_endian = "little")]
-#[target_feature(enable = "neon,v7")]
-#[cfg_attr(test, assert_instr(vtbl))]
-pub unsafe fn vtbl2_s8(a: int8x8x2_t, b: int8x8_t) -> int8x8_t {
-    vtbl2(a.0, a.1, b)
-}
-
-/// Table look-up
-#[inline]
-#[cfg(target_arch = "arm")]
-#[cfg(target_endian = "little")]
-#[target_feature(enable = "neon,v7")]
-#[cfg_attr(test, assert_instr(vtbl))]
-pub unsafe fn vtbl2_u8(a: uint8x8x2_t, b: uint8x8_t) -> uint8x8_t {
-    transmute(vtbl2(transmute(a.0), transmute(a.1), transmute(b)))
-}
-
-/// Table look-up
-#[inline]
-#[cfg(target_arch = "arm")]
-#[cfg(target_endian = "little")]
-#[target_feature(enable = "neon,v7")]
-#[cfg_attr(test, assert_instr(vtbl))]
-pub unsafe fn vtbl2_p8(a: poly8x8x2_t, b: uint8x8_t) -> poly8x8_t {
-    transmute(vtbl2(transmute(a.0), transmute(a.1), transmute(b)))
-}
-
-/// Table look-up
-#[inline]
-#[cfg(target_arch = "arm")]
-#[cfg(target_endian = "little")]
-#[target_feature(enable = "neon,v7")]
-#[cfg_attr(test, assert_instr(vtbl))]
-pub unsafe fn vtbl3_s8(a: int8x8x3_t, b: int8x8_t) -> int8x8_t {
-    vtbl3(a.0, a.1, a.2, b)
-}
-
-/// Table look-up
-#[inline]
-#[cfg(target_arch = "arm")]
-#[cfg(target_endian = "little")]
-#[target_feature(enable = "neon,v7")]
-#[cfg_attr(test, assert_instr(vtbl))]
-pub unsafe fn vtbl3_u8(a: uint8x8x3_t, b: uint8x8_t) -> uint8x8_t {
-    transmute(vtbl3(
-        transmute(a.0),
-        transmute(a.1),
-        transmute(a.2),
-        transmute(b),
-    ))
-}
-
-/// Table look-up
-#[inline]
-#[cfg(target_arch = "arm")]
-#[cfg(target_endian = "little")]
-#[target_feature(enable = "neon,v7")]
-#[cfg_attr(test, assert_instr(vtbl))]
-pub unsafe fn vtbl3_p8(a: poly8x8x3_t, b: uint8x8_t) -> poly8x8_t {
-    transmute(vtbl3(
-        transmute(a.0),
-        transmute(a.1),
-        transmute(a.2),
-        transmute(b),
-    ))
-}
-
-/// Table look-up
-#[inline]
-#[cfg(target_arch = "arm")]
-#[cfg(target_endian = "little")]
-#[target_feature(enable = "neon,v7")]
-#[cfg_attr(test, assert_instr(vtbl))]
-pub unsafe fn vtbl4_s8(a: int8x8x4_t, b: int8x8_t) -> int8x8_t {
-    vtbl4(a.0, a.1, a.2, a.3, b)
-}
-
-/// Table look-up
-#[inline]
-#[cfg(target_arch = "arm")]
-#[cfg(target_endian = "little")]
-#[target_feature(enable = "neon,v7")]
-#[cfg_attr(test, assert_instr(vtbl))]
-pub unsafe fn vtbl4_u8(a: uint8x8x4_t, b: uint8x8_t) -> uint8x8_t {
-    transmute(vtbl4(
-        transmute(a.0),
-        transmute(a.1),
-        transmute(a.2),
-        transmute(a.3),
-        transmute(b),
-    ))
-}
-
-/// Table look-up
-#[inline]
-#[cfg(target_arch = "arm")]
-#[cfg(target_endian = "little")]
-#[target_feature(enable = "neon,v7")]
-#[cfg_attr(test, assert_instr(vtbl))]
-pub unsafe fn vtbl4_p8(a: poly8x8x4_t, b: uint8x8_t) -> poly8x8_t {
-    transmute(vtbl4(
-        transmute(a.0),
-        transmute(a.1),
-        transmute(a.2),
-        transmute(a.3),
-        transmute(b),
-    ))
-}
-
-/// Extended table look-up
-#[inline]
-#[cfg(target_arch = "arm")]
-#[cfg(target_endian = "little")]
-#[target_feature(enable = "neon,v7")]
-#[cfg_attr(test, assert_instr(vtbx))]
-pub unsafe fn vtbx1_s8(a: int8x8_t, b: int8x8_t, c: int8x8_t) -> int8x8_t {
-    vtbx1(a, b, c)
-}
-
-/// Extended table look-up
-#[inline]
-#[cfg(target_arch = "arm")]
-#[cfg(target_endian = "little")]
-#[target_feature(enable = "neon,v7")]
-#[cfg_attr(test, assert_instr(vtbx))]
-pub unsafe fn vtbx1_u8(a: uint8x8_t, b: uint8x8_t, c: uint8x8_t) -> uint8x8_t {
-    transmute(vtbx1(transmute(a), transmute(b), transmute(c)))
-}
-
-/// Extended table look-up
-#[inline]
-#[cfg(target_arch = "arm")]
-#[cfg(target_endian = "little")]
-#[target_feature(enable = "neon,v7")]
-#[cfg_attr(test, assert_instr(vtbx))]
-pub unsafe fn vtbx1_p8(a: poly8x8_t, b: poly8x8_t, c: uint8x8_t) -> poly8x8_t {
-    transmute(vtbx1(transmute(a), transmute(b), transmute(c)))
-}
-
-/// Extended table look-up
-#[inline]
-#[cfg(target_arch = "arm")]
-#[cfg(target_endian = "little")]
-#[target_feature(enable = "neon,v7")]
-#[cfg_attr(test, assert_instr(vtbx))]
-pub unsafe fn vtbx2_s8(a: int8x8_t, b: int8x8x2_t, c: int8x8_t) -> int8x8_t {
-    vtbx2(a, b.0, b.1, c)
-}
-
-/// Extended table look-up
-#[inline]
-#[cfg(target_arch = "arm")]
-#[cfg(target_endian = "little")]
-#[target_feature(enable = "neon,v7")]
-#[cfg_attr(test, assert_instr(vtbx))]
-pub unsafe fn vtbx2_u8(a: uint8x8_t, b: uint8x8x2_t, c: uint8x8_t) -> uint8x8_t {
-    transmute(vtbx2(
-        transmute(a),
-        transmute(b.0),
-        transmute(b.1),
-        transmute(c),
-    ))
-}
-
-/// Extended table look-up
-#[inline]
-#[cfg(target_arch = "arm")]
-#[cfg(target_endian = "little")]
-#[target_feature(enable = "neon,v7")]
-#[cfg_attr(test, assert_instr(vtbx))]
-pub unsafe fn vtbx2_p8(a: poly8x8_t, b: poly8x8x2_t, c: uint8x8_t) -> poly8x8_t {
-    transmute(vtbx2(
-        transmute(a),
-        transmute(b.0),
-        transmute(b.1),
-        transmute(c),
-    ))
-}
-
-/// Extended table look-up
-#[inline]
-#[cfg(target_arch = "arm")]
-#[cfg(target_endian = "little")]
-#[target_feature(enable = "neon,v7")]
-#[cfg_attr(test, assert_instr(vtbx))]
-pub unsafe fn vtbx3_s8(a: int8x8_t, b: int8x8x3_t, c: int8x8_t) -> int8x8_t {
-    vtbx3(a, b.0, b.1, b.2, c)
-}
-
-/// Extended table look-up
-#[inline]
-#[cfg(target_arch = "arm")]
-#[cfg(target_endian = "little")]
-#[target_feature(enable = "neon,v7")]
-#[cfg_attr(test, assert_instr(vtbx))]
-pub unsafe fn vtbx3_u8(a: uint8x8_t, b: uint8x8x3_t, c: uint8x8_t) -> uint8x8_t {
-    transmute(vtbx3(
-        transmute(a),
-        transmute(b.0),
-        transmute(b.1),
-        transmute(b.2),
-        transmute(c),
-    ))
-}
-
-/// Extended table look-up
-#[inline]
-#[cfg(target_arch = "arm")]
-#[cfg(target_endian = "little")]
-#[target_feature(enable = "neon,v7")]
-#[cfg_attr(test, assert_instr(vtbx))]
-pub unsafe fn vtbx3_p8(a: poly8x8_t, b: poly8x8x3_t, c: uint8x8_t) -> poly8x8_t {
-    transmute(vtbx3(
-        transmute(a),
-        transmute(b.0),
-        transmute(b.1),
-        transmute(b.2),
-        transmute(c),
-    ))
-}
-
-/// Extended table look-up
-#[inline]
-#[cfg(target_arch = "arm")]
-#[cfg(target_endian = "little")]
-#[target_feature(enable = "neon,v7")]
-#[cfg_attr(test, assert_instr(vtbx))]
-pub unsafe fn vtbx4_s8(a: int8x8_t, b: int8x8x4_t, c: int8x8_t) -> int8x8_t {
-    vtbx4(a, b.0, b.1, b.2, b.3, c)
-}
-
-/// Extended table look-up
-#[inline]
-#[cfg(target_arch = "arm")]
-#[cfg(target_endian = "little")]
-#[target_feature(enable = "neon,v7")]
-#[cfg_attr(test, assert_instr(vtbx))]
-pub unsafe fn vtbx4_u8(a: uint8x8_t, b: uint8x8x4_t, c: uint8x8_t) -> uint8x8_t {
-    transmute(vtbx4(
-        transmute(a),
-        transmute(b.0),
-        transmute(b.1),
-        transmute(b.2),
-        transmute(b.3),
-        transmute(c),
-    ))
-}
-
-/// Extended table look-up
-#[inline]
-#[cfg(target_arch = "arm")]
-#[cfg(target_endian = "little")]
-#[target_feature(enable = "neon,v7")]
-#[cfg_attr(test, assert_instr(vtbx))]
-pub unsafe fn vtbx4_p8(a: poly8x8_t, b: poly8x8x4_t, c: uint8x8_t) -> poly8x8_t {
-    transmute(vtbx4(
-        transmute(a),
-        transmute(b.0),
-        transmute(b.1),
-        transmute(b.2),
-        transmute(b.3),
-        transmute(c),
-    ))
-}
-
 /// Move vector element to general-purpose register
 #[inline]
 #[target_feature(enable = "neon")]
@@ -4440,29 +3651,6 @@ pub unsafe fn vext_u64<const N: i32>(a: uint64x1_t, _b: uint64x1_t) -> uint64x1_
     a
 }
 
-// These float-to-int implementations have undefined behaviour when `a` overflows
-// the destination type. Clang has the same problem: https://llvm.org/PR47510
-
-/// Floating-point Convert to Signed fixed-point, rounding toward Zero (vector)
-#[inline]
-#[cfg(target_arch = "arm")]
-#[target_feature(enable = "neon")]
-#[target_feature(enable = "v7")]
-#[cfg_attr(test, assert_instr("vcvt.s32.f32"))]
-pub unsafe fn vcvtq_s32_f32(a: float32x4_t) -> int32x4_t {
-    transmute(simd_cast::<_, i32x4>(transmute::<_, f32x4>(a)))
-}
-
-/// Floating-point Convert to Unsigned fixed-point, rounding toward Zero (vector)
-#[inline]
-#[cfg(target_arch = "arm")]
-#[target_feature(enable = "neon")]
-#[target_feature(enable = "v7")]
-#[cfg_attr(test, assert_instr("vcvt.u32.f32"))]
-pub unsafe fn vcvtq_u32_f32(a: float32x4_t) -> uint32x4_t {
-    transmute(simd_cast::<_, u32x4>(transmute::<_, f32x4>(a)))
-}
-
 /// Population count per byte.
 #[inline]
 #[target_feature(enable = "neon")]
@@ -4518,532 +3706,6 @@ pub unsafe fn vcntq_p8(a: poly8x16_t) -> poly8x16_t {
     transmute(vcntq_s8_(transmute(a)))
 }
 
-/// Shift Left and Insert (immediate)
-#[inline]
-#[cfg(target_arch = "arm")]
-#[target_feature(enable = "neon,v7")]
-#[cfg_attr(test, assert_instr("vsli.8", N = 1))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vsli_n_s8<const N: i32>(a: int8x8_t, b: int8x8_t) -> int8x8_t {
-    static_assert_imm3!(N);
-    let n = N as i8;
-    vshiftins_v8i8(a, b, int8x8_t(n, n, n, n, n, n, n, n))
-}
-/// Shift Left and Insert (immediate)
-#[inline]
-#[cfg(target_arch = "arm")]
-#[target_feature(enable = "neon,v7")]
-#[cfg_attr(test, assert_instr("vsli.8", N = 1))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vsliq_n_s8<const N: i32>(a: int8x16_t, b: int8x16_t) -> int8x16_t {
-    static_assert_imm3!(N);
-    let n = N as i8;
-    vshiftins_v16i8(
-        a,
-        b,
-        int8x16_t(n, n, n, n, n, n, n, n, n, n, n, n, n, n, n, n),
-    )
-}
-/// Shift Left and Insert (immediate)
-#[inline]
-#[cfg(target_arch = "arm")]
-#[target_feature(enable = "neon,v7")]
-#[cfg_attr(test, assert_instr("vsli.16", N = 1))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vsli_n_s16<const N: i32>(a: int16x4_t, b: int16x4_t) -> int16x4_t {
-    static_assert_imm4!(N);
-    let n = N as i16;
-    vshiftins_v4i16(a, b, int16x4_t(n, n, n, n))
-}
-/// Shift Left and Insert (immediate)
-#[inline]
-#[cfg(target_arch = "arm")]
-#[target_feature(enable = "neon,v7")]
-#[cfg_attr(test, assert_instr("vsli.16", N = 1))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vsliq_n_s16<const N: i32>(a: int16x8_t, b: int16x8_t) -> int16x8_t {
-    static_assert_imm4!(N);
-    let n = N as i16;
-    vshiftins_v8i16(a, b, int16x8_t(n, n, n, n, n, n, n, n))
-}
-/// Shift Left and Insert (immediate)
-#[inline]
-#[cfg(target_arch = "arm")]
-#[target_feature(enable = "neon,v7")]
-#[cfg_attr(test, assert_instr("vsli.32", N = 1))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vsli_n_s32<const N: i32>(a: int32x2_t, b: int32x2_t) -> int32x2_t {
-    static_assert!(N: i32 where N >= 0 && N <= 31);
-    vshiftins_v2i32(a, b, int32x2_t(N, N))
-}
-/// Shift Left and Insert (immediate)
-#[inline]
-#[cfg(target_arch = "arm")]
-#[target_feature(enable = "neon,v7")]
-#[cfg_attr(test, assert_instr("vsli.32", N = 1))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vsliq_n_s32<const N: i32>(a: int32x4_t, b: int32x4_t) -> int32x4_t {
-    static_assert!(N: i32 where N >= 0 && N <= 31);
-    vshiftins_v4i32(a, b, int32x4_t(N, N, N, N))
-}
-/// Shift Left and Insert (immediate)
-#[inline]
-#[cfg(target_arch = "arm")]
-#[target_feature(enable = "neon,v7")]
-#[cfg_attr(test, assert_instr("vsli.64", N = 1))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vsli_n_s64<const N: i32>(a: int64x1_t, b: int64x1_t) -> int64x1_t {
-    static_assert!(N : i32 where 0 <= N && N <= 63);
-    vshiftins_v1i64(a, b, int64x1_t(N as i64))
-}
-/// Shift Left and Insert (immediate)
-#[inline]
-#[cfg(target_arch = "arm")]
-#[target_feature(enable = "neon,v7")]
-#[cfg_attr(test, assert_instr("vsli.64", N = 1))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vsliq_n_s64<const N: i32>(a: int64x2_t, b: int64x2_t) -> int64x2_t {
-    static_assert!(N : i32 where 0 <= N && N <= 63);
-    vshiftins_v2i64(a, b, int64x2_t(N as i64, N as i64))
-}
-/// Shift Left and Insert (immediate)
-#[inline]
-#[cfg(target_arch = "arm")]
-#[target_feature(enable = "neon,v7")]
-#[cfg_attr(test, assert_instr("vsli.8", N = 1))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vsli_n_u8<const N: i32>(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
-    static_assert_imm3!(N);
-    let n = N as i8;
-    transmute(vshiftins_v8i8(
-        transmute(a),
-        transmute(b),
-        int8x8_t(n, n, n, n, n, n, n, n),
-    ))
-}
-/// Shift Left and Insert (immediate)
-#[inline]
-#[cfg(target_arch = "arm")]
-#[target_feature(enable = "neon,v7")]
-#[cfg_attr(test, assert_instr("vsli.8", N = 1))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vsliq_n_u8<const N: i32>(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
-    static_assert_imm3!(N);
-    let n = N as i8;
-    transmute(vshiftins_v16i8(
-        transmute(a),
-        transmute(b),
-        int8x16_t(n, n, n, n, n, n, n, n, n, n, n, n, n, n, n, n),
-    ))
-}
-/// Shift Left and Insert (immediate)
-#[inline]
-#[cfg(target_arch = "arm")]
-#[target_feature(enable = "neon,v7")]
-#[cfg_attr(test, assert_instr("vsli.16", N = 1))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vsli_n_u16<const N: i32>(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
-    static_assert_imm4!(N);
-    let n = N as i16;
-    transmute(vshiftins_v4i16(
-        transmute(a),
-        transmute(b),
-        int16x4_t(n, n, n, n),
-    ))
-}
-/// Shift Left and Insert (immediate)
-#[inline]
-#[cfg(target_arch = "arm")]
-#[target_feature(enable = "neon,v7")]
-#[cfg_attr(test, assert_instr("vsli.16", N = 1))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vsliq_n_u16<const N: i32>(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
-    static_assert_imm4!(N);
-    let n = N as i16;
-    transmute(vshiftins_v8i16(
-        transmute(a),
-        transmute(b),
-        int16x8_t(n, n, n, n, n, n, n, n),
-    ))
-}
-/// Shift Left and Insert (immediate)
-#[inline]
-#[cfg(target_arch = "arm")]
-#[target_feature(enable = "neon,v7")]
-#[cfg_attr(test, assert_instr("vsli.32", N = 1))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vsli_n_u32<const N: i32>(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
-    static_assert!(N: i32 where N >= 0 && N <= 31);
-    transmute(vshiftins_v2i32(transmute(a), transmute(b), int32x2_t(N, N)))
-}
-/// Shift Left and Insert (immediate)
-#[inline]
-#[cfg(target_arch = "arm")]
-#[target_feature(enable = "neon,v7")]
-#[cfg_attr(test, assert_instr("vsli.32", N = 1))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vsliq_n_u32<const N: i32>(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
-    static_assert!(N: i32 where N >= 0 && N <= 31);
-    transmute(vshiftins_v4i32(
-        transmute(a),
-        transmute(b),
-        int32x4_t(N, N, N, N),
-    ))
-}
-/// Shift Left and Insert (immediate)
-#[inline]
-#[cfg(target_arch = "arm")]
-#[target_feature(enable = "neon,v7")]
-#[cfg_attr(test, assert_instr("vsli.64", N = 1))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vsli_n_u64<const N: i32>(a: uint64x1_t, b: uint64x1_t) -> uint64x1_t {
-    static_assert!(N : i32 where 0 <= N && N <= 63);
-    transmute(vshiftins_v1i64(
-        transmute(a),
-        transmute(b),
-        int64x1_t(N as i64),
-    ))
-}
-/// Shift Left and Insert (immediate)
-#[inline]
-#[cfg(target_arch = "arm")]
-#[target_feature(enable = "neon,v7")]
-#[cfg_attr(test, assert_instr("vsli.64", N = 1))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vsliq_n_u64<const N: i32>(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
-    static_assert!(N : i32 where 0 <= N && N <= 63);
-    transmute(vshiftins_v2i64(
-        transmute(a),
-        transmute(b),
-        int64x2_t(N as i64, N as i64),
-    ))
-}
-/// Shift Left and Insert (immediate)
-#[inline]
-#[cfg(target_arch = "arm")]
-#[target_feature(enable = "neon,v7")]
-#[cfg_attr(test, assert_instr("vsli.8", N = 1))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vsli_n_p8<const N: i32>(a: poly8x8_t, b: poly8x8_t) -> poly8x8_t {
-    static_assert_imm3!(N);
-    let n = N as i8;
-    transmute(vshiftins_v8i8(
-        transmute(a),
-        transmute(b),
-        int8x8_t(n, n, n, n, n, n, n, n),
-    ))
-}
-/// Shift Left and Insert (immediate)
-#[inline]
-#[cfg(target_arch = "arm")]
-#[target_feature(enable = "neon,v7")]
-#[cfg_attr(test, assert_instr("vsli.8", N = 1))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vsliq_n_p8<const N: i32>(a: poly8x16_t, b: poly8x16_t) -> poly8x16_t {
-    static_assert_imm3!(N);
-    let n = N as i8;
-    transmute(vshiftins_v16i8(
-        transmute(a),
-        transmute(b),
-        int8x16_t(n, n, n, n, n, n, n, n, n, n, n, n, n, n, n, n),
-    ))
-}
-/// Shift Left and Insert (immediate)
-#[inline]
-#[cfg(target_arch = "arm")]
-#[target_feature(enable = "neon,v7")]
-#[cfg_attr(test, assert_instr("vsli.16", N = 1))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vsli_n_p16<const N: i32>(a: poly16x4_t, b: poly16x4_t) -> poly16x4_t {
-    static_assert_imm4!(N);
-    let n = N as i16;
-    transmute(vshiftins_v4i16(
-        transmute(a),
-        transmute(b),
-        int16x4_t(n, n, n, n),
-    ))
-}
-/// Shift Left and Insert (immediate)
-#[inline]
-#[cfg(target_arch = "arm")]
-#[target_feature(enable = "neon,v7")]
-#[cfg_attr(test, assert_instr("vsli.16", N = 1))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vsliq_n_p16<const N: i32>(a: poly16x8_t, b: poly16x8_t) -> poly16x8_t {
-    static_assert_imm4!(N);
-    let n = N as i16;
-    transmute(vshiftins_v8i16(
-        transmute(a),
-        transmute(b),
-        int16x8_t(n, n, n, n, n, n, n, n),
-    ))
-}
-
-/// Shift Right and Insert (immediate)
-#[inline]
-#[cfg(target_arch = "arm")]
-#[target_feature(enable = "neon,v7")]
-#[cfg_attr(test, assert_instr("vsri.8", N = 1))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vsri_n_s8<const N: i32>(a: int8x8_t, b: int8x8_t) -> int8x8_t {
-    static_assert!(N : i32 where 1 <= N && N <= 8);
-    let n = -N as i8;
-    vshiftins_v8i8(a, b, int8x8_t(n, n, n, n, n, n, n, n))
-}
-/// Shift Right and Insert (immediate)
-#[inline]
-#[cfg(target_arch = "arm")]
-#[target_feature(enable = "neon,v7")]
-#[cfg_attr(test, assert_instr("vsri.8", N = 1))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vsriq_n_s8<const N: i32>(a: int8x16_t, b: int8x16_t) -> int8x16_t {
-    static_assert!(N : i32 where 1 <= N && N <= 8);
-    let n = -N as i8;
-    vshiftins_v16i8(
-        a,
-        b,
-        int8x16_t(n, n, n, n, n, n, n, n, n, n, n, n, n, n, n, n),
-    )
-}
-/// Shift Right and Insert (immediate)
-#[inline]
-#[cfg(target_arch = "arm")]
-#[target_feature(enable = "neon,v7")]
-#[cfg_attr(test, assert_instr("vsri.16", N = 1))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vsri_n_s16<const N: i32>(a: int16x4_t, b: int16x4_t) -> int16x4_t {
-    static_assert!(N : i32 where 1 <= N && N <= 16);
-    let n = -N as i16;
-    vshiftins_v4i16(a, b, int16x4_t(n, n, n, n))
-}
-/// Shift Right and Insert (immediate)
-#[inline]
-#[cfg(target_arch = "arm")]
-#[target_feature(enable = "neon,v7")]
-#[cfg_attr(test, assert_instr("vsri.16", N = 1))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vsriq_n_s16<const N: i32>(a: int16x8_t, b: int16x8_t) -> int16x8_t {
-    static_assert!(N : i32 where 1 <= N && N <= 16);
-    let n = -N as i16;
-    vshiftins_v8i16(a, b, int16x8_t(n, n, n, n, n, n, n, n))
-}
-/// Shift Right and Insert (immediate)
-#[inline]
-#[cfg(target_arch = "arm")]
-#[target_feature(enable = "neon,v7")]
-#[cfg_attr(test, assert_instr("vsri.32", N = 1))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vsri_n_s32<const N: i32>(a: int32x2_t, b: int32x2_t) -> int32x2_t {
-    static_assert!(N : i32 where 1 <= N && N <= 32);
-    vshiftins_v2i32(a, b, int32x2_t(-N, -N))
-}
-/// Shift Right and Insert (immediate)
-#[inline]
-#[cfg(target_arch = "arm")]
-#[target_feature(enable = "neon,v7")]
-#[cfg_attr(test, assert_instr("vsri.32", N = 1))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vsriq_n_s32<const N: i32>(a: int32x4_t, b: int32x4_t) -> int32x4_t {
-    static_assert!(N : i32 where 1 <= N && N <= 32);
-    vshiftins_v4i32(a, b, int32x4_t(-N, -N, -N, -N))
-}
-/// Shift Right and Insert (immediate)
-#[inline]
-#[cfg(target_arch = "arm")]
-#[target_feature(enable = "neon,v7")]
-#[cfg_attr(test, assert_instr("vsri.64", N = 1))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vsri_n_s64<const N: i32>(a: int64x1_t, b: int64x1_t) -> int64x1_t {
-    static_assert!(N : i32 where 1 <= N && N <= 64);
-    vshiftins_v1i64(a, b, int64x1_t(-N as i64))
-}
-/// Shift Right and Insert (immediate)
-#[inline]
-#[cfg(target_arch = "arm")]
-#[target_feature(enable = "neon,v7")]
-#[cfg_attr(test, assert_instr("vsri.64", N = 1))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vsriq_n_s64<const N: i32>(a: int64x2_t, b: int64x2_t) -> int64x2_t {
-    static_assert!(N : i32 where 1 <= N && N <= 64);
-    vshiftins_v2i64(a, b, int64x2_t(-N as i64, -N as i64))
-}
-/// Shift Right and Insert (immediate)
-#[inline]
-#[cfg(target_arch = "arm")]
-#[target_feature(enable = "neon,v7")]
-#[cfg_attr(test, assert_instr("vsri.8", N = 1))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vsri_n_u8<const N: i32>(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
-    static_assert!(N : i32 where 1 <= N && N <= 8);
-    let n = -N as i8;
-    transmute(vshiftins_v8i8(
-        transmute(a),
-        transmute(b),
-        int8x8_t(n, n, n, n, n, n, n, n),
-    ))
-}
-/// Shift Right and Insert (immediate)
-#[inline]
-#[cfg(target_arch = "arm")]
-#[target_feature(enable = "neon,v7")]
-#[cfg_attr(test, assert_instr("vsri.8", N = 1))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vsriq_n_u8<const N: i32>(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
-    static_assert!(N : i32 where 1 <= N && N <= 8);
-    let n = -N as i8;
-    transmute(vshiftins_v16i8(
-        transmute(a),
-        transmute(b),
-        int8x16_t(n, n, n, n, n, n, n, n, n, n, n, n, n, n, n, n),
-    ))
-}
-/// Shift Right and Insert (immediate)
-#[inline]
-#[cfg(target_arch = "arm")]
-#[target_feature(enable = "neon,v7")]
-#[cfg_attr(test, assert_instr("vsri.16", N = 1))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vsri_n_u16<const N: i32>(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
-    static_assert!(N : i32 where 1 <= N && N <= 16);
-    let n = -N as i16;
-    transmute(vshiftins_v4i16(
-        transmute(a),
-        transmute(b),
-        int16x4_t(n, n, n, n),
-    ))
-}
-/// Shift Right and Insert (immediate)
-#[inline]
-#[cfg(target_arch = "arm")]
-#[target_feature(enable = "neon,v7")]
-#[cfg_attr(test, assert_instr("vsri.16", N = 1))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vsriq_n_u16<const N: i32>(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
-    static_assert!(N : i32 where 1 <= N && N <= 16);
-    let n = -N as i16;
-    transmute(vshiftins_v8i16(
-        transmute(a),
-        transmute(b),
-        int16x8_t(n, n, n, n, n, n, n, n),
-    ))
-}
-/// Shift Right and Insert (immediate)
-#[inline]
-#[cfg(target_arch = "arm")]
-#[target_feature(enable = "neon,v7")]
-#[cfg_attr(test, assert_instr("vsri.32", N = 1))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vsri_n_u32<const N: i32>(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
-    static_assert!(N : i32 where 1 <= N && N <= 32);
-    transmute(vshiftins_v2i32(
-        transmute(a),
-        transmute(b),
-        int32x2_t(-N, -N),
-    ))
-}
-/// Shift Right and Insert (immediate)
-#[inline]
-#[cfg(target_arch = "arm")]
-#[target_feature(enable = "neon,v7")]
-#[cfg_attr(test, assert_instr("vsri.32", N = 1))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vsriq_n_u32<const N: i32>(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
-    static_assert!(N : i32 where 1 <= N && N <= 32);
-    transmute(vshiftins_v4i32(
-        transmute(a),
-        transmute(b),
-        int32x4_t(-N, -N, -N, -N),
-    ))
-}
-/// Shift Right and Insert (immediate)
-#[inline]
-#[cfg(target_arch = "arm")]
-#[target_feature(enable = "neon,v7")]
-#[cfg_attr(test, assert_instr("vsri.64", N = 1))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vsri_n_u64<const N: i32>(a: uint64x1_t, b: uint64x1_t) -> uint64x1_t {
-    static_assert!(N : i32 where 1 <= N && N <= 64);
-    transmute(vshiftins_v1i64(
-        transmute(a),
-        transmute(b),
-        int64x1_t(-N as i64),
-    ))
-}
-/// Shift Right and Insert (immediate)
-#[inline]
-#[cfg(target_arch = "arm")]
-#[target_feature(enable = "neon,v7")]
-#[cfg_attr(test, assert_instr("vsri.64", N = 1))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vsriq_n_u64<const N: i32>(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
-    static_assert!(N : i32 where 1 <= N && N <= 64);
-    transmute(vshiftins_v2i64(
-        transmute(a),
-        transmute(b),
-        int64x2_t(-N as i64, -N as i64),
-    ))
-}
-/// Shift Right and Insert (immediate)
-#[inline]
-#[cfg(target_arch = "arm")]
-#[target_feature(enable = "neon,v7")]
-#[cfg_attr(test, assert_instr("vsri.8", N = 1))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vsri_n_p8<const N: i32>(a: poly8x8_t, b: poly8x8_t) -> poly8x8_t {
-    static_assert!(N : i32 where 1 <= N && N <= 8);
-    let n = -N as i8;
-    transmute(vshiftins_v8i8(
-        transmute(a),
-        transmute(b),
-        int8x8_t(n, n, n, n, n, n, n, n),
-    ))
-}
-/// Shift Right and Insert (immediate)
-#[inline]
-#[cfg(target_arch = "arm")]
-#[target_feature(enable = "neon,v7")]
-#[cfg_attr(test, assert_instr("vsri.8", N = 1))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vsriq_n_p8<const N: i32>(a: poly8x16_t, b: poly8x16_t) -> poly8x16_t {
-    static_assert!(N : i32 where 1 <= N && N <= 8);
-    let n = -N as i8;
-    transmute(vshiftins_v16i8(
-        transmute(a),
-        transmute(b),
-        int8x16_t(n, n, n, n, n, n, n, n, n, n, n, n, n, n, n, n),
-    ))
-}
-/// Shift Right and Insert (immediate)
-#[inline]
-#[cfg(target_arch = "arm")]
-#[target_feature(enable = "neon,v7")]
-#[cfg_attr(test, assert_instr("vsri.16", N = 1))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vsri_n_p16<const N: i32>(a: poly16x4_t, b: poly16x4_t) -> poly16x4_t {
-    static_assert!(N : i32 where 1 <= N && N <= 16);
-    let n = -N as i16;
-    transmute(vshiftins_v4i16(
-        transmute(a),
-        transmute(b),
-        int16x4_t(n, n, n, n),
-    ))
-}
-/// Shift Right and Insert (immediate)
-#[inline]
-#[cfg(target_arch = "arm")]
-#[target_feature(enable = "neon,v7")]
-#[cfg_attr(test, assert_instr("vsri.16", N = 1))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vsriq_n_p16<const N: i32>(a: poly16x8_t, b: poly16x8_t) -> poly16x8_t {
-    static_assert!(N : i32 where 1 <= N && N <= 16);
-    let n = -N as i16;
-    transmute(vshiftins_v8i16(
-        transmute(a),
-        transmute(b),
-        int16x8_t(n, n, n, n, n, n, n, n),
-    ))
-}
-
 /// Reversing vector elements (swap endianness)
 #[inline]
 #[target_feature(enable = "neon")]
@@ -5404,11 +4066,219 @@ pub unsafe fn vrev64q_p16(a: poly16x8_t) -> poly16x8_t {
     simd_shuffle8(a, a, [3, 2, 1, 0, 7, 6, 5, 4])
 }
 
+/// Signed Add and Accumulate Long Pairwise.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpadal.s8))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sadalp))]
+pub unsafe fn vpadal_s8(a: int16x4_t, b: int8x8_t) -> int16x4_t {
+    #[cfg(target_arch = "arm")]
+    {
+        crate::core_arch::arm::neon::vpadal_s8_(a, b)
+    }
+    #[cfg(target_arch = "aarch64")]
+    {
+        simd_add(vpaddl_s8_(b), a)
+    }
+}
+
+/// Signed Add and Accumulate Long Pairwise.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpadal.s16))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sadalp))]
+pub unsafe fn vpadal_s16(a: int32x2_t, b: int16x4_t) -> int32x2_t {
+    #[cfg(target_arch = "arm")]
+    {
+        crate::core_arch::arm::neon::vpadal_s16_(a, b)
+    }
+    #[cfg(target_arch = "aarch64")]
+    {
+        simd_add(vpaddl_s16_(b), a)
+    }
+}
+
+/// Signed Add and Accumulate Long Pairwise.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpadal.s32))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sadalp))]
+pub unsafe fn vpadal_s32(a: int64x1_t, b: int32x2_t) -> int64x1_t {
+    #[cfg(target_arch = "arm")]
+    {
+        crate::core_arch::arm::neon::vpadal_s32_(a, b)
+    }
+    #[cfg(target_arch = "aarch64")]
+    {
+        simd_add(vpaddl_s32_(b), a)
+    }
+}
+
+/// Signed Add and Accumulate Long Pairwise.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpadal.s8))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sadalp))]
+pub unsafe fn vpadalq_s8(a: int16x8_t, b: int8x16_t) -> int16x8_t {
+    #[cfg(target_arch = "arm")]
+    {
+        crate::core_arch::arm::neon::vpadalq_s8_(a, b)
+    }
+    #[cfg(target_arch = "aarch64")]
+    {
+        simd_add(vpaddlq_s8_(b), a)
+    }
+}
+
+/// Signed Add and Accumulate Long Pairwise.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpadal.s16))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sadalp))]
+pub unsafe fn vpadalq_s16(a: int32x4_t, b: int16x8_t) -> int32x4_t {
+    #[cfg(target_arch = "arm")]
+    {
+        crate::core_arch::arm::neon::vpadalq_s16_(a, b)
+    }
+    #[cfg(target_arch = "aarch64")]
+    {
+        simd_add(vpaddlq_s16_(b), a)
+    }
+}
+
+/// Signed Add and Accumulate Long Pairwise.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpadal.s32))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sadalp))]
+pub unsafe fn vpadalq_s32(a: int64x2_t, b: int32x4_t) -> int64x2_t {
+    #[cfg(target_arch = "arm")]
+    {
+        crate::core_arch::arm::neon::vpadalq_s32_(a, b)
+    }
+    #[cfg(target_arch = "aarch64")]
+    {
+        simd_add(vpaddlq_s32_(b), a)
+    }
+}
+
+/// Unsigned Add and Accumulate Long Pairwise.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpadal.u8))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uadalp))]
+pub unsafe fn vpadal_u8(a: uint16x4_t, b: uint8x8_t) -> uint16x4_t {
+    #[cfg(target_arch = "arm")]
+    {
+        crate::core_arch::arm::neon::vpadal_u8_(a, b)
+    }
+    #[cfg(target_arch = "aarch64")]
+    {
+        simd_add(vpaddl_u8_(b), a)
+    }
+}
+
+/// Unsigned Add and Accumulate Long Pairwise.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpadal.u16))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uadalp))]
+pub unsafe fn vpadal_u16(a: uint32x2_t, b: uint16x4_t) -> uint32x2_t {
+    #[cfg(target_arch = "arm")]
+    {
+        crate::core_arch::arm::neon::vpadal_u16_(a, b)
+    }
+    #[cfg(target_arch = "aarch64")]
+    {
+        simd_add(vpaddl_u16_(b), a)
+    }
+}
+
+/// Unsigned Add and Accumulate Long Pairwise.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpadal.u32))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uadalp))]
+pub unsafe fn vpadal_u32(a: uint64x1_t, b: uint32x2_t) -> uint64x1_t {
+    #[cfg(target_arch = "arm")]
+    {
+        crate::core_arch::arm::neon::vpadal_u32_(a, b)
+    }
+    #[cfg(target_arch = "aarch64")]
+    {
+        simd_add(vpaddl_u32_(b), a)
+    }
+}
+
+/// Unsigned Add and Accumulate Long Pairwise.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpadal.u8))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uadalp))]
+pub unsafe fn vpadalq_u8(a: uint16x8_t, b: uint8x16_t) -> uint16x8_t {
+    #[cfg(target_arch = "arm")]
+    {
+        crate::core_arch::arm::neon::vpadalq_u8_(a, b)
+    }
+    #[cfg(target_arch = "aarch64")]
+    {
+        simd_add(vpaddlq_u8_(b), a)
+    }
+}
+
+/// Unsigned Add and Accumulate Long Pairwise.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpadal.u16))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uadalp))]
+pub unsafe fn vpadalq_u16(a: uint32x4_t, b: uint16x8_t) -> uint32x4_t {
+    #[cfg(target_arch = "arm")]
+    {
+        crate::core_arch::arm::neon::vpadalq_u16_(a, b)
+    }
+    #[cfg(target_arch = "aarch64")]
+    {
+        simd_add(vpaddlq_u16_(b), a)
+    }
+}
+
+/// Unsigned Add and Accumulate Long Pairwise.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vpadal.u32))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uadalp))]
+pub unsafe fn vpadalq_u32(a: uint64x2_t, b: uint32x4_t) -> uint64x2_t {
+    #[cfg(target_arch = "arm")]
+    {
+        crate::core_arch::arm::neon::vpadalq_u32_(a, b)
+    }
+    #[cfg(target_arch = "aarch64")]
+    {
+        simd_add(vpaddlq_u32_(b), a)
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
-    use crate::core_arch::arm::test_support::*;
-    use crate::core_arch::{arm::*, simd::*};
+    #[cfg(target_arch = "aarch64")]
+    use crate::core_arch::aarch64::*;
+    #[cfg(target_arch = "arm")]
+    use crate::core_arch::arm::*;
+    use crate::core_arch::arm_shared::test_support::*;
+    use crate::core_arch::simd::*;
     use std::{i16, i32, i8, mem::transmute, u16, u32, u8, vec::Vec};
     use stdarch_test::simd_test;
 
@@ -5792,24 +4662,6 @@ mod tests {
         assert_eq!(r, e)
     }
 
-    #[cfg(target_arch = "arm")]
-    #[simd_test(enable = "neon")]
-    unsafe fn test_vcvtq_s32_f32() {
-        let f = f32x4::new(-1., 2., 3., 4.);
-        let e = i32x4::new(-1, 2, 3, 4);
-        let r: i32x4 = transmute(vcvtq_s32_f32(transmute(f)));
-        assert_eq!(r, e);
-    }
-
-    #[cfg(target_arch = "arm")]
-    #[simd_test(enable = "neon")]
-    unsafe fn test_vcvtq_u32_f32() {
-        let f = f32x4::new(1., 2., 3., 4.);
-        let e = u32x4::new(1, 2, 3, 4);
-        let r: u32x4 = transmute(vcvtq_u32_f32(transmute(f)));
-        assert_eq!(r, e);
-    }
-
     #[simd_test(enable = "neon")]
     unsafe fn test_vget_lane_u8() {
         let v = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
diff --git a/crates/core_arch/src/arm/neon/shift_and_insert_tests.rs b/crates/core_arch/src/arm_shared/neon/shift_and_insert_tests.rs
similarity index 100%
rename from crates/core_arch/src/arm/neon/shift_and_insert_tests.rs
rename to crates/core_arch/src/arm_shared/neon/shift_and_insert_tests.rs
diff --git a/crates/core_arch/src/arm/neon/table_lookup_tests.rs b/crates/core_arch/src/arm_shared/neon/table_lookup_tests.rs
similarity index 100%
rename from crates/core_arch/src/arm/neon/table_lookup_tests.rs
rename to crates/core_arch/src/arm_shared/neon/table_lookup_tests.rs
diff --git a/crates/core_arch/src/acle/registers/aarch32.rs b/crates/core_arch/src/arm_shared/registers/aarch32.rs
similarity index 100%
rename from crates/core_arch/src/acle/registers/aarch32.rs
rename to crates/core_arch/src/arm_shared/registers/aarch32.rs
diff --git a/crates/core_arch/src/acle/registers/mod.rs b/crates/core_arch/src/arm_shared/registers/mod.rs
similarity index 100%
rename from crates/core_arch/src/acle/registers/mod.rs
rename to crates/core_arch/src/arm_shared/registers/mod.rs
diff --git a/crates/core_arch/src/acle/registers/v6m.rs b/crates/core_arch/src/arm_shared/registers/v6m.rs
similarity index 100%
rename from crates/core_arch/src/acle/registers/v6m.rs
rename to crates/core_arch/src/arm_shared/registers/v6m.rs
diff --git a/crates/core_arch/src/acle/registers/v7m.rs b/crates/core_arch/src/arm_shared/registers/v7m.rs
similarity index 100%
rename from crates/core_arch/src/acle/registers/v7m.rs
rename to crates/core_arch/src/arm_shared/registers/v7m.rs
diff --git a/crates/core_arch/src/arm/test_support.rs b/crates/core_arch/src/arm_shared/test_support.rs
similarity index 98%
rename from crates/core_arch/src/arm/test_support.rs
rename to crates/core_arch/src/arm_shared/test_support.rs
index 337a270e40..ff752f25b3 100644
--- a/crates/core_arch/src/arm/test_support.rs
+++ b/crates/core_arch/src/arm_shared/test_support.rs
@@ -1,4 +1,10 @@
-use crate::core_arch::{arm::*, simd::*};
+#[cfg(target_arch = "arm")]
+use crate::core_arch::arm::*;
+
+#[cfg(target_arch = "aarch64")]
+use crate::core_arch::aarch64::*;
+
+use crate::core_arch::simd::*;
 use std::{i16, i32, i8, mem::transmute, u16, u32, u8, vec::Vec};
 
 macro_rules! V_u8 {
diff --git a/crates/core_arch/src/mod.rs b/crates/core_arch/src/mod.rs
index 5b25687c3d..4e6dcb7dc4 100644
--- a/crates/core_arch/src/mod.rs
+++ b/crates/core_arch/src/mod.rs
@@ -5,7 +5,7 @@
 mod macros;
 
 #[cfg(any(target_arch = "arm", target_arch = "aarch64", doc))]
-mod acle;
+mod arm_shared;
 
 mod simd;
 
@@ -53,7 +53,7 @@ pub mod arch {
     #[doc(cfg(target_arch = "aarch64"))]
     #[unstable(feature = "stdsimd", issue = "27731")]
     pub mod aarch64 {
-        pub use crate::core_arch::{aarch64::*, arm::*};
+        pub use crate::core_arch::aarch64::*;
     }
 
     /// Platform-specific intrinsics for the `wasm32` platform.
@@ -234,8 +234,8 @@ mod x86_64;
 #[cfg(any(target_arch = "aarch64", doc))]
 #[doc(cfg(target_arch = "aarch64"))]
 mod aarch64;
-#[cfg(any(target_arch = "arm", target_arch = "aarch64", doc))]
-#[doc(cfg(any(target_arch = "arm", target_arch = "aarch64")))]
+#[cfg(any(target_arch = "arm", doc))]
+#[doc(cfg(any(target_arch = "arm")))]
 mod arm;
 
 #[cfg(any(target_arch = "wasm32", doc))]
diff --git a/crates/stdarch-gen/src/main.rs b/crates/stdarch-gen/src/main.rs
index 155c898a41..1babd33744 100644
--- a/crates/stdarch-gen/src/main.rs
+++ b/crates/stdarch-gen/src/main.rs
@@ -2253,7 +2253,7 @@ mod test {
 
     let arm_out_path: PathBuf = PathBuf::from(env::var("OUT_DIR").unwrap())
         .join("src")
-        .join("arm")
+        .join("arm_shared")
         .join("neon");
     std::fs::create_dir_all(&arm_out_path)?;
 
diff --git a/crates/stdarch-test/src/lib.rs b/crates/stdarch-test/src/lib.rs
index 70797e17c8..408d7190e9 100644
--- a/crates/stdarch-test/src/lib.rs
+++ b/crates/stdarch-test/src/lib.rs
@@ -121,7 +121,7 @@ pub fn assert(shim_addr: usize, fnname: &str, expected: &str) {
                 // in some cases exceed the limit.
                 "cvtpi2ps" => 25,
 
-                // core_arch/src/acle/simd32
+                // core_arch/src/arm_shared/simd32
                 "usad8" => 27,
                 "qadd8" | "qsub8" | "sadd8" | "sel" | "shadd8" | "shsub8" | "usub8" | "ssub8" => 29,
 
diff --git a/crates/stdarch-verify/src/lib.rs b/crates/stdarch-verify/src/lib.rs
index 7c947d8ac7..e85f0489a8 100644
--- a/crates/stdarch-verify/src/lib.rs
+++ b/crates/stdarch-verify/src/lib.rs
@@ -224,6 +224,7 @@ fn to_type(t: &syn::Type) -> proc_macro2::TokenStream {
             "int64x1_t" => quote! { &I64X1 },
             "int64x2_t" => quote! { &I64X2 },
             "uint8x8_t" => quote! { &U8X8 },
+            "uint8x4_t" => quote! { &U8X4 },
             "uint8x8x2_t" => quote! { &U8X8X2 },
             "uint8x16x2_t" => quote! { &U8X16X2 },
             "uint8x16x3_t" => quote! { &U8X16X3 },
diff --git a/crates/stdarch-verify/tests/arm.rs b/crates/stdarch-verify/tests/arm.rs
index 9f0d981485..8c2cf47855 100644
--- a/crates/stdarch-verify/tests/arm.rs
+++ b/crates/stdarch-verify/tests/arm.rs
@@ -149,6 +149,7 @@ static U8X16X2: Type = Type::U(8, 16, 2);
 static U8X16X3: Type = Type::U(8, 16, 3);
 static U8X16X4: Type = Type::U(8, 16, 4);
 static U8X8: Type = Type::U(8, 8, 1);
+static U8X4: Type = Type::U(8, 4, 1);
 static U8X8X2: Type = Type::U(8, 8, 2);
 static U8X8X3: Type = Type::U(8, 8, 3);
 static U8X8X4: Type = Type::U(8, 8, 4);
@@ -371,6 +372,90 @@ fn verify_all_signatures() {
                 "vsriq_n_p8",
                 "vsri_n_p16",
                 "vsriq_n_p16",
+                "__smulbb",
+                "__smultb",
+                "__smulbt",
+                "__smultt",
+                "__smulwb",
+                "__smulwt",
+                "__qadd",
+                "__qsub",
+                "__qdbl",
+                "__smlabb",
+                "__smlabt",
+                "__smlatb",
+                "__smlatt",
+                "__smlawb",
+                "__smlawt",
+                "__qadd8",
+                "__qsub8",
+                "__qsub16",
+                "__qadd16",
+                "__qasx",
+                "__qsax",
+                "__sadd16",
+                "__sadd8",
+                "__smlad",
+                "__smlsd",
+                "__sasx",
+                "__sel",
+                "__shadd8",
+                "__shadd16",
+                "__shsub8",
+                "__usub8",
+                "__ssub8",
+                "__shsub16",
+                "__smuad",
+                "__smuadx",
+                "__smusd",
+                "__smusdx",
+                "__usad8",
+                "__usada8",
+                "vld1_s8",
+                "vld1q_s8",
+                "vld1q_s8",
+                "vld1_s16",
+                "vld1q_s16",
+                "vld1_s32",
+                "vld1q_s32",
+                "vld1_s64",
+                "vld1q_s64",
+                "vld1_u8",
+                "vld1q_u8",
+                "vld1_u16",
+                "vld1q_u16",
+                "vld1_u32",
+                "vld1q_u32",
+                "vld1_u64",
+                "vld1q_u64",
+                "vld1_p8",
+                "vld1q_p8",
+                "vld1_p16",
+                "vld1q_p16",
+                "vld1_f32",
+                "vld1q_f32",
+                "vld1_f64",
+                "vld1q_f64",
+                "vpadal_s8",
+                "vpadal_s16",
+                "vpadal_s32",
+                "vpadalq_s8",
+                "vpadalq_s16",
+                "vpadalq_s32",
+                "vpadal_u8",
+                "vpadal_u16",
+                "vpadal_u32",
+                "vpadalq_u8",
+                "vpadalq_u16",
+                "vpadalq_u32",
+                "__ldrex",
+                "__strex",
+                "__ldrexb",
+                "__strexb",
+                "__ldrexh",
+                "__strexh",
+                "__clrex",
+                "__dbg",
             ];
             if !skip.contains(&rust.name) {
                 println!(
@@ -402,6 +487,7 @@ fn verify_all_signatures() {
             "vreinterpret_p64_s64",
             "vreinterpret_f32_p64",
             "vreinterpretq_f32_p64",
+            "__dbg",
         ];
         let arm = match map.get(rust.name) {
             Some(i) => i,
@@ -412,11 +498,13 @@ fn verify_all_signatures() {
                 // TODO: we still need to verify these intrinsics or find a
                 // reference for them, need to figure out where though!
                 if !rust.file.ends_with("dsp.rs\"")
+                    && !rust.file.ends_with("simd32.rs\"")
                     && !rust.file.ends_with("cmsis.rs\"")
                     && !rust.file.ends_with("v6.rs\"")
                     && !rust.file.ends_with("v7.rs\"")
                     && !rust.file.ends_with("v8.rs\"")
                     && !rust.file.ends_with("tme.rs\"")
+                    && !rust.file.ends_with("ex.rs\"")
                     && !skip_intrinsic_verify.contains(&rust.name)
                 {
                     println!(