From bb460fab314dd62ce25e4f25bacf3cc0e0890424 Mon Sep 17 00:00:00 2001
From: Trevor Gross <tmgross@umich.edu>
Date: Thu, 1 May 2025 18:41:05 +0000
Subject: [PATCH 1/3] update-api-list: Match subdirectories within arch

---
 etc/update-api-list.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/etc/update-api-list.py b/etc/update-api-list.py
index 0770a8b20..28ff22f4c 100755
--- a/etc/update-api-list.py
+++ b/etc/update-api-list.py
@@ -123,7 +123,9 @@ def _init_defs(self, index: IndexTy) -> None:
 
         # A lot of the `arch` module is often configured out so doesn't show up in docs. Use
         # string matching as a fallback.
-        for fname in glob("libm/src/math/arch/**.rs", root_dir=ROOT_DIR):
+        for fname in glob(
+            "libm/src/math/arch/**/*.rs", root_dir=ROOT_DIR, recursive=True
+        ):
             contents = (ROOT_DIR.joinpath(fname)).read_text()
 
             for name in self.public_functions:

From a549da43a1614f11cca8bd4aa7976f2fc2e844f4 Mon Sep 17 00:00:00 2001
From: Trevor Gross <tmgross@umich.edu>
Date: Thu, 1 May 2025 17:49:56 +0000
Subject: [PATCH 2/3] Rename the i686 module to x86

This module is used for both i686 and x86-64.
---
 etc/function-definitions.json          | 4 ++--
 libm/src/math/arch/mod.rs              | 4 ++--
 libm/src/math/arch/{i686.rs => x86.rs} | 0
 3 files changed, 4 insertions(+), 4 deletions(-)
 rename libm/src/math/arch/{i686.rs => x86.rs} (100%)

diff --git a/etc/function-definitions.json b/etc/function-definitions.json
index 9e5774eaf..691205ddf 100644
--- a/etc/function-definitions.json
+++ b/etc/function-definitions.json
@@ -932,8 +932,8 @@
     "sqrt": {
         "sources": [
             "libm/src/math/arch/aarch64.rs",
-            "libm/src/math/arch/i686.rs",
             "libm/src/math/arch/wasm32.rs",
+            "libm/src/math/arch/x86.rs",
             "libm/src/math/generic/sqrt.rs",
             "libm/src/math/sqrt.rs"
         ],
@@ -942,8 +942,8 @@
     "sqrtf": {
         "sources": [
             "libm/src/math/arch/aarch64.rs",
-            "libm/src/math/arch/i686.rs",
             "libm/src/math/arch/wasm32.rs",
+            "libm/src/math/arch/x86.rs",
             "libm/src/math/generic/sqrt.rs",
             "libm/src/math/sqrt.rs"
         ],
diff --git a/libm/src/math/arch/mod.rs b/libm/src/math/arch/mod.rs
index d9f2aad66..67352f90c 100644
--- a/libm/src/math/arch/mod.rs
+++ b/libm/src/math/arch/mod.rs
@@ -15,8 +15,8 @@ cfg_if! {
             ceil, ceilf, fabs, fabsf, floor, floorf, rint, rintf, sqrt, sqrtf, trunc, truncf,
         };
     } else if #[cfg(target_feature = "sse2")] {
-        mod i686;
-        pub use i686::{sqrt, sqrtf};
+        mod x86;
+        pub use x86::{sqrt, sqrtf};
     } else if #[cfg(all(
         any(target_arch = "aarch64", target_arch = "arm64ec"),
         target_feature = "neon"
diff --git a/libm/src/math/arch/i686.rs b/libm/src/math/arch/x86.rs
similarity index 100%
rename from libm/src/math/arch/i686.rs
rename to libm/src/math/arch/x86.rs

From e1d996a44432a67cac619495ba5fdfa4df56f811 Mon Sep 17 00:00:00 2001
From: Trevor Gross <tmgross@umich.edu>
Date: Tue, 29 Apr 2025 22:16:41 +0000
Subject: [PATCH 3/3] Use runtime feature detection for fma routines on x86

Get performance closer to the glibc implementations by adding assembly
fma routines, with runtime feature detection so they are used even if
not compiled with `+fma` (as the distributed standard library is often
not). Glibc uses ifuncs, this implementation stores a function pointer
in an atomic.

Results of CPU flags are also cached in order to avoid repeating the
startup time in calls to different functions. The feature detection code
is a slightly simplified version of `std-detect`.

Musl sources were used as a reference [1].

Fixes: https://github.com/rust-lang/rust/issues/140452 once synced

[1]: https://github.com/bminor/musl/blob/c47ad25ea3b484e10326f933e927c0bc8cded3da/src/math/x32/fma.c
---
 etc/function-definitions.json           |   2 +
 libm/src/math/arch/mod.rs               |   2 +-
 libm/src/math/arch/x86.rs               |   5 +
 libm/src/math/arch/x86/detect.rs        | 229 ++++++++++++++++++++++++
 libm/src/math/arch/x86/fma.rs           | 134 ++++++++++++++
 libm/src/math/fma.rs                    |  10 +-
 libm/src/math/support/feature_detect.rs | 206 +++++++++++++++++++++
 libm/src/math/support/mod.rs            |   3 +
 8 files changed, 588 insertions(+), 3 deletions(-)
 create mode 100644 libm/src/math/arch/x86/detect.rs
 create mode 100644 libm/src/math/arch/x86/fma.rs
 create mode 100644 libm/src/math/support/feature_detect.rs

diff --git a/etc/function-definitions.json b/etc/function-definitions.json
index 691205ddf..4f796905b 100644
--- a/etc/function-definitions.json
+++ b/etc/function-definitions.json
@@ -343,6 +343,7 @@
     "fma": {
         "sources": [
             "libm/src/math/arch/aarch64.rs",
+            "libm/src/math/arch/x86/fma.rs",
             "libm/src/math/fma.rs"
         ],
         "type": "f64"
@@ -350,6 +351,7 @@
     "fmaf": {
         "sources": [
             "libm/src/math/arch/aarch64.rs",
+            "libm/src/math/arch/x86/fma.rs",
             "libm/src/math/fma.rs"
         ],
         "type": "f32"
diff --git a/libm/src/math/arch/mod.rs b/libm/src/math/arch/mod.rs
index 67352f90c..984ae7f31 100644
--- a/libm/src/math/arch/mod.rs
+++ b/libm/src/math/arch/mod.rs
@@ -16,7 +16,7 @@ cfg_if! {
         };
     } else if #[cfg(target_feature = "sse2")] {
         mod x86;
-        pub use x86::{sqrt, sqrtf};
+        pub use x86::{sqrt, sqrtf, fma, fmaf};
     } else if #[cfg(all(
         any(target_arch = "aarch64", target_arch = "arm64ec"),
         target_feature = "neon"
diff --git a/libm/src/math/arch/x86.rs b/libm/src/math/arch/x86.rs
index 3e1d19bfa..454aa2850 100644
--- a/libm/src/math/arch/x86.rs
+++ b/libm/src/math/arch/x86.rs
@@ -1,5 +1,10 @@
 //! Architecture-specific support for x86-32 and x86-64 with SSE2
 
+mod detect;
+mod fma;
+
+pub use fma::{fma, fmaf};
+
 pub fn sqrtf(mut x: f32) -> f32 {
     // SAFETY: `sqrtss` is part of `sse2`, which this module is gated behind. It has no memory
     // access or side effects.
diff --git a/libm/src/math/arch/x86/detect.rs b/libm/src/math/arch/x86/detect.rs
new file mode 100644
index 000000000..71c3281dc
--- /dev/null
+++ b/libm/src/math/arch/x86/detect.rs
@@ -0,0 +1,229 @@
+#[cfg(target_arch = "x86")]
+use core::arch::x86::{__cpuid, __cpuid_count, _xgetbv, CpuidResult};
+#[cfg(target_arch = "x86_64")]
+use core::arch::x86_64::{__cpuid, __cpuid_count, _xgetbv, CpuidResult};
+
+use crate::support::{Flags, get_or_init_flags_cache};
+
+/// CPU features that get cached (doesn't correlate to anything on the CPU).
+pub mod cpu_flags {
+    use crate::support::unique_masks;
+
+    unique_masks! {
+        u32,
+        SSE3,
+        F16C,
+        SSE,
+        SSE2,
+        ERMSB,
+        MOVRS,
+        FMA,
+        FMA4,
+        AVX512FP16,
+        AVX512BF16,
+    }
+}
+
+/// Get CPU features, loading from a cache if available.
+pub fn get_cpu_features() -> Flags {
+    use core::sync::atomic::AtomicU32;
+    static CACHE: AtomicU32 = AtomicU32::new(0);
+    get_or_init_flags_cache(&CACHE, load_x86_features)
+}
+
+/// Read from cpuid and translate to a `Flags` instance, using `cpu_flags`.
+///
+/// Implementation is taken from [std-detect][std-detect].
+///
+/// [std-detect]: https://github.com/rust-lang/stdarch/blob/690b3a6334d482874163bd6fcef408e0518febe9/crates/std_detect/src/detect/os/x86.rs#L142
+fn load_x86_features() -> Flags {
+    let mut value = Flags::empty();
+
+    if cfg!(target_env = "sgx") {
+        // doesn't support this because it is untrusted data
+        return Flags::empty();
+    }
+
+    // Calling `__cpuid`/`__cpuid_count` from here on is safe because the CPU
+    // has `cpuid` support.
+
+    // 0. EAX = 0: Basic Information:
+    // - EAX returns the "Highest Function Parameter", that is, the maximum leaf
+    //   value for subsequent calls of `cpuinfo` in range [0, 0x8000_0000].
+    // - The vendor ID is stored in 12 u8 ascii chars, returned in EBX, EDX, and ECX
+    //   (in that order)
+    let mut vendor_id = [0u8; 12];
+    let max_basic_leaf;
+    unsafe {
+        let CpuidResult { eax, ebx, ecx, edx } = __cpuid(0);
+        max_basic_leaf = eax;
+        vendor_id[0..4].copy_from_slice(&ebx.to_ne_bytes());
+        vendor_id[4..8].copy_from_slice(&edx.to_ne_bytes());
+        vendor_id[8..12].copy_from_slice(&ecx.to_ne_bytes());
+    }
+
+    if max_basic_leaf < 1 {
+        // Earlier Intel 486, CPUID not implemented
+        return value;
+    }
+
+    // EAX = 1, ECX = 0: Queries "Processor Info and Feature Bits";
+    // Contains information about most x86 features.
+    let CpuidResult { ecx, edx, .. } = unsafe { __cpuid(0x0000_0001_u32) };
+    let proc_info_ecx = Flags::from_bits(ecx);
+    let proc_info_edx = Flags::from_bits(edx);
+
+    // EAX = 7: Queries "Extended Features";
+    // Contains information about bmi,bmi2, and avx2 support.
+    let mut extended_features_ebx = Flags::empty();
+    let mut extended_features_edx = Flags::empty();
+    let mut extended_features_eax_leaf_1 = Flags::empty();
+    if max_basic_leaf >= 7 {
+        let CpuidResult { ebx, edx, .. } = unsafe { __cpuid(0x0000_0007_u32) };
+        extended_features_ebx = Flags::from_bits(ebx);
+        extended_features_edx = Flags::from_bits(edx);
+
+        let CpuidResult { eax, .. } = unsafe { __cpuid_count(0x0000_0007_u32, 0x0000_0001_u32) };
+        extended_features_eax_leaf_1 = Flags::from_bits(eax)
+    }
+
+    // EAX = 0x8000_0000, ECX = 0: Get Highest Extended Function Supported
+    // - EAX returns the max leaf value for extended information, that is,
+    //   `cpuid` calls in range [0x8000_0000; u32::MAX]:
+    let extended_max_basic_leaf = unsafe { __cpuid(0x8000_0000_u32) }.eax;
+
+    // EAX = 0x8000_0001, ECX=0: Queries "Extended Processor Info and Feature Bits"
+    let mut extended_proc_info_ecx = Flags::empty();
+    if extended_max_basic_leaf >= 1 {
+        let CpuidResult { ecx, .. } = unsafe { __cpuid(0x8000_0001_u32) };
+        extended_proc_info_ecx = Flags::from_bits(ecx);
+    }
+
+    let mut enable = |regflags: Flags, regbit, flag| {
+        if regflags.test_nth(regbit) {
+            value.insert(flag);
+        }
+    };
+
+    enable(proc_info_ecx, 0, cpu_flags::SSE3);
+    enable(proc_info_ecx, 29, cpu_flags::F16C);
+    enable(proc_info_edx, 25, cpu_flags::SSE);
+    enable(proc_info_edx, 26, cpu_flags::SSE2);
+    enable(extended_features_ebx, 9, cpu_flags::ERMSB);
+    enable(extended_features_eax_leaf_1, 31, cpu_flags::MOVRS);
+
+    // `XSAVE` and `AVX` support:
+    let cpu_xsave = proc_info_ecx.test_nth(26);
+    if cpu_xsave {
+        // 0. Here the CPU supports `XSAVE`.
+
+        // 1. Detect `OSXSAVE`, that is, whether the OS is AVX enabled and
+        //    supports saving the state of the AVX/AVX2 vector registers on
+        //    context-switches, see:
+        //
+        // - [intel: is avx enabled?][is_avx_enabled],
+        // - [mozilla: sse.cpp][mozilla_sse_cpp].
+        //
+        // [is_avx_enabled]: https://software.intel.com/en-us/blogs/2011/04/14/is-avx-enabled
+        // [mozilla_sse_cpp]: https://hg.mozilla.org/mozilla-central/file/64bab5cbb9b6/mozglue/build/SSE.cpp#l190
+        let cpu_osxsave = proc_info_ecx.test_nth(27);
+
+        if cpu_osxsave {
+            // 2. The OS must have signaled the CPU that it supports saving and
+            // restoring the:
+            //
+            // * SSE -> `XCR0.SSE[1]`
+            // * AVX -> `XCR0.AVX[2]`
+            // * AVX-512 -> `XCR0.AVX-512[7:5]`.
+            // * AMX -> `XCR0.AMX[18:17]`
+            //
+            // by setting the corresponding bits of `XCR0` to `1`.
+            //
+            // This is safe because the CPU supports `xsave` and the OS has set `osxsave`.
+            let xcr0 = unsafe { _xgetbv(0) };
+            // Test `XCR0.SSE[1]` and `XCR0.AVX[2]` with the mask `0b110 == 6`:
+            let os_avx_support = xcr0 & 6 == 6;
+            // Test `XCR0.AVX-512[7:5]` with the mask `0b1110_0000 == 0xe0`:
+            let os_avx512_support = xcr0 & 0xe0 == 0xe0;
+
+            // Only if the OS and the CPU support saving/restoring the AVX
+            // registers we enable `xsave` support:
+            if os_avx_support {
+                // See "13.3 ENABLING THE XSAVE FEATURE SET AND XSAVE-ENABLED
+                // FEATURES" in the "Intel® 64 and IA-32 Architectures Software
+                // Developer’s Manual, Volume 1: Basic Architecture":
+                //
+                // "Software enables the XSAVE feature set by setting
+                // CR4.OSXSAVE[bit 18] to 1 (e.g., with the MOV to CR4
+                // instruction). If this bit is 0, execution of any of XGETBV,
+                // XRSTOR, XRSTORS, XSAVE, XSAVEC, XSAVEOPT, XSAVES, and XSETBV
+                // causes an invalid-opcode exception (#UD)"
+
+                // FMA (uses 256-bit wide registers):
+                enable(proc_info_ecx, 12, cpu_flags::FMA);
+
+                // For AVX-512 the OS also needs to support saving/restoring
+                // the extended state, only then we enable AVX-512 support:
+                if os_avx512_support {
+                    enable(extended_features_edx, 23, cpu_flags::AVX512FP16);
+                    enable(extended_features_eax_leaf_1, 5, cpu_flags::AVX512BF16);
+                }
+            }
+        }
+    }
+
+    // As Hygon Dhyana originates from AMD technology and shares most of the architecture with
+    // AMD's family 17h, but with different CPU Vendor ID("HygonGenuine")/Family series number
+    // (Family 18h).
+    //
+    // For CPUID feature bits, Hygon Dhyana(family 18h) share the same definition with AMD
+    // family 17h.
+    //
+    // Related AMD CPUID specification is https://www.amd.com/system/files/TechDocs/25481.pdf
+    // (AMD64 Architecture Programmer's Manual, Appendix E).
+    // Related Hygon kernel patch can be found on
+    // http://lkml.kernel.org/r/5ce86123a7b9dad925ac583d88d2f921040e859b.1538583282.git.puwen@hygon.cn
+    if vendor_id == *b"AuthenticAMD" || vendor_id == *b"HygonGenuine" {
+        // These features are available on AMD arch CPUs:
+        enable(extended_proc_info_ecx, 16, cpu_flags::FMA4);
+    }
+
+    value
+}
+
+#[cfg(test)]
+mod tests {
+    extern crate std;
+    use std::is_x86_feature_detected;
+
+    use super::*;
+
+    #[test]
+    fn check_matches_std() {
+        let features = get_cpu_features();
+        for i in 0..cpu_flags::ALL.len() {
+            let flag = cpu_flags::ALL[i];
+            let name = cpu_flags::NAMES[i];
+
+            let std_detected = match flag {
+                cpu_flags::SSE3 => is_x86_feature_detected!("sse3"),
+                cpu_flags::F16C => is_x86_feature_detected!("f16c"),
+                cpu_flags::SSE => is_x86_feature_detected!("sse"),
+                cpu_flags::SSE2 => is_x86_feature_detected!("sse2"),
+                cpu_flags::ERMSB => is_x86_feature_detected!("ermsb"),
+                cpu_flags::MOVRS => continue, // only very recent support in std
+                cpu_flags::FMA => is_x86_feature_detected!("fma"),
+                cpu_flags::FMA4 => continue, // not yet supported in std
+                cpu_flags::AVX512FP16 => is_x86_feature_detected!("avx512fp16"),
+                cpu_flags::AVX512BF16 => is_x86_feature_detected!("avx512bf16"),
+                _ => panic!("untested CPU flag {name}"),
+            };
+
+            assert_eq!(
+                std_detected,
+                features.contains(flag),
+                "different flag {name}. flags: {features:?}"
+            );
+        }
+    }
+}
diff --git a/libm/src/math/arch/x86/fma.rs b/libm/src/math/arch/x86/fma.rs
new file mode 100644
index 000000000..eb43f4696
--- /dev/null
+++ b/libm/src/math/arch/x86/fma.rs
@@ -0,0 +1,134 @@
+//! Use assembly fma if the `fma` or `fma4` feature is detected at runtime.
+
+use core::arch::asm;
+
+use super::super::super::generic;
+use super::detect::{cpu_flags, get_cpu_features};
+use crate::support::{Round, select_once};
+
+pub fn fma(x: f64, y: f64, z: f64) -> f64 {
+    select_once! {
+        sig: fn(x: f64, y: f64, z: f64) -> f64,
+        init: || {
+            let features = get_cpu_features();
+            if features.contains(cpu_flags::FMA) {
+                fma_with_fma
+            } else if features.contains(cpu_flags::FMA4) {
+               fma_with_fma4
+            } else {
+                fma_fallback as Func
+            }
+        },
+        // SAFETY: `fn_ptr` is the result of `init`, preconditions have been checked.
+        call: |fn_ptr: Func| unsafe { fn_ptr(x, y, z) },
+    }
+}
+
+pub fn fmaf(x: f32, y: f32, z: f32) -> f32 {
+    select_once! {
+        sig: fn(x: f32, y: f32, z: f32) -> f32,
+        init: || {
+            let features = get_cpu_features();
+            if features.contains(cpu_flags::FMA) {
+                fmaf_with_fma
+            } else if features.contains(cpu_flags::FMA4) {
+                fmaf_with_fma4
+            } else {
+                fmaf_fallback as Func
+            }
+        },
+        // SAFETY: `fn_ptr` is the result of `init`, preconditions have been checked.
+        call: |fn_ptr: Func| unsafe { fn_ptr(x, y, z) },
+    }
+}
+
+/// # Safety
+///
+/// Must have +fma available.
+unsafe fn fma_with_fma(mut x: f64, y: f64, z: f64) -> f64 {
+    debug_assert!(get_cpu_features().contains(cpu_flags::FMA));
+
+    // SAFETY: fma is asserted available by precondition, which provides the instruction. No
+    // memory access or side effects.
+    unsafe {
+        asm!(
+            "vfmadd213sd {x}, {y}, {z}",
+            x = inout(xmm_reg) x,
+            y = in(xmm_reg) y,
+            z = in(xmm_reg) z,
+            options(nostack, nomem, pure),
+        );
+    }
+    x
+}
+
+/// # Safety
+///
+/// Must have +fma available.
+unsafe fn fmaf_with_fma(mut x: f32, y: f32, z: f32) -> f32 {
+    debug_assert!(get_cpu_features().contains(cpu_flags::FMA));
+
+    // SAFETY: fma is asserted available by precondition, which provides the instruction. No
+    // memory access or side effects.
+    unsafe {
+        asm!(
+            "vfmadd213ss {x}, {y}, {z}",
+            x = inout(xmm_reg) x,
+            y = in(xmm_reg) y,
+            z = in(xmm_reg) z,
+            options(nostack, nomem, pure),
+        );
+    }
+    x
+}
+
+/// # Safety
+///
+/// Must have +fma4 available.
+unsafe fn fma_with_fma4(mut x: f64, y: f64, z: f64) -> f64 {
+    debug_assert!(get_cpu_features().contains(cpu_flags::FMA4));
+
+    // SAFETY: fma4 is asserted available by precondition, which provides the instruction. No
+    // memory access or side effects.
+    unsafe {
+        asm!(
+            "vfmaddsd {x}, {x}, {y}, {z}",
+            x = inout(xmm_reg) x,
+            y = in(xmm_reg) y,
+            z = in(xmm_reg) z,
+            options(nostack, nomem, pure),
+        );
+    }
+    x
+}
+
+/// # Safety
+///
+/// Must have +fma4 available.
+unsafe fn fmaf_with_fma4(mut x: f32, y: f32, z: f32) -> f32 {
+    debug_assert!(get_cpu_features().contains(cpu_flags::FMA4));
+
+    // SAFETY: fma4 is asserted available by precondition, which provides the instruction. No
+    // memory access or side effects.
+    unsafe {
+        asm!(
+            "vfmaddss {x}, {x}, {y}, {z}",
+            x = inout(xmm_reg) x,
+            y = in(xmm_reg) y,
+            z = in(xmm_reg) z,
+            options(nostack, nomem, pure),
+        );
+    }
+    x
+}
+
+// FIXME: the `select_implementation` macro should handle arch implementations that want
+// to use the fallback, so we don't need to recreate the body.
+
+fn fma_fallback(x: f64, y: f64, z: f64) -> f64 {
+    generic::fma_round(x, y, z, Round::Nearest).val
+}
+
+fn fmaf_fallback(x: f32, y: f32, z: f32) -> f32 {
+    generic::fma_wide_round(x, y, z, Round::Nearest).val
+}
diff --git a/libm/src/math/fma.rs b/libm/src/math/fma.rs
index 78f0f8992..5bf473cfe 100644
--- a/libm/src/math/fma.rs
+++ b/libm/src/math/fma.rs
@@ -19,7 +19,10 @@ pub(crate) fn fmaf16(_x: f16, _y: f16, _z: f16) -> f16 {
 pub fn fmaf(x: f32, y: f32, z: f32) -> f32 {
     select_implementation! {
         name: fmaf,
-        use_arch: all(target_arch = "aarch64", target_feature = "neon"),
+        use_arch: any(
+            all(target_arch = "aarch64", target_feature = "neon"),
+            target_feature = "sse2",
+        ),
         args: x, y, z,
     }
 
@@ -33,7 +36,10 @@ pub fn fmaf(x: f32, y: f32, z: f32) -> f32 {
 pub fn fma(x: f64, y: f64, z: f64) -> f64 {
     select_implementation! {
         name: fma,
-        use_arch: all(target_arch = "aarch64", target_feature = "neon"),
+        use_arch: any(
+            all(target_arch = "aarch64", target_feature = "neon"),
+            target_feature = "sse2",
+        ),
         args: x, y, z,
     }
 
diff --git a/libm/src/math/support/feature_detect.rs b/libm/src/math/support/feature_detect.rs
new file mode 100644
index 000000000..cb669b073
--- /dev/null
+++ b/libm/src/math/support/feature_detect.rs
@@ -0,0 +1,206 @@
+//! Helpers for runtime target feature detection that are shared across architectures.
+
+use core::sync::atomic::{AtomicU32, Ordering};
+
+/// Given a list of identifiers, assign each one a unique sequential single-bit mask.
+#[allow(unused_macros)]
+macro_rules! unique_masks {
+    ($ty:ty, $($name:ident,)+) => {
+        #[cfg(test)]
+        pub const ALL: &[$ty] = &[$($name),+];
+        #[cfg(test)]
+        pub const NAMES: &[&str] = &[$(stringify!($name)),+];
+
+        unique_masks!(@one; $ty; 0; $($name,)+);
+    };
+    // Matcher for a single value
+    (@one; $_ty:ty; $_idx:expr;) => {};
+    (@one; $ty:ty; $shift:expr; $name:ident, $($tail:tt)*) => {
+        pub const $name: $ty = 1 << $shift;
+        // Ensure the top bit is not used since it stores initialized state.
+        const _: () = assert!($name != (1 << (<$ty>::BITS - 1)));
+        // Increment the shift and invoke the next
+        unique_masks!(@one; $ty; $shift + 1; $($tail)*);
+    };
+}
+
+/// Call `init` once to choose an implementation, then use it for the rest of the program.
+///
+/// - `sig` is the function type.
+/// - `init` is an expression called at startup that chooses an implementation and returns a
+///   function pointer.
+/// - `call` is an expression to call a function returned by `init`, encapsulating any safety
+///   preconditions.
+///
+/// The type `Func` is available in `init` and `call`.
+///
+/// This is effectively our version of an ifunc without linker support. Note that `init` may be
+/// called more than once until one completes.
+#[allow(unused_macros)] // only used on some architectures
+macro_rules! select_once {
+    (
+        sig: fn($($arg:ident: $ArgTy:ty),*) -> $RetTy:ty,
+        init: $init:expr,
+        call: $call:expr,
+    ) => {{
+        use core::mem;
+        use core::sync::atomic::{AtomicPtr, Ordering};
+
+        type Func = unsafe fn($($arg: $ArgTy),*) -> $RetTy;
+
+        /// Stores a pointer that is immediately jumped to. By default it is an init function
+        /// that sets FUNC to something else.
+        static FUNC: AtomicPtr<()> = AtomicPtr::new((initializer as Func) as *mut ());
+
+        /// Run once to set the function that will be used for all subsequent calls.
+        fn initializer($($arg: $ArgTy),*) -> $RetTy {
+            // Select an implementation, ensuring a 'static lifetime.
+            let fn_ptr: Func = $init();
+            FUNC.store(fn_ptr as *mut (), Ordering::Relaxed);
+
+            // Forward the call to the selected function.
+            $call(fn_ptr)
+        }
+
+        let raw: *mut () = FUNC.load(Ordering::Relaxed);
+
+        // SAFETY: will only ever be `initializer` or another function pointer that has the
+        // 'static lifetime.
+        let fn_ptr: Func = unsafe { mem::transmute::<*mut (), Func>(raw) };
+
+        $call(fn_ptr)
+    }}
+}
+
+pub(crate) use {select_once, unique_masks};
+
+use crate::support::cold_path;
+
+/// Helper for working with bit flags, based on `bitflags`.
+#[derive(Clone, Copy, Debug, PartialEq)]
+pub struct Flags(u32);
+
+#[allow(dead_code)] // only used on some architectures
+impl Flags {
+    /// No bits set.
+    pub const fn empty() -> Self {
+        Self(0)
+    }
+
+    /// Create with bits already set.
+    pub const fn from_bits(val: u32) -> Self {
+        Self(val)
+    }
+
+    /// Get the integer representation.
+    pub fn bits(&self) -> u32 {
+        self.0
+    }
+
+    /// Set any bits in `mask`.
+    pub fn insert(&mut self, mask: u32) {
+        self.0 |= mask;
+    }
+
+    /// Check whether the mask is set.
+    pub fn contains(&self, mask: u32) -> bool {
+        self.0 & mask == mask
+    }
+
+    /// Check whether the nth bit is set.
+    pub fn test_nth(&self, bit: u32) -> bool {
+        debug_assert!(bit < u32::BITS, "bit index out-of-bounds");
+        self.0 & (1 << bit) != 0
+    }
+}
+
+/// Load flags from an atomic value. If the flags have not yet been initialized, call `init`
+/// to do so.
+///
+/// Note that `init` may run more than once.
+#[allow(dead_code)] // only used on some architectures
+pub fn get_or_init_flags_cache(cache: &AtomicU32, init: impl FnOnce() -> Flags) -> Flags {
+    // The top bit is used to indicate that the values have already been set once.
+    const INITIALIZED: u32 = 1 << 31;
+
+    // Relaxed ops are sufficient since the result should always be the same.
+    let mut flags = Flags::from_bits(cache.load(Ordering::Relaxed));
+
+    if !flags.contains(INITIALIZED) {
+        // Without this, `init` is inlined and the bit check gets wrapped in `init`'s lengthy
+        // prologue/epilogue. Cold pathing gives a preferable load->test->?jmp->ret.
+        cold_path();
+
+        flags = init();
+        debug_assert!(
+            !flags.contains(INITIALIZED),
+            "initialized bit shouldn't be set"
+        );
+        flags.insert(INITIALIZED);
+        cache.store(flags.bits(), Ordering::Relaxed);
+    }
+
+    flags
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn unique_masks() {
+        unique_masks! {
+            u32,
+            V0,
+            V1,
+            V2,
+        }
+        assert_eq!(V0, 1u32 << 0);
+        assert_eq!(V1, 1u32 << 1);
+        assert_eq!(V2, 1u32 << 2);
+        assert_eq!(ALL, [V0, V1, V2]);
+        assert_eq!(NAMES, ["V0", "V1", "V2"]);
+    }
+
+    #[test]
+    fn flag_cache_is_used() {
+        // Sanity check that flags are only ever set once
+        static CACHE: AtomicU32 = AtomicU32::new(0);
+
+        let mut f1 = Flags::from_bits(0x1);
+        let f2 = Flags::from_bits(0x2);
+
+        let r1 = get_or_init_flags_cache(&CACHE, || f1);
+        let r2 = get_or_init_flags_cache(&CACHE, || f2);
+
+        f1.insert(1 << 31); // init bit
+
+        assert_eq!(r1, f1);
+        assert_eq!(r2, f1);
+    }
+
+    #[test]
+    fn select_cache_is_used() {
+        // Sanity check that cache is used
+        static CALLED: AtomicU32 = AtomicU32::new(0);
+
+        fn inner() {
+            fn nop() {}
+
+            select_once! {
+                sig: fn() -> (),
+                init: || {
+                    CALLED.fetch_add(1, Ordering::Relaxed);
+                    nop
+                },
+                call: |fn_ptr: Func| unsafe { fn_ptr() },
+            }
+        }
+
+        // `init` should only have been called once.
+        inner();
+        assert_eq!(CALLED.load(Ordering::Relaxed), 1);
+        inner();
+        assert_eq!(CALLED.load(Ordering::Relaxed), 1);
+    }
+}
diff --git a/libm/src/math/support/mod.rs b/libm/src/math/support/mod.rs
index ee3f2bbdf..727b9a360 100644
--- a/libm/src/math/support/mod.rs
+++ b/libm/src/math/support/mod.rs
@@ -2,6 +2,7 @@
 pub mod macros;
 mod big;
 mod env;
+mod feature_detect;
 mod float_traits;
 pub mod hex_float;
 mod int_traits;
@@ -10,6 +11,8 @@ mod int_traits;
 pub use big::{i256, u256};
 pub use env::{FpResult, Round, Status};
 #[allow(unused_imports)]
+pub(crate) use feature_detect::{Flags, get_or_init_flags_cache, select_once, unique_masks};
+#[allow(unused_imports)]
 pub use float_traits::{DFloat, Float, HFloat, IntTy};
 pub(crate) use float_traits::{f32_from_bits, f64_from_bits};
 #[cfg(f16_enabled)]