From e880ee2eb2fc483085180c9a879518ff1ed52327 Mon Sep 17 00:00:00 2001
From: OverMighty <its.overmighty@gmail.com>
Date: Thu, 6 Jun 2024 17:07:37 +0200
Subject: [PATCH 1/7] [libc][math][c23] Add fmodf16 C23 math function

---
 libc/config/linux/aarch64/entrypoints.txt |  1 +
 libc/config/linux/x86_64/entrypoints.txt  |  1 +
 libc/docs/math/index.rst                  |  2 +-
 libc/spec/stdc.td                         |  1 +
 libc/src/__support/FPUtil/FPBits.h        |  2 +-
 libc/src/__support/FPUtil/generic/FMod.h  |  4 ++-
 libc/src/math/CMakeLists.txt              |  1 +
 libc/src/math/fmodf16.h                   | 20 ++++++++++++
 libc/src/math/generic/CMakeLists.txt      | 13 ++++++++
 libc/src/math/generic/fmodf16.cpp         | 19 ++++++++++++
 libc/test/src/math/smoke/CMakeLists.txt   | 33 +++++++++++++++-----
 libc/test/src/math/smoke/FModTest.h       | 37 ++++++++++++-----------
 libc/test/src/math/smoke/fmodf16_test.cpp | 13 ++++++++
 13 files changed, 118 insertions(+), 29 deletions(-)
 create mode 100644 libc/src/math/fmodf16.h
 create mode 100644 libc/src/math/generic/fmodf16.cpp
 create mode 100644 libc/test/src/math/smoke/fmodf16_test.cpp

diff --git a/libc/config/linux/aarch64/entrypoints.txt b/libc/config/linux/aarch64/entrypoints.txt
index 33ecff813a1fb..193183ddeb78a 100644
--- a/libc/config/linux/aarch64/entrypoints.txt
+++ b/libc/config/linux/aarch64/entrypoints.txt
@@ -515,6 +515,7 @@ if(LIBC_TYPES_HAS_FLOAT16)
     libc.src.math.fminimum_magf16
     libc.src.math.fminimum_mag_numf16
     libc.src.math.fminimum_numf16
+    libc.src.math.fmodf16
     libc.src.math.fromfpf16
     libc.src.math.fromfpxf16
     libc.src.math.llrintf16
diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt
index ebacb1c59ceec..ee748b5cd8e1a 100644
--- a/libc/config/linux/x86_64/entrypoints.txt
+++ b/libc/config/linux/x86_64/entrypoints.txt
@@ -548,6 +548,7 @@ if(LIBC_TYPES_HAS_FLOAT16)
     libc.src.math.fminimum_magf16
     libc.src.math.fminimum_mag_numf16
     libc.src.math.fminimum_numf16
+    libc.src.math.fmodf16
     libc.src.math.fromfpf16
     libc.src.math.fromfpxf16
     libc.src.math.llrintf16
diff --git a/libc/docs/math/index.rst b/libc/docs/math/index.rst
index b9507f0887cd7..24b88a52f049d 100644
--- a/libc/docs/math/index.rst
+++ b/libc/docs/math/index.rst
@@ -156,7 +156,7 @@ Basic Operations
 +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
 | fminimum_num     | |check|          | |check|         | |check|                | |check|              | |check|                | 7.12.12.9              | F.10.9.5                   |
 +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
-| fmod             | |check|          | |check|         | |check|                |                      | |check|                | 7.12.10.1              | F.10.7.1                   |
+| fmod             | |check|          | |check|         | |check|                | |check|              | |check|                | 7.12.10.1              | F.10.7.1                   |
 +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
 | fmul             | N/A              |                 |                        | N/A                  |                        | 7.12.14.3              | F.10.11                    |
 +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
diff --git a/libc/spec/stdc.td b/libc/spec/stdc.td
index 210f2a1325169..90e5246a63505 100644
--- a/libc/spec/stdc.td
+++ b/libc/spec/stdc.td
@@ -478,6 +478,7 @@ def StdC : StandardSpec<"stdc"> {
           FunctionSpec<"fmod", RetValSpec<DoubleType>, [ArgSpec<DoubleType>, ArgSpec<DoubleType>]>,
           FunctionSpec<"fmodf", RetValSpec<FloatType>, [ArgSpec<FloatType>, ArgSpec<FloatType>]>,
           FunctionSpec<"fmodl", RetValSpec<LongDoubleType>, [ArgSpec<LongDoubleType>, ArgSpec<LongDoubleType>]>,
+          GuardedFunctionSpec<"fmodf16", RetValSpec<Float16Type>, [ArgSpec<Float16Type>, ArgSpec<Float16Type>], "LIBC_TYPES_HAS_FLOAT16">,
           GuardedFunctionSpec<"fmodf128", RetValSpec<Float128Type>, [ArgSpec<Float128Type>, ArgSpec<Float128Type>], "LIBC_TYPES_HAS_FLOAT128">,
 
           FunctionSpec<"frexp", RetValSpec<DoubleType>, [ArgSpec<DoubleType>, ArgSpec<IntPtr>]>,
diff --git a/libc/src/__support/FPUtil/FPBits.h b/libc/src/__support/FPUtil/FPBits.h
index d3c96d2d613d6..559ecde767c30 100644
--- a/libc/src/__support/FPUtil/FPBits.h
+++ b/libc/src/__support/FPUtil/FPBits.h
@@ -744,7 +744,7 @@ struct FPRepImpl : public FPRepSem<fp_type, RetT> {
     if (LIBC_LIKELY(ep >= 0)) {
       // Implicit number bit will be removed by mask
       result.set_significand(number);
-      result.set_biased_exponent(ep + 1);
+      result.set_biased_exponent(static_cast<StorageType>(ep + 1));
     } else {
       result.set_significand(number >> -ep);
     }
diff --git a/libc/src/__support/FPUtil/generic/FMod.h b/libc/src/__support/FPUtil/generic/FMod.h
index 211ab926d28b0..f840a92b1a5a2 100644
--- a/libc/src/__support/FPUtil/generic/FMod.h
+++ b/libc/src/__support/FPUtil/generic/FMod.h
@@ -210,7 +210,9 @@ class FMod {
                     e_x - e_y <= int(FPB::EXP_LEN))) {
       StorageType m_x = sx.get_explicit_mantissa();
       StorageType m_y = sy.get_explicit_mantissa();
-      StorageType d = (e_x == e_y) ? (m_x - m_y) : (m_x << (e_x - e_y)) % m_y;
+      StorageType d = (e_x == e_y)
+                          ? (m_x - m_y)
+                          : static_cast<StorageType>(m_x << (e_x - e_y)) % m_y;
       if (d == 0)
         return FPB::zero();
       // iy - 1 because of "zero power" for number with power 1
diff --git a/libc/src/math/CMakeLists.txt b/libc/src/math/CMakeLists.txt
index 7a349ddc53724..141f66817e53e 100644
--- a/libc/src/math/CMakeLists.txt
+++ b/libc/src/math/CMakeLists.txt
@@ -183,6 +183,7 @@ add_math_entrypoint_object(fminimum_mag_numf128)
 add_math_entrypoint_object(fmod)
 add_math_entrypoint_object(fmodf)
 add_math_entrypoint_object(fmodl)
+add_math_entrypoint_object(fmodf16)
 add_math_entrypoint_object(fmodf128)
 
 add_math_entrypoint_object(frexp)
diff --git a/libc/src/math/fmodf16.h b/libc/src/math/fmodf16.h
new file mode 100644
index 0000000000000..ab658430275d8
--- /dev/null
+++ b/libc/src/math/fmodf16.h
@@ -0,0 +1,20 @@
+//===-- Implementation header for fmodf16 -----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_FMODF16_H
+#define LLVM_LIBC_SRC_MATH_FMODF16_H
+
+#include "src/__support/macros/properties/types.h"
+
+namespace LIBC_NAMESPACE {
+
+float16 fmodf16(float16 x, float16 y);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_MATH_FMODF16_H
diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt
index b1d786fc6b29f..9c9073c0ea7bf 100644
--- a/libc/src/math/generic/CMakeLists.txt
+++ b/libc/src/math/generic/CMakeLists.txt
@@ -2887,6 +2887,19 @@ add_entrypoint_object(
     -O3
 )
 
+add_entrypoint_object(
+  fmodf16
+  SRCS
+    fmodf16.cpp
+  HDRS
+    ../fmodf16.h
+  DEPENDS
+    libc.src.__support.macros.properties.types
+    libc.src.__support.FPUtil.generic.fmod
+  COMPILE_OPTIONS
+    -O3
+)
+
 add_entrypoint_object(
   fmodf128
   SRCS
diff --git a/libc/src/math/generic/fmodf16.cpp b/libc/src/math/generic/fmodf16.cpp
new file mode 100644
index 0000000000000..0a54a65806de9
--- /dev/null
+++ b/libc/src/math/generic/fmodf16.cpp
@@ -0,0 +1,19 @@
+//===-- Implementation of fmodf16 function --------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/fmodf16.h"
+#include "src/__support/FPUtil/generic/FMod.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float16, fmodf16, (float16 x, float16 y)) {
+  return fputil::generic::FMod<float16>::eval(x, y);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/test/src/math/smoke/CMakeLists.txt b/libc/test/src/math/smoke/CMakeLists.txt
index 110fa1de97d6d..07e8b5dddfa6c 100644
--- a/libc/test/src/math/smoke/CMakeLists.txt
+++ b/libc/test/src/math/smoke/CMakeLists.txt
@@ -3111,10 +3111,10 @@ add_fp_unittest(
   HDRS
     FModTest.h
   DEPENDS
+    libc.hdr.fenv_macros
     libc.src.errno.errno
     libc.src.math.fmodf
-    libc.src.__support.FPUtil.basic_operations
-    libc.src.__support.FPUtil.nearest_integer_operations
+    libc.src.__support.FPUtil.fenv_impl
   # FIXME: Currently fails on the GPU build.
   UNIT_TEST_ONLY
 )
@@ -3128,10 +3128,10 @@ add_fp_unittest(
   HDRS
     FModTest.h
   DEPENDS
+    libc.hdr.fenv_macros
     libc.src.errno.errno
     libc.src.math.fmod
-    libc.src.__support.FPUtil.basic_operations
-    libc.src.__support.FPUtil.nearest_integer_operations
+    libc.src.__support.FPUtil.fenv_impl
   # FIXME: Currently fails on the GPU build.
   UNIT_TEST_ONLY
 )
@@ -3145,10 +3145,27 @@ add_fp_unittest(
   HDRS
     FModTest.h
   DEPENDS
+    libc.hdr.fenv_macros
     libc.src.errno.errno
     libc.src.math.fmodl
-    libc.src.__support.FPUtil.basic_operations
-    libc.src.__support.FPUtil.nearest_integer_operations
+    libc.src.__support.FPUtil.fenv_impl
+  # FIXME: Currently fails on the GPU build.
+  UNIT_TEST_ONLY
+)
+
+add_fp_unittest(
+  fmodf16_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    fmodf16_test.cpp
+  HDRS
+    FModTest.h
+  DEPENDS
+    libc.hdr.fenv_macros
+    libc.src.errno.errno
+    libc.src.math.fmodf16
+    libc.src.__support.FPUtil.fenv_impl
   # FIXME: Currently fails on the GPU build.
   UNIT_TEST_ONLY
 )
@@ -3162,10 +3179,10 @@ add_fp_unittest(
   HDRS
     FModTest.h
   DEPENDS
+    libc.hdr.fenv_macros
     libc.src.errno.errno
     libc.src.math.fmodf128
-    libc.src.__support.FPUtil.basic_operations
-    libc.src.__support.FPUtil.nearest_integer_operations
+    libc.src.__support.FPUtil.fenv_impl
   # FIXME: Currently fails on the GPU build.
   UNIT_TEST_ONLY
 )
diff --git a/libc/test/src/math/smoke/FModTest.h b/libc/test/src/math/smoke/FModTest.h
index f1015d6497fcd..405e3107438d4 100644
--- a/libc/test/src/math/smoke/FModTest.h
+++ b/libc/test/src/math/smoke/FModTest.h
@@ -9,13 +9,13 @@
 #ifndef LLVM_LIBC_TEST_SRC_MATH_FMODTEST_H
 #define LLVM_LIBC_TEST_SRC_MATH_FMODTEST_H
 
-#include "src/__support/FPUtil/BasicOperations.h"
-#include "src/__support/FPUtil/NearestIntegerOperations.h"
+#include "src/__support/FPUtil/FEnvImpl.h"
+#include "src/errno/libc_errno.h"
 #include "test/UnitTest/FEnvSafeTest.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 
-#include "hdr/math_macros.h"
+#include "hdr/fenv_macros.h"
 
 #define TEST_SPECIAL(x, y, expected, dom_err, expected_exception)              \
   EXPECT_FP_EQ(expected, f(x, y));                                             \
@@ -210,7 +210,8 @@ class FmodTest : public LIBC_NAMESPACE::testing::FEnvSafeTest {
   }
 
   void testRegularExtreme(FModFunc f) {
-
+    if constexpr (sizeof(T) < sizeof(float))
+      return;
     TEST_REGULAR(0x1p127L, 0x3p-149L, 0x1p-149L);
     TEST_REGULAR(0x1p127L, -0x3p-149L, 0x1p-149L);
     TEST_REGULAR(0x1p127L, 0x3p-148L, 0x1p-147L);
@@ -224,20 +225,20 @@ class FmodTest : public LIBC_NAMESPACE::testing::FEnvSafeTest {
     TEST_REGULAR(-0x1p127L, 0x3p-126L, -0x1p-125L);
     TEST_REGULAR(-0x1p127L, -0x3p-126L, -0x1p-125L);
 
-    if constexpr (sizeof(T) >= sizeof(double)) {
-      TEST_REGULAR(0x1p1023L, 0x3p-1074L, 0x1p-1073L);
-      TEST_REGULAR(0x1p1023L, -0x3p-1074L, 0x1p-1073L);
-      TEST_REGULAR(0x1p1023L, 0x3p-1073L, 0x1p-1073L);
-      TEST_REGULAR(0x1p1023L, -0x3p-1073L, 0x1p-1073L);
-      TEST_REGULAR(0x1p1023L, 0x3p-1022L, 0x1p-1021L);
-      TEST_REGULAR(0x1p1023L, -0x3p-1022L, 0x1p-1021L);
-      TEST_REGULAR(-0x1p1023L, 0x3p-1074L, -0x1p-1073L);
-      TEST_REGULAR(-0x1p1023L, -0x3p-1074L, -0x1p-1073L);
-      TEST_REGULAR(-0x1p1023L, 0x3p-1073L, -0x1p-1073L);
-      TEST_REGULAR(-0x1p1023L, -0x3p-1073L, -0x1p-1073L);
-      TEST_REGULAR(-0x1p1023L, 0x3p-1022L, -0x1p-1021L);
-      TEST_REGULAR(-0x1p1023L, -0x3p-1022L, -0x1p-1021L);
-    }
+    if constexpr (sizeof(T) < sizeof(double))
+      return;
+    TEST_REGULAR(0x1p1023L, 0x3p-1074L, 0x1p-1073L);
+    TEST_REGULAR(0x1p1023L, -0x3p-1074L, 0x1p-1073L);
+    TEST_REGULAR(0x1p1023L, 0x3p-1073L, 0x1p-1073L);
+    TEST_REGULAR(0x1p1023L, -0x3p-1073L, 0x1p-1073L);
+    TEST_REGULAR(0x1p1023L, 0x3p-1022L, 0x1p-1021L);
+    TEST_REGULAR(0x1p1023L, -0x3p-1022L, 0x1p-1021L);
+    TEST_REGULAR(-0x1p1023L, 0x3p-1074L, -0x1p-1073L);
+    TEST_REGULAR(-0x1p1023L, -0x3p-1074L, -0x1p-1073L);
+    TEST_REGULAR(-0x1p1023L, 0x3p-1073L, -0x1p-1073L);
+    TEST_REGULAR(-0x1p1023L, -0x3p-1073L, -0x1p-1073L);
+    TEST_REGULAR(-0x1p1023L, 0x3p-1022L, -0x1p-1021L);
+    TEST_REGULAR(-0x1p1023L, -0x3p-1022L, -0x1p-1021L);
   }
 };
 
diff --git a/libc/test/src/math/smoke/fmodf16_test.cpp b/libc/test/src/math/smoke/fmodf16_test.cpp
new file mode 100644
index 0000000000000..9a48c5aa0d609
--- /dev/null
+++ b/libc/test/src/math/smoke/fmodf16_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for fmodf16 ---------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "FModTest.h"
+
+#include "src/math/fmodf16.h"
+
+LIST_FMOD_TESTS(float16, LIBC_NAMESPACE::fmodf16)

From f6621ffd0a175965caa7764103796d83fab3fff2 Mon Sep 17 00:00:00 2001
From: OverMighty <its.overmighty@gmail.com>
Date: Thu, 6 Jun 2024 19:58:53 +0200
Subject: [PATCH 2/7] [libc][math][c23] Change fmodf16 to use generic FMod with
 uint32_t

---
 libc/src/math/generic/fmodf16.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libc/src/math/generic/fmodf16.cpp b/libc/src/math/generic/fmodf16.cpp
index 0a54a65806de9..a5bfd78113f63 100644
--- a/libc/src/math/generic/fmodf16.cpp
+++ b/libc/src/math/generic/fmodf16.cpp
@@ -13,7 +13,7 @@
 namespace LIBC_NAMESPACE {
 
 LLVM_LIBC_FUNCTION(float16, fmodf16, (float16 x, float16 y)) {
-  return fputil::generic::FMod<float16>::eval(x, y);
+  return fputil::generic::FMod<float16, uint32_t>::eval(x, y);
 }
 
 } // namespace LIBC_NAMESPACE

From 002ebebef39e0e7a592be36cdfd17275d7321efa Mon Sep 17 00:00:00 2001
From: OverMighty <its.overmighty@gmail.com>
Date: Fri, 7 Jun 2024 23:02:44 +0200
Subject: [PATCH 3/7] [libc][math][c23] Add performance test for different
 implementations of fmodf16

---
 .../BinaryOpSingleOutputPerf.h                | 31 ++++++++++---------
 .../math/performance_testing/CMakeLists.txt   | 12 +++++++
 .../math/performance_testing/fmodf16_perf.cpp | 24 ++++++++++++++
 3 files changed, 53 insertions(+), 14 deletions(-)
 create mode 100644 libc/test/src/math/performance_testing/fmodf16_perf.cpp

diff --git a/libc/test/src/math/performance_testing/BinaryOpSingleOutputPerf.h b/libc/test/src/math/performance_testing/BinaryOpSingleOutputPerf.h
index 504d1be94b891..3a469b2f6c44d 100644
--- a/libc/test/src/math/performance_testing/BinaryOpSingleOutputPerf.h
+++ b/libc/test/src/math/performance_testing/BinaryOpSingleOutputPerf.h
@@ -33,13 +33,15 @@ template <typename T> class BinaryOpSingleOutputPerf {
       }
 
       StorageType step = (endingBit - startingBit) / N;
-      for (StorageType bitsX = startingBit, bitsY = endingBit;;
-           bitsX += step, bitsY -= step) {
-        T x = FPBits(bitsX).get_val();
-        T y = FPBits(bitsY).get_val();
-        result = func(x, y);
-        if (endingBit - bitsX < step) {
-          break;
+      for (int i = 0; i < 5000; i++) {
+        for (StorageType bitsX = startingBit, bitsY = endingBit;;
+             bitsX += step, bitsY -= step) {
+          T x = FPBits(bitsX).get_val();
+          T y = FPBits(bitsY).get_val();
+          result = func(x, y);
+          if (endingBit - bitsX < step) {
+            break;
+          }
         }
       }
     };
@@ -49,7 +51,7 @@ template <typename T> class BinaryOpSingleOutputPerf {
     runner(myFunc);
     timer.stop();
 
-    double my_average = static_cast<double>(timer.nanoseconds()) / N;
+    double my_average = static_cast<double>(timer.nanoseconds()) / (N * 5000);
     log << "-- My function --\n";
     log << "     Total time      : " << timer.nanoseconds() << " ns \n";
     log << "     Average runtime : " << my_average << " ns/op \n";
@@ -60,7 +62,7 @@ template <typename T> class BinaryOpSingleOutputPerf {
     runner(otherFunc);
     timer.stop();
 
-    double other_average = static_cast<double>(timer.nanoseconds()) / N;
+    double other_average = static_cast<double>(timer.nanoseconds()) / (N * 5000);
     log << "-- Other function --\n";
     log << "     Total time      : " << timer.nanoseconds() << " ns \n";
     log << "     Average runtime : " << other_average << " ns/op \n";
@@ -76,17 +78,17 @@ template <typename T> class BinaryOpSingleOutputPerf {
     log << " Performance tests with inputs in denormal range:\n";
     run_perf_in_range(myFunc, otherFunc, /* startingBit= */ StorageType(0),
                       /* endingBit= */ FPBits::max_subnormal().uintval(),
-                      10'000'001, log);
+                      FPBits::max_subnormal().uintval(), log);
     log << "\n Performance tests with inputs in normal range:\n";
     run_perf_in_range(myFunc, otherFunc,
                       /* startingBit= */ FPBits::min_normal().uintval(),
                       /* endingBit= */ FPBits::max_normal().uintval(),
-                      10'000'001, log);
+                      FPBits::max_normal().uintval() - FPBits::min_normal().uintval(), log);
     log << "\n Performance tests with inputs in normal range with exponents "
            "close to each other:\n";
     run_perf_in_range(
         myFunc, otherFunc, /* startingBit= */ FPBits(T(0x1.0p-10)).uintval(),
-        /* endingBit= */ FPBits(T(0x1.0p+10)).uintval(), 1'001'001, log);
+        /* endingBit= */ FPBits(T(0x1.0p+10)).uintval(), FPBits(T(0x1.0p+10)).uintval() - FPBits(T(0x1.0p-10)).uintval(), log);
   }
 
   static void run_diff(Func myFunc, Func otherFunc, const char *logFile) {
@@ -115,8 +117,9 @@ template <typename T> class BinaryOpSingleOutputPerf {
 } // namespace LIBC_NAMESPACE
 
 #define BINARY_OP_SINGLE_OUTPUT_PERF(T, myFunc, otherFunc, filename)           \
-  int main() {                                                                 \
+  {                                                                 \
+    LIBC_NAMESPACE::testing::BinaryOpSingleOutputPerf<T>::run_perf(            \
+        &myFunc, &otherFunc, filename);                                        \
     LIBC_NAMESPACE::testing::BinaryOpSingleOutputPerf<T>::run_perf(            \
         &myFunc, &otherFunc, filename);                                        \
-    return 0;                                                                  \
   }
diff --git a/libc/test/src/math/performance_testing/CMakeLists.txt b/libc/test/src/math/performance_testing/CMakeLists.txt
index d1fb24e37f728..d10f5ab848ec7 100644
--- a/libc/test/src/math/performance_testing/CMakeLists.txt
+++ b/libc/test/src/math/performance_testing/CMakeLists.txt
@@ -343,6 +343,18 @@ add_perf_binary(
     -fno-builtin
 )
 
+add_perf_binary(
+  fmodf16_perf
+  SRCS
+    fmodf16_perf.cpp
+  DEPENDS
+    .binary_op_single_output_diff
+    libc.include.llvm-libc-macros.stdint_macros
+    libc.src.math.fmodf16
+    libc.src.__support.FPUtil.generic.fmod
+    libc.src.__support.macros.properties.types
+)
+
 add_perf_binary(
   fmodf128_perf
   SRCS
diff --git a/libc/test/src/math/performance_testing/fmodf16_perf.cpp b/libc/test/src/math/performance_testing/fmodf16_perf.cpp
new file mode 100644
index 0000000000000..5ae2e43a54931
--- /dev/null
+++ b/libc/test/src/math/performance_testing/fmodf16_perf.cpp
@@ -0,0 +1,24 @@
+//===-- Differential test for fmodf16 -------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "BinaryOpSingleOutputPerf.h"
+
+//#include "include/llvm-libc-macros/stdint-macros.h"
+#include "src/__support/FPUtil/generic/FMod.h"
+#include "src/__support/macros/properties/types.h"
+
+#define FMOD_FUNC(U) (LIBC_NAMESPACE::fputil::generic::FMod<float16, U>::eval)
+
+int main() {
+BINARY_OP_SINGLE_OUTPUT_PERF(float16, FMOD_FUNC(uint16_t), FMOD_FUNC(uint32_t),
+                             "fmodf16_u16_vs_u32_perf.log")
+
+BINARY_OP_SINGLE_OUTPUT_PERF(float16, FMOD_FUNC(uint16_t), FMOD_FUNC(uint64_t),
+                             "fmodf16_u16_vs_u64_perf.log")
+    return 0;
+}

From fff5340770fd8d14e7a0391a0ec8c0f5dc88050a Mon Sep 17 00:00:00 2001
From: OverMighty <its.overmighty@gmail.com>
Date: Fri, 7 Jun 2024 23:38:17 +0200
Subject: [PATCH 4/7] [libc][math][c23] Clean up performance test for different
 implementations of fmodf16

---
 .../BinaryOpSingleOutputPerf.h                | 43 +++++++++++++------
 .../math/performance_testing/CMakeLists.txt   |  4 +-
 .../math/performance_testing/fmodf16_perf.cpp | 15 ++++---
 3 files changed, 42 insertions(+), 20 deletions(-)

diff --git a/libc/test/src/math/performance_testing/BinaryOpSingleOutputPerf.h b/libc/test/src/math/performance_testing/BinaryOpSingleOutputPerf.h
index 3a469b2f6c44d..cebd159b634c3 100644
--- a/libc/test/src/math/performance_testing/BinaryOpSingleOutputPerf.h
+++ b/libc/test/src/math/performance_testing/BinaryOpSingleOutputPerf.h
@@ -6,9 +6,11 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "src/__support/CPP/algorithm.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "test/src/math/performance_testing/Timer.h"
 
+#include <cstddef>
 #include <fstream>
 
 namespace LIBC_NAMESPACE {
@@ -25,7 +27,12 @@ template <typename T> class BinaryOpSingleOutputPerf {
 
   static void run_perf_in_range(Func myFunc, Func otherFunc,
                                 StorageType startingBit, StorageType endingBit,
-                                StorageType N, std::ofstream &log) {
+                                size_t N, size_t rounds, std::ofstream &log) {
+    if (endingBit - startingBit < N)
+      N = endingBit - startingBit;
+
+    size_t total_ops = N * rounds;
+
     auto runner = [=](Func func) {
       volatile T result;
       if (endingBit < startingBit) {
@@ -33,7 +40,7 @@ template <typename T> class BinaryOpSingleOutputPerf {
       }
 
       StorageType step = (endingBit - startingBit) / N;
-      for (int i = 0; i < 5000; i++) {
+      for (size_t i = 0; i < rounds; i++) {
         for (StorageType bitsX = startingBit, bitsY = endingBit;;
              bitsX += step, bitsY -= step) {
           T x = FPBits(bitsX).get_val();
@@ -51,7 +58,7 @@ template <typename T> class BinaryOpSingleOutputPerf {
     runner(myFunc);
     timer.stop();
 
-    double my_average = static_cast<double>(timer.nanoseconds()) / (N * 5000);
+    double my_average = static_cast<double>(timer.nanoseconds()) / total_ops;
     log << "-- My function --\n";
     log << "     Total time      : " << timer.nanoseconds() << " ns \n";
     log << "     Average runtime : " << my_average << " ns/op \n";
@@ -62,7 +69,7 @@ template <typename T> class BinaryOpSingleOutputPerf {
     runner(otherFunc);
     timer.stop();
 
-    double other_average = static_cast<double>(timer.nanoseconds()) / (N * 5000);
+    double other_average = static_cast<double>(timer.nanoseconds()) / total_ops;
     log << "-- Other function --\n";
     log << "     Total time      : " << timer.nanoseconds() << " ns \n";
     log << "     Average runtime : " << other_average << " ns/op \n";
@@ -73,22 +80,24 @@ template <typename T> class BinaryOpSingleOutputPerf {
     log << "     Mine / Other's  : " << my_average / other_average << " \n";
   }
 
-  static void run_perf(Func myFunc, Func otherFunc, const char *logFile) {
+  static void run_perf(Func myFunc, Func otherFunc, int rounds,
+                       const char *logFile) {
     std::ofstream log(logFile);
     log << " Performance tests with inputs in denormal range:\n";
     run_perf_in_range(myFunc, otherFunc, /* startingBit= */ StorageType(0),
                       /* endingBit= */ FPBits::max_subnormal().uintval(),
-                      FPBits::max_subnormal().uintval(), log);
+                      10'000'001, rounds, log);
     log << "\n Performance tests with inputs in normal range:\n";
     run_perf_in_range(myFunc, otherFunc,
                       /* startingBit= */ FPBits::min_normal().uintval(),
                       /* endingBit= */ FPBits::max_normal().uintval(),
-                      FPBits::max_normal().uintval() - FPBits::min_normal().uintval(), log);
+                      10'000'001, rounds, log);
     log << "\n Performance tests with inputs in normal range with exponents "
            "close to each other:\n";
-    run_perf_in_range(
-        myFunc, otherFunc, /* startingBit= */ FPBits(T(0x1.0p-10)).uintval(),
-        /* endingBit= */ FPBits(T(0x1.0p+10)).uintval(), FPBits(T(0x1.0p+10)).uintval() - FPBits(T(0x1.0p-10)).uintval(), log);
+    run_perf_in_range(myFunc, otherFunc,
+                      /* startingBit= */ FPBits(T(0x1.0p-10)).uintval(),
+                      /* endingBit= */ FPBits(T(0x1.0p+10)).uintval(),
+                      1'001'001, rounds, log);
   }
 
   static void run_diff(Func myFunc, Func otherFunc, const char *logFile) {
@@ -117,9 +126,17 @@ template <typename T> class BinaryOpSingleOutputPerf {
 } // namespace LIBC_NAMESPACE
 
 #define BINARY_OP_SINGLE_OUTPUT_PERF(T, myFunc, otherFunc, filename)           \
-  {                                                                 \
+  int main() {                                                                 \
+    LIBC_NAMESPACE::testing::BinaryOpSingleOutputPerf<T>::run_perf(            \
+        &myFunc, &otherFunc, 1, filename);                                     \
+    return 0;                                                                  \
+  }
+
+#define BINARY_OP_SINGLE_OUTPUT_PERF_EX(T, myFunc, otherFunc, rounds,          \
+                                        filename)                              \
+  {                                                                            \
     LIBC_NAMESPACE::testing::BinaryOpSingleOutputPerf<T>::run_perf(            \
-        &myFunc, &otherFunc, filename);                                        \
+        &myFunc, &otherFunc, rounds, filename);                                \
     LIBC_NAMESPACE::testing::BinaryOpSingleOutputPerf<T>::run_perf(            \
-        &myFunc, &otherFunc, filename);                                        \
+        &myFunc, &otherFunc, rounds, filename);                                \
   }
diff --git a/libc/test/src/math/performance_testing/CMakeLists.txt b/libc/test/src/math/performance_testing/CMakeLists.txt
index d10f5ab848ec7..10522d972fb2d 100644
--- a/libc/test/src/math/performance_testing/CMakeLists.txt
+++ b/libc/test/src/math/performance_testing/CMakeLists.txt
@@ -88,6 +88,9 @@ add_header_library(
   binary_op_single_output_diff
   HDRS
     BinaryOpSingleOutputPerf.h
+  DEPENDS
+    libc.src.__support.CPP.algorithm
+    libc.src.__support.FPUtil.fp_bits
 )
 
 add_perf_binary(
@@ -349,7 +352,6 @@ add_perf_binary(
     fmodf16_perf.cpp
   DEPENDS
     .binary_op_single_output_diff
-    libc.include.llvm-libc-macros.stdint_macros
     libc.src.math.fmodf16
     libc.src.__support.FPUtil.generic.fmod
     libc.src.__support.macros.properties.types
diff --git a/libc/test/src/math/performance_testing/fmodf16_perf.cpp b/libc/test/src/math/performance_testing/fmodf16_perf.cpp
index 5ae2e43a54931..35bb0a15dfbab 100644
--- a/libc/test/src/math/performance_testing/fmodf16_perf.cpp
+++ b/libc/test/src/math/performance_testing/fmodf16_perf.cpp
@@ -8,17 +8,20 @@
 
 #include "BinaryOpSingleOutputPerf.h"
 
-//#include "include/llvm-libc-macros/stdint-macros.h"
 #include "src/__support/FPUtil/generic/FMod.h"
 #include "src/__support/macros/properties/types.h"
 
+#include <stdint.h>
+
 #define FMOD_FUNC(U) (LIBC_NAMESPACE::fputil::generic::FMod<float16, U>::eval)
 
 int main() {
-BINARY_OP_SINGLE_OUTPUT_PERF(float16, FMOD_FUNC(uint16_t), FMOD_FUNC(uint32_t),
-                             "fmodf16_u16_vs_u32_perf.log")
+  BINARY_OP_SINGLE_OUTPUT_PERF_EX(float16, FMOD_FUNC(uint16_t),
+                                  FMOD_FUNC(uint32_t), 5000,
+                                  "fmodf16_u16_vs_u32_perf.log")
 
-BINARY_OP_SINGLE_OUTPUT_PERF(float16, FMOD_FUNC(uint16_t), FMOD_FUNC(uint64_t),
-                             "fmodf16_u16_vs_u64_perf.log")
-    return 0;
+  BINARY_OP_SINGLE_OUTPUT_PERF_EX(float16, FMOD_FUNC(uint16_t),
+                                  FMOD_FUNC(uint64_t), 5000,
+                                  "fmodf16_u16_vs_u64_perf.log")
+  return 0;
 }

From 21b7fe299c43fe320cb61c6e76061149f87bf945 Mon Sep 17 00:00:00 2001
From: OverMighty <its.overmighty@gmail.com>
Date: Fri, 7 Jun 2024 23:41:03 +0200
Subject: [PATCH 5/7] [libc][math][c23] Remove unused dependency from
 performance test

---
 .../test/src/math/performance_testing/BinaryOpSingleOutputPerf.h | 1 -
 libc/test/src/math/performance_testing/CMakeLists.txt            | 1 -
 2 files changed, 2 deletions(-)

diff --git a/libc/test/src/math/performance_testing/BinaryOpSingleOutputPerf.h b/libc/test/src/math/performance_testing/BinaryOpSingleOutputPerf.h
index cebd159b634c3..4c04b5b329fe6 100644
--- a/libc/test/src/math/performance_testing/BinaryOpSingleOutputPerf.h
+++ b/libc/test/src/math/performance_testing/BinaryOpSingleOutputPerf.h
@@ -6,7 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "src/__support/CPP/algorithm.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "test/src/math/performance_testing/Timer.h"
 
diff --git a/libc/test/src/math/performance_testing/CMakeLists.txt b/libc/test/src/math/performance_testing/CMakeLists.txt
index 10522d972fb2d..4ea78f9999e4d 100644
--- a/libc/test/src/math/performance_testing/CMakeLists.txt
+++ b/libc/test/src/math/performance_testing/CMakeLists.txt
@@ -89,7 +89,6 @@ add_header_library(
   HDRS
     BinaryOpSingleOutputPerf.h
   DEPENDS
-    libc.src.__support.CPP.algorithm
     libc.src.__support.FPUtil.fp_bits
 )
 

From bc2bc7608bc4a33df4bfd860f56b85e1bd72c178 Mon Sep 17 00:00:00 2001
From: OverMighty <its.overmighty@gmail.com>
Date: Sat, 8 Jun 2024 00:17:55 +0200
Subject: [PATCH 6/7] [libc][math][c23] Clean up performance test again

---
 .../performance_testing/BinaryOpSingleOutputPerf.h   | 12 +++++-------
 .../src/math/performance_testing/fmodf16_perf.cpp    |  2 +-
 2 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/libc/test/src/math/performance_testing/BinaryOpSingleOutputPerf.h b/libc/test/src/math/performance_testing/BinaryOpSingleOutputPerf.h
index 4c04b5b329fe6..861840e87f7e7 100644
--- a/libc/test/src/math/performance_testing/BinaryOpSingleOutputPerf.h
+++ b/libc/test/src/math/performance_testing/BinaryOpSingleOutputPerf.h
@@ -30,8 +30,6 @@ template <typename T> class BinaryOpSingleOutputPerf {
     if (endingBit - startingBit < N)
       N = endingBit - startingBit;
 
-    size_t total_ops = N * rounds;
-
     auto runner = [=](Func func) {
       volatile T result;
       if (endingBit < startingBit) {
@@ -57,7 +55,7 @@ template <typename T> class BinaryOpSingleOutputPerf {
     runner(myFunc);
     timer.stop();
 
-    double my_average = static_cast<double>(timer.nanoseconds()) / total_ops;
+    double my_average = static_cast<double>(timer.nanoseconds()) / N / rounds;
     log << "-- My function --\n";
     log << "     Total time      : " << timer.nanoseconds() << " ns \n";
     log << "     Average runtime : " << my_average << " ns/op \n";
@@ -68,7 +66,7 @@ template <typename T> class BinaryOpSingleOutputPerf {
     runner(otherFunc);
     timer.stop();
 
-    double other_average = static_cast<double>(timer.nanoseconds()) / total_ops;
+    double other_average = static_cast<double>(timer.nanoseconds()) / N / rounds;
     log << "-- Other function --\n";
     log << "     Total time      : " << timer.nanoseconds() << " ns \n";
     log << "     Average runtime : " << other_average << " ns/op \n";
@@ -85,18 +83,18 @@ template <typename T> class BinaryOpSingleOutputPerf {
     log << " Performance tests with inputs in denormal range:\n";
     run_perf_in_range(myFunc, otherFunc, /* startingBit= */ StorageType(0),
                       /* endingBit= */ FPBits::max_subnormal().uintval(),
-                      10'000'001, rounds, log);
+                      1'000'001, rounds, log);
     log << "\n Performance tests with inputs in normal range:\n";
     run_perf_in_range(myFunc, otherFunc,
                       /* startingBit= */ FPBits::min_normal().uintval(),
                       /* endingBit= */ FPBits::max_normal().uintval(),
-                      10'000'001, rounds, log);
+                      1'000'001, rounds, log);
     log << "\n Performance tests with inputs in normal range with exponents "
            "close to each other:\n";
     run_perf_in_range(myFunc, otherFunc,
                       /* startingBit= */ FPBits(T(0x1.0p-10)).uintval(),
                       /* endingBit= */ FPBits(T(0x1.0p+10)).uintval(),
-                      1'001'001, rounds, log);
+                      1'000'001, rounds, log);
   }
 
   static void run_diff(Func myFunc, Func otherFunc, const char *logFile) {
diff --git a/libc/test/src/math/performance_testing/fmodf16_perf.cpp b/libc/test/src/math/performance_testing/fmodf16_perf.cpp
index 35bb0a15dfbab..ff01fa6ca5870 100644
--- a/libc/test/src/math/performance_testing/fmodf16_perf.cpp
+++ b/libc/test/src/math/performance_testing/fmodf16_perf.cpp
@@ -1,4 +1,4 @@
-//===-- Differential test for fmodf16 -------------------------------------===//
+//===-- Performance test for fmodf16 --------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.

From 324607f02ad209285085d714341ca3f99ceffe00 Mon Sep 17 00:00:00 2001
From: OverMighty <its.overmighty@gmail.com>
Date: Sat, 8 Jun 2024 00:22:11 +0200
Subject: [PATCH 7/7] [libc][math][c23] Format performance test (oops)

---
 .../src/math/performance_testing/BinaryOpSingleOutputPerf.h    | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/libc/test/src/math/performance_testing/BinaryOpSingleOutputPerf.h b/libc/test/src/math/performance_testing/BinaryOpSingleOutputPerf.h
index 861840e87f7e7..3027932c70f40 100644
--- a/libc/test/src/math/performance_testing/BinaryOpSingleOutputPerf.h
+++ b/libc/test/src/math/performance_testing/BinaryOpSingleOutputPerf.h
@@ -66,7 +66,8 @@ template <typename T> class BinaryOpSingleOutputPerf {
     runner(otherFunc);
     timer.stop();
 
-    double other_average = static_cast<double>(timer.nanoseconds()) / N / rounds;
+    double other_average =
+        static_cast<double>(timer.nanoseconds()) / N / rounds;
     log << "-- Other function --\n";
     log << "     Total time      : " << timer.nanoseconds() << " ns \n";
     log << "     Average runtime : " << other_average << " ns/op \n";