From dd6b8408c975dcf72fd682ea5e80d5ad66075b60 Mon Sep 17 00:00:00 2001
From: Francis Couture-Harpin <git@compilade.net>
Date: Fri, 21 Feb 2025 13:14:32 -0500
Subject: [PATCH 01/12] ggml-quants : improve IQ4_NL, IQ4_XS, and Q3_K

---
 ggml/src/ggml-quants.c | 400 ++++++++++++++++++++++++++++++++---------
 1 file changed, 320 insertions(+), 80 deletions(-)

diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c
index 7918388ae9f2e..638766ae89bcf 100644
--- a/ggml/src/ggml-quants.c
+++ b/ggml/src/ggml-quants.c
@@ -628,6 +628,258 @@ static float make_qkx2_quants(int n, int nmax, const float * restrict x, const f
     return scale;
 }
 
+struct fraction {
+    // float frac;
+    float numer;
+    float denom;
+    int i;
+};
+
+// comparator function for sorting fractions in make_qkxs_quants
+static inline int compare_fractions_desc(const void * a, const void * b) {
+    const struct fraction * f_a = (const struct fraction *) a;
+    const struct fraction * f_b = (const struct fraction *) b;
+    float na = f_a->numer;
+    float da = f_a->denom;
+    float nb = f_b->numer;
+    float db = f_b->denom;
+
+    // Stable sort
+    // a - b sorts ascending, which means
+    // 1 swaps, -1 stays
+    if (da == db) { // equal denominators
+        return (na == nb) ? ((a > b) ? 1 : -1) : (na < nb) ? 1 : -1;
+    }
+    if (na == nb) { // equal numerators
+        return (da > db) ? 1 : -1;
+    }
+    float ab = na * db;
+    float ba = nb * da;
+    return (ab == ba) ? ((a > b) ? 1 : -1) : (ab < ba) ? 1 : -1;
+}
+
+// exhaustive search with cumulative sums
+// Need Faux to have room for n*(max(abs(nmin), abs(nmax))) fractions
+static float make_qkxs_quants(int n, int nmin, int nmax, const float * restrict x, const float * restrict weights, int8_t * restrict L, struct fraction * restrict Faux, bool signed_scale) {
+    float max = 0.0f;
+    float amax = 0.0f;
+    for (int i = 0; i < n; ++i) {
+        float ax = fabsf(x[i]);
+        if (ax > amax) {
+            amax = ax;
+            max = x[i];
+        }
+    }
+    bool negative_scale = false;
+    if (signed_scale && -nmin != nmax) {
+        // the max side should have the biggest range
+        if ((max < 0.0f) == (-nmin < nmax)) {
+            // [-4, 3] ==> [-3, 4]
+            int tmp = nmin;
+            nmin = -nmax;
+            nmax = -tmp;
+            negative_scale = true;
+        }
+    }
+    if (amax < GROUP_MAX_EPS) { // all zero
+        for (int i = 0; i < n; ++i) {
+            L[i] = 0;
+        }
+        return 0.0f;
+    }
+    int n_frac = 0;
+    for (int i = 0; i < n; ++i) {
+        // assuming nmin <= nmax
+        const int odd_max = MAX(0, x[i] < 0 ? -nmin : nmax);
+        const int odd_min = MAX(0, x[i] < 0 ? -nmax : nmin);
+        const float v = fabsf(x[i]);
+        // fprintf(stderr, "%s: i=%d, odd_min=%d, odd_max=%d\n", __func__, i, odd_min, odd_max);
+        for (int j = odd_min; j < odd_max; ++j) {
+            const float odd = 2*j + 1;
+            Faux[n_frac++] = (struct fraction){
+                .numer=v,
+                .denom=odd,
+                .i=i,
+            };
+        }
+    }
+
+    qsort(Faux, n_frac, sizeof(struct fraction), compare_fractions_desc);
+
+    float iscale = 0.0f;
+    {
+        float sumlx = 0.0f;
+        float suml2 = 0.0f;
+        float best = 0.0f;
+        float best_denom = 1.0f;
+        for (int i = 0; i < n_frac; ++i) {
+            // maximize the weighted cosine
+            const int ii = Faux[i].i;
+            const float w = weights ? weights[ii] : x[ii] * x[ii];
+            sumlx += w * Faux[i].numer;
+            suml2 += w * Faux[i].denom;
+            const float current = sumlx * sumlx;
+            // fprintf(stderr, "%s: Faux[%d]=(%f/%f) * %f, square(sumlx)=%f, suml2=%f, k*cos2=%f\n", __func__, i, Faux[i].numer, Faux[i].denom, Faux[i].weight, current, suml2, current / suml2);
+            // use the last in case of equality
+            // FIXME: > or >= ?? Why does [0, 0, 1] rounds to [0, 0, 0] with >= ?
+            if (suml2 > 0.0f && current * best_denom > best * suml2) {
+                best = current;
+                best_denom = suml2;
+                iscale = Faux[i].numer > 0.0f ? Faux[i].denom / (2.0f * Faux[i].numer) : 0.0f;
+                if (!isfinite(iscale)) {
+                    fprintf(stderr, "%s: iscale is not finite, %f/(2*%f)\n", __func__, Faux[i].denom, Faux[i].numer);
+                }
+            }
+        }
+    }
+    // (very) small fudging necessary because floats otherwise round to nearest even
+    iscale = iscale * ((float)((1 << 23) + 1) / (float)(1 << 23));
+
+    float sumlx = 0.0f;
+    float suml2 = 0.0f;
+    for (int i = 0; i < n; ++i) {
+        // Rounding away from zero is assumed by the search algorithm above.
+        int l = MAX(nmin, MIN(lroundf(x[i] * iscale), nmax));
+        if (negative_scale) {
+            l = -l;
+        }
+        L[i] = negative_scale ? l + nmax : l - nmin;
+        float w = weights ? weights[i] : x[i] * x[i];
+        // weighted projection scale
+        sumlx += w * x[i] * l;
+        suml2 += w * l * l;
+    }
+
+    return suml2 > 0.0f ? sumlx / suml2 : 0.0f;
+}
+
+// non-linear exhaustive search with cumulative sums
+// Need Faux to have room for n*k fractions
+static float make_qkxs_nl_quants(int n, int k, const float * restrict x, const float * restrict weights, const int8_t * restrict kvalues, uint8_t * restrict L, uint8_t * restrict Laux, struct fraction * restrict Faux, bool signed_scale) {
+    float sumlx = 0.0f;
+    float suml2 = 0.0f;
+    int kmin = abs(kvalues[0]);
+    int koff = 0;
+    for (int i = 1; i < k; ++i) {
+        int ak = abs(kvalues[i]);
+        if (ak < kmin) {
+            kmin = ak;
+            koff = i;
+        }
+    }
+    kmin = kvalues[koff];
+    for (int i = 0; i < n; ++i) {
+        float w = weights ? weights[i] : x[i] * x[i];
+        Laux[i] = koff;
+        sumlx += w * x[i] * kmin;
+        suml2 += w * kmin * kmin;
+    }
+
+    int n_frac_p = 0;
+    for (int i = 0; i < n; ++i) {
+        const int start = x[i] < 0.0f ? 1 : koff + 1;
+        const int end = x[i] < 0.0f ? koff + 1: k;
+        for (int j = start; j < end; ++j) {
+            const float threshold = kvalues[j] + kvalues[j - 1];
+            const float step = kvalues[j] - kvalues[j - 1];
+            Faux[n_frac_p++] = (struct fraction){
+                // This should always be positive or else
+                // the fraction comparison function won't work properly
+                .numer=fabsf(x[i] * step),
+                // It's amazing how this is still the difference of consecutive squares
+                .denom=fabsf(threshold * step),
+                .i=i,
+            };
+        }
+    }
+
+    qsort(Faux, n_frac_p, sizeof(struct fraction), compare_fractions_desc);
+
+    float best = 0.0f;
+    float best_sumlx = 0.0f;
+    float best_suml2 = 1.0f;
+    float sumlx_p = sumlx;
+    float suml2_p = suml2;
+    int best_p_i = -2; // not consecutive with 0..n_frac
+    for (int i = 0; i < n_frac_p; ++i) {
+        const int ii = Faux[i].i;
+        const float w = weights ? weights[ii] : x[ii] * x[ii];
+        sumlx_p += w * Faux[i].numer;
+        suml2_p += w * Faux[i].denom;
+        const float current = sumlx_p * sumlx_p;
+        Laux[ii] += x[ii] < 0.0f ? -1 : 1;
+        if (suml2_p > 0.0f && current * best_suml2 > best * suml2_p) {
+            best = current;
+            best_sumlx = sumlx_p;
+            best_suml2 = suml2_p;
+            if (i == best_p_i + 1) {
+                // reduce copies for consecutive bests
+                L[ii] += x[ii] < 0.0f ? -1 : 1;
+            } else {
+                for (int j = 0; j < n; ++j) {
+                    L[j] = Laux[j];
+                }
+            }
+            best_p_i = i;
+        }
+    }
+
+    // Non-linear mappings are usually not symmetric, so try negating the scale
+    if (signed_scale) {
+        for (int i = 0; i < n; ++i) {
+            Laux[i] = koff;
+        }
+
+        int n_frac_n = 0;
+        for (int i = 0; i < n; ++i) {
+            const int start = x[i] >= 0.0f ? 1 : koff + 1;
+            const int end = x[i] >= 0.0f ? koff + 1: k;
+            for (int j = start; j < end; ++j) {
+                const float threshold = kvalues[j] + kvalues[j - 1];
+                const float step = kvalues[j] - kvalues[j - 1];
+                Faux[n_frac_n++] = (struct fraction){
+                    // This should always be positive or else
+                    // the fraction comparison function won't work properly
+                    .numer=fabsf(x[i] * step),
+                    // It's amazing how this is still the difference of consecutive squares
+                    .denom=fabsf(threshold * step),
+                    .i=i,
+                };
+            }
+        }
+
+        qsort(Faux, n_frac_n, sizeof(struct fraction), compare_fractions_desc);
+
+        float sumlx_n = -sumlx;
+        float suml2_n = suml2;
+        int best_n_i = -2; // not consecutive with 0..n_frac
+        for (int i = 0; i < n_frac_n; ++i) {
+            const int ii = Faux[i].i;
+            const float w = weights ? weights[ii] : x[ii] * x[ii];
+            sumlx_n += w * Faux[i].numer;
+            suml2_n += w * Faux[i].denom;
+            const float current = sumlx_n * sumlx_n;
+            Laux[ii] += x[ii] >= 0.0f ? -1 : 1;
+            if (suml2_n > 0.0f && current * best_suml2 > best * suml2_n) {
+                best = current;
+                best_sumlx = -sumlx_n;
+                best_suml2 = suml2_n;
+                if (i == best_n_i + 1) {
+                    // reduce copies for consecutive bests
+                    L[ii] += x[ii] >= 0.0f ? -1 : 1;
+                } else {
+                    for (int j = 0; j < n; ++j) {
+                        L[j] = Laux[j];
+                    }
+                }
+                best_n_i = i;
+            }
+        }
+    }
+
+    return best_suml2 != 0.0f ? best_sumlx / best_suml2 : 0.0f;
+}
+
 static inline void get_scale_min_k4(int j, const uint8_t * restrict q, uint8_t * restrict d, uint8_t * restrict m) {
     if (j < 4) {
         *d = q[j] & 63; *m = q[j + 4] & 63;
@@ -982,14 +1234,21 @@ void quantize_row_q3_K_ref(const float * restrict x, block_q3_K * restrict y, in
     const int nb = k / QK_K;
 
     int8_t L[QK_K];
+    struct fraction Faux[16 * 4];
     float scales[QK_K / 16];
+    float weights[16];
+
+    for (int i = 0; i < 16; ++i) {
+        weights[i] = 1.0f;
+    }
 
     for (int i = 0; i < nb; i++) {
 
         float max_scale = 0;
         float amax = 0;
         for (int j = 0; j < QK_K/16; ++j) {
-            scales[j] = make_q3_quants(16, 4, x + 16*j, L + 16*j, true);
+            scales[j] = make_qkxs_quants(16, -4, 3, x + 16*j, weights, L + 16*j, Faux, true);
+            // scales[j] = make_q3_quants(16, 4, x + 16*j, L + 16*j, true);
             float scale = fabsf(scales[j]);
             if (scale > amax) {
                 amax = scale; max_scale = scales[j];
@@ -1015,20 +1274,20 @@ void quantize_row_q3_K_ref(const float * restrict x, block_q3_K * restrict y, in
             y[i].d = GGML_FP32_TO_FP16(0.f);
         }
 
-        int8_t sc;
-        for (int j = 0; j < QK_K/16; ++j) {
-            sc = j < 8 ? y[i].scales[j] & 0xF : y[i].scales[j-8] >> 4;
-            sc = (sc | (((y[i].scales[8 + j%4] >> (2*(j/4))) & 3) << 4)) - 32;
-            float d = GGML_FP16_TO_FP32(y[i].d) * sc;
-            if (!d) {
-                continue;
-            }
-            for (int ii = 0; ii < 16; ++ii) {
-                int l = nearest_int(x[16*j + ii]/d);
-                l = MAX(-4, MIN(3, l));
-                L[16*j + ii] = l + 4;
-            }
-        }
+        // int8_t sc;
+        // for (int j = 0; j < QK_K/16; ++j) {
+        //     sc = j < 8 ? y[i].scales[j] & 0xF : y[i].scales[j-8] >> 4;
+        //     sc = (sc | (((y[i].scales[8 + j%4] >> (2*(j/4))) & 3) << 4)) - 32;
+        //     float d = GGML_FP16_TO_FP32(y[i].d) * sc;
+        //     if (!d) {
+        //         continue;
+        //     }
+        //     for (int ii = 0; ii < 16; ++ii) {
+        //         int l = nearest_int(x[16*j + ii]/d);
+        //         l = MAX(-4, MIN(3, l));
+        //         L[16*j + ii] = l + 4;
+        //     }
+        // }
 
         memset(y[i].hmask, 0, QK_K/8);
         // We put the high-bit for the 1st 8 quants into bit 0, the next 8 into bit 1, etc.
@@ -1112,6 +1371,7 @@ static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restri
     float weight[16];
     float sw[QK_K / 16];
     int8_t Ls[QK_K / 16];
+    struct fraction Faux[16 * 32];
 
     for (int i = 0; i < nb; i++) {
 
@@ -1130,13 +1390,15 @@ static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restri
             for (int l = 0; l < 16; ++l) sumw += weight[l];
             sw[j] = sumw;
 
-            scales[j] = make_qx_quants(16, 4, x + 16*j, L + 16*j, 1, weight);
+            // scales[j] = make_qx_quants(16, 4, x + 16*j, L + 16*j, 1, weight);
+            scales[j] = make_qkxs_quants(16, -4, 3, x + 16*j, weight, L + 16*j, Faux, true);
 
         }
 
         memset(y[i].scales, 0, 12);
 
-        float d_block = make_qx_quants(QK_K/16, 32, scales, Ls, 1, sw);
+        // float d_block = make_qx_quants(QK_K/16, 32, scales, Ls, 1, sw);
+        float d_block = make_qkxs_quants(QK_K/16, -32, 31, scales, sw, Ls, Faux, true);
         for (int j = 0; j < QK_K/16; ++j) {
             int l = Ls[j];
             if (j < 8) {
@@ -1149,20 +1411,20 @@ static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restri
         }
         y[i].d = GGML_FP32_TO_FP16(d_block);
 
-        int8_t sc;
-        for (int j = 0; j < QK_K/16; ++j) {
-            sc = j < 8 ? y[i].scales[j] & 0xF : y[i].scales[j-8] >> 4;
-            sc = (sc | (((y[i].scales[8 + j%4] >> (2*(j/4))) & 3) << 4)) - 32;
-            float d = GGML_FP16_TO_FP32(y[i].d) * sc;
-            if (!d) {
-                continue;
-            }
-            for (int ii = 0; ii < 16; ++ii) {
-                int l = nearest_int(x[16*j + ii]/d);
-                l = MAX(-4, MIN(3, l));
-                L[16*j + ii] = l + 4;
-            }
-        }
+        // int8_t sc;
+        // for (int j = 0; j < QK_K/16; ++j) {
+        //     sc = j < 8 ? y[i].scales[j] & 0xF : y[i].scales[j-8] >> 4;
+        //     sc = (sc | (((y[i].scales[8 + j%4] >> (2*(j/4))) & 3) << 4)) - 32;
+        //     float d = GGML_FP16_TO_FP32(y[i].d) * sc;
+        //     if (!d) {
+        //         continue;
+        //     }
+        //     for (int ii = 0; ii < 16; ++ii) {
+        //         int l = nearest_int(x[16*j + ii]/d);
+        //         l = MAX(-4, MIN(3, l));
+        //         L[16*j + ii] = l + 4;
+        //     }
+        // }
 
         memset(y[i].hmask, 0, QK_K/8);
         // We put the high-bit for the 1st 8 quants into bit 0, the next 8 into bit 1, etc.
@@ -4572,10 +4834,9 @@ static inline int best_index_int8(int n, const int8_t * val, float x) {
 
 static void quantize_row_iq4_nl_impl(const int super_block_size, const int block_size, const float * restrict x,
         ggml_fp16_t * dh, uint8_t * q4, uint16_t * scales_h, uint8_t * scales_l,
-        float * scales, float * weight, uint8_t * L,
+        float * scales, float * weight, uint8_t * L, uint8_t * Laux, struct fraction * Faux,
         const int8_t * values,
-        const float * quant_weights,
-        const int ntry) {
+        const float * quant_weights) {
 
     float sigma2 = 0;
     for (int j = 0; j < super_block_size; ++j) sigma2 += x[j]*x[j];
@@ -4592,7 +4853,8 @@ static void quantize_row_iq4_nl_impl(const int super_block_size, const int block
             const float * qw = quant_weights + ib*block_size;
             for (int j = 0; j < block_size; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]);
         } else {
-            for (int j = 0; j < block_size; ++j) weight[j] = xb[j]*xb[j];
+            for (int j = 0; j < block_size; ++j) weight[j] = sqrtf(sigma2 + xb[j]*xb[j]);
+            // for (int j = 0; j < block_size; ++j) weight[j] = 1;
         }
         float amax = 0, max = 0;
         for (int j = 0; j < block_size; ++j) {
@@ -4605,35 +4867,7 @@ static void quantize_row_iq4_nl_impl(const int super_block_size, const int block
             scales[ib] = 0;
             continue;
         }
-        float d = ntry > 0 ? -max/values[0] : max/values[0];
-        float id = 1/d;
-        float sumqx = 0, sumq2 = 0;
-        for (int j = 0; j < block_size; ++j) {
-            float al = id*xb[j];
-            int l = best_index_int8(16, values, al);
-            Lb[j] = l;
-            float q = values[l];
-            float w = weight[j];
-            sumqx += w*q*xb[j];
-            sumq2 += w*q*q;
-        }
-        d = sumqx/sumq2;
-        float best = d*sumqx;
-        for (int itry = -ntry; itry <= ntry; ++itry) {
-            id = (itry + values[0])/max;
-            sumqx = sumq2 = 0;
-            for (int j = 0; j < block_size; ++j) {
-                float al = id*xb[j];
-                int l = best_index_int8(16, values, al);
-                float q = values[l];
-                float w = weight[j];
-                sumqx += w*q*xb[j];
-                sumq2 += w*q*q;
-            }
-            if (sumq2 > 0 && sumqx*sumqx > best*sumq2) {
-                d = sumqx/sumq2; best = d * sumqx;
-            }
-        }
+        float d = make_qkxs_nl_quants(block_size, 16, xb, weight, values, Lb, Laux, Faux, true);
         scales[ib] = d;
         float abs_d = fabsf(d);
         if (abs_d > amax_scale) {
@@ -4650,13 +4884,13 @@ static void quantize_row_iq4_nl_impl(const int super_block_size, const int block
         for (int ib = 0; ib < super_block_size/block_size; ++ib) {
             int l = nearest_int(id*scales[ib]);
             l = MAX(-32, MIN(31, l));
-            float dl = d * l;
-            float idl = dl ? 1/dl : 0.f;
-            uint8_t * Lb = L + ib*block_size;
-            const float * xb = x + ib*block_size;
-            for (int j = 0; j < block_size; ++j) {
-                Lb[j] = best_index_int8(16, values, idl*xb[j]);
-            }
+            // float dl = d * l;
+            // float idl = dl ? 1/dl : 0.f;
+            // uint8_t * Lb = L + ib*block_size;
+            // const float * xb = x + ib*block_size;
+            // for (int j = 0; j < block_size; ++j) {
+            //     Lb[j] = best_index_int8(16, values, idl*xb[j]);
+            // }
             l += 32;
             uint8_t l_l = l & 0xf;
             uint8_t l_h = l >>  4;
@@ -4666,12 +4900,12 @@ static void quantize_row_iq4_nl_impl(const int super_block_size, const int block
         }
     } else {
         dh[0] = GGML_FP32_TO_FP16(scales[0]);
-        if (ntry > 0) {
-            float id = scales[0] ? 1/scales[0] : 0;
-            for (int j = 0; j < super_block_size; ++j) {
-                L[j] = best_index_int8(16, values, id*x[j]);
-            }
-        }
+        // if (ntry > 0) {
+        //     float id = scales[0] ? 1/scales[0] : 0;
+        //     for (int j = 0; j < super_block_size; ++j) {
+        //         L[j] = best_index_int8(16, values, id*x[j]);
+        //     }
+        // }
     }
 
     for (int i = 0; i < super_block_size/32; ++i) {
@@ -4686,6 +4920,8 @@ size_t quantize_iq4_nl(const float * restrict src, void * restrict dst, int64_t
     int64_t nblock = n_per_row/QK4_NL;
     char * qrow = (char *)dst;
     uint8_t L[QK4_NL];
+    uint8_t Laux[QK4_NL];
+    struct fraction Faux[QK4_NL * 16];
     float weight[QK4_NL];
     uint16_t unused_h;
     uint8_t * unused_l = NULL;
@@ -4695,7 +4931,7 @@ size_t quantize_iq4_nl(const float * restrict src, void * restrict dst, int64_t
         for (int ibl = 0; ibl < nblock; ++ibl) {
             const float * qw = quant_weights ? quant_weights + QK4_NL*ibl : NULL;
             quantize_row_iq4_nl_impl(QK4_NL, 32, src + QK4_NL*ibl, &iq4[ibl].d, iq4[ibl].qs, &unused_h, unused_l,
-                    &scale, weight, L, kvalues_iq4nl, qw, 7);
+                    &scale, weight, L, Laux, Faux, kvalues_iq4nl, qw);
         }
         src += n_per_row;
         qrow += nblock*sizeof(block_iq4_nl);
@@ -4708,6 +4944,8 @@ void quantize_row_iq4_nl_ref(const float * restrict x, block_iq4_nl * restrict y
     GGML_ASSERT(k%QK4_NL == 0);
     int64_t nblock = k/QK4_NL;
     uint8_t L[QK4_NL];
+    uint8_t Laux[QK4_NL];
+    struct fraction Faux[QK4_NL * 16];
     float weight[QK4_NL];
     uint16_t unused_h;
     uint8_t * unused_l = NULL;
@@ -4715,7 +4953,7 @@ void quantize_row_iq4_nl_ref(const float * restrict x, block_iq4_nl * restrict y
     block_iq4_nl * iq4 = y;
     for (int ibl = 0; ibl < nblock; ++ibl) {
         quantize_row_iq4_nl_impl(QK4_NL, 32, x + QK4_NL*ibl, &iq4[ibl].d, iq4[ibl].qs, &unused_h, unused_l,
-                &scale, weight, L, kvalues_iq4nl, NULL, -1);
+                &scale, weight, L, Laux, Faux, kvalues_iq4nl, NULL);
     }
 }
 
@@ -4724,6 +4962,8 @@ size_t quantize_iq4_xs(const float * restrict src, void * restrict dst, int64_t
     int64_t nblock = n_per_row/QK_K;
     char * qrow = (char *)dst;
     uint8_t L[QK_K];
+    uint8_t Laux[32];
+    struct fraction Faux[32 * 16];
     float weight[32];
     float scales[QK_K/32];
     for (int64_t row = 0; row < nrow; ++row) {
@@ -4731,7 +4971,7 @@ size_t quantize_iq4_xs(const float * restrict src, void * restrict dst, int64_t
         for (int ibl = 0; ibl < nblock; ++ibl) {
             const float * qw = quant_weights ? quant_weights + QK_K*ibl : NULL;
             quantize_row_iq4_nl_impl(QK_K, 32, src + QK_K*ibl, &iq4[ibl].d, iq4[ibl].qs, &iq4[ibl].scales_h, iq4[ibl].scales_l,
-                    scales, weight, L, kvalues_iq4nl, qw, 7);
+                    scales, weight, L, Laux, Faux, kvalues_iq4nl, qw);
         }
         src += n_per_row;
         qrow += nblock*sizeof(block_iq4_xs);

From d0060fc498595c7a2b7c1e28662cc11f298e6c9a Mon Sep 17 00:00:00 2001
From: Francis Couture-Harpin <git@compilade.net>
Date: Fri, 21 Feb 2025 15:05:03 -0500
Subject: [PATCH 02/12] ggml-quants : better and faster make_qkxs_quants

---
 ggml/src/ggml-quants.c | 179 ++++++++++++++++++++++++++---------------
 1 file changed, 116 insertions(+), 63 deletions(-)

diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c
index 638766ae89bcf..29ffb608b645f 100644
--- a/ggml/src/ggml-quants.c
+++ b/ggml/src/ggml-quants.c
@@ -660,97 +660,148 @@ static inline int compare_fractions_desc(const void * a, const void * b) {
 
 // exhaustive search with cumulative sums
 // Need Faux to have room for n*(max(abs(nmin), abs(nmax))) fractions
-static float make_qkxs_quants(int n, int nmin, int nmax, const float * restrict x, const float * restrict weights, int8_t * restrict L, struct fraction * restrict Faux, bool signed_scale) {
-    float max = 0.0f;
-    float amax = 0.0f;
-    for (int i = 0; i < n; ++i) {
-        float ax = fabsf(x[i]);
-        if (ax > amax) {
-            amax = ax;
-            max = x[i];
+static float make_qkxs_quants(int n, int nmin, int nmax, const float * restrict x, const float * restrict weights, int8_t * restrict L, int8_t * restrict Laux, struct fraction * restrict Faux, bool signed_scale) {
+    const int orig_nmin = nmin;
+    const int orig_nmax = nmax;
+    float max = x[0];
+    float min = x[0];
+    float w_amax = weights[0] * fabsf(x[0]);
+    int max_i = 0;
+    int w_amax_i = 0;
+    int min_i = 0;
+    for (int i = 1; i < n; ++i) {
+        if (x[i] < min) { min = x[i]; min_i = i; }
+        if (x[i] > max) { max = x[i]; max_i = i; }
+        // Find the most important value
+        const float w = weights[i];
+        const float wax = w * fabsf(x[i]);
+        if (wax > w_amax) {
+            w_amax = wax;
+            w_amax_i = i;
+        }
+    }
+    const int amax_i = fabsf(min) > fabsf(max) ? min_i : max_i;
+    const float amax = fabsf(x[amax_i]);
+
+    if (amax < GROUP_MAX_EPS) { // all zero
+        for (int i = 0; i < n; ++i) {
+            L[i] = 0;
         }
+        return 0.0f;
     }
     bool negative_scale = false;
     if (signed_scale && -nmin != nmax) {
         // the max side should have the biggest range
-        if ((max < 0.0f) == (-nmin < nmax)) {
+        // FIXME: this is incorrect when the weights[.] do not sort in the same order as fabsf(x[.])
+        //        or is it some other condition?
+        if ((x[amax_i] < 0.0f) == (-nmin < nmax)) {
             // [-4, 3] ==> [-3, 4]
-            int tmp = nmin;
+            const int tmp = nmin;
+            const float ftmp = min;
             nmin = -nmax;
             nmax = -tmp;
+            min = -max;
+            max = -ftmp;
             negative_scale = true;
         }
     }
-    if (amax < GROUP_MAX_EPS) { // all zero
+
+    // Find the max range in [0, amax_range] which doesn't result in clamping.
+    // This is the range from the side which would clamp first (biggest ratio of max to nmax).
+    int amax_range;
+    float range_max;
+    if (fabsf(-max * nmin) < fabsf(-min * nmax)) {
+        amax_range = MAX(0, -nmin);
+        range_max = fabsf(min);
+    } else {
+        amax_range = MAX(0, nmax);
+        range_max = fabsf(max);
+    }
+    float sumlx = 0.0f;
+    float suml2 = 0.0f;
+    float scale = 0.0f;
+    float best = 0.0f;
+    float best_denom = 1.0f;
+    if (amax_range > 1) {
+        // The smallest non-redundant iscale makes the first clamped value half+1 its max integer value.
+        // Proof: anything smaller has a representable vector with values twice as big.
+        const float iscale = ((float)(amax_range / 2 + 1))/range_max * (negative_scale ? -1.0f : 1.0f);
+        for (int i = 0; i < n; ++i) {
+            const float w = weights[i];
+            int l = MAX(nmin, MIN(lroundf(x[i] * iscale), nmax));
+            if (negative_scale) { l = -l; }
+            Laux[i] = l;
+            L[i] = l;
+            suml2 += w * l * l;
+            sumlx += w * l * x[i];
+        }
+        best = sumlx * sumlx;
+        best_denom = suml2; // should never be zero
+        scale = sumlx / suml2;
+    } else {
         for (int i = 0; i < n; ++i) {
+            Laux[i] = 0;
             L[i] = 0;
         }
-        return 0.0f;
     }
+
+    const int imax_range = MAX(0, (x[w_amax_i] < 0.0f) ? -nmin : nmax);
+    const int max_odd = 2*(imax_range + 1) + 1;
+    const float wmax = fabsf(x[w_amax_i]);
     int n_frac = 0;
     for (int i = 0; i < n; ++i) {
         // assuming nmin <= nmax
-        const int odd_max = MAX(0, x[i] < 0 ? -nmin : nmax);
-        const int odd_min = MAX(0, x[i] < 0 ? -nmax : nmin);
+        const int odd_max = MAX(abs(Laux[i]), x[i] < 0.0f ? -nmin : nmax);
+        const int odd_min = MAX(abs(Laux[i]), x[i] < 0.0f ? -nmax : nmin);
         const float v = fabsf(x[i]);
-        // fprintf(stderr, "%s: i=%d, odd_min=%d, odd_max=%d\n", __func__, i, odd_min, odd_max);
+        const float v_max_odd = v * max_odd;
         for (int j = odd_min; j < odd_max; ++j) {
             const float odd = 2*j + 1;
-            Faux[n_frac++] = (struct fraction){
-                .numer=v,
-                .denom=odd,
-                .i=i,
-            };
+            if (wmax * odd < v_max_odd) {
+                Faux[n_frac++] = (struct fraction){
+                    .numer=v,
+                    .denom=odd,
+                    .i=i,
+                };
+            } else {
+                // stop when the inverse scale would result in clamping the max (FIXME: most important) value
+                break;
+            }
         }
     }
 
     qsort(Faux, n_frac, sizeof(struct fraction), compare_fractions_desc);
 
-    float iscale = 0.0f;
-    {
-        float sumlx = 0.0f;
-        float suml2 = 0.0f;
-        float best = 0.0f;
-        float best_denom = 1.0f;
-        for (int i = 0; i < n_frac; ++i) {
-            // maximize the weighted cosine
-            const int ii = Faux[i].i;
-            const float w = weights ? weights[ii] : x[ii] * x[ii];
-            sumlx += w * Faux[i].numer;
-            suml2 += w * Faux[i].denom;
-            const float current = sumlx * sumlx;
-            // fprintf(stderr, "%s: Faux[%d]=(%f/%f) * %f, square(sumlx)=%f, suml2=%f, k*cos2=%f\n", __func__, i, Faux[i].numer, Faux[i].denom, Faux[i].weight, current, suml2, current / suml2);
-            // use the last in case of equality
-            // FIXME: > or >= ?? Why does [0, 0, 1] rounds to [0, 0, 0] with >= ?
-            if (suml2 > 0.0f && current * best_denom > best * suml2) {
-                best = current;
-                best_denom = suml2;
-                iscale = Faux[i].numer > 0.0f ? Faux[i].denom / (2.0f * Faux[i].numer) : 0.0f;
-                if (!isfinite(iscale)) {
-                    fprintf(stderr, "%s: iscale is not finite, %f/(2*%f)\n", __func__, Faux[i].denom, Faux[i].numer);
+    int best_p_i = -1; // consecutive with 0..n_frac
+    for (int i = 0; i < n_frac; ++i) {
+        // maximize the weighted cosine
+        const int ii = Faux[i].i;
+        const float w = weights ? weights[ii] : x[ii] * x[ii];
+        sumlx += w * Faux[i].numer;
+        suml2 += w * Faux[i].denom;
+        const float current = sumlx * sumlx;
+        Laux[ii] += x[ii] < 0.0f ? -1 : 1;
+        if (suml2 > 0.0f && Faux[i].numer > 0.0f && current * best_denom > best * suml2) {
+            best = current;
+            best_denom = suml2;
+            scale = sumlx / suml2;
+            if (i == best_p_i + 1) {
+                // reduce copies for consecutive bests
+                L[ii] += x[ii] < 0.0f ? -1 : 1;
+            } else {
+                for (int j = 0; j < n; ++j) {
+                    L[j] = Laux[j];
                 }
             }
+            best_p_i = i;
         }
     }
-    // (very) small fudging necessary because floats otherwise round to nearest even
-    iscale = iscale * ((float)((1 << 23) + 1) / (float)(1 << 23));
-
-    float sumlx = 0.0f;
-    float suml2 = 0.0f;
     for (int i = 0; i < n; ++i) {
-        // Rounding away from zero is assumed by the search algorithm above.
-        int l = MAX(nmin, MIN(lroundf(x[i] * iscale), nmax));
-        if (negative_scale) {
-            l = -l;
-        }
-        L[i] = negative_scale ? l + nmax : l - nmin;
-        float w = weights ? weights[i] : x[i] * x[i];
-        // weighted projection scale
-        sumlx += w * x[i] * l;
-        suml2 += w * l * l;
+        L[i] = negative_scale ? (-L[i] + nmax) : (L[i] + -nmin);
+        GGML_ASSERT(L[i] >= 0 && L[i] <= nmax - nmin);
     }
 
-    return suml2 > 0.0f ? sumlx / suml2 : 0.0f;
+    return negative_scale ? -scale : scale;
 }
 
 // non-linear exhaustive search with cumulative sums
@@ -1234,6 +1285,7 @@ void quantize_row_q3_K_ref(const float * restrict x, block_q3_K * restrict y, in
     const int nb = k / QK_K;
 
     int8_t L[QK_K];
+    int8_t Laux[16];
     struct fraction Faux[16 * 4];
     float scales[QK_K / 16];
     float weights[16];
@@ -1247,7 +1299,7 @@ void quantize_row_q3_K_ref(const float * restrict x, block_q3_K * restrict y, in
         float max_scale = 0;
         float amax = 0;
         for (int j = 0; j < QK_K/16; ++j) {
-            scales[j] = make_qkxs_quants(16, -4, 3, x + 16*j, weights, L + 16*j, Faux, true);
+            scales[j] = make_qkxs_quants(16, -4, 3, x + 16*j, weights, L + 16*j, Laux, Faux, true);
             // scales[j] = make_q3_quants(16, 4, x + 16*j, L + 16*j, true);
             float scale = fabsf(scales[j]);
             if (scale > amax) {
@@ -1367,6 +1419,7 @@ static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restri
     const int nb = n_per_row / QK_K;
 
     int8_t L[QK_K];
+    int8_t Laux[16];
     float scales[QK_K / 16];
     float weight[16];
     float sw[QK_K / 16];
@@ -1391,14 +1444,14 @@ static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restri
             sw[j] = sumw;
 
             // scales[j] = make_qx_quants(16, 4, x + 16*j, L + 16*j, 1, weight);
-            scales[j] = make_qkxs_quants(16, -4, 3, x + 16*j, weight, L + 16*j, Faux, true);
+            scales[j] = make_qkxs_quants(16, -4, 3, x + 16*j, weight, L + 16*j, Laux, Faux, true);
 
         }
 
         memset(y[i].scales, 0, 12);
 
         // float d_block = make_qx_quants(QK_K/16, 32, scales, Ls, 1, sw);
-        float d_block = make_qkxs_quants(QK_K/16, -32, 31, scales, sw, Ls, Faux, true);
+        float d_block = make_qkxs_quants(QK_K/16, -32, 31, scales, sw, Ls, Laux, Faux, true);
         for (int j = 0; j < QK_K/16; ++j) {
             int l = Ls[j];
             if (j < 8) {
@@ -4856,11 +4909,11 @@ static void quantize_row_iq4_nl_impl(const int super_block_size, const int block
             for (int j = 0; j < block_size; ++j) weight[j] = sqrtf(sigma2 + xb[j]*xb[j]);
             // for (int j = 0; j < block_size; ++j) weight[j] = 1;
         }
-        float amax = 0, max = 0;
+        float amax = 0;
         for (int j = 0; j < block_size; ++j) {
             float ax = fabsf(xb[j]);
             if (ax > amax) {
-                amax = ax; max = xb[j];
+                amax = ax;
             }
         }
         if (amax < GROUP_MAX_EPS) {

From 6f7fe74946f24c2e3f3d2b1d9345a6882523da51 Mon Sep 17 00:00:00 2001
From: Francis Couture-Harpin <git@compilade.net>
Date: Fri, 21 Feb 2025 18:47:09 -0500
Subject: [PATCH 03/12] ggml-quants : improve imatrix behavior for TQ1_0,
 TQ2_0, Q4_0, Q5_0

---
 ggml/src/ggml-quants.c | 142 ++++++++++++++++++++++++++++++++++++++---
 1 file changed, 132 insertions(+), 10 deletions(-)

diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c
index 29ffb608b645f..4dcca9953501c 100644
--- a/ggml/src/ggml-quants.c
+++ b/ggml/src/ggml-quants.c
@@ -661,8 +661,6 @@ static inline int compare_fractions_desc(const void * a, const void * b) {
 // exhaustive search with cumulative sums
 // Need Faux to have room for n*(max(abs(nmin), abs(nmax))) fractions
 static float make_qkxs_quants(int n, int nmin, int nmax, const float * restrict x, const float * restrict weights, int8_t * restrict L, int8_t * restrict Laux, struct fraction * restrict Faux, bool signed_scale) {
-    const int orig_nmin = nmin;
-    const int orig_nmax = nmax;
     float max = x[0];
     float min = x[0];
     float w_amax = weights[0] * fabsf(x[0]);
@@ -2143,6 +2141,8 @@ static void quantize_row_q4_0_impl(const float * restrict x, block_q4_0 * restri
 
     float weight[QK4_0];
     int8_t L[QK4_0];
+    int8_t Laux[QK4_0];
+    struct fraction Faux[8 * QK4_0];
 
     float sum_x2 = 0;
     for (int j = 0; j < n_per_row; ++j) sum_x2 += x[j]*x[j];
@@ -2153,7 +2153,7 @@ static void quantize_row_q4_0_impl(const float * restrict x, block_q4_0 * restri
         const float * xb = x + QK4_0 * ib;
         const float * qw = quant_weights + QK4_0 * ib;
         for (int j = 0; j < QK4_0; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]);
-        float d = make_qx_quants(QK4_0, 8, xb, L, 1, weight);
+        float d = make_qkxs_quants(QK4_0, -8, 7, xb, weight, L, Laux, Faux, true);
         y[ib].d = GGML_FP32_TO_FP16(d);
         for (int j = 0; j < 16; ++j) {
             y[ib].qs[j] = L[j] | (L[j+16] << 4);
@@ -2231,6 +2231,8 @@ static void quantize_row_q5_0_impl(const float * restrict x, block_q5_0 * restri
 
     float weight[QK5_0];
     int8_t L[QK5_0];
+    int8_t Laux[QK5_0];
+    struct fraction Faux[16 * QK5_0];
 
     float sum_x2 = 0;
     for (int j = 0; j < n_per_row; ++j) sum_x2 += x[j]*x[j];
@@ -2241,7 +2243,7 @@ static void quantize_row_q5_0_impl(const float * restrict x, block_q5_0 * restri
         const float * xb = x + QK5_0 * ib;
         const float * qw = quant_weights + QK5_0 * ib;
         for (int j = 0; j < QK5_0; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]);
-        float d = make_qx_quants(QK5_0, 16, xb, L, 1, weight);
+        float d = make_qkxs_quants(QK5_0, -16, 15, xb, weight, L, Laux, Faux, true);
         y[ib].d = GGML_FP32_TO_FP16(d);
 
         uint32_t qh = 0;
@@ -2403,6 +2405,74 @@ void quantize_row_tq1_0_ref(const float * restrict x, block_tq1_0 * restrict y,
     }
 }
 
+static void quantize_row_tq1_0_impl(const float * restrict x, block_tq1_0 * restrict y, int64_t n_per_row, const float * quant_weights) {
+    if (!quant_weights) {
+        quantize_row_tq1_0_ref(x, y, n_per_row);
+        return;
+    }
+
+    float weight[QK_K];
+    int8_t L[QK_K];
+    int8_t Laux[QK_K];
+    struct fraction Faux[1 * QK_K];
+
+    float sum_x2 = 0;
+    for (int j = 0; j < n_per_row; ++j) { sum_x2 += x[j]*x[j]; }
+    float sigma2 = sum_x2/n_per_row;
+
+    const int64_t nb = n_per_row/QK_K;
+    for (int ib = 0; ib < nb; ++ib) {
+        const float * xb = x + QK_K * ib;
+        const float * qw = quant_weights + QK_K * ib;
+        const int8_t * Lptr = L;
+        for (int j = 0; j < QK_K; ++j) { weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]); }
+        float d = make_qkxs_quants(QK_K, -1, 1, xb, weight, L, Laux, Faux, false);
+        y[ib].d = GGML_FP32_TO_FP16(d);
+
+        // 5 elements per byte, along 32 bytes
+        for (size_t j = 0; j < sizeof(y->qs) - sizeof(y->qs) % 32; j += 32) {
+            for (size_t m = 0; m < 32; ++m) {
+                uint8_t q = 0;
+                for (size_t n = 0; n < 5; ++n) {
+                    q *= 3;
+                    q += Lptr[m + n*32];
+                }
+                // ceiling division (243 == pow(3, 5))
+                q = ((uint16_t)q * 256 + (243 - 1)) / 243;
+                y[ib].qs[j + m] = q;
+            }
+            Lptr += 5*32;
+        }
+        // along 16 bytes
+        for (size_t j = sizeof(y->qs) - sizeof(y->qs) % 32; j < sizeof(y->qs); j += 16) {
+            for (size_t m = 0; m < 16; ++m) {
+                uint8_t q = 0;
+                for (size_t n = 0; n < 5; ++n) {
+                    q *= 3;
+                    q += Lptr[m + n*16];
+                }
+                // ceiling division (243 == pow(3, 5))
+                q = ((uint16_t)q * 256 + (243 - 1)) / 243;
+                y[ib].qs[j + m] = q;
+            }
+            Lptr += 5*16;
+        }
+        // 4 elements per byte
+        for (size_t j = 0; j < sizeof(y->qh); ++j) {
+            uint8_t q = 0;
+            for (size_t m = 0; m < 4; ++m) {
+                q *= 3;
+                q += Lptr[j + m*sizeof(y->qh)];
+            }
+            // shift the first value to the most significant trit
+            q *= 3;
+            // ceiling division (243 == pow(3, 5))
+            q = ((uint16_t)q * 256 + (243 - 1)) / 243;
+            y[ib].qh[j] = q;
+        }
+    }
+}
+
 void quantize_row_tq2_0_ref(const float * restrict x, block_tq2_0 * restrict y, int64_t k) {
     assert(k % QK_K == 0);
     const int64_t nb = k / QK_K;
@@ -2435,17 +2505,69 @@ void quantize_row_tq2_0_ref(const float * restrict x, block_tq2_0 * restrict y,
     }
 }
 
+
+static void quantize_row_tq2_0_impl(const float * restrict x, block_tq2_0 * restrict y, int64_t n_per_row, const float * quant_weights) {
+    if (!quant_weights) {
+        quantize_row_tq2_0_ref(x, y, n_per_row);
+        return;
+    }
+
+    float weight[QK_K];
+    int8_t L[QK_K];
+    int8_t Laux[QK_K];
+    struct fraction Faux[2 * QK_K];
+
+    float sum_x2 = 0;
+    for (int j = 0; j < n_per_row; ++j) { sum_x2 += x[j]*x[j]; }
+    float sigma2 = sum_x2/n_per_row;
+
+    const int64_t nb = n_per_row/QK_K;
+    for (int ib = 0; ib < nb; ++ib) {
+        const float * xb = x + QK_K * ib;
+        const float * qw = quant_weights + QK_K * ib;
+        for (int j = 0; j < QK_K; ++j) { weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]); }
+        float d = make_qkxs_quants(QK_K, -1, 2, xb, weight, L, Laux, Faux, true);
+        y[ib].d = GGML_FP32_TO_FP16(d);
+
+        for (size_t j = 0; j < sizeof(y->qs); j += 32) {
+            for (size_t m = 0; m < 32; ++m) {
+                uint8_t q = 0;
+                for (size_t n = 0; n < 4; ++n) {
+                    q += (L[4*j + m + n*32] & 3) << (2*n);
+                }
+                y[ib].qs[j + m] = q;
+            }
+        }
+    }
+}
+
 size_t quantize_tq1_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
-    (void)quant_weights; // not used
-    const size_t row_size = ggml_row_size(GGML_TYPE_TQ1_0, n_per_row);
-    quantize_row_tq1_0_ref(src, dst, (int64_t)nrow*n_per_row);
+    if (!quant_weights) {
+        quantize_row_tq1_0_ref(src, dst, (int64_t)nrow*n_per_row);
+        return nrow * ggml_row_size(GGML_TYPE_TQ1_0, n_per_row);
+    }
+    size_t row_size = ggml_row_size(GGML_TYPE_TQ1_0, n_per_row);
+    char * qrow = (char *)dst;
+    for (int64_t row = 0; row < nrow; ++row) {
+        quantize_row_tq1_0_impl(src, (block_tq1_0*)qrow, n_per_row, quant_weights);
+        src += n_per_row;
+        qrow += row_size;
+    }
     return nrow * row_size;
 }
 
 size_t quantize_tq2_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
-    (void)quant_weights; // not used
-    const size_t row_size = ggml_row_size(GGML_TYPE_TQ2_0, n_per_row);
-    quantize_row_tq2_0_ref(src, dst, (int64_t)nrow*n_per_row);
+    if (!quant_weights) {
+        quantize_row_tq2_0_ref(src, dst, (int64_t)nrow*n_per_row);
+        return nrow * ggml_row_size(GGML_TYPE_TQ2_0, n_per_row);
+    }
+    size_t row_size = ggml_row_size(GGML_TYPE_TQ2_0, n_per_row);
+    char * qrow = (char *)dst;
+    for (int64_t row = 0; row < nrow; ++row) {
+        quantize_row_tq2_0_impl(src, (block_tq2_0*)qrow, n_per_row, quant_weights);
+        src += n_per_row;
+        qrow += row_size;
+    }
     return nrow * row_size;
 }
 

From f27c1afc4040f3cb998b952cb571c280966b8ab2 Mon Sep 17 00:00:00 2001
From: Francis Couture-Harpin <git@compilade.net>
Date: Fri, 7 Mar 2025 12:54:56 -0500
Subject: [PATCH 04/12] ggml-quants : improve TQ2_0 imatrix

---
 ggml/src/ggml-quants.c | 218 ++++++++++++++++++++++++++++++++++-------
 1 file changed, 181 insertions(+), 37 deletions(-)

diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c
index 4dcca9953501c..f13805124ffb6 100644
--- a/ggml/src/ggml-quants.c
+++ b/ggml/src/ggml-quants.c
@@ -687,11 +687,11 @@ static float make_qkxs_quants(int n, int nmin, int nmax, const float * restrict
         }
         return 0.0f;
     }
+
     bool negative_scale = false;
     if (signed_scale && -nmin != nmax) {
         // the max side should have the biggest range
-        // FIXME: this is incorrect when the weights[.] do not sort in the same order as fabsf(x[.])
-        //        or is it some other condition?
+        // FIXME: this is not always the best sign
         if ((x[amax_i] < 0.0f) == (-nmin < nmax)) {
             // [-4, 3] ==> [-3, 4]
             const int tmp = nmin;
@@ -762,7 +762,7 @@ static float make_qkxs_quants(int n, int nmin, int nmax, const float * restrict
                     .i=i,
                 };
             } else {
-                // stop when the inverse scale would result in clamping the max (FIXME: most important) value
+                // stop when the inverse scale would result in clamping the most important value
                 break;
             }
         }
@@ -802,6 +802,182 @@ static float make_qkxs_quants(int n, int nmin, int nmax, const float * restrict
     return negative_scale ? -scale : scale;
 }
 
+// Very similar to make_qkxs_quants, but the sign of the scale is not assumed to be the sign of the absmax value.
+static float make_qkxss_quants(int n, int nmin, int nmax, const float * restrict x, const float * restrict weights, int8_t * restrict L, int8_t * restrict Laux, struct fraction * restrict Faux) {
+    // start at zero
+    nmin = MIN(0, nmin);
+    nmax = MAX(0, nmax);
+    float amax = 0.0f;
+    float min = 0.0f;
+    float max = 0.0f;
+    float w_amax = 0.0f;
+    int amax_i = -1;
+    int w_amax_i = -1;
+    for (int i = 0; i < n; ++i) {
+        const float w = weights ? weights[i] : x[i] * x[i];
+        const float ax = fabsf(x[i]);
+        const float wax = w * ax;
+        if (ax > amax) { amax = ax; amax_i = i; }
+        if (x[i] > max) { max = x[i]; }
+        if (x[i] < min) { min = x[i]; }
+        // Find the most important value
+        if (wax > w_amax) { w_amax = wax; w_amax_i = i; }
+    }
+
+    if (amax < GROUP_MAX_EPS || amax_i < 0 || w_amax_i < 0) { // all zero
+        for (int i = 0; i < n; ++i) { L[i] = 0; }
+        return 0.0f;
+    }
+
+    // Use the side which will clamp first.
+    // The first clamped value is the absmax at the end of the common range.
+    // TODO: reduce the search space when one of the ranges is 0
+    const int amax_range = MIN(-nmin, nmax);
+    float sumlx_p = 0.0f;
+    float suml2_p = 0.0f;
+    float sumlx_n = 0.0f;
+    float suml2_n = 0.0f;
+    float scale = 0.0f;
+    float best = 0.0f;
+    float best_denom = 1.0f;
+    int best_i = -2; // not consecutive with 0..n_frac
+    // Pre-calculate the half-point for the common range.
+    // All smaller vectors have a representable vector with twice the values, and thus can be skipped.
+    if (amax_range > 1) {
+        const float iscale = ((float)(amax_range / 2 + 1))/amax;
+        for (int i = 0; i < n; ++i) {
+            const float w = weights ? weights[i] : x[i] * x[i];
+            int l = MAX(nmin, MIN(lroundf(x[i] * iscale), nmax));
+            Laux[i] = l;
+            suml2_p += w * l * l;
+            sumlx_p += w * l * x[i];
+        }
+        sumlx_n = -sumlx_p;
+        suml2_n = suml2_p;
+        const float current_p = sumlx_p * sumlx_p;
+        if (suml2_p > 0.0f && current_p * best_denom > best * suml2_p) {
+            best = current_p;
+            best_denom = suml2_p;
+            scale = sumlx_p / suml2_p;
+            for (int i = 0; i < n; ++i) {
+                L[i] = Laux[i];
+            }
+            best_i = -1; // right before 0 of the loop after sorting
+        }
+    } else {
+        for (int i = 0; i < n; ++i) {
+            Laux[i] = 0;
+        }
+    }
+
+    const int imax_range = MAX(nmax, -nmin);
+    const int max_odd = 2*(imax_range + 1) + 1;
+    const float wmax = fabsf(x[w_amax_i]);
+    int n_frac = 0;
+    for (int i = 0; i < n; ++i) {
+        // assuming nmin <= nmax
+        const int odd_max = MAX(nmax, -nmin);
+        const float v = fabsf(x[i]);
+        const float v_max_odd = v * max_odd;
+        for (int j = abs(Laux[i]); j < odd_max; ++j) {
+            const float odd = 2*j + 1;
+            const float wmax_odd = wmax * odd;
+            if (wmax_odd < v_max_odd) {
+                Faux[n_frac++] = (struct fraction){
+                    .numer=v,
+                    .denom=odd,
+                    .i=i,
+                };
+            } else {
+                // stop when the inverse scale would result in clamping the most important value
+                break;
+            }
+        }
+    }
+
+    qsort(Faux, n_frac, sizeof(struct fraction), compare_fractions_desc);
+
+    const float max_common_odd = (MIN(nmax, -nmin) * 2) + 1;
+    const float max_odd_p = (nmax * 2) + 1;
+    const float max_odd_n = (-nmin * 2) + 1;
+
+    for (int i = 0; i < n_frac; ++i) {
+        // maximize the weighted cosine similarity
+        const int ii = Faux[i].i;
+        const float w = weights ? weights[ii] : x[ii] * x[ii];
+        const float lx = w * Faux[i].numer;
+        const float odd = Faux[i].denom;
+        const float l2 = w * odd;
+
+        Laux[ii] += x[ii] < 0.0f ? -1 : 1;
+
+        float sumlx = 0.0f;
+        float proj = 0.0f;
+        float norm = 0.0f;
+        if (odd < max_common_odd) {
+            sumlx_p += lx;
+            suml2_p += l2;
+            sumlx_n -= lx;
+            suml2_n += l2;
+
+            sumlx = sumlx_p;
+            proj = sumlx_p * sumlx_p;
+            norm = suml2_p;
+
+            // avoid double-copying Laux in a single iteration
+            if (suml2_p != suml2_n && suml2_p * suml2_n > 0.0f) {
+                const float proj_n = sumlx_n * sumlx_n;
+                if (proj_n * norm > proj * suml2_n) {
+                    sumlx = sumlx_n;
+                    proj = proj_n;
+                    norm = suml2_n;
+                }
+            }
+        } else if (x[ii] < 0.0f ? odd < max_odd_n : odd < max_odd_p) {
+            sumlx_p += lx;
+            suml2_p += l2;
+
+            sumlx = sumlx_p;
+            proj = sumlx_p * sumlx_p;
+            norm = suml2_p;
+        } else {
+            // outside the positive range means we're now into negatives
+            sumlx_n -= lx;
+            suml2_n += l2;
+
+            sumlx = sumlx_n;
+            proj = sumlx_n * sumlx_n;
+            norm = suml2_n;
+        }
+        if (norm > 0.0f && proj * best_denom > best * norm) {
+            best = proj;
+            best_denom = norm;
+            scale = sumlx / norm;
+            if (i == best_i + 1) {
+                // reduce copies for consecutive bests
+                L[ii] += x[ii] < 0.0f ? -1 : 1;
+            } else {
+                for (int j = 0; j < n; ++j) {
+                    L[j] = Laux[j];
+                }
+            }
+            best_i = i;
+        }
+    }
+
+    if (scale < 0.0f) {
+        for (int i = 0; i < n; ++i) {
+            L[i] = MAX(nmin, MIN(-L[i], nmax)) - nmin;
+        }
+    } else {
+        for (int i = 0; i < n; ++i) {
+            L[i] = MAX(nmin, MIN(L[i], nmax)) - nmin;
+        }
+    }
+
+    return scale;
+}
+
 // non-linear exhaustive search with cumulative sums
 // Need Faux to have room for n*k fractions
 static float make_qkxs_nl_quants(int n, int k, const float * restrict x, const float * restrict weights, const int8_t * restrict kvalues, uint8_t * restrict L, uint8_t * restrict Laux, struct fraction * restrict Faux, bool signed_scale) {
@@ -874,6 +1050,7 @@ static float make_qkxs_nl_quants(int n, int k, const float * restrict x, const f
     }
 
     // Non-linear mappings are usually not symmetric, so try negating the scale
+    // This is the same as above, but keeping the old best if the new best is not better.
     if (signed_scale) {
         for (int i = 0; i < n; ++i) {
             Laux[i] = koff;
@@ -1298,7 +1475,6 @@ void quantize_row_q3_K_ref(const float * restrict x, block_q3_K * restrict y, in
         float amax = 0;
         for (int j = 0; j < QK_K/16; ++j) {
             scales[j] = make_qkxs_quants(16, -4, 3, x + 16*j, weights, L + 16*j, Laux, Faux, true);
-            // scales[j] = make_q3_quants(16, 4, x + 16*j, L + 16*j, true);
             float scale = fabsf(scales[j]);
             if (scale > amax) {
                 amax = scale; max_scale = scales[j];
@@ -1324,21 +1500,6 @@ void quantize_row_q3_K_ref(const float * restrict x, block_q3_K * restrict y, in
             y[i].d = GGML_FP32_TO_FP16(0.f);
         }
 
-        // int8_t sc;
-        // for (int j = 0; j < QK_K/16; ++j) {
-        //     sc = j < 8 ? y[i].scales[j] & 0xF : y[i].scales[j-8] >> 4;
-        //     sc = (sc | (((y[i].scales[8 + j%4] >> (2*(j/4))) & 3) << 4)) - 32;
-        //     float d = GGML_FP16_TO_FP32(y[i].d) * sc;
-        //     if (!d) {
-        //         continue;
-        //     }
-        //     for (int ii = 0; ii < 16; ++ii) {
-        //         int l = nearest_int(x[16*j + ii]/d);
-        //         l = MAX(-4, MIN(3, l));
-        //         L[16*j + ii] = l + 4;
-        //     }
-        // }
-
         memset(y[i].hmask, 0, QK_K/8);
         // We put the high-bit for the 1st 8 quants into bit 0, the next 8 into bit 1, etc.
         int m = 0;
@@ -1441,14 +1602,12 @@ static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restri
             for (int l = 0; l < 16; ++l) sumw += weight[l];
             sw[j] = sumw;
 
-            // scales[j] = make_qx_quants(16, 4, x + 16*j, L + 16*j, 1, weight);
             scales[j] = make_qkxs_quants(16, -4, 3, x + 16*j, weight, L + 16*j, Laux, Faux, true);
 
         }
 
         memset(y[i].scales, 0, 12);
 
-        // float d_block = make_qx_quants(QK_K/16, 32, scales, Ls, 1, sw);
         float d_block = make_qkxs_quants(QK_K/16, -32, 31, scales, sw, Ls, Laux, Faux, true);
         for (int j = 0; j < QK_K/16; ++j) {
             int l = Ls[j];
@@ -1462,21 +1621,6 @@ static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restri
         }
         y[i].d = GGML_FP32_TO_FP16(d_block);
 
-        // int8_t sc;
-        // for (int j = 0; j < QK_K/16; ++j) {
-        //     sc = j < 8 ? y[i].scales[j] & 0xF : y[i].scales[j-8] >> 4;
-        //     sc = (sc | (((y[i].scales[8 + j%4] >> (2*(j/4))) & 3) << 4)) - 32;
-        //     float d = GGML_FP16_TO_FP32(y[i].d) * sc;
-        //     if (!d) {
-        //         continue;
-        //     }
-        //     for (int ii = 0; ii < 16; ++ii) {
-        //         int l = nearest_int(x[16*j + ii]/d);
-        //         l = MAX(-4, MIN(3, l));
-        //         L[16*j + ii] = l + 4;
-        //     }
-        // }
-
         memset(y[i].hmask, 0, QK_K/8);
         // We put the high-bit for the 1st 8 quants into bit 0, the next 8 into bit 1, etc.
         int m = 0;
@@ -2526,7 +2670,7 @@ static void quantize_row_tq2_0_impl(const float * restrict x, block_tq2_0 * rest
         const float * xb = x + QK_K * ib;
         const float * qw = quant_weights + QK_K * ib;
         for (int j = 0; j < QK_K; ++j) { weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]); }
-        float d = make_qkxs_quants(QK_K, -1, 2, xb, weight, L, Laux, Faux, true);
+        float d = make_qkxss_quants(QK_K, -1, 2, xb, weight, L, Laux, Faux);
         y[ib].d = GGML_FP32_TO_FP16(d);
 
         for (size_t j = 0; j < sizeof(y->qs); j += 32) {

From 0c9e44248996f3bcd08ca2c1acf0ef5548428f41 Mon Sep 17 00:00:00 2001
From: Francis Couture-Harpin <git@compilade.net>
Date: Sat, 15 Mar 2025 10:29:47 -0400
Subject: [PATCH 05/12] ggml-quants : remove some commented code

---
 ggml/src/ggml-quants.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c
index f13805124ffb6..78c0e2f492a77 100644
--- a/ggml/src/ggml-quants.c
+++ b/ggml/src/ggml-quants.c
@@ -5173,7 +5173,6 @@ static void quantize_row_iq4_nl_impl(const int super_block_size, const int block
             for (int j = 0; j < block_size; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]);
         } else {
             for (int j = 0; j < block_size; ++j) weight[j] = sqrtf(sigma2 + xb[j]*xb[j]);
-            // for (int j = 0; j < block_size; ++j) weight[j] = 1;
         }
         float amax = 0;
         for (int j = 0; j < block_size; ++j) {
@@ -5258,7 +5257,6 @@ size_t quantize_iq4_nl(const float * restrict src, void * restrict dst, int64_t
     return nrow * nblock * sizeof(block_iq4_nl);
 }
 
-//void quantize_row_iq4_nl_ref(const float * restrict x, void * restrict vy, int64_t k) {
 void quantize_row_iq4_nl_ref(const float * restrict x, block_iq4_nl * restrict y, int64_t k) {
     GGML_ASSERT(k%QK4_NL == 0);
     int64_t nblock = k/QK4_NL;

From 30ad9c2873fd95ae0f4ecae4fabe0fb812dc640d Mon Sep 17 00:00:00 2001
From: Francis Couture-Harpin <git@compilade.net>
Date: Sat, 15 Mar 2025 12:55:22 -0400
Subject: [PATCH 06/12] ggml-quants : faster exhaustive IQ4_NL rounding with
 k_heap

---
 ggml/src/ggml-quants.c | 317 +++++++++++++++++++++++++++++------------
 1 file changed, 227 insertions(+), 90 deletions(-)

diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c
index 78c0e2f492a77..6762e98f68097 100644
--- a/ggml/src/ggml-quants.c
+++ b/ggml/src/ggml-quants.c
@@ -658,6 +658,153 @@ static inline int compare_fractions_desc(const void * a, const void * b) {
     return (ab == ba) ? ((a > b) ? 1 : -1) : (ab < ba) ? 1 : -1;
 }
 
+struct k_heap_cell {
+    float frac;
+    float x;
+    int x_i;
+    int k_i;
+};
+
+// Faster enumeration for cumulative search
+struct k_heap {
+    int n;
+    int k;
+    int mid_k;
+    int8_t kmin;
+    int8_t kmax;
+    const float * odd;
+    const float * steps;
+    struct k_heap_cell * heap;
+};
+
+// build a max heap out of k_cells starting from node i
+static void k_heap_build(struct k_heap * heap, int i) {
+    const int n = heap->n;
+    int max = i;
+    int prev_max;
+    while (max < n / 2) {
+        prev_max = max;
+        int left = 2*(max + 1) - 1;
+        int right = 2*(max + 1);
+        if (left < n && heap->heap[left].frac > heap->heap[max].frac) {
+            max = left;
+        }
+        if (right < n && heap->heap[right].frac > heap->heap[max].frac) {
+            max = right;
+        }
+        if (max != prev_max) {
+            struct k_heap_cell tmp = heap->heap[prev_max];
+            heap->heap[prev_max] = heap->heap[max];
+            heap->heap[max] = tmp;
+        } else {
+            break;
+        }
+    }
+}
+
+// Assuming kvalues is sorted from most negative to most positive.
+// odd and steps are buffers of size k, while heap_cells should be big enough for x later on.
+static void k_heap_init(struct k_heap * restrict k_heap, int k, const int8_t * restrict kvalues, struct k_heap_cell * restrict heap_cells, float * restrict odd, float * restrict steps) {
+    GGML_ASSERT(k_heap && kvalues && heap_cells && odd);
+    k_heap->n = 0;
+    k_heap->k = k;
+    k_heap->odd = odd;
+    k_heap->steps = steps;
+    k_heap->heap = heap_cells;
+
+    int amin = abs(kvalues[0]);
+    int amax = abs(kvalues[0]);
+    int mid_k = 0;
+    int max_k = 0;
+    for (int i = 1; i < k; ++i) {
+        const int ak = abs(kvalues[i]);
+        if (ak < amin) {
+            amin = ak;
+            mid_k = i;
+        }
+        if (ak > amax) {
+            amax = ak;
+            max_k = i;
+        }
+    }
+    k_heap->mid_k = mid_k;
+    k_heap->kmin = kvalues[mid_k];
+    k_heap->kmax = kvalues[max_k];
+
+    for (int i = 0; i < k - 1; ++i) {
+        const float threshold = kvalues[i + 1] + kvalues[i];
+        const float step = kvalues[i + 1] - kvalues[i];
+        // It's amazing how their product is the difference between consecutive squares of the kvalues
+        GGML_ASSERT(threshold * step != 0.0f);
+        odd[i + (i >= mid_k ? 1 : 0)] = fabsf(threshold);
+        steps[i + (i >= mid_k ? 1 : 0)] = fabsf(step);
+    }
+    odd[mid_k] = 0.0f;
+    steps[mid_k] = 0.0f;
+
+    GGML_ASSERT(mid_k > 0 && mid_k + 1 < k);
+}
+
+// TODO: initial quantized values
+static void k_heap_set_x(struct k_heap * k_heap, const float * restrict x, int n, bool invert_sign) {
+    // TODO: sanity checks
+    k_heap->n = n;
+    for (int i = 0; i < n; ++i) {
+        const int k_i = ((x[i] < 0.0f) != invert_sign) ? k_heap->mid_k - 1 : k_heap->mid_k + 1;
+        k_heap->heap[i] = (struct k_heap_cell) {
+            .k_i=k_i,
+            .x_i=i,
+            .x=fabsf(x[i]),
+            .frac=fabsf(x[i] / k_heap->odd[k_i]),
+        };
+    }
+
+    for (int i = (k_heap->n / 2) - 1; i >= 0; --i) {
+        k_heap_build(k_heap, i);
+    }
+}
+
+static struct fraction k_heap_pop(struct k_heap * k_heap) {
+    if (k_heap && k_heap->n > 0) {
+        struct k_heap_cell * heap_cell = k_heap->heap;
+        const float step = k_heap->steps[heap_cell->k_i];
+        // Properly turn this into a difference of consecutive squares
+        struct fraction frac = (struct fraction) {
+            .numer=heap_cell->x*step,
+            .denom=k_heap->odd[heap_cell->k_i]*step,
+            .i=heap_cell->x_i,
+        };
+
+        if (heap_cell->k_i < k_heap->mid_k) {
+            if (heap_cell->k_i > 0) {
+                heap_cell->k_i -= 1;
+                heap_cell->frac = heap_cell->x/k_heap->odd[heap_cell->k_i];
+            } else {
+                // remove this node
+                k_heap->heap[0] = k_heap->heap[k_heap->n - 1];
+                k_heap->n -= 1;
+            }
+        } else {
+            if (heap_cell->k_i < k_heap->k - 1) {
+                heap_cell->k_i += 1;
+                heap_cell->frac = heap_cell->x/k_heap->odd[heap_cell->k_i];
+            } else {
+                // remove this node
+                k_heap->heap[0] = k_heap->heap[k_heap->n - 1];
+                k_heap->n -= 1;
+            }
+        }
+        k_heap_build(k_heap, 0);
+
+        return frac;
+    }
+    return (struct fraction) {
+        .numer=0.0f,
+        .denom=0.0f,
+        .i=-1,
+    };
+}
+
 // exhaustive search with cumulative sums
 // Need Faux to have room for n*(max(abs(nmin), abs(nmax))) fractions
 static float make_qkxs_quants(int n, int nmin, int nmax, const float * restrict x, const float * restrict weights, int8_t * restrict L, int8_t * restrict Laux, struct fraction * restrict Faux, bool signed_scale) {
@@ -979,111 +1126,89 @@ static float make_qkxss_quants(int n, int nmin, int nmax, const float * restrict
 }
 
 // non-linear exhaustive search with cumulative sums
-// Need Faux to have room for n*k fractions
-static float make_qkxs_nl_quants(int n, int k, const float * restrict x, const float * restrict weights, const int8_t * restrict kvalues, uint8_t * restrict L, uint8_t * restrict Laux, struct fraction * restrict Faux, bool signed_scale) {
+static float make_qkxs_nl_quants(int n, const float * restrict x, const float * restrict weights, uint8_t * restrict L, uint8_t * restrict Laux, struct k_heap * restrict k_heap, bool signed_scale, bool fast) {
     float sumlx = 0.0f;
     float suml2 = 0.0f;
-    int kmin = abs(kvalues[0]);
-    int koff = 0;
-    for (int i = 1; i < k; ++i) {
-        int ak = abs(kvalues[i]);
-        if (ak < kmin) {
-            kmin = ak;
-            koff = i;
-        }
-    }
-    kmin = kvalues[koff];
+    float amax = -1.0f;
+    int amax_i = -1;
+    const int8_t kmin = k_heap->kmin;
     for (int i = 0; i < n; ++i) {
-        float w = weights ? weights[i] : x[i] * x[i];
-        Laux[i] = koff;
+        const float w = weights ? weights[i] : x[i] * x[i];
+        const float ax = fabsf(x[i]);
+        if (ax > amax) {
+            amax = ax;
+            amax_i = i;
+        }
+        Laux[i] = k_heap->mid_k;
         sumlx += w * x[i] * kmin;
         suml2 += w * kmin * kmin;
     }
 
-    int n_frac_p = 0;
-    for (int i = 0; i < n; ++i) {
-        const int start = x[i] < 0.0f ? 1 : koff + 1;
-        const int end = x[i] < 0.0f ? koff + 1: k;
-        for (int j = start; j < end; ++j) {
-            const float threshold = kvalues[j] + kvalues[j - 1];
-            const float step = kvalues[j] - kvalues[j - 1];
-            Faux[n_frac_p++] = (struct fraction){
-                // This should always be positive or else
-                // the fraction comparison function won't work properly
-                .numer=fabsf(x[i] * step),
-                // It's amazing how this is still the difference of consecutive squares
-                .denom=fabsf(threshold * step),
-                .i=i,
-            };
-        }
-    }
+    const bool neg_scale = signed_scale && fast ? (x[amax_i] < 0.0f) != (k_heap->kmax < 0) : false;
 
-    qsort(Faux, n_frac_p, sizeof(struct fraction), compare_fractions_desc);
+    k_heap_set_x(k_heap, x, n, neg_scale);
 
-    float best = 0.0f;
-    float best_sumlx = 0.0f;
-    float best_suml2 = 1.0f;
-    float sumlx_p = sumlx;
-    float suml2_p = suml2;
-    int best_p_i = -2; // not consecutive with 0..n_frac
-    for (int i = 0; i < n_frac_p; ++i) {
-        const int ii = Faux[i].i;
-        const float w = weights ? weights[ii] : x[ii] * x[ii];
-        sumlx_p += w * Faux[i].numer;
-        suml2_p += w * Faux[i].denom;
-        const float current = sumlx_p * sumlx_p;
-        Laux[ii] += x[ii] < 0.0f ? -1 : 1;
-        if (suml2_p > 0.0f && current * best_suml2 > best * suml2_p) {
-            best = current;
-            best_sumlx = sumlx_p;
-            best_suml2 = suml2_p;
-            if (i == best_p_i + 1) {
-                // reduce copies for consecutive bests
-                L[ii] += x[ii] < 0.0f ? -1 : 1;
-            } else {
-                for (int j = 0; j < n; ++j) {
-                    L[j] = Laux[j];
+    float best;
+    float best_sumlx;
+    float best_suml2;
+    if (suml2 != 0.0f) {
+        best = sumlx * sumlx;
+        best_sumlx = neg_scale ? -sumlx : sumlx;
+        best_suml2 = suml2 != 0.0f ? suml2 : 1.0f;
+    } else {
+        best = 0.0f;
+        best_sumlx = 0.0f;
+        best_suml2 = 1.0f;
+    }
+    {
+        float sumlx_p = neg_scale ? -sumlx : sumlx;
+        float suml2_p = suml2;
+        int best_p_i = -2; // not consecutive with 0..n_frac
+        int i = 0;
+        while (k_heap->n > 0) {
+            struct fraction frac = k_heap_pop(k_heap);
+            const int ii = frac.i;
+            const float w = weights ? weights[ii] : x[ii] * x[ii];
+            sumlx_p += w * frac.numer;
+            suml2_p += w * frac.denom;
+            const float current = sumlx_p * sumlx_p;
+            Laux[ii] += (x[ii] < 0.0f) != neg_scale ? -1 : 1;
+            if (suml2_p > 0.0f && current * best_suml2 > best * suml2_p) {
+                best = current;
+                best_sumlx = neg_scale ? -sumlx_p : sumlx_p;
+                best_suml2 = suml2_p;
+                if (i == best_p_i + 1) {
+                    // reduce copies for consecutive bests
+                    L[ii] += (x[ii] < 0.0f) != neg_scale ? -1 : 1;
+                } else {
+                    for (int j = 0; j < n; ++j) {
+                        L[j] = Laux[j];
+                    }
                 }
+                best_p_i = i;
             }
-            best_p_i = i;
         }
     }
 
     // Non-linear mappings are usually not symmetric, so try negating the scale
     // This is the same as above, but keeping the old best if the new best is not better.
-    if (signed_scale) {
+    if (signed_scale && !fast) {
         for (int i = 0; i < n; ++i) {
-            Laux[i] = koff;
-        }
-
-        int n_frac_n = 0;
-        for (int i = 0; i < n; ++i) {
-            const int start = x[i] >= 0.0f ? 1 : koff + 1;
-            const int end = x[i] >= 0.0f ? koff + 1: k;
-            for (int j = start; j < end; ++j) {
-                const float threshold = kvalues[j] + kvalues[j - 1];
-                const float step = kvalues[j] - kvalues[j - 1];
-                Faux[n_frac_n++] = (struct fraction){
-                    // This should always be positive or else
-                    // the fraction comparison function won't work properly
-                    .numer=fabsf(x[i] * step),
-                    // It's amazing how this is still the difference of consecutive squares
-                    .denom=fabsf(threshold * step),
-                    .i=i,
-                };
-            }
+            Laux[i] = k_heap->mid_k;
         }
 
-        qsort(Faux, n_frac_n, sizeof(struct fraction), compare_fractions_desc);
+        k_heap_set_x(k_heap, x, n, true);
 
         float sumlx_n = -sumlx;
         float suml2_n = suml2;
         int best_n_i = -2; // not consecutive with 0..n_frac
-        for (int i = 0; i < n_frac_n; ++i) {
-            const int ii = Faux[i].i;
+        int i = 0;
+        while (k_heap->n > 0) {
+            struct fraction frac = k_heap_pop(k_heap);
+            const int ii = frac.i;
             const float w = weights ? weights[ii] : x[ii] * x[ii];
-            sumlx_n += w * Faux[i].numer;
-            suml2_n += w * Faux[i].denom;
+            sumlx_n += w * frac.numer;
+            suml2_n += w * frac.denom;
             const float current = sumlx_n * sumlx_n;
             Laux[ii] += x[ii] >= 0.0f ? -1 : 1;
             if (suml2_n > 0.0f && current * best_suml2 > best * suml2_n) {
@@ -1100,6 +1225,7 @@ static float make_qkxs_nl_quants(int n, int k, const float * restrict x, const f
                 }
                 best_n_i = i;
             }
+            ++i;
         }
     }
 
@@ -5153,8 +5279,7 @@ static inline int best_index_int8(int n, const int8_t * val, float x) {
 
 static void quantize_row_iq4_nl_impl(const int super_block_size, const int block_size, const float * restrict x,
         ggml_fp16_t * dh, uint8_t * q4, uint16_t * scales_h, uint8_t * scales_l,
-        float * scales, float * weight, uint8_t * L, uint8_t * Laux, struct fraction * Faux,
-        const int8_t * values,
+        float * scales, float * weight, uint8_t * L, uint8_t * Laux, struct k_heap * k_heap,
         const float * quant_weights) {
 
     float sigma2 = 0;
@@ -5185,7 +5310,7 @@ static void quantize_row_iq4_nl_impl(const int super_block_size, const int block
             scales[ib] = 0;
             continue;
         }
-        float d = make_qkxs_nl_quants(block_size, 16, xb, weight, values, Lb, Laux, Faux, true);
+        float d = make_qkxs_nl_quants(block_size, xb, weight, Lb, Laux, k_heap, true, !quant_weights);
         scales[ib] = d;
         float abs_d = fabsf(d);
         if (abs_d > amax_scale) {
@@ -5239,17 +5364,21 @@ size_t quantize_iq4_nl(const float * restrict src, void * restrict dst, int64_t
     char * qrow = (char *)dst;
     uint8_t L[QK4_NL];
     uint8_t Laux[QK4_NL];
-    struct fraction Faux[QK4_NL * 16];
+    struct k_heap_cell heap_cells[QK4_NL];
+    struct k_heap k_heap;
+    float odd[16];
+    float steps[16];
     float weight[QK4_NL];
     uint16_t unused_h;
     uint8_t * unused_l = NULL;
     float scale;
+    k_heap_init(&k_heap, 16, kvalues_iq4nl, heap_cells, odd, steps);
     for (int64_t row = 0; row < nrow; ++row) {
         block_iq4_nl * iq4 = (block_iq4_nl *)qrow;
         for (int ibl = 0; ibl < nblock; ++ibl) {
             const float * qw = quant_weights ? quant_weights + QK4_NL*ibl : NULL;
             quantize_row_iq4_nl_impl(QK4_NL, 32, src + QK4_NL*ibl, &iq4[ibl].d, iq4[ibl].qs, &unused_h, unused_l,
-                    &scale, weight, L, Laux, Faux, kvalues_iq4nl, qw);
+                    &scale, weight, L, Laux, &k_heap, qw);
         }
         src += n_per_row;
         qrow += nblock*sizeof(block_iq4_nl);
@@ -5262,15 +5391,19 @@ void quantize_row_iq4_nl_ref(const float * restrict x, block_iq4_nl * restrict y
     int64_t nblock = k/QK4_NL;
     uint8_t L[QK4_NL];
     uint8_t Laux[QK4_NL];
-    struct fraction Faux[QK4_NL * 16];
+    struct k_heap_cell heap_cells[QK4_NL];
+    struct k_heap k_heap;
+    float odd[16];
+    float steps[16];
     float weight[QK4_NL];
     uint16_t unused_h;
     uint8_t * unused_l = NULL;
     float scale;
     block_iq4_nl * iq4 = y;
+    k_heap_init(&k_heap, 16, kvalues_iq4nl, heap_cells, odd, steps);
     for (int ibl = 0; ibl < nblock; ++ibl) {
         quantize_row_iq4_nl_impl(QK4_NL, 32, x + QK4_NL*ibl, &iq4[ibl].d, iq4[ibl].qs, &unused_h, unused_l,
-                &scale, weight, L, Laux, Faux, kvalues_iq4nl, NULL);
+                &scale, weight, L, Laux, &k_heap, NULL);
     }
 }
 
@@ -5280,15 +5413,19 @@ size_t quantize_iq4_xs(const float * restrict src, void * restrict dst, int64_t
     char * qrow = (char *)dst;
     uint8_t L[QK_K];
     uint8_t Laux[32];
-    struct fraction Faux[32 * 16];
+    struct k_heap_cell heap_cells[32];
+    struct k_heap k_heap;
+    float odd[16];
+    float steps[16];
     float weight[32];
     float scales[QK_K/32];
+    k_heap_init(&k_heap, 16, kvalues_iq4nl, heap_cells, odd, steps);
     for (int64_t row = 0; row < nrow; ++row) {
         block_iq4_xs * iq4 = (block_iq4_xs *)qrow;
         for (int ibl = 0; ibl < nblock; ++ibl) {
             const float * qw = quant_weights ? quant_weights + QK_K*ibl : NULL;
             quantize_row_iq4_nl_impl(QK_K, 32, src + QK_K*ibl, &iq4[ibl].d, iq4[ibl].qs, &iq4[ibl].scales_h, iq4[ibl].scales_l,
-                    scales, weight, L, Laux, Faux, kvalues_iq4nl, qw);
+                    scales, weight, L, Laux, &k_heap, qw);
         }
         src += n_per_row;
         qrow += nblock*sizeof(block_iq4_xs);

From 3be115100f581f418701362aa22c1c3a5d382b89 Mon Sep 17 00:00:00 2001
From: Francis Couture-Harpin <git@compilade.net>
Date: Thu, 20 Mar 2025 19:21:45 -0400
Subject: [PATCH 07/12] ggml-quants : use a max-heap for linear quants like
 Q3_K

Slightly faster than the previous method.
---
 ggml/src/ggml-quants.c | 253 ++++++++++++++++++++++++++++++++++++-----
 1 file changed, 224 insertions(+), 29 deletions(-)

diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c
index 6762e98f68097..1267e02dd3822 100644
--- a/ggml/src/ggml-quants.c
+++ b/ggml/src/ggml-quants.c
@@ -635,7 +635,7 @@ struct fraction {
     int i;
 };
 
-// comparator function for sorting fractions in make_qkxs_quants
+// comparator function for sorting fractions
 static inline int compare_fractions_desc(const void * a, const void * b) {
     const struct fraction * f_a = (const struct fraction *) a;
     const struct fraction * f_b = (const struct fraction *) b;
@@ -734,51 +734,106 @@ static void k_heap_init(struct k_heap * restrict k_heap, int k, const int8_t * r
     for (int i = 0; i < k - 1; ++i) {
         const float threshold = kvalues[i + 1] + kvalues[i];
         const float step = kvalues[i + 1] - kvalues[i];
-        // It's amazing how their product is the difference between consecutive squares of the kvalues
+        // It's amazing how their product is the difference between consecutive squares of the kvalues,
+        // but it makes sense because a*a - b*b == (a + b)*(a - b).
         GGML_ASSERT(threshold * step != 0.0f);
         odd[i + (i >= mid_k ? 1 : 0)] = fabsf(threshold);
         steps[i + (i >= mid_k ? 1 : 0)] = fabsf(step);
     }
     odd[mid_k] = 0.0f;
     steps[mid_k] = 0.0f;
+}
+
+static void k_heap_init_linear(struct k_heap * k_heap, int nmin, int nmax, struct k_heap_cell * restrict heap_cells, float * restrict odd) {
+    GGML_ASSERT(k_heap && heap_cells && odd);
+    nmin = MIN(0, nmin);
+    nmax = MAX(0, nmax);
+    k_heap->n = 0;
+    k_heap->k = nmax - nmin + 1;
+    k_heap->odd = odd;
+    k_heap->steps = NULL;
+    k_heap->heap = heap_cells;
+
+    k_heap->mid_k = -nmin;
+    k_heap->kmin = 0; // the range should always overlap 0
+    k_heap->kmax = abs(nmin) > abs(nmax) ? nmin : nmax;
+
+    for (int i = nmin; i < nmax; ++i) {
+        // odd numbers are the difference between consecutive squares
+        odd[i - nmin + (i >= 0 ? 1 : 0)] = fabsf((float) (i + (i + 1)));
+    }
+    odd[-nmin] = 0.0f;
+}
+
+// with initial quantized values
+static void k_heap_set_x_L(struct k_heap * k_heap, const float * restrict x, const int8_t * restrict L, int n, bool invert_sign) {
+    int j = 0;
+    for (int i = 0; i < n; ++i) {
+        const int k_i = ((x[i] < 0.0f) != invert_sign) ? L[i] - 1 : L[i] + 1;
+        GGML_ASSERT(k_i != k_heap->mid_k);
+        if (k_i >= 0 && k_i < k_heap->k) {
+            k_heap->heap[j++] = (struct k_heap_cell) {
+                .k_i=k_i,
+                .x_i=i,
+                .x=fabsf(x[i]),
+                .frac=fabsf(x[i] / k_heap->odd[k_i]),
+            };
+        }
+    }
+    k_heap->n = j;
 
-    GGML_ASSERT(mid_k > 0 && mid_k + 1 < k);
+    for (int i = (k_heap->n / 2) - 1; i >= 0; --i) {
+        k_heap_build(k_heap, i);
+    }
 }
 
-// TODO: initial quantized values
+// assuming the initial quantized value are all at k_heap->mid_k
 static void k_heap_set_x(struct k_heap * k_heap, const float * restrict x, int n, bool invert_sign) {
-    // TODO: sanity checks
-    k_heap->n = n;
+    int j = 0;
     for (int i = 0; i < n; ++i) {
         const int k_i = ((x[i] < 0.0f) != invert_sign) ? k_heap->mid_k - 1 : k_heap->mid_k + 1;
-        k_heap->heap[i] = (struct k_heap_cell) {
-            .k_i=k_i,
-            .x_i=i,
-            .x=fabsf(x[i]),
-            .frac=fabsf(x[i] / k_heap->odd[k_i]),
-        };
+        if (k_i >= 0 && k_i < k_heap->k) {
+            k_heap->heap[j++] = (struct k_heap_cell) {
+                .k_i=k_i,
+                .x_i=i,
+                .x=fabsf(x[i]),
+                .frac=fabsf(x[i] / k_heap->odd[k_i]),
+            };
+        }
     }
+    k_heap->n = j;
 
     for (int i = (k_heap->n / 2) - 1; i >= 0; --i) {
         k_heap_build(k_heap, i);
     }
 }
 
+// returns the fractions in descending order
 static struct fraction k_heap_pop(struct k_heap * k_heap) {
     if (k_heap && k_heap->n > 0) {
         struct k_heap_cell * heap_cell = k_heap->heap;
-        const float step = k_heap->steps[heap_cell->k_i];
-        // Properly turn this into a difference of consecutive squares
-        struct fraction frac = (struct fraction) {
-            .numer=heap_cell->x*step,
-            .denom=k_heap->odd[heap_cell->k_i]*step,
-            .i=heap_cell->x_i,
-        };
+        struct fraction frac;
+        if (k_heap->steps) {
+            const float step = k_heap->steps[heap_cell->k_i];
+            // Properly turn this into a difference of consecutive squares even for non-linear steps
+            frac = (struct fraction) {
+                .numer=heap_cell->x * step,
+                .denom=k_heap->odd[heap_cell->k_i] * step,
+                .i=heap_cell->x_i,
+            };
+        } else {
+            // step is always 1 for linear quants
+            frac = (struct fraction) {
+                .numer=heap_cell->x,
+                .denom=k_heap->odd[heap_cell->k_i],
+                .i=heap_cell->x_i,
+            };
+        }
 
         if (heap_cell->k_i < k_heap->mid_k) {
             if (heap_cell->k_i > 0) {
                 heap_cell->k_i -= 1;
-                heap_cell->frac = heap_cell->x/k_heap->odd[heap_cell->k_i];
+                heap_cell->frac = heap_cell->x / k_heap->odd[heap_cell->k_i];
             } else {
                 // remove this node
                 k_heap->heap[0] = k_heap->heap[k_heap->n - 1];
@@ -787,7 +842,7 @@ static struct fraction k_heap_pop(struct k_heap * k_heap) {
         } else {
             if (heap_cell->k_i < k_heap->k - 1) {
                 heap_cell->k_i += 1;
-                heap_cell->frac = heap_cell->x/k_heap->odd[heap_cell->k_i];
+                heap_cell->frac = heap_cell->x / k_heap->odd[heap_cell->k_i];
             } else {
                 // remove this node
                 k_heap->heap[0] = k_heap->heap[k_heap->n - 1];
@@ -838,7 +893,7 @@ static float make_qkxs_quants(int n, int nmin, int nmax, const float * restrict
     bool negative_scale = false;
     if (signed_scale && -nmin != nmax) {
         // the max side should have the biggest range
-        // FIXME: this is not always the best sign
+        // NOTE: this is not always the best sign
         if ((x[amax_i] < 0.0f) == (-nmin < nmax)) {
             // [-4, 3] ==> [-3, 4]
             const int tmp = nmin;
@@ -870,7 +925,7 @@ static float make_qkxs_quants(int n, int nmin, int nmax, const float * restrict
     if (amax_range > 1) {
         // The smallest non-redundant iscale makes the first clamped value half+1 its max integer value.
         // Proof: anything smaller has a representable vector with values twice as big.
-        const float iscale = ((float)(amax_range / 2 + 1))/range_max * (negative_scale ? -1.0f : 1.0f);
+        const float iscale = ((float)((amax_range >> 1) + 1))/range_max * (negative_scale ? -1.0f : 1.0f);
         for (int i = 0; i < n; ++i) {
             const float w = weights[i];
             int l = MAX(nmin, MIN(lroundf(x[i] * iscale), nmax));
@@ -949,6 +1004,129 @@ static float make_qkxs_quants(int n, int nmin, int nmax, const float * restrict
     return negative_scale ? -scale : scale;
 }
 
+static float make_qkxh_quants(int n, const float * restrict x, const float * restrict weights, int8_t * restrict L, int8_t * restrict Laux, struct k_heap * restrict k_heap, bool signed_scale) {
+    const int nmin = -k_heap->mid_k; // TODO: maybe directly pass these
+    const int nmax = k_heap->k + nmin - 1;
+    float amax = fabsf(x[0]);
+    float w_amax = (weights ? weights[0] : x[0] * x[0]) * amax;
+    int amax_i = 0;
+    int w_amax_i = 0;
+    for (int i = 1; i < n; ++i) {
+        // Find the most important value
+        const float w = weights ? weights[i] : x[i] * x[i];
+        const float ax = fabsf(x[i]);
+        const float wax = w * ax;
+        if (ax > amax) {
+            amax = ax;
+            amax_i = i;
+        }
+        if (wax > w_amax) {
+            w_amax = wax;
+            w_amax_i = i;
+        }
+    }
+
+    if (amax < GROUP_MAX_EPS) { // all zero
+        for (int i = 0; i < n; ++i) {
+            L[i] = 0;
+        }
+        return 0.0f;
+    }
+
+    bool negative_scale = false;
+    if (signed_scale && -nmin > nmax) {
+        // the max side should have the biggest range
+        // NOTE: this is not always the best sign, but seems to be a good heuristic.
+        if ((x[amax_i] < 0.0f) == (-nmin < nmax)) {
+            // [-4, 3] ==> [-3, 4]
+            negative_scale = true;
+        }
+    }
+
+    // Find the max range in [0, amax_range] which doesn't result in clamping.
+    // This is the range from the side which would clamp first (biggest ratio of max to nmax).
+    // But it's easier and safer to simply use the smallest range.
+    int amax_range = MIN(abs(nmin), abs(nmax));
+    if (amax_range == 0) {
+        // one side will clamp anyway
+        amax_range = MAX(abs(nmin), abs(nmax));
+    }
+    float sumlx = 0.0f;
+    float suml2 = 0.0f;
+    float scale = 0.0f;
+    float best = 0.0f;
+    float best_denom = 1.0f; // should never be zero
+    if (amax_range > 1) {
+        // The smallest non-redundant iscale makes the first clamped value half+1 its max integer value.
+        // Proof: anything smaller has a representable vector with values twice as big.
+        // TODO: use a bigger iscale in asymmetric cases when possible
+        // NOTE: strangely, when using half+1, with nmin=-2 and nmax=2, the corners look slighlty clipped,
+        //       but this does not happen when using half of the range as a starting point.
+        const float iscale = ((float)(amax_range >> 1))/amax * (negative_scale ? -1.0f : 1.0f);
+        for (int i = 0; i < n; ++i) {
+            const float w = weights ? weights[i] : x[i] * x[i];
+            int l = MAX(nmin, MIN(lroundf(x[i] * iscale), nmax));
+            Laux[i] = l + k_heap->mid_k;
+            suml2 += w * l * l;
+            sumlx += w * l * x[i];
+        }
+        if (suml2 > 0.0f) {
+            best = sumlx * sumlx;
+            best_denom = suml2;
+            scale = sumlx / suml2;
+        }
+    } else {
+        memset(Laux, k_heap->mid_k, n);
+    }
+    memcpy(L, Laux, n);
+
+    k_heap_set_x_L(k_heap, x, Laux, n, negative_scale);
+
+    const int imax_range = MAX(abs(nmin), abs(nmax));
+    // const int imax_range = (x[w_amax_i] < 0.0f) != negative_scale ? abs(nmin) : abs(nmax);
+    const int max_odd = 2*(imax_range + 1) + 1;
+    const float wmax = fabsf(x[w_amax_i]);
+    // const float wmax = amax;
+    {
+        int best_p_i = -1; // consecutive with 0..n_frac
+        int i = 0;
+        while (k_heap->n > 0) {
+            struct fraction frac = k_heap_pop(k_heap);
+            if (frac.numer == 0.0f) { break; }
+            const float v_max_odd = frac.numer * max_odd;
+            if (wmax * frac.denom > v_max_odd) {
+                // stop when the inverse scale would result in clamping the most important value
+                break;
+            }
+            // maximize the weighted cosine similarity
+            const int ii = frac.i;
+            const float w = weights ? weights[ii] : x[ii] * x[ii];
+            if (negative_scale) {
+                frac.numer = -frac.numer;
+            }
+            sumlx += w * frac.numer;
+            suml2 += w * frac.denom;
+            const float current = sumlx * sumlx;
+            Laux[ii] += (x[ii] < 0.0f) != negative_scale ? -1 : 1;
+            if (suml2 > 0.0f && current * best_denom > best * suml2) {
+                best = current;
+                best_denom = suml2;
+                scale = sumlx / suml2;
+                if (i == best_p_i + 1) {
+                    // reduce copies for consecutive bests
+                    L[ii] += (x[ii] < 0.0f) != negative_scale ? -1 : 1;
+                } else {
+                    memcpy(L, Laux, n);
+                }
+                best_p_i = i;
+            }
+            i += 1;
+        }
+    }
+
+    return scale;
+}
+
 // Very similar to make_qkxs_quants, but the sign of the scale is not assumed to be the sign of the absmax value.
 static float make_qkxss_quants(int n, int nmin, int nmax, const float * restrict x, const float * restrict weights, int8_t * restrict L, int8_t * restrict Laux, struct fraction * restrict Faux) {
     // start at zero
@@ -991,7 +1169,7 @@ static float make_qkxss_quants(int n, int nmin, int nmax, const float * restrict
     // Pre-calculate the half-point for the common range.
     // All smaller vectors have a representable vector with twice the values, and thus can be skipped.
     if (amax_range > 1) {
-        const float iscale = ((float)(amax_range / 2 + 1))/amax;
+        const float iscale = ((float)((amax_range >> 1) + 1))/amax;
         for (int i = 0; i < n; ++i) {
             const float w = weights ? weights[i] : x[i] * x[i];
             int l = MAX(nmin, MIN(lroundf(x[i] * iscale), nmax));
@@ -1587,10 +1765,15 @@ void quantize_row_q3_K_ref(const float * restrict x, block_q3_K * restrict y, in
 
     int8_t L[QK_K];
     int8_t Laux[16];
-    struct fraction Faux[16 * 4];
+    // struct fraction Faux[16 * 4];
+    struct k_heap_cell heap_cells[16];
+    float odd[8];
+    struct k_heap k_heap;
     float scales[QK_K / 16];
     float weights[16];
 
+    k_heap_init_linear(&k_heap, -4, 3, heap_cells, odd);
+
     for (int i = 0; i < 16; ++i) {
         weights[i] = 1.0f;
     }
@@ -1600,7 +1783,8 @@ void quantize_row_q3_K_ref(const float * restrict x, block_q3_K * restrict y, in
         float max_scale = 0;
         float amax = 0;
         for (int j = 0; j < QK_K/16; ++j) {
-            scales[j] = make_qkxs_quants(16, -4, 3, x + 16*j, weights, L + 16*j, Laux, Faux, true);
+            // scales[j] = make_qkxs_quants(16, -4, 3, x + 16*j, weights, L + 16*j, Laux, Faux, true);
+            scales[j] = make_qkxh_quants(16, x + 16*j, weights, L + 16*j, Laux, &k_heap, true);
             float scale = fabsf(scales[j]);
             if (scale > amax) {
                 amax = scale; max_scale = scales[j];
@@ -1709,7 +1893,16 @@ static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restri
     float weight[16];
     float sw[QK_K / 16];
     int8_t Ls[QK_K / 16];
-    struct fraction Faux[16 * 32];
+    // struct fraction Faux[16 * 32];
+    struct k_heap_cell heap_cells[16];
+    float odd[8];
+    struct k_heap k_heap;
+    struct k_heap_cell heap_cells_s[QK_K / 16];
+    float odd_s[64];
+    struct k_heap k_heap_s;
+
+    k_heap_init_linear(&k_heap, -4, 3, heap_cells, odd);
+    k_heap_init_linear(&k_heap_s, -32, 31, heap_cells_s, odd_s);
 
     for (int i = 0; i < nb; i++) {
 
@@ -1728,13 +1921,15 @@ static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restri
             for (int l = 0; l < 16; ++l) sumw += weight[l];
             sw[j] = sumw;
 
-            scales[j] = make_qkxs_quants(16, -4, 3, x + 16*j, weight, L + 16*j, Laux, Faux, true);
+            // scales[j] = make_qkxs_quants(16, -4, 3, x + 16*j, weight, L + 16*j, Laux, Faux, true);
+            scales[j] = make_qkxh_quants(16, x + 16*j, weight, L + 16*j, Laux, &k_heap, true);
 
         }
 
         memset(y[i].scales, 0, 12);
 
-        float d_block = make_qkxs_quants(QK_K/16, -32, 31, scales, sw, Ls, Laux, Faux, true);
+        // float d_block = make_qkxs_quants(QK_K/16, -32, 31, scales, sw, Ls, Laux, Faux, true);
+        float d_block = make_qkxh_quants(QK_K/16, scales, sw, Ls, Laux, &k_heap_s, true);
         for (int j = 0; j < QK_K/16; ++j) {
             int l = Ls[j];
             if (j < 8) {

From f86b8ff21084c86ab4722887f6b4f6a38f67cbf6 Mon Sep 17 00:00:00 2001
From: Francis Couture-Harpin <git@compilade.net>
Date: Fri, 21 Mar 2025 14:05:58 -0400
Subject: [PATCH 08/12] ggml-quants : use qkxh in more places

---
 ggml/src/ggml-quants.c | 61 +++++++++++++++++++++++-------------------
 1 file changed, 34 insertions(+), 27 deletions(-)

diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c
index 1267e02dd3822..5e25887f10fc6 100644
--- a/ggml/src/ggml-quants.c
+++ b/ggml/src/ggml-quants.c
@@ -629,7 +629,6 @@ static float make_qkx2_quants(int n, int nmax, const float * restrict x, const f
 }
 
 struct fraction {
-    // float frac;
     float numer;
     float denom;
     int i;
@@ -677,7 +676,8 @@ struct k_heap {
     struct k_heap_cell * heap;
 };
 
-// build a max heap out of k_cells starting from node i
+// build a max heap out of k_cells starting from node i;
+// makes sure the node i is bigger than its children
 static void k_heap_build(struct k_heap * heap, int i) {
     const int n = heap->n;
     int max = i;
@@ -744,6 +744,7 @@ static void k_heap_init(struct k_heap * restrict k_heap, int k, const int8_t * r
     steps[mid_k] = 0.0f;
 }
 
+// for linear types which have a constant step of 1 between representable values
 static void k_heap_init_linear(struct k_heap * k_heap, int nmin, int nmax, struct k_heap_cell * restrict heap_cells, float * restrict odd) {
     GGML_ASSERT(k_heap && heap_cells && odd);
     nmin = MIN(0, nmin);
@@ -1004,6 +1005,7 @@ static float make_qkxs_quants(int n, int nmin, int nmax, const float * restrict
     return negative_scale ? -scale : scale;
 }
 
+// exhaustive search with cumulative sums
 static float make_qkxh_quants(int n, const float * restrict x, const float * restrict weights, int8_t * restrict L, int8_t * restrict Laux, struct k_heap * restrict k_heap, bool signed_scale) {
     const int nmin = -k_heap->mid_k; // TODO: maybe directly pass these
     const int nmax = k_heap->k + nmin - 1;
@@ -1027,9 +1029,7 @@ static float make_qkxh_quants(int n, const float * restrict x, const float * res
     }
 
     if (amax < GROUP_MAX_EPS) { // all zero
-        for (int i = 0; i < n; ++i) {
-            L[i] = 0;
-        }
+        memset(L, 0, n);
         return 0.0f;
     }
 
@@ -1304,7 +1304,7 @@ static float make_qkxss_quants(int n, int nmin, int nmax, const float * restrict
 }
 
 // non-linear exhaustive search with cumulative sums
-static float make_qkxs_nl_quants(int n, const float * restrict x, const float * restrict weights, uint8_t * restrict L, uint8_t * restrict Laux, struct k_heap * restrict k_heap, bool signed_scale, bool fast) {
+static float make_qkxh_nl_quants(int n, const float * restrict x, const float * restrict weights, uint8_t * restrict L, uint8_t * restrict Laux, struct k_heap * restrict k_heap, bool signed_scale, bool fast) {
     float sumlx = 0.0f;
     float suml2 = 0.0f;
     float amax = -1.0f;
@@ -1687,11 +1687,18 @@ static void quantize_row_q2_K_impl(const float * restrict x, block_q2_K * restri
 
     uint8_t L[QK_K];
     uint8_t Laux[16];
+    int8_t Lsaux[16];
     float mins[QK_K/16];
     float scales[QK_K/16];
     float sw[QK_K/16];
     float weight[16];
-    uint8_t Ls[QK_K/16], Lm[QK_K/16];
+    int8_t Ls[QK_K/16], Lm[QK_K/16];
+
+    struct k_heap_cell heap_cells_s[QK_K/16];
+    float odd_s[16];
+    struct k_heap k_heap_s;
+
+    k_heap_init_linear(&k_heap_s, 0, 15, heap_cells_s, odd_s);
 
     for (int i = 0; i < nb; i++) {
         memset(sw, 0, QK_K/16*sizeof(float));
@@ -1706,8 +1713,8 @@ static void quantize_row_q2_K_impl(const float * restrict x, block_q2_K * restri
         }
 
         float dm, mm;
-        dm  = make_qp_quants(QK_K/16, 15, scales, Ls, sw);
-        mm  = make_qp_quants(QK_K/16, 15, mins,   Lm, sw);
+        dm  = make_qkxh_quants(QK_K/16, scales, sw, Ls, Lsaux, &k_heap_s, false);
+        mm  = make_qkxh_quants(QK_K/16, mins, sw, Lm, Lsaux, &k_heap_s, false);
 
         y[i].d    = GGML_FP32_TO_FP16(dm);
         y[i].dmin = GGML_FP32_TO_FP16(mm);
@@ -2607,7 +2614,12 @@ static void quantize_row_q4_0_impl(const float * restrict x, block_q4_0 * restri
     float weight[QK4_0];
     int8_t L[QK4_0];
     int8_t Laux[QK4_0];
-    struct fraction Faux[8 * QK4_0];
+    // struct fraction Faux[8 * QK4_0];
+    struct k_heap_cell heap_cells[QK4_0];
+    float odd[16];
+    struct k_heap k_heap;
+
+    k_heap_init_linear(&k_heap, -8, 7, heap_cells, odd);
 
     float sum_x2 = 0;
     for (int j = 0; j < n_per_row; ++j) sum_x2 += x[j]*x[j];
@@ -2618,7 +2630,8 @@ static void quantize_row_q4_0_impl(const float * restrict x, block_q4_0 * restri
         const float * xb = x + QK4_0 * ib;
         const float * qw = quant_weights + QK4_0 * ib;
         for (int j = 0; j < QK4_0; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]);
-        float d = make_qkxs_quants(QK4_0, -8, 7, xb, weight, L, Laux, Faux, true);
+        // float d = make_qkxs_quants(QK4_0, -8, 7, xb, weight, L, Laux, Faux, true);
+        float d = make_qkxh_quants(QK4_0, xb, weight, L, Laux, &k_heap, true);
         y[ib].d = GGML_FP32_TO_FP16(d);
         for (int j = 0; j < 16; ++j) {
             y[ib].qs[j] = L[j] | (L[j+16] << 4);
@@ -2697,7 +2710,12 @@ static void quantize_row_q5_0_impl(const float * restrict x, block_q5_0 * restri
     float weight[QK5_0];
     int8_t L[QK5_0];
     int8_t Laux[QK5_0];
-    struct fraction Faux[16 * QK5_0];
+    // struct fraction Faux[16 * QK5_0];
+    struct k_heap_cell heap_cells[QK5_0];
+    float odd[32];
+    struct k_heap k_heap;
+
+    k_heap_init_linear(&k_heap, -16, 15, heap_cells, odd);
 
     float sum_x2 = 0;
     for (int j = 0; j < n_per_row; ++j) sum_x2 += x[j]*x[j];
@@ -2708,7 +2726,8 @@ static void quantize_row_q5_0_impl(const float * restrict x, block_q5_0 * restri
         const float * xb = x + QK5_0 * ib;
         const float * qw = quant_weights + QK5_0 * ib;
         for (int j = 0; j < QK5_0; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]);
-        float d = make_qkxs_quants(QK5_0, -16, 15, xb, weight, L, Laux, Faux, true);
+        float d = make_qkxh_quants(QK5_0, xb, weight, L, Laux, &k_heap, true);
+        // float d = make_qkxs_quants(QK5_0, -16, 15, xb, weight, L, Laux, Faux, true);
         y[ib].d = GGML_FP32_TO_FP16(d);
 
         uint32_t qh = 0;
@@ -5505,7 +5524,7 @@ static void quantize_row_iq4_nl_impl(const int super_block_size, const int block
             scales[ib] = 0;
             continue;
         }
-        float d = make_qkxs_nl_quants(block_size, xb, weight, Lb, Laux, k_heap, true, !quant_weights);
+        float d = make_qkxh_nl_quants(block_size, xb, weight, Lb, Laux, k_heap, true, !quant_weights);
         scales[ib] = d;
         float abs_d = fabsf(d);
         if (abs_d > amax_scale) {
@@ -5516,19 +5535,13 @@ static void quantize_row_iq4_nl_impl(const int super_block_size, const int block
     if (super_block_size/block_size > 1) {
         int nb = super_block_size/block_size;
         memset(scales_h, 0, ((nb+7)/8)*sizeof(uint16_t));
+        // TODO: use make_qkxh_quants
         float d = -max_scale/32;
         dh[0] = GGML_FP32_TO_FP16(d);
         float id = d ? 1/d : 0.f;
         for (int ib = 0; ib < super_block_size/block_size; ++ib) {
             int l = nearest_int(id*scales[ib]);
             l = MAX(-32, MIN(31, l));
-            // float dl = d * l;
-            // float idl = dl ? 1/dl : 0.f;
-            // uint8_t * Lb = L + ib*block_size;
-            // const float * xb = x + ib*block_size;
-            // for (int j = 0; j < block_size; ++j) {
-            //     Lb[j] = best_index_int8(16, values, idl*xb[j]);
-            // }
             l += 32;
             uint8_t l_l = l & 0xf;
             uint8_t l_h = l >>  4;
@@ -5538,12 +5551,6 @@ static void quantize_row_iq4_nl_impl(const int super_block_size, const int block
         }
     } else {
         dh[0] = GGML_FP32_TO_FP16(scales[0]);
-        // if (ntry > 0) {
-        //     float id = scales[0] ? 1/scales[0] : 0;
-        //     for (int j = 0; j < super_block_size; ++j) {
-        //         L[j] = best_index_int8(16, values, id*x[j]);
-        //     }
-        // }
     }
 
     for (int i = 0; i < super_block_size/32; ++i) {

From 3e4b675c9f0f955965ad1ea3a927a57eee582b6d Mon Sep 17 00:00:00 2001
From: Francis Couture-Harpin <git@compilade.net>
Date: Sat, 22 Mar 2025 12:03:26 -0400
Subject: [PATCH 09/12] ggml-quants : use a max-heap for TQ1_0 and TQ2_0
 quantization

---
 ggml/src/ggml-quants.c | 238 ++++++++++++++++++++++++++++++++++-------
 1 file changed, 199 insertions(+), 39 deletions(-)

diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c
index 5e25887f10fc6..9be87a671ce83 100644
--- a/ggml/src/ggml-quants.c
+++ b/ggml/src/ggml-quants.c
@@ -1007,8 +1007,8 @@ static float make_qkxs_quants(int n, int nmin, int nmax, const float * restrict
 
 // exhaustive search with cumulative sums
 static float make_qkxh_quants(int n, const float * restrict x, const float * restrict weights, int8_t * restrict L, int8_t * restrict Laux, struct k_heap * restrict k_heap, bool signed_scale) {
-    const int nmin = -k_heap->mid_k; // TODO: maybe directly pass these
-    const int nmax = k_heap->k + nmin - 1;
+    const int nmin = MIN(0, -k_heap->mid_k); // TODO: maybe directly pass these
+    const int nmax = MAX(0, k_heap->k + nmin - 1);
     float amax = fabsf(x[0]);
     float w_amax = (weights ? weights[0] : x[0] * x[0]) * amax;
     int amax_i = 0;
@@ -1046,10 +1046,10 @@ static float make_qkxh_quants(int n, const float * restrict x, const float * res
     // Find the max range in [0, amax_range] which doesn't result in clamping.
     // This is the range from the side which would clamp first (biggest ratio of max to nmax).
     // But it's easier and safer to simply use the smallest range.
-    int amax_range = MIN(abs(nmin), abs(nmax));
+    int amax_range = MIN(-nmin, nmax);
     if (amax_range == 0) {
         // one side will clamp anyway
-        amax_range = MAX(abs(nmin), abs(nmax));
+        amax_range = MAX(-nmin, nmax);
     }
     float sumlx = 0.0f;
     float suml2 = 0.0f;
@@ -1087,40 +1087,192 @@ static float make_qkxh_quants(int n, const float * restrict x, const float * res
     const int max_odd = 2*(imax_range + 1) + 1;
     const float wmax = fabsf(x[w_amax_i]);
     // const float wmax = amax;
-    {
-        int best_p_i = -1; // consecutive with 0..n_frac
-        int i = 0;
-        while (k_heap->n > 0) {
-            struct fraction frac = k_heap_pop(k_heap);
-            if (frac.numer == 0.0f) { break; }
-            const float v_max_odd = frac.numer * max_odd;
-            if (wmax * frac.denom > v_max_odd) {
-                // stop when the inverse scale would result in clamping the most important value
-                break;
+    int best_p_i = -1; // consecutive with 0..n_frac
+    for (int i = 0; k_heap->n > 0; ++i) {
+        struct fraction frac = k_heap_pop(k_heap);
+        if (frac.numer == 0.0f) { break; }
+        const float v_max_odd = frac.numer * max_odd;
+        if (wmax * frac.denom > v_max_odd) {
+            // stop when the inverse scale would result in clamping the most important value
+            break;
+        }
+        // maximize the weighted cosine similarity
+        const int ii = frac.i;
+        const float w = weights ? weights[ii] : x[ii] * x[ii];
+        if (negative_scale) {
+            frac.numer = -frac.numer;
+        }
+        sumlx += w * frac.numer;
+        suml2 += w * frac.denom;
+        const float current = sumlx * sumlx;
+        Laux[ii] += (x[ii] < 0.0f) != negative_scale ? -1 : 1;
+        if (suml2 > 0.0f && current * best_denom > best * suml2) {
+            best = current;
+            best_denom = suml2;
+            scale = sumlx / suml2;
+            if (i == best_p_i + 1) {
+                // reduce copies for consecutive bests
+                L[ii] += (x[ii] < 0.0f) != negative_scale ? -1 : 1;
+            } else {
+                memcpy(L, Laux, n);
             }
-            // maximize the weighted cosine similarity
-            const int ii = frac.i;
-            const float w = weights ? weights[ii] : x[ii] * x[ii];
-            if (negative_scale) {
-                frac.numer = -frac.numer;
-            }
-            sumlx += w * frac.numer;
-            suml2 += w * frac.denom;
-            const float current = sumlx * sumlx;
-            Laux[ii] += (x[ii] < 0.0f) != negative_scale ? -1 : 1;
-            if (suml2 > 0.0f && current * best_denom > best * suml2) {
-                best = current;
-                best_denom = suml2;
-                scale = sumlx / suml2;
-                if (i == best_p_i + 1) {
-                    // reduce copies for consecutive bests
-                    L[ii] += (x[ii] < 0.0f) != negative_scale ? -1 : 1;
-                } else {
-                    memcpy(L, Laux, n);
+            best_p_i = i;
+        }
+    }
+
+    return scale;
+}
+
+// like make_qkxh_quants, but doesn't assume the sign of the scale is the sign of the absmax value
+static float make_qkxsh_quants(int n, int nmin, int nmax, const float * restrict x, const float * restrict weights, int8_t * restrict L, int8_t * restrict Laux, struct k_heap * restrict k_heap) {
+    nmin = MIN(0, nmin);
+    nmax = MAX(0, nmax);
+    // start at zero
+    float amax = fabsf(x[0]);
+    float min = x[0];
+    float max = x[0];
+    float w_amax = weights[0] * amax;
+    int w_amax_i = 0;
+    for (int i = 1; i < n; ++i) {
+        const float w = weights[i];
+        const float ax = fabsf(x[i]);
+        const float wax = w * ax;
+        if (ax > amax) { amax = ax; }
+        if (x[i] > max) { max = x[i]; }
+        if (x[i] < min) { min = x[i]; }
+        // Find the most important value
+        if (wax > w_amax) { w_amax = wax; w_amax_i = i; }
+    }
+
+    if (amax < GROUP_MAX_EPS) { // all zero
+        memset(L, 0, n);
+        return 0.0f;
+    }
+
+    // Use the side which will clamp first.
+    // The first clamped value is the absmax at the end of the common range.
+    int amax_range = MIN(-nmin, nmax);
+    if (amax_range == 0) {
+        // One side will always clamp anyway
+        amax_range = MAX(-nmin, nmax);
+    }
+    float sumlx_p = 0.0f;
+    float suml2_p = 0.0f;
+    float sumlx_n = 0.0f;
+    float suml2_n = 0.0f;
+    float scale = 0.0f;
+    float best = 0.0f;
+    float best_denom = 1.0f;
+    int best_i = -1; // consecutive with 0..n_frac
+    // Pre-calculate the half-point for the common range.
+    // All smaller vectors have a representable vector with twice the values, and thus can be skipped.
+    if (amax_range > 1) {
+        const float iscale = ((float)((amax_range >> 1) + 1))/amax;
+        for (int i = 0; i < n; ++i) {
+            const float w = weights[i];
+            int l = MAX(nmin, MIN(lroundf(x[i] * iscale), nmax));
+            Laux[i] = l + k_heap->mid_k;
+            suml2_p += w * l * l;
+            sumlx_p += w * l * x[i];
+        }
+        sumlx_n = -sumlx_p;
+        suml2_n = suml2_p;
+        const float current_p = sumlx_p * sumlx_p;
+        if (suml2_p > 0.0f) {
+            best = current_p;
+            best_denom = suml2_p;
+            scale = sumlx_p / suml2_p;
+            best_i = -1; // right before 0 of the loop after sorting
+        }
+    } else {
+        memset(Laux, k_heap->mid_k, n);
+    }
+    memcpy(L, Laux, n);
+
+    k_heap_set_x_L(k_heap, x, Laux, n, false);
+
+    // TODO: make that range sign-aware to reduce the search space
+    const int imax_range = MAX(nmax, -nmin);
+    const int max_odd = 2*(imax_range + 1) + 1;
+    const float wmax = fabsf(x[w_amax_i]);
+
+    const float max_common_odd = (MIN(nmax, -nmin) * 2) + 1;
+    const float max_odd_p = (nmax * 2) + 1;
+    const float max_odd_n = (-nmin * 2) + 1;
+    for (int i = 0; k_heap->n > 0; ++i) {
+        struct fraction frac = k_heap_pop(k_heap);
+        // maximize the weighted cosine similarity
+        const int ii = frac.i;
+        const float w = weights[ii];
+        const float lx = w * frac.numer;
+        const float odd = frac.denom;
+        const float l2 = w * odd;
+        if (wmax * odd > frac.numer * max_odd) {
+            // stop when the inverse scale would result in clamping the most important value
+            break;
+        }
+
+        Laux[ii] += x[ii] < 0.0f ? -1 : 1;
+
+        float sumlx;
+        float proj;
+        float norm;
+        if (odd < max_common_odd) {
+            sumlx_p += lx;
+            suml2_p += l2;
+            sumlx_n -= lx;
+            suml2_n += l2;
+
+            sumlx = sumlx_p;
+            proj = sumlx_p * sumlx_p;
+            norm = suml2_p;
+
+            // avoid double-copying Laux in a single iteration
+            if (suml2_p != suml2_n && suml2_p * suml2_n > 0.0f) {
+                const float proj_n = sumlx_n * sumlx_n;
+                if (proj_n * norm > proj * suml2_n) {
+                    sumlx = sumlx_n;
+                    proj = proj_n;
+                    norm = suml2_n;
                 }
-                best_p_i = i;
             }
-            i += 1;
+        } else if (x[ii] < 0.0f ? odd < max_odd_n : odd < max_odd_p) {
+            sumlx_p += lx;
+            suml2_p += l2;
+
+            sumlx = sumlx_p;
+            proj = sumlx_p * sumlx_p;
+            norm = suml2_p;
+        } else {
+            // outside the positive range means we're now into negatives
+            sumlx_n -= lx;
+            suml2_n += l2;
+
+            sumlx = sumlx_n;
+            proj = sumlx_n * sumlx_n;
+            norm = suml2_n;
+        }
+        if (norm > 0.0f && proj * best_denom > best * norm) {
+            best = proj;
+            best_denom = norm;
+            scale = sumlx / norm;
+            if (i == best_i + 1) {
+                // reduce copies for consecutive bests
+                L[ii] += x[ii] < 0.0f ? -1 : 1;
+            } else {
+                memcpy(L, Laux, n);
+            }
+            best_i = i;
+        }
+    }
+
+    if (scale < 0.0f) {
+        for (int i = 0; i < n; ++i) {
+            L[i] = MAX(nmin, MIN(-(L[i] - k_heap->mid_k), nmax)) - nmin;
+        }
+    } else {
+        for (int i = 0; i < n; ++i) {
+            L[i] = MAX(nmin, MIN(L[i] - k_heap->mid_k, nmax)) - nmin;
         }
     }
 
@@ -2898,7 +3050,11 @@ static void quantize_row_tq1_0_impl(const float * restrict x, block_tq1_0 * rest
     float weight[QK_K];
     int8_t L[QK_K];
     int8_t Laux[QK_K];
-    struct fraction Faux[1 * QK_K];
+    struct k_heap_cell heap_cells[QK_K];
+    float odd[3];
+    struct k_heap k_heap;
+
+    k_heap_init_linear(&k_heap, -1, 1, heap_cells, odd);
 
     float sum_x2 = 0;
     for (int j = 0; j < n_per_row; ++j) { sum_x2 += x[j]*x[j]; }
@@ -2910,7 +3066,7 @@ static void quantize_row_tq1_0_impl(const float * restrict x, block_tq1_0 * rest
         const float * qw = quant_weights + QK_K * ib;
         const int8_t * Lptr = L;
         for (int j = 0; j < QK_K; ++j) { weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]); }
-        float d = make_qkxs_quants(QK_K, -1, 1, xb, weight, L, Laux, Faux, false);
+        float d = make_qkxh_quants(QK_K, xb, weight, L, Laux, &k_heap, false);
         y[ib].d = GGML_FP32_TO_FP16(d);
 
         // 5 elements per byte, along 32 bytes
@@ -2999,7 +3155,11 @@ static void quantize_row_tq2_0_impl(const float * restrict x, block_tq2_0 * rest
     float weight[QK_K];
     int8_t L[QK_K];
     int8_t Laux[QK_K];
-    struct fraction Faux[2 * QK_K];
+    struct k_heap_cell heap_cells[QK_K];
+    float odd[4 + 1];
+    struct k_heap k_heap;
+
+    k_heap_init_linear(&k_heap, -2, 2, heap_cells, odd);
 
     float sum_x2 = 0;
     for (int j = 0; j < n_per_row; ++j) { sum_x2 += x[j]*x[j]; }
@@ -3010,7 +3170,7 @@ static void quantize_row_tq2_0_impl(const float * restrict x, block_tq2_0 * rest
         const float * xb = x + QK_K * ib;
         const float * qw = quant_weights + QK_K * ib;
         for (int j = 0; j < QK_K; ++j) { weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]); }
-        float d = make_qkxss_quants(QK_K, -1, 2, xb, weight, L, Laux, Faux);
+        float d = make_qkxsh_quants(QK_K, -1, 2, xb, weight, L, Laux, &k_heap);
         y[ib].d = GGML_FP32_TO_FP16(d);
 
         for (size_t j = 0; j < sizeof(y->qs); j += 32) {

From af23abd3cb1385142271af3318c26d6475e17c54 Mon Sep 17 00:00:00 2001
From: Francis Couture-Harpin <git@compilade.net>
Date: Sat, 22 Mar 2025 12:07:28 -0400
Subject: [PATCH 10/12] ggml-quants : remove slower qsort-based cumulative
 search

---
 ggml/src/ggml-quants.c | 329 -----------------------------------------
 1 file changed, 329 deletions(-)

diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c
index 9be87a671ce83..f2edfc16241b6 100644
--- a/ggml/src/ggml-quants.c
+++ b/ggml/src/ggml-quants.c
@@ -861,150 +861,6 @@ static struct fraction k_heap_pop(struct k_heap * k_heap) {
     };
 }
 
-// exhaustive search with cumulative sums
-// Need Faux to have room for n*(max(abs(nmin), abs(nmax))) fractions
-static float make_qkxs_quants(int n, int nmin, int nmax, const float * restrict x, const float * restrict weights, int8_t * restrict L, int8_t * restrict Laux, struct fraction * restrict Faux, bool signed_scale) {
-    float max = x[0];
-    float min = x[0];
-    float w_amax = weights[0] * fabsf(x[0]);
-    int max_i = 0;
-    int w_amax_i = 0;
-    int min_i = 0;
-    for (int i = 1; i < n; ++i) {
-        if (x[i] < min) { min = x[i]; min_i = i; }
-        if (x[i] > max) { max = x[i]; max_i = i; }
-        // Find the most important value
-        const float w = weights[i];
-        const float wax = w * fabsf(x[i]);
-        if (wax > w_amax) {
-            w_amax = wax;
-            w_amax_i = i;
-        }
-    }
-    const int amax_i = fabsf(min) > fabsf(max) ? min_i : max_i;
-    const float amax = fabsf(x[amax_i]);
-
-    if (amax < GROUP_MAX_EPS) { // all zero
-        for (int i = 0; i < n; ++i) {
-            L[i] = 0;
-        }
-        return 0.0f;
-    }
-
-    bool negative_scale = false;
-    if (signed_scale && -nmin != nmax) {
-        // the max side should have the biggest range
-        // NOTE: this is not always the best sign
-        if ((x[amax_i] < 0.0f) == (-nmin < nmax)) {
-            // [-4, 3] ==> [-3, 4]
-            const int tmp = nmin;
-            const float ftmp = min;
-            nmin = -nmax;
-            nmax = -tmp;
-            min = -max;
-            max = -ftmp;
-            negative_scale = true;
-        }
-    }
-
-    // Find the max range in [0, amax_range] which doesn't result in clamping.
-    // This is the range from the side which would clamp first (biggest ratio of max to nmax).
-    int amax_range;
-    float range_max;
-    if (fabsf(-max * nmin) < fabsf(-min * nmax)) {
-        amax_range = MAX(0, -nmin);
-        range_max = fabsf(min);
-    } else {
-        amax_range = MAX(0, nmax);
-        range_max = fabsf(max);
-    }
-    float sumlx = 0.0f;
-    float suml2 = 0.0f;
-    float scale = 0.0f;
-    float best = 0.0f;
-    float best_denom = 1.0f;
-    if (amax_range > 1) {
-        // The smallest non-redundant iscale makes the first clamped value half+1 its max integer value.
-        // Proof: anything smaller has a representable vector with values twice as big.
-        const float iscale = ((float)((amax_range >> 1) + 1))/range_max * (negative_scale ? -1.0f : 1.0f);
-        for (int i = 0; i < n; ++i) {
-            const float w = weights[i];
-            int l = MAX(nmin, MIN(lroundf(x[i] * iscale), nmax));
-            if (negative_scale) { l = -l; }
-            Laux[i] = l;
-            L[i] = l;
-            suml2 += w * l * l;
-            sumlx += w * l * x[i];
-        }
-        best = sumlx * sumlx;
-        best_denom = suml2; // should never be zero
-        scale = sumlx / suml2;
-    } else {
-        for (int i = 0; i < n; ++i) {
-            Laux[i] = 0;
-            L[i] = 0;
-        }
-    }
-
-    const int imax_range = MAX(0, (x[w_amax_i] < 0.0f) ? -nmin : nmax);
-    const int max_odd = 2*(imax_range + 1) + 1;
-    const float wmax = fabsf(x[w_amax_i]);
-    int n_frac = 0;
-    for (int i = 0; i < n; ++i) {
-        // assuming nmin <= nmax
-        const int odd_max = MAX(abs(Laux[i]), x[i] < 0.0f ? -nmin : nmax);
-        const int odd_min = MAX(abs(Laux[i]), x[i] < 0.0f ? -nmax : nmin);
-        const float v = fabsf(x[i]);
-        const float v_max_odd = v * max_odd;
-        for (int j = odd_min; j < odd_max; ++j) {
-            const float odd = 2*j + 1;
-            if (wmax * odd < v_max_odd) {
-                Faux[n_frac++] = (struct fraction){
-                    .numer=v,
-                    .denom=odd,
-                    .i=i,
-                };
-            } else {
-                // stop when the inverse scale would result in clamping the most important value
-                break;
-            }
-        }
-    }
-
-    qsort(Faux, n_frac, sizeof(struct fraction), compare_fractions_desc);
-
-    int best_p_i = -1; // consecutive with 0..n_frac
-    for (int i = 0; i < n_frac; ++i) {
-        // maximize the weighted cosine
-        const int ii = Faux[i].i;
-        const float w = weights ? weights[ii] : x[ii] * x[ii];
-        sumlx += w * Faux[i].numer;
-        suml2 += w * Faux[i].denom;
-        const float current = sumlx * sumlx;
-        Laux[ii] += x[ii] < 0.0f ? -1 : 1;
-        if (suml2 > 0.0f && Faux[i].numer > 0.0f && current * best_denom > best * suml2) {
-            best = current;
-            best_denom = suml2;
-            scale = sumlx / suml2;
-            if (i == best_p_i + 1) {
-                // reduce copies for consecutive bests
-                L[ii] += x[ii] < 0.0f ? -1 : 1;
-            } else {
-                for (int j = 0; j < n; ++j) {
-                    L[j] = Laux[j];
-                }
-            }
-            best_p_i = i;
-        }
-    }
-    for (int i = 0; i < n; ++i) {
-        L[i] = negative_scale ? (-L[i] + nmax) : (L[i] + -nmin);
-        GGML_ASSERT(L[i] >= 0 && L[i] <= nmax - nmin);
-    }
-
-    return negative_scale ? -scale : scale;
-}
-
 // exhaustive search with cumulative sums
 static float make_qkxh_quants(int n, const float * restrict x, const float * restrict weights, int8_t * restrict L, int8_t * restrict Laux, struct k_heap * restrict k_heap, bool signed_scale) {
     const int nmin = MIN(0, -k_heap->mid_k); // TODO: maybe directly pass these
@@ -1279,182 +1135,6 @@ static float make_qkxsh_quants(int n, int nmin, int nmax, const float * restrict
     return scale;
 }
 
-// Very similar to make_qkxs_quants, but the sign of the scale is not assumed to be the sign of the absmax value.
-static float make_qkxss_quants(int n, int nmin, int nmax, const float * restrict x, const float * restrict weights, int8_t * restrict L, int8_t * restrict Laux, struct fraction * restrict Faux) {
-    // start at zero
-    nmin = MIN(0, nmin);
-    nmax = MAX(0, nmax);
-    float amax = 0.0f;
-    float min = 0.0f;
-    float max = 0.0f;
-    float w_amax = 0.0f;
-    int amax_i = -1;
-    int w_amax_i = -1;
-    for (int i = 0; i < n; ++i) {
-        const float w = weights ? weights[i] : x[i] * x[i];
-        const float ax = fabsf(x[i]);
-        const float wax = w * ax;
-        if (ax > amax) { amax = ax; amax_i = i; }
-        if (x[i] > max) { max = x[i]; }
-        if (x[i] < min) { min = x[i]; }
-        // Find the most important value
-        if (wax > w_amax) { w_amax = wax; w_amax_i = i; }
-    }
-
-    if (amax < GROUP_MAX_EPS || amax_i < 0 || w_amax_i < 0) { // all zero
-        for (int i = 0; i < n; ++i) { L[i] = 0; }
-        return 0.0f;
-    }
-
-    // Use the side which will clamp first.
-    // The first clamped value is the absmax at the end of the common range.
-    // TODO: reduce the search space when one of the ranges is 0
-    const int amax_range = MIN(-nmin, nmax);
-    float sumlx_p = 0.0f;
-    float suml2_p = 0.0f;
-    float sumlx_n = 0.0f;
-    float suml2_n = 0.0f;
-    float scale = 0.0f;
-    float best = 0.0f;
-    float best_denom = 1.0f;
-    int best_i = -2; // not consecutive with 0..n_frac
-    // Pre-calculate the half-point for the common range.
-    // All smaller vectors have a representable vector with twice the values, and thus can be skipped.
-    if (amax_range > 1) {
-        const float iscale = ((float)((amax_range >> 1) + 1))/amax;
-        for (int i = 0; i < n; ++i) {
-            const float w = weights ? weights[i] : x[i] * x[i];
-            int l = MAX(nmin, MIN(lroundf(x[i] * iscale), nmax));
-            Laux[i] = l;
-            suml2_p += w * l * l;
-            sumlx_p += w * l * x[i];
-        }
-        sumlx_n = -sumlx_p;
-        suml2_n = suml2_p;
-        const float current_p = sumlx_p * sumlx_p;
-        if (suml2_p > 0.0f && current_p * best_denom > best * suml2_p) {
-            best = current_p;
-            best_denom = suml2_p;
-            scale = sumlx_p / suml2_p;
-            for (int i = 0; i < n; ++i) {
-                L[i] = Laux[i];
-            }
-            best_i = -1; // right before 0 of the loop after sorting
-        }
-    } else {
-        for (int i = 0; i < n; ++i) {
-            Laux[i] = 0;
-        }
-    }
-
-    const int imax_range = MAX(nmax, -nmin);
-    const int max_odd = 2*(imax_range + 1) + 1;
-    const float wmax = fabsf(x[w_amax_i]);
-    int n_frac = 0;
-    for (int i = 0; i < n; ++i) {
-        // assuming nmin <= nmax
-        const int odd_max = MAX(nmax, -nmin);
-        const float v = fabsf(x[i]);
-        const float v_max_odd = v * max_odd;
-        for (int j = abs(Laux[i]); j < odd_max; ++j) {
-            const float odd = 2*j + 1;
-            const float wmax_odd = wmax * odd;
-            if (wmax_odd < v_max_odd) {
-                Faux[n_frac++] = (struct fraction){
-                    .numer=v,
-                    .denom=odd,
-                    .i=i,
-                };
-            } else {
-                // stop when the inverse scale would result in clamping the most important value
-                break;
-            }
-        }
-    }
-
-    qsort(Faux, n_frac, sizeof(struct fraction), compare_fractions_desc);
-
-    const float max_common_odd = (MIN(nmax, -nmin) * 2) + 1;
-    const float max_odd_p = (nmax * 2) + 1;
-    const float max_odd_n = (-nmin * 2) + 1;
-
-    for (int i = 0; i < n_frac; ++i) {
-        // maximize the weighted cosine similarity
-        const int ii = Faux[i].i;
-        const float w = weights ? weights[ii] : x[ii] * x[ii];
-        const float lx = w * Faux[i].numer;
-        const float odd = Faux[i].denom;
-        const float l2 = w * odd;
-
-        Laux[ii] += x[ii] < 0.0f ? -1 : 1;
-
-        float sumlx = 0.0f;
-        float proj = 0.0f;
-        float norm = 0.0f;
-        if (odd < max_common_odd) {
-            sumlx_p += lx;
-            suml2_p += l2;
-            sumlx_n -= lx;
-            suml2_n += l2;
-
-            sumlx = sumlx_p;
-            proj = sumlx_p * sumlx_p;
-            norm = suml2_p;
-
-            // avoid double-copying Laux in a single iteration
-            if (suml2_p != suml2_n && suml2_p * suml2_n > 0.0f) {
-                const float proj_n = sumlx_n * sumlx_n;
-                if (proj_n * norm > proj * suml2_n) {
-                    sumlx = sumlx_n;
-                    proj = proj_n;
-                    norm = suml2_n;
-                }
-            }
-        } else if (x[ii] < 0.0f ? odd < max_odd_n : odd < max_odd_p) {
-            sumlx_p += lx;
-            suml2_p += l2;
-
-            sumlx = sumlx_p;
-            proj = sumlx_p * sumlx_p;
-            norm = suml2_p;
-        } else {
-            // outside the positive range means we're now into negatives
-            sumlx_n -= lx;
-            suml2_n += l2;
-
-            sumlx = sumlx_n;
-            proj = sumlx_n * sumlx_n;
-            norm = suml2_n;
-        }
-        if (norm > 0.0f && proj * best_denom > best * norm) {
-            best = proj;
-            best_denom = norm;
-            scale = sumlx / norm;
-            if (i == best_i + 1) {
-                // reduce copies for consecutive bests
-                L[ii] += x[ii] < 0.0f ? -1 : 1;
-            } else {
-                for (int j = 0; j < n; ++j) {
-                    L[j] = Laux[j];
-                }
-            }
-            best_i = i;
-        }
-    }
-
-    if (scale < 0.0f) {
-        for (int i = 0; i < n; ++i) {
-            L[i] = MAX(nmin, MIN(-L[i], nmax)) - nmin;
-        }
-    } else {
-        for (int i = 0; i < n; ++i) {
-            L[i] = MAX(nmin, MIN(L[i], nmax)) - nmin;
-        }
-    }
-
-    return scale;
-}
-
 // non-linear exhaustive search with cumulative sums
 static float make_qkxh_nl_quants(int n, const float * restrict x, const float * restrict weights, uint8_t * restrict L, uint8_t * restrict Laux, struct k_heap * restrict k_heap, bool signed_scale, bool fast) {
     float sumlx = 0.0f;
@@ -1924,7 +1604,6 @@ void quantize_row_q3_K_ref(const float * restrict x, block_q3_K * restrict y, in
 
     int8_t L[QK_K];
     int8_t Laux[16];
-    // struct fraction Faux[16 * 4];
     struct k_heap_cell heap_cells[16];
     float odd[8];
     struct k_heap k_heap;
@@ -1942,7 +1621,6 @@ void quantize_row_q3_K_ref(const float * restrict x, block_q3_K * restrict y, in
         float max_scale = 0;
         float amax = 0;
         for (int j = 0; j < QK_K/16; ++j) {
-            // scales[j] = make_qkxs_quants(16, -4, 3, x + 16*j, weights, L + 16*j, Laux, Faux, true);
             scales[j] = make_qkxh_quants(16, x + 16*j, weights, L + 16*j, Laux, &k_heap, true);
             float scale = fabsf(scales[j]);
             if (scale > amax) {
@@ -2052,7 +1730,6 @@ static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restri
     float weight[16];
     float sw[QK_K / 16];
     int8_t Ls[QK_K / 16];
-    // struct fraction Faux[16 * 32];
     struct k_heap_cell heap_cells[16];
     float odd[8];
     struct k_heap k_heap;
@@ -2080,14 +1757,12 @@ static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restri
             for (int l = 0; l < 16; ++l) sumw += weight[l];
             sw[j] = sumw;
 
-            // scales[j] = make_qkxs_quants(16, -4, 3, x + 16*j, weight, L + 16*j, Laux, Faux, true);
             scales[j] = make_qkxh_quants(16, x + 16*j, weight, L + 16*j, Laux, &k_heap, true);
 
         }
 
         memset(y[i].scales, 0, 12);
 
-        // float d_block = make_qkxs_quants(QK_K/16, -32, 31, scales, sw, Ls, Laux, Faux, true);
         float d_block = make_qkxh_quants(QK_K/16, scales, sw, Ls, Laux, &k_heap_s, true);
         for (int j = 0; j < QK_K/16; ++j) {
             int l = Ls[j];
@@ -2766,7 +2441,6 @@ static void quantize_row_q4_0_impl(const float * restrict x, block_q4_0 * restri
     float weight[QK4_0];
     int8_t L[QK4_0];
     int8_t Laux[QK4_0];
-    // struct fraction Faux[8 * QK4_0];
     struct k_heap_cell heap_cells[QK4_0];
     float odd[16];
     struct k_heap k_heap;
@@ -2782,7 +2456,6 @@ static void quantize_row_q4_0_impl(const float * restrict x, block_q4_0 * restri
         const float * xb = x + QK4_0 * ib;
         const float * qw = quant_weights + QK4_0 * ib;
         for (int j = 0; j < QK4_0; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]);
-        // float d = make_qkxs_quants(QK4_0, -8, 7, xb, weight, L, Laux, Faux, true);
         float d = make_qkxh_quants(QK4_0, xb, weight, L, Laux, &k_heap, true);
         y[ib].d = GGML_FP32_TO_FP16(d);
         for (int j = 0; j < 16; ++j) {
@@ -2862,7 +2535,6 @@ static void quantize_row_q5_0_impl(const float * restrict x, block_q5_0 * restri
     float weight[QK5_0];
     int8_t L[QK5_0];
     int8_t Laux[QK5_0];
-    // struct fraction Faux[16 * QK5_0];
     struct k_heap_cell heap_cells[QK5_0];
     float odd[32];
     struct k_heap k_heap;
@@ -2879,7 +2551,6 @@ static void quantize_row_q5_0_impl(const float * restrict x, block_q5_0 * restri
         const float * qw = quant_weights + QK5_0 * ib;
         for (int j = 0; j < QK5_0; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]);
         float d = make_qkxh_quants(QK5_0, xb, weight, L, Laux, &k_heap, true);
-        // float d = make_qkxs_quants(QK5_0, -16, 15, xb, weight, L, Laux, Faux, true);
         y[ib].d = GGML_FP32_TO_FP16(d);
 
         uint32_t qh = 0;

From 8b8b88f3de204486eddeaabc64015384b921b09f Mon Sep 17 00:00:00 2001
From: Francis Couture-Harpin <git@compilade.net>
Date: Sat, 22 Mar 2025 18:47:56 -0400
Subject: [PATCH 11/12] ggml-quants : restore Q2_K use of make_qp_quants

Weirdly, it seems like in practice replacing this instance is not better.
This is probably because of its interaction with make_qkx3_quants.
---
 ggml/src/ggml-quants.c | 13 +++----------
 1 file changed, 3 insertions(+), 10 deletions(-)

diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c
index 8e52f99c6070c..3b08b7e6a8c4e 100644
--- a/ggml/src/ggml-quants.c
+++ b/ggml/src/ggml-quants.c
@@ -1519,18 +1519,11 @@ static void quantize_row_q2_K_impl(const float * GGML_RESTRICT x, block_q2_K * G
 
     uint8_t L[QK_K];
     uint8_t Laux[16];
-    int8_t Lsaux[16];
     float mins[QK_K/16];
     float scales[QK_K/16];
     float sw[QK_K/16];
     float weight[16];
-    int8_t Ls[QK_K/16], Lm[QK_K/16];
-
-    struct k_heap_cell heap_cells_s[QK_K/16];
-    float odd_s[16];
-    struct k_heap k_heap_s;
-
-    k_heap_init_linear(&k_heap_s, 0, 15, heap_cells_s, odd_s);
+    uint8_t Ls[QK_K/16], Lm[QK_K/16];
 
     for (int i = 0; i < nb; i++) {
         memset(sw, 0, QK_K/16*sizeof(float));
@@ -1545,8 +1538,8 @@ static void quantize_row_q2_K_impl(const float * GGML_RESTRICT x, block_q2_K * G
         }
 
         float dm, mm;
-        dm  = make_qkxh_quants(QK_K/16, scales, sw, Ls, Lsaux, &k_heap_s, false);
-        mm  = make_qkxh_quants(QK_K/16, mins, sw, Lm, Lsaux, &k_heap_s, false);
+        dm  = make_qp_quants(QK_K/16, 15, scales, Ls, sw);
+        mm  = make_qp_quants(QK_K/16, 15, mins,   Lm, sw);
 
         y[i].d    = GGML_FP32_TO_FP16(dm);
         y[i].dmin = GGML_FP32_TO_FP16(mm);

From a5b1943912c5657482b1921ef8385c72b9609b5d Mon Sep 17 00:00:00 2001
From: Francis Couture-Harpin <git@compilade.net>
Date: Sun, 23 Mar 2025 17:59:37 -0400
Subject: [PATCH 12/12] ggml-quants : fix some edge cases in
 make_qkxh_nl_quants

---
 ggml/src/ggml-quants.c | 66 ++++++++++++++++++------------------------
 1 file changed, 28 insertions(+), 38 deletions(-)

diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c
index 3b08b7e6a8c4e..728e77fe87cd0 100644
--- a/ggml/src/ggml-quants.c
+++ b/ggml/src/ggml-quants.c
@@ -1149,10 +1149,11 @@ static float make_qkxh_nl_quants(int n, const float * GGML_RESTRICT x, const flo
             amax = ax;
             amax_i = i;
         }
-        Laux[i] = k_heap->mid_k;
         sumlx += w * x[i] * kmin;
         suml2 += w * kmin * kmin;
     }
+    memset(Laux, k_heap->mid_k, n);
+    memset(L, k_heap->mid_k, n);
 
     const bool neg_scale = signed_scale && fast ? (x[amax_i] < 0.0f) != (k_heap->kmax < 0) : false;
 
@@ -1163,57 +1164,49 @@ static float make_qkxh_nl_quants(int n, const float * GGML_RESTRICT x, const flo
     float best_suml2;
     if (suml2 != 0.0f) {
         best = sumlx * sumlx;
-        best_sumlx = neg_scale ? -sumlx : sumlx;
-        best_suml2 = suml2 != 0.0f ? suml2 : 1.0f;
+        best_sumlx = sumlx; // can't change the sign of kmin
+        best_suml2 = suml2;
     } else {
         best = 0.0f;
         best_sumlx = 0.0f;
         best_suml2 = 1.0f;
     }
-    {
-        float sumlx_p = neg_scale ? -sumlx : sumlx;
-        float suml2_p = suml2;
-        int best_p_i = -2; // not consecutive with 0..n_frac
-        int i = 0;
-        while (k_heap->n > 0) {
-            struct fraction frac = k_heap_pop(k_heap);
-            const int ii = frac.i;
-            const float w = weights ? weights[ii] : x[ii] * x[ii];
-            sumlx_p += w * frac.numer;
-            suml2_p += w * frac.denom;
-            const float current = sumlx_p * sumlx_p;
-            Laux[ii] += (x[ii] < 0.0f) != neg_scale ? -1 : 1;
-            if (suml2_p > 0.0f && current * best_suml2 > best * suml2_p) {
-                best = current;
-                best_sumlx = neg_scale ? -sumlx_p : sumlx_p;
-                best_suml2 = suml2_p;
-                if (i == best_p_i + 1) {
-                    // reduce copies for consecutive bests
-                    L[ii] += (x[ii] < 0.0f) != neg_scale ? -1 : 1;
-                } else {
-                    for (int j = 0; j < n; ++j) {
-                        L[j] = Laux[j];
-                    }
-                }
-                best_p_i = i;
+    float sumlx_p = neg_scale ? -sumlx : sumlx;
+    float suml2_p = suml2;
+    int best_p_i = -1; // consecutive with 0..n_frac
+    for (int i = 0; k_heap->n > 0; ++i) {
+        struct fraction frac = k_heap_pop(k_heap);
+        const int ii = frac.i;
+        const float w = weights ? weights[ii] : x[ii] * x[ii];
+        sumlx_p += w * frac.numer;
+        suml2_p += w * frac.denom;
+        const float current = sumlx_p * sumlx_p;
+        Laux[ii] += (x[ii] < 0.0f) != neg_scale ? -1 : 1;
+        if (suml2_p > 0.0f && current * best_suml2 > best * suml2_p) {
+            best = current;
+            best_sumlx = neg_scale ? -sumlx_p : sumlx_p;
+            best_suml2 = suml2_p;
+            if (i == best_p_i + 1) {
+                // reduce copies for consecutive bests
+                L[ii] += (x[ii] < 0.0f) != neg_scale ? -1 : 1;
+            } else {
+                memcpy(L, Laux, n);
             }
+            best_p_i = i;
         }
     }
 
     // Non-linear mappings are usually not symmetric, so try negating the scale
     // This is the same as above, but keeping the old best if the new best is not better.
     if (signed_scale && !fast) {
-        for (int i = 0; i < n; ++i) {
-            Laux[i] = k_heap->mid_k;
-        }
+        memset(Laux, k_heap->mid_k, n);
 
         k_heap_set_x(k_heap, x, n, true);
 
         float sumlx_n = -sumlx;
         float suml2_n = suml2;
         int best_n_i = -2; // not consecutive with 0..n_frac
-        int i = 0;
-        while (k_heap->n > 0) {
+        for (int i = 0; k_heap->n > 0; ++i) {
             struct fraction frac = k_heap_pop(k_heap);
             const int ii = frac.i;
             const float w = weights ? weights[ii] : x[ii] * x[ii];
@@ -1229,13 +1222,10 @@ static float make_qkxh_nl_quants(int n, const float * GGML_RESTRICT x, const flo
                     // reduce copies for consecutive bests
                     L[ii] += x[ii] >= 0.0f ? -1 : 1;
                 } else {
-                    for (int j = 0; j < n; ++j) {
-                        L[j] = Laux[j];
-                    }
+                    memcpy(L, Laux, n);
                 }
                 best_n_i = i;
             }
-            ++i;
         }
     }