From dd6b8408c975dcf72fd682ea5e80d5ad66075b60 Mon Sep 17 00:00:00 2001 From: Francis Couture-Harpin Date: Fri, 21 Feb 2025 13:14:32 -0500 Subject: [PATCH 01/12] ggml-quants : improve IQ4_NL, IQ4_XS, and Q3_K --- ggml/src/ggml-quants.c | 400 ++++++++++++++++++++++++++++++++--------- 1 file changed, 320 insertions(+), 80 deletions(-) diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index 7918388ae9f2e..638766ae89bcf 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -628,6 +628,258 @@ static float make_qkx2_quants(int n, int nmax, const float * restrict x, const f return scale; } +struct fraction { + // float frac; + float numer; + float denom; + int i; +}; + +// comparator function for sorting fractions in make_qkxs_quants +static inline int compare_fractions_desc(const void * a, const void * b) { + const struct fraction * f_a = (const struct fraction *) a; + const struct fraction * f_b = (const struct fraction *) b; + float na = f_a->numer; + float da = f_a->denom; + float nb = f_b->numer; + float db = f_b->denom; + + // Stable sort + // a - b sorts ascending, which means + // 1 swaps, -1 stays + if (da == db) { // equal denominators + return (na == nb) ? ((a > b) ? 1 : -1) : (na < nb) ? 1 : -1; + } + if (na == nb) { // equal numerators + return (da > db) ? 1 : -1; + } + float ab = na * db; + float ba = nb * da; + return (ab == ba) ? ((a > b) ? 1 : -1) : (ab < ba) ? 1 : -1; +} + +// exhaustive search with cumulative sums +// Need Faux to have room for n*(max(abs(nmin), abs(nmax))) fractions +static float make_qkxs_quants(int n, int nmin, int nmax, const float * restrict x, const float * restrict weights, int8_t * restrict L, struct fraction * restrict Faux, bool signed_scale) { + float max = 0.0f; + float amax = 0.0f; + for (int i = 0; i < n; ++i) { + float ax = fabsf(x[i]); + if (ax > amax) { + amax = ax; + max = x[i]; + } + } + bool negative_scale = false; + if (signed_scale && -nmin != nmax) { + // the max side should have the biggest range + if ((max < 0.0f) == (-nmin < nmax)) { + // [-4, 3] ==> [-3, 4] + int tmp = nmin; + nmin = -nmax; + nmax = -tmp; + negative_scale = true; + } + } + if (amax < GROUP_MAX_EPS) { // all zero + for (int i = 0; i < n; ++i) { + L[i] = 0; + } + return 0.0f; + } + int n_frac = 0; + for (int i = 0; i < n; ++i) { + // assuming nmin <= nmax + const int odd_max = MAX(0, x[i] < 0 ? -nmin : nmax); + const int odd_min = MAX(0, x[i] < 0 ? -nmax : nmin); + const float v = fabsf(x[i]); + // fprintf(stderr, "%s: i=%d, odd_min=%d, odd_max=%d\n", __func__, i, odd_min, odd_max); + for (int j = odd_min; j < odd_max; ++j) { + const float odd = 2*j + 1; + Faux[n_frac++] = (struct fraction){ + .numer=v, + .denom=odd, + .i=i, + }; + } + } + + qsort(Faux, n_frac, sizeof(struct fraction), compare_fractions_desc); + + float iscale = 0.0f; + { + float sumlx = 0.0f; + float suml2 = 0.0f; + float best = 0.0f; + float best_denom = 1.0f; + for (int i = 0; i < n_frac; ++i) { + // maximize the weighted cosine + const int ii = Faux[i].i; + const float w = weights ? weights[ii] : x[ii] * x[ii]; + sumlx += w * Faux[i].numer; + suml2 += w * Faux[i].denom; + const float current = sumlx * sumlx; + // fprintf(stderr, "%s: Faux[%d]=(%f/%f) * %f, square(sumlx)=%f, suml2=%f, k*cos2=%f\n", __func__, i, Faux[i].numer, Faux[i].denom, Faux[i].weight, current, suml2, current / suml2); + // use the last in case of equality + // FIXME: > or >= ?? Why does [0, 0, 1] rounds to [0, 0, 0] with >= ? + if (suml2 > 0.0f && current * best_denom > best * suml2) { + best = current; + best_denom = suml2; + iscale = Faux[i].numer > 0.0f ? Faux[i].denom / (2.0f * Faux[i].numer) : 0.0f; + if (!isfinite(iscale)) { + fprintf(stderr, "%s: iscale is not finite, %f/(2*%f)\n", __func__, Faux[i].denom, Faux[i].numer); + } + } + } + } + // (very) small fudging necessary because floats otherwise round to nearest even + iscale = iscale * ((float)((1 << 23) + 1) / (float)(1 << 23)); + + float sumlx = 0.0f; + float suml2 = 0.0f; + for (int i = 0; i < n; ++i) { + // Rounding away from zero is assumed by the search algorithm above. + int l = MAX(nmin, MIN(lroundf(x[i] * iscale), nmax)); + if (negative_scale) { + l = -l; + } + L[i] = negative_scale ? l + nmax : l - nmin; + float w = weights ? weights[i] : x[i] * x[i]; + // weighted projection scale + sumlx += w * x[i] * l; + suml2 += w * l * l; + } + + return suml2 > 0.0f ? sumlx / suml2 : 0.0f; +} + +// non-linear exhaustive search with cumulative sums +// Need Faux to have room for n*k fractions +static float make_qkxs_nl_quants(int n, int k, const float * restrict x, const float * restrict weights, const int8_t * restrict kvalues, uint8_t * restrict L, uint8_t * restrict Laux, struct fraction * restrict Faux, bool signed_scale) { + float sumlx = 0.0f; + float suml2 = 0.0f; + int kmin = abs(kvalues[0]); + int koff = 0; + for (int i = 1; i < k; ++i) { + int ak = abs(kvalues[i]); + if (ak < kmin) { + kmin = ak; + koff = i; + } + } + kmin = kvalues[koff]; + for (int i = 0; i < n; ++i) { + float w = weights ? weights[i] : x[i] * x[i]; + Laux[i] = koff; + sumlx += w * x[i] * kmin; + suml2 += w * kmin * kmin; + } + + int n_frac_p = 0; + for (int i = 0; i < n; ++i) { + const int start = x[i] < 0.0f ? 1 : koff + 1; + const int end = x[i] < 0.0f ? koff + 1: k; + for (int j = start; j < end; ++j) { + const float threshold = kvalues[j] + kvalues[j - 1]; + const float step = kvalues[j] - kvalues[j - 1]; + Faux[n_frac_p++] = (struct fraction){ + // This should always be positive or else + // the fraction comparison function won't work properly + .numer=fabsf(x[i] * step), + // It's amazing how this is still the difference of consecutive squares + .denom=fabsf(threshold * step), + .i=i, + }; + } + } + + qsort(Faux, n_frac_p, sizeof(struct fraction), compare_fractions_desc); + + float best = 0.0f; + float best_sumlx = 0.0f; + float best_suml2 = 1.0f; + float sumlx_p = sumlx; + float suml2_p = suml2; + int best_p_i = -2; // not consecutive with 0..n_frac + for (int i = 0; i < n_frac_p; ++i) { + const int ii = Faux[i].i; + const float w = weights ? weights[ii] : x[ii] * x[ii]; + sumlx_p += w * Faux[i].numer; + suml2_p += w * Faux[i].denom; + const float current = sumlx_p * sumlx_p; + Laux[ii] += x[ii] < 0.0f ? -1 : 1; + if (suml2_p > 0.0f && current * best_suml2 > best * suml2_p) { + best = current; + best_sumlx = sumlx_p; + best_suml2 = suml2_p; + if (i == best_p_i + 1) { + // reduce copies for consecutive bests + L[ii] += x[ii] < 0.0f ? -1 : 1; + } else { + for (int j = 0; j < n; ++j) { + L[j] = Laux[j]; + } + } + best_p_i = i; + } + } + + // Non-linear mappings are usually not symmetric, so try negating the scale + if (signed_scale) { + for (int i = 0; i < n; ++i) { + Laux[i] = koff; + } + + int n_frac_n = 0; + for (int i = 0; i < n; ++i) { + const int start = x[i] >= 0.0f ? 1 : koff + 1; + const int end = x[i] >= 0.0f ? koff + 1: k; + for (int j = start; j < end; ++j) { + const float threshold = kvalues[j] + kvalues[j - 1]; + const float step = kvalues[j] - kvalues[j - 1]; + Faux[n_frac_n++] = (struct fraction){ + // This should always be positive or else + // the fraction comparison function won't work properly + .numer=fabsf(x[i] * step), + // It's amazing how this is still the difference of consecutive squares + .denom=fabsf(threshold * step), + .i=i, + }; + } + } + + qsort(Faux, n_frac_n, sizeof(struct fraction), compare_fractions_desc); + + float sumlx_n = -sumlx; + float suml2_n = suml2; + int best_n_i = -2; // not consecutive with 0..n_frac + for (int i = 0; i < n_frac_n; ++i) { + const int ii = Faux[i].i; + const float w = weights ? weights[ii] : x[ii] * x[ii]; + sumlx_n += w * Faux[i].numer; + suml2_n += w * Faux[i].denom; + const float current = sumlx_n * sumlx_n; + Laux[ii] += x[ii] >= 0.0f ? -1 : 1; + if (suml2_n > 0.0f && current * best_suml2 > best * suml2_n) { + best = current; + best_sumlx = -sumlx_n; + best_suml2 = suml2_n; + if (i == best_n_i + 1) { + // reduce copies for consecutive bests + L[ii] += x[ii] >= 0.0f ? -1 : 1; + } else { + for (int j = 0; j < n; ++j) { + L[j] = Laux[j]; + } + } + best_n_i = i; + } + } + } + + return best_suml2 != 0.0f ? best_sumlx / best_suml2 : 0.0f; +} + static inline void get_scale_min_k4(int j, const uint8_t * restrict q, uint8_t * restrict d, uint8_t * restrict m) { if (j < 4) { *d = q[j] & 63; *m = q[j + 4] & 63; @@ -982,14 +1234,21 @@ void quantize_row_q3_K_ref(const float * restrict x, block_q3_K * restrict y, in const int nb = k / QK_K; int8_t L[QK_K]; + struct fraction Faux[16 * 4]; float scales[QK_K / 16]; + float weights[16]; + + for (int i = 0; i < 16; ++i) { + weights[i] = 1.0f; + } for (int i = 0; i < nb; i++) { float max_scale = 0; float amax = 0; for (int j = 0; j < QK_K/16; ++j) { - scales[j] = make_q3_quants(16, 4, x + 16*j, L + 16*j, true); + scales[j] = make_qkxs_quants(16, -4, 3, x + 16*j, weights, L + 16*j, Faux, true); + // scales[j] = make_q3_quants(16, 4, x + 16*j, L + 16*j, true); float scale = fabsf(scales[j]); if (scale > amax) { amax = scale; max_scale = scales[j]; @@ -1015,20 +1274,20 @@ void quantize_row_q3_K_ref(const float * restrict x, block_q3_K * restrict y, in y[i].d = GGML_FP32_TO_FP16(0.f); } - int8_t sc; - for (int j = 0; j < QK_K/16; ++j) { - sc = j < 8 ? y[i].scales[j] & 0xF : y[i].scales[j-8] >> 4; - sc = (sc | (((y[i].scales[8 + j%4] >> (2*(j/4))) & 3) << 4)) - 32; - float d = GGML_FP16_TO_FP32(y[i].d) * sc; - if (!d) { - continue; - } - for (int ii = 0; ii < 16; ++ii) { - int l = nearest_int(x[16*j + ii]/d); - l = MAX(-4, MIN(3, l)); - L[16*j + ii] = l + 4; - } - } + // int8_t sc; + // for (int j = 0; j < QK_K/16; ++j) { + // sc = j < 8 ? y[i].scales[j] & 0xF : y[i].scales[j-8] >> 4; + // sc = (sc | (((y[i].scales[8 + j%4] >> (2*(j/4))) & 3) << 4)) - 32; + // float d = GGML_FP16_TO_FP32(y[i].d) * sc; + // if (!d) { + // continue; + // } + // for (int ii = 0; ii < 16; ++ii) { + // int l = nearest_int(x[16*j + ii]/d); + // l = MAX(-4, MIN(3, l)); + // L[16*j + ii] = l + 4; + // } + // } memset(y[i].hmask, 0, QK_K/8); // We put the high-bit for the 1st 8 quants into bit 0, the next 8 into bit 1, etc. @@ -1112,6 +1371,7 @@ static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restri float weight[16]; float sw[QK_K / 16]; int8_t Ls[QK_K / 16]; + struct fraction Faux[16 * 32]; for (int i = 0; i < nb; i++) { @@ -1130,13 +1390,15 @@ static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restri for (int l = 0; l < 16; ++l) sumw += weight[l]; sw[j] = sumw; - scales[j] = make_qx_quants(16, 4, x + 16*j, L + 16*j, 1, weight); + // scales[j] = make_qx_quants(16, 4, x + 16*j, L + 16*j, 1, weight); + scales[j] = make_qkxs_quants(16, -4, 3, x + 16*j, weight, L + 16*j, Faux, true); } memset(y[i].scales, 0, 12); - float d_block = make_qx_quants(QK_K/16, 32, scales, Ls, 1, sw); + // float d_block = make_qx_quants(QK_K/16, 32, scales, Ls, 1, sw); + float d_block = make_qkxs_quants(QK_K/16, -32, 31, scales, sw, Ls, Faux, true); for (int j = 0; j < QK_K/16; ++j) { int l = Ls[j]; if (j < 8) { @@ -1149,20 +1411,20 @@ static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restri } y[i].d = GGML_FP32_TO_FP16(d_block); - int8_t sc; - for (int j = 0; j < QK_K/16; ++j) { - sc = j < 8 ? y[i].scales[j] & 0xF : y[i].scales[j-8] >> 4; - sc = (sc | (((y[i].scales[8 + j%4] >> (2*(j/4))) & 3) << 4)) - 32; - float d = GGML_FP16_TO_FP32(y[i].d) * sc; - if (!d) { - continue; - } - for (int ii = 0; ii < 16; ++ii) { - int l = nearest_int(x[16*j + ii]/d); - l = MAX(-4, MIN(3, l)); - L[16*j + ii] = l + 4; - } - } + // int8_t sc; + // for (int j = 0; j < QK_K/16; ++j) { + // sc = j < 8 ? y[i].scales[j] & 0xF : y[i].scales[j-8] >> 4; + // sc = (sc | (((y[i].scales[8 + j%4] >> (2*(j/4))) & 3) << 4)) - 32; + // float d = GGML_FP16_TO_FP32(y[i].d) * sc; + // if (!d) { + // continue; + // } + // for (int ii = 0; ii < 16; ++ii) { + // int l = nearest_int(x[16*j + ii]/d); + // l = MAX(-4, MIN(3, l)); + // L[16*j + ii] = l + 4; + // } + // } memset(y[i].hmask, 0, QK_K/8); // We put the high-bit for the 1st 8 quants into bit 0, the next 8 into bit 1, etc. @@ -4572,10 +4834,9 @@ static inline int best_index_int8(int n, const int8_t * val, float x) { static void quantize_row_iq4_nl_impl(const int super_block_size, const int block_size, const float * restrict x, ggml_fp16_t * dh, uint8_t * q4, uint16_t * scales_h, uint8_t * scales_l, - float * scales, float * weight, uint8_t * L, + float * scales, float * weight, uint8_t * L, uint8_t * Laux, struct fraction * Faux, const int8_t * values, - const float * quant_weights, - const int ntry) { + const float * quant_weights) { float sigma2 = 0; for (int j = 0; j < super_block_size; ++j) sigma2 += x[j]*x[j]; @@ -4592,7 +4853,8 @@ static void quantize_row_iq4_nl_impl(const int super_block_size, const int block const float * qw = quant_weights + ib*block_size; for (int j = 0; j < block_size; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]); } else { - for (int j = 0; j < block_size; ++j) weight[j] = xb[j]*xb[j]; + for (int j = 0; j < block_size; ++j) weight[j] = sqrtf(sigma2 + xb[j]*xb[j]); + // for (int j = 0; j < block_size; ++j) weight[j] = 1; } float amax = 0, max = 0; for (int j = 0; j < block_size; ++j) { @@ -4605,35 +4867,7 @@ static void quantize_row_iq4_nl_impl(const int super_block_size, const int block scales[ib] = 0; continue; } - float d = ntry > 0 ? -max/values[0] : max/values[0]; - float id = 1/d; - float sumqx = 0, sumq2 = 0; - for (int j = 0; j < block_size; ++j) { - float al = id*xb[j]; - int l = best_index_int8(16, values, al); - Lb[j] = l; - float q = values[l]; - float w = weight[j]; - sumqx += w*q*xb[j]; - sumq2 += w*q*q; - } - d = sumqx/sumq2; - float best = d*sumqx; - for (int itry = -ntry; itry <= ntry; ++itry) { - id = (itry + values[0])/max; - sumqx = sumq2 = 0; - for (int j = 0; j < block_size; ++j) { - float al = id*xb[j]; - int l = best_index_int8(16, values, al); - float q = values[l]; - float w = weight[j]; - sumqx += w*q*xb[j]; - sumq2 += w*q*q; - } - if (sumq2 > 0 && sumqx*sumqx > best*sumq2) { - d = sumqx/sumq2; best = d * sumqx; - } - } + float d = make_qkxs_nl_quants(block_size, 16, xb, weight, values, Lb, Laux, Faux, true); scales[ib] = d; float abs_d = fabsf(d); if (abs_d > amax_scale) { @@ -4650,13 +4884,13 @@ static void quantize_row_iq4_nl_impl(const int super_block_size, const int block for (int ib = 0; ib < super_block_size/block_size; ++ib) { int l = nearest_int(id*scales[ib]); l = MAX(-32, MIN(31, l)); - float dl = d * l; - float idl = dl ? 1/dl : 0.f; - uint8_t * Lb = L + ib*block_size; - const float * xb = x + ib*block_size; - for (int j = 0; j < block_size; ++j) { - Lb[j] = best_index_int8(16, values, idl*xb[j]); - } + // float dl = d * l; + // float idl = dl ? 1/dl : 0.f; + // uint8_t * Lb = L + ib*block_size; + // const float * xb = x + ib*block_size; + // for (int j = 0; j < block_size; ++j) { + // Lb[j] = best_index_int8(16, values, idl*xb[j]); + // } l += 32; uint8_t l_l = l & 0xf; uint8_t l_h = l >> 4; @@ -4666,12 +4900,12 @@ static void quantize_row_iq4_nl_impl(const int super_block_size, const int block } } else { dh[0] = GGML_FP32_TO_FP16(scales[0]); - if (ntry > 0) { - float id = scales[0] ? 1/scales[0] : 0; - for (int j = 0; j < super_block_size; ++j) { - L[j] = best_index_int8(16, values, id*x[j]); - } - } + // if (ntry > 0) { + // float id = scales[0] ? 1/scales[0] : 0; + // for (int j = 0; j < super_block_size; ++j) { + // L[j] = best_index_int8(16, values, id*x[j]); + // } + // } } for (int i = 0; i < super_block_size/32; ++i) { @@ -4686,6 +4920,8 @@ size_t quantize_iq4_nl(const float * restrict src, void * restrict dst, int64_t int64_t nblock = n_per_row/QK4_NL; char * qrow = (char *)dst; uint8_t L[QK4_NL]; + uint8_t Laux[QK4_NL]; + struct fraction Faux[QK4_NL * 16]; float weight[QK4_NL]; uint16_t unused_h; uint8_t * unused_l = NULL; @@ -4695,7 +4931,7 @@ size_t quantize_iq4_nl(const float * restrict src, void * restrict dst, int64_t for (int ibl = 0; ibl < nblock; ++ibl) { const float * qw = quant_weights ? quant_weights + QK4_NL*ibl : NULL; quantize_row_iq4_nl_impl(QK4_NL, 32, src + QK4_NL*ibl, &iq4[ibl].d, iq4[ibl].qs, &unused_h, unused_l, - &scale, weight, L, kvalues_iq4nl, qw, 7); + &scale, weight, L, Laux, Faux, kvalues_iq4nl, qw); } src += n_per_row; qrow += nblock*sizeof(block_iq4_nl); @@ -4708,6 +4944,8 @@ void quantize_row_iq4_nl_ref(const float * restrict x, block_iq4_nl * restrict y GGML_ASSERT(k%QK4_NL == 0); int64_t nblock = k/QK4_NL; uint8_t L[QK4_NL]; + uint8_t Laux[QK4_NL]; + struct fraction Faux[QK4_NL * 16]; float weight[QK4_NL]; uint16_t unused_h; uint8_t * unused_l = NULL; @@ -4715,7 +4953,7 @@ void quantize_row_iq4_nl_ref(const float * restrict x, block_iq4_nl * restrict y block_iq4_nl * iq4 = y; for (int ibl = 0; ibl < nblock; ++ibl) { quantize_row_iq4_nl_impl(QK4_NL, 32, x + QK4_NL*ibl, &iq4[ibl].d, iq4[ibl].qs, &unused_h, unused_l, - &scale, weight, L, kvalues_iq4nl, NULL, -1); + &scale, weight, L, Laux, Faux, kvalues_iq4nl, NULL); } } @@ -4724,6 +4962,8 @@ size_t quantize_iq4_xs(const float * restrict src, void * restrict dst, int64_t int64_t nblock = n_per_row/QK_K; char * qrow = (char *)dst; uint8_t L[QK_K]; + uint8_t Laux[32]; + struct fraction Faux[32 * 16]; float weight[32]; float scales[QK_K/32]; for (int64_t row = 0; row < nrow; ++row) { @@ -4731,7 +4971,7 @@ size_t quantize_iq4_xs(const float * restrict src, void * restrict dst, int64_t for (int ibl = 0; ibl < nblock; ++ibl) { const float * qw = quant_weights ? quant_weights + QK_K*ibl : NULL; quantize_row_iq4_nl_impl(QK_K, 32, src + QK_K*ibl, &iq4[ibl].d, iq4[ibl].qs, &iq4[ibl].scales_h, iq4[ibl].scales_l, - scales, weight, L, kvalues_iq4nl, qw, 7); + scales, weight, L, Laux, Faux, kvalues_iq4nl, qw); } src += n_per_row; qrow += nblock*sizeof(block_iq4_xs); From d0060fc498595c7a2b7c1e28662cc11f298e6c9a Mon Sep 17 00:00:00 2001 From: Francis Couture-Harpin Date: Fri, 21 Feb 2025 15:05:03 -0500 Subject: [PATCH 02/12] ggml-quants : better and faster make_qkxs_quants --- ggml/src/ggml-quants.c | 179 ++++++++++++++++++++++++++--------------- 1 file changed, 116 insertions(+), 63 deletions(-) diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index 638766ae89bcf..29ffb608b645f 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -660,97 +660,148 @@ static inline int compare_fractions_desc(const void * a, const void * b) { // exhaustive search with cumulative sums // Need Faux to have room for n*(max(abs(nmin), abs(nmax))) fractions -static float make_qkxs_quants(int n, int nmin, int nmax, const float * restrict x, const float * restrict weights, int8_t * restrict L, struct fraction * restrict Faux, bool signed_scale) { - float max = 0.0f; - float amax = 0.0f; - for (int i = 0; i < n; ++i) { - float ax = fabsf(x[i]); - if (ax > amax) { - amax = ax; - max = x[i]; +static float make_qkxs_quants(int n, int nmin, int nmax, const float * restrict x, const float * restrict weights, int8_t * restrict L, int8_t * restrict Laux, struct fraction * restrict Faux, bool signed_scale) { + const int orig_nmin = nmin; + const int orig_nmax = nmax; + float max = x[0]; + float min = x[0]; + float w_amax = weights[0] * fabsf(x[0]); + int max_i = 0; + int w_amax_i = 0; + int min_i = 0; + for (int i = 1; i < n; ++i) { + if (x[i] < min) { min = x[i]; min_i = i; } + if (x[i] > max) { max = x[i]; max_i = i; } + // Find the most important value + const float w = weights[i]; + const float wax = w * fabsf(x[i]); + if (wax > w_amax) { + w_amax = wax; + w_amax_i = i; + } + } + const int amax_i = fabsf(min) > fabsf(max) ? min_i : max_i; + const float amax = fabsf(x[amax_i]); + + if (amax < GROUP_MAX_EPS) { // all zero + for (int i = 0; i < n; ++i) { + L[i] = 0; } + return 0.0f; } bool negative_scale = false; if (signed_scale && -nmin != nmax) { // the max side should have the biggest range - if ((max < 0.0f) == (-nmin < nmax)) { + // FIXME: this is incorrect when the weights[.] do not sort in the same order as fabsf(x[.]) + // or is it some other condition? + if ((x[amax_i] < 0.0f) == (-nmin < nmax)) { // [-4, 3] ==> [-3, 4] - int tmp = nmin; + const int tmp = nmin; + const float ftmp = min; nmin = -nmax; nmax = -tmp; + min = -max; + max = -ftmp; negative_scale = true; } } - if (amax < GROUP_MAX_EPS) { // all zero + + // Find the max range in [0, amax_range] which doesn't result in clamping. + // This is the range from the side which would clamp first (biggest ratio of max to nmax). + int amax_range; + float range_max; + if (fabsf(-max * nmin) < fabsf(-min * nmax)) { + amax_range = MAX(0, -nmin); + range_max = fabsf(min); + } else { + amax_range = MAX(0, nmax); + range_max = fabsf(max); + } + float sumlx = 0.0f; + float suml2 = 0.0f; + float scale = 0.0f; + float best = 0.0f; + float best_denom = 1.0f; + if (amax_range > 1) { + // The smallest non-redundant iscale makes the first clamped value half+1 its max integer value. + // Proof: anything smaller has a representable vector with values twice as big. + const float iscale = ((float)(amax_range / 2 + 1))/range_max * (negative_scale ? -1.0f : 1.0f); + for (int i = 0; i < n; ++i) { + const float w = weights[i]; + int l = MAX(nmin, MIN(lroundf(x[i] * iscale), nmax)); + if (negative_scale) { l = -l; } + Laux[i] = l; + L[i] = l; + suml2 += w * l * l; + sumlx += w * l * x[i]; + } + best = sumlx * sumlx; + best_denom = suml2; // should never be zero + scale = sumlx / suml2; + } else { for (int i = 0; i < n; ++i) { + Laux[i] = 0; L[i] = 0; } - return 0.0f; } + + const int imax_range = MAX(0, (x[w_amax_i] < 0.0f) ? -nmin : nmax); + const int max_odd = 2*(imax_range + 1) + 1; + const float wmax = fabsf(x[w_amax_i]); int n_frac = 0; for (int i = 0; i < n; ++i) { // assuming nmin <= nmax - const int odd_max = MAX(0, x[i] < 0 ? -nmin : nmax); - const int odd_min = MAX(0, x[i] < 0 ? -nmax : nmin); + const int odd_max = MAX(abs(Laux[i]), x[i] < 0.0f ? -nmin : nmax); + const int odd_min = MAX(abs(Laux[i]), x[i] < 0.0f ? -nmax : nmin); const float v = fabsf(x[i]); - // fprintf(stderr, "%s: i=%d, odd_min=%d, odd_max=%d\n", __func__, i, odd_min, odd_max); + const float v_max_odd = v * max_odd; for (int j = odd_min; j < odd_max; ++j) { const float odd = 2*j + 1; - Faux[n_frac++] = (struct fraction){ - .numer=v, - .denom=odd, - .i=i, - }; + if (wmax * odd < v_max_odd) { + Faux[n_frac++] = (struct fraction){ + .numer=v, + .denom=odd, + .i=i, + }; + } else { + // stop when the inverse scale would result in clamping the max (FIXME: most important) value + break; + } } } qsort(Faux, n_frac, sizeof(struct fraction), compare_fractions_desc); - float iscale = 0.0f; - { - float sumlx = 0.0f; - float suml2 = 0.0f; - float best = 0.0f; - float best_denom = 1.0f; - for (int i = 0; i < n_frac; ++i) { - // maximize the weighted cosine - const int ii = Faux[i].i; - const float w = weights ? weights[ii] : x[ii] * x[ii]; - sumlx += w * Faux[i].numer; - suml2 += w * Faux[i].denom; - const float current = sumlx * sumlx; - // fprintf(stderr, "%s: Faux[%d]=(%f/%f) * %f, square(sumlx)=%f, suml2=%f, k*cos2=%f\n", __func__, i, Faux[i].numer, Faux[i].denom, Faux[i].weight, current, suml2, current / suml2); - // use the last in case of equality - // FIXME: > or >= ?? Why does [0, 0, 1] rounds to [0, 0, 0] with >= ? - if (suml2 > 0.0f && current * best_denom > best * suml2) { - best = current; - best_denom = suml2; - iscale = Faux[i].numer > 0.0f ? Faux[i].denom / (2.0f * Faux[i].numer) : 0.0f; - if (!isfinite(iscale)) { - fprintf(stderr, "%s: iscale is not finite, %f/(2*%f)\n", __func__, Faux[i].denom, Faux[i].numer); + int best_p_i = -1; // consecutive with 0..n_frac + for (int i = 0; i < n_frac; ++i) { + // maximize the weighted cosine + const int ii = Faux[i].i; + const float w = weights ? weights[ii] : x[ii] * x[ii]; + sumlx += w * Faux[i].numer; + suml2 += w * Faux[i].denom; + const float current = sumlx * sumlx; + Laux[ii] += x[ii] < 0.0f ? -1 : 1; + if (suml2 > 0.0f && Faux[i].numer > 0.0f && current * best_denom > best * suml2) { + best = current; + best_denom = suml2; + scale = sumlx / suml2; + if (i == best_p_i + 1) { + // reduce copies for consecutive bests + L[ii] += x[ii] < 0.0f ? -1 : 1; + } else { + for (int j = 0; j < n; ++j) { + L[j] = Laux[j]; } } + best_p_i = i; } } - // (very) small fudging necessary because floats otherwise round to nearest even - iscale = iscale * ((float)((1 << 23) + 1) / (float)(1 << 23)); - - float sumlx = 0.0f; - float suml2 = 0.0f; for (int i = 0; i < n; ++i) { - // Rounding away from zero is assumed by the search algorithm above. - int l = MAX(nmin, MIN(lroundf(x[i] * iscale), nmax)); - if (negative_scale) { - l = -l; - } - L[i] = negative_scale ? l + nmax : l - nmin; - float w = weights ? weights[i] : x[i] * x[i]; - // weighted projection scale - sumlx += w * x[i] * l; - suml2 += w * l * l; + L[i] = negative_scale ? (-L[i] + nmax) : (L[i] + -nmin); + GGML_ASSERT(L[i] >= 0 && L[i] <= nmax - nmin); } - return suml2 > 0.0f ? sumlx / suml2 : 0.0f; + return negative_scale ? -scale : scale; } // non-linear exhaustive search with cumulative sums @@ -1234,6 +1285,7 @@ void quantize_row_q3_K_ref(const float * restrict x, block_q3_K * restrict y, in const int nb = k / QK_K; int8_t L[QK_K]; + int8_t Laux[16]; struct fraction Faux[16 * 4]; float scales[QK_K / 16]; float weights[16]; @@ -1247,7 +1299,7 @@ void quantize_row_q3_K_ref(const float * restrict x, block_q3_K * restrict y, in float max_scale = 0; float amax = 0; for (int j = 0; j < QK_K/16; ++j) { - scales[j] = make_qkxs_quants(16, -4, 3, x + 16*j, weights, L + 16*j, Faux, true); + scales[j] = make_qkxs_quants(16, -4, 3, x + 16*j, weights, L + 16*j, Laux, Faux, true); // scales[j] = make_q3_quants(16, 4, x + 16*j, L + 16*j, true); float scale = fabsf(scales[j]); if (scale > amax) { @@ -1367,6 +1419,7 @@ static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restri const int nb = n_per_row / QK_K; int8_t L[QK_K]; + int8_t Laux[16]; float scales[QK_K / 16]; float weight[16]; float sw[QK_K / 16]; @@ -1391,14 +1444,14 @@ static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restri sw[j] = sumw; // scales[j] = make_qx_quants(16, 4, x + 16*j, L + 16*j, 1, weight); - scales[j] = make_qkxs_quants(16, -4, 3, x + 16*j, weight, L + 16*j, Faux, true); + scales[j] = make_qkxs_quants(16, -4, 3, x + 16*j, weight, L + 16*j, Laux, Faux, true); } memset(y[i].scales, 0, 12); // float d_block = make_qx_quants(QK_K/16, 32, scales, Ls, 1, sw); - float d_block = make_qkxs_quants(QK_K/16, -32, 31, scales, sw, Ls, Faux, true); + float d_block = make_qkxs_quants(QK_K/16, -32, 31, scales, sw, Ls, Laux, Faux, true); for (int j = 0; j < QK_K/16; ++j) { int l = Ls[j]; if (j < 8) { @@ -4856,11 +4909,11 @@ static void quantize_row_iq4_nl_impl(const int super_block_size, const int block for (int j = 0; j < block_size; ++j) weight[j] = sqrtf(sigma2 + xb[j]*xb[j]); // for (int j = 0; j < block_size; ++j) weight[j] = 1; } - float amax = 0, max = 0; + float amax = 0; for (int j = 0; j < block_size; ++j) { float ax = fabsf(xb[j]); if (ax > amax) { - amax = ax; max = xb[j]; + amax = ax; } } if (amax < GROUP_MAX_EPS) { From 6f7fe74946f24c2e3f3d2b1d9345a6882523da51 Mon Sep 17 00:00:00 2001 From: Francis Couture-Harpin Date: Fri, 21 Feb 2025 18:47:09 -0500 Subject: [PATCH 03/12] ggml-quants : improve imatrix behavior for TQ1_0, TQ2_0, Q4_0, Q5_0 --- ggml/src/ggml-quants.c | 142 ++++++++++++++++++++++++++++++++++++++--- 1 file changed, 132 insertions(+), 10 deletions(-) diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index 29ffb608b645f..4dcca9953501c 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -661,8 +661,6 @@ static inline int compare_fractions_desc(const void * a, const void * b) { // exhaustive search with cumulative sums // Need Faux to have room for n*(max(abs(nmin), abs(nmax))) fractions static float make_qkxs_quants(int n, int nmin, int nmax, const float * restrict x, const float * restrict weights, int8_t * restrict L, int8_t * restrict Laux, struct fraction * restrict Faux, bool signed_scale) { - const int orig_nmin = nmin; - const int orig_nmax = nmax; float max = x[0]; float min = x[0]; float w_amax = weights[0] * fabsf(x[0]); @@ -2143,6 +2141,8 @@ static void quantize_row_q4_0_impl(const float * restrict x, block_q4_0 * restri float weight[QK4_0]; int8_t L[QK4_0]; + int8_t Laux[QK4_0]; + struct fraction Faux[8 * QK4_0]; float sum_x2 = 0; for (int j = 0; j < n_per_row; ++j) sum_x2 += x[j]*x[j]; @@ -2153,7 +2153,7 @@ static void quantize_row_q4_0_impl(const float * restrict x, block_q4_0 * restri const float * xb = x + QK4_0 * ib; const float * qw = quant_weights + QK4_0 * ib; for (int j = 0; j < QK4_0; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]); - float d = make_qx_quants(QK4_0, 8, xb, L, 1, weight); + float d = make_qkxs_quants(QK4_0, -8, 7, xb, weight, L, Laux, Faux, true); y[ib].d = GGML_FP32_TO_FP16(d); for (int j = 0; j < 16; ++j) { y[ib].qs[j] = L[j] | (L[j+16] << 4); @@ -2231,6 +2231,8 @@ static void quantize_row_q5_0_impl(const float * restrict x, block_q5_0 * restri float weight[QK5_0]; int8_t L[QK5_0]; + int8_t Laux[QK5_0]; + struct fraction Faux[16 * QK5_0]; float sum_x2 = 0; for (int j = 0; j < n_per_row; ++j) sum_x2 += x[j]*x[j]; @@ -2241,7 +2243,7 @@ static void quantize_row_q5_0_impl(const float * restrict x, block_q5_0 * restri const float * xb = x + QK5_0 * ib; const float * qw = quant_weights + QK5_0 * ib; for (int j = 0; j < QK5_0; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]); - float d = make_qx_quants(QK5_0, 16, xb, L, 1, weight); + float d = make_qkxs_quants(QK5_0, -16, 15, xb, weight, L, Laux, Faux, true); y[ib].d = GGML_FP32_TO_FP16(d); uint32_t qh = 0; @@ -2403,6 +2405,74 @@ void quantize_row_tq1_0_ref(const float * restrict x, block_tq1_0 * restrict y, } } +static void quantize_row_tq1_0_impl(const float * restrict x, block_tq1_0 * restrict y, int64_t n_per_row, const float * quant_weights) { + if (!quant_weights) { + quantize_row_tq1_0_ref(x, y, n_per_row); + return; + } + + float weight[QK_K]; + int8_t L[QK_K]; + int8_t Laux[QK_K]; + struct fraction Faux[1 * QK_K]; + + float sum_x2 = 0; + for (int j = 0; j < n_per_row; ++j) { sum_x2 += x[j]*x[j]; } + float sigma2 = sum_x2/n_per_row; + + const int64_t nb = n_per_row/QK_K; + for (int ib = 0; ib < nb; ++ib) { + const float * xb = x + QK_K * ib; + const float * qw = quant_weights + QK_K * ib; + const int8_t * Lptr = L; + for (int j = 0; j < QK_K; ++j) { weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]); } + float d = make_qkxs_quants(QK_K, -1, 1, xb, weight, L, Laux, Faux, false); + y[ib].d = GGML_FP32_TO_FP16(d); + + // 5 elements per byte, along 32 bytes + for (size_t j = 0; j < sizeof(y->qs) - sizeof(y->qs) % 32; j += 32) { + for (size_t m = 0; m < 32; ++m) { + uint8_t q = 0; + for (size_t n = 0; n < 5; ++n) { + q *= 3; + q += Lptr[m + n*32]; + } + // ceiling division (243 == pow(3, 5)) + q = ((uint16_t)q * 256 + (243 - 1)) / 243; + y[ib].qs[j + m] = q; + } + Lptr += 5*32; + } + // along 16 bytes + for (size_t j = sizeof(y->qs) - sizeof(y->qs) % 32; j < sizeof(y->qs); j += 16) { + for (size_t m = 0; m < 16; ++m) { + uint8_t q = 0; + for (size_t n = 0; n < 5; ++n) { + q *= 3; + q += Lptr[m + n*16]; + } + // ceiling division (243 == pow(3, 5)) + q = ((uint16_t)q * 256 + (243 - 1)) / 243; + y[ib].qs[j + m] = q; + } + Lptr += 5*16; + } + // 4 elements per byte + for (size_t j = 0; j < sizeof(y->qh); ++j) { + uint8_t q = 0; + for (size_t m = 0; m < 4; ++m) { + q *= 3; + q += Lptr[j + m*sizeof(y->qh)]; + } + // shift the first value to the most significant trit + q *= 3; + // ceiling division (243 == pow(3, 5)) + q = ((uint16_t)q * 256 + (243 - 1)) / 243; + y[ib].qh[j] = q; + } + } +} + void quantize_row_tq2_0_ref(const float * restrict x, block_tq2_0 * restrict y, int64_t k) { assert(k % QK_K == 0); const int64_t nb = k / QK_K; @@ -2435,17 +2505,69 @@ void quantize_row_tq2_0_ref(const float * restrict x, block_tq2_0 * restrict y, } } + +static void quantize_row_tq2_0_impl(const float * restrict x, block_tq2_0 * restrict y, int64_t n_per_row, const float * quant_weights) { + if (!quant_weights) { + quantize_row_tq2_0_ref(x, y, n_per_row); + return; + } + + float weight[QK_K]; + int8_t L[QK_K]; + int8_t Laux[QK_K]; + struct fraction Faux[2 * QK_K]; + + float sum_x2 = 0; + for (int j = 0; j < n_per_row; ++j) { sum_x2 += x[j]*x[j]; } + float sigma2 = sum_x2/n_per_row; + + const int64_t nb = n_per_row/QK_K; + for (int ib = 0; ib < nb; ++ib) { + const float * xb = x + QK_K * ib; + const float * qw = quant_weights + QK_K * ib; + for (int j = 0; j < QK_K; ++j) { weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]); } + float d = make_qkxs_quants(QK_K, -1, 2, xb, weight, L, Laux, Faux, true); + y[ib].d = GGML_FP32_TO_FP16(d); + + for (size_t j = 0; j < sizeof(y->qs); j += 32) { + for (size_t m = 0; m < 32; ++m) { + uint8_t q = 0; + for (size_t n = 0; n < 4; ++n) { + q += (L[4*j + m + n*32] & 3) << (2*n); + } + y[ib].qs[j + m] = q; + } + } + } +} + size_t quantize_tq1_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { - (void)quant_weights; // not used - const size_t row_size = ggml_row_size(GGML_TYPE_TQ1_0, n_per_row); - quantize_row_tq1_0_ref(src, dst, (int64_t)nrow*n_per_row); + if (!quant_weights) { + quantize_row_tq1_0_ref(src, dst, (int64_t)nrow*n_per_row); + return nrow * ggml_row_size(GGML_TYPE_TQ1_0, n_per_row); + } + size_t row_size = ggml_row_size(GGML_TYPE_TQ1_0, n_per_row); + char * qrow = (char *)dst; + for (int64_t row = 0; row < nrow; ++row) { + quantize_row_tq1_0_impl(src, (block_tq1_0*)qrow, n_per_row, quant_weights); + src += n_per_row; + qrow += row_size; + } return nrow * row_size; } size_t quantize_tq2_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { - (void)quant_weights; // not used - const size_t row_size = ggml_row_size(GGML_TYPE_TQ2_0, n_per_row); - quantize_row_tq2_0_ref(src, dst, (int64_t)nrow*n_per_row); + if (!quant_weights) { + quantize_row_tq2_0_ref(src, dst, (int64_t)nrow*n_per_row); + return nrow * ggml_row_size(GGML_TYPE_TQ2_0, n_per_row); + } + size_t row_size = ggml_row_size(GGML_TYPE_TQ2_0, n_per_row); + char * qrow = (char *)dst; + for (int64_t row = 0; row < nrow; ++row) { + quantize_row_tq2_0_impl(src, (block_tq2_0*)qrow, n_per_row, quant_weights); + src += n_per_row; + qrow += row_size; + } return nrow * row_size; } From f27c1afc4040f3cb998b952cb571c280966b8ab2 Mon Sep 17 00:00:00 2001 From: Francis Couture-Harpin Date: Fri, 7 Mar 2025 12:54:56 -0500 Subject: [PATCH 04/12] ggml-quants : improve TQ2_0 imatrix --- ggml/src/ggml-quants.c | 218 ++++++++++++++++++++++++++++++++++------- 1 file changed, 181 insertions(+), 37 deletions(-) diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index 4dcca9953501c..f13805124ffb6 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -687,11 +687,11 @@ static float make_qkxs_quants(int n, int nmin, int nmax, const float * restrict } return 0.0f; } + bool negative_scale = false; if (signed_scale && -nmin != nmax) { // the max side should have the biggest range - // FIXME: this is incorrect when the weights[.] do not sort in the same order as fabsf(x[.]) - // or is it some other condition? + // FIXME: this is not always the best sign if ((x[amax_i] < 0.0f) == (-nmin < nmax)) { // [-4, 3] ==> [-3, 4] const int tmp = nmin; @@ -762,7 +762,7 @@ static float make_qkxs_quants(int n, int nmin, int nmax, const float * restrict .i=i, }; } else { - // stop when the inverse scale would result in clamping the max (FIXME: most important) value + // stop when the inverse scale would result in clamping the most important value break; } } @@ -802,6 +802,182 @@ static float make_qkxs_quants(int n, int nmin, int nmax, const float * restrict return negative_scale ? -scale : scale; } +// Very similar to make_qkxs_quants, but the sign of the scale is not assumed to be the sign of the absmax value. +static float make_qkxss_quants(int n, int nmin, int nmax, const float * restrict x, const float * restrict weights, int8_t * restrict L, int8_t * restrict Laux, struct fraction * restrict Faux) { + // start at zero + nmin = MIN(0, nmin); + nmax = MAX(0, nmax); + float amax = 0.0f; + float min = 0.0f; + float max = 0.0f; + float w_amax = 0.0f; + int amax_i = -1; + int w_amax_i = -1; + for (int i = 0; i < n; ++i) { + const float w = weights ? weights[i] : x[i] * x[i]; + const float ax = fabsf(x[i]); + const float wax = w * ax; + if (ax > amax) { amax = ax; amax_i = i; } + if (x[i] > max) { max = x[i]; } + if (x[i] < min) { min = x[i]; } + // Find the most important value + if (wax > w_amax) { w_amax = wax; w_amax_i = i; } + } + + if (amax < GROUP_MAX_EPS || amax_i < 0 || w_amax_i < 0) { // all zero + for (int i = 0; i < n; ++i) { L[i] = 0; } + return 0.0f; + } + + // Use the side which will clamp first. + // The first clamped value is the absmax at the end of the common range. + // TODO: reduce the search space when one of the ranges is 0 + const int amax_range = MIN(-nmin, nmax); + float sumlx_p = 0.0f; + float suml2_p = 0.0f; + float sumlx_n = 0.0f; + float suml2_n = 0.0f; + float scale = 0.0f; + float best = 0.0f; + float best_denom = 1.0f; + int best_i = -2; // not consecutive with 0..n_frac + // Pre-calculate the half-point for the common range. + // All smaller vectors have a representable vector with twice the values, and thus can be skipped. + if (amax_range > 1) { + const float iscale = ((float)(amax_range / 2 + 1))/amax; + for (int i = 0; i < n; ++i) { + const float w = weights ? weights[i] : x[i] * x[i]; + int l = MAX(nmin, MIN(lroundf(x[i] * iscale), nmax)); + Laux[i] = l; + suml2_p += w * l * l; + sumlx_p += w * l * x[i]; + } + sumlx_n = -sumlx_p; + suml2_n = suml2_p; + const float current_p = sumlx_p * sumlx_p; + if (suml2_p > 0.0f && current_p * best_denom > best * suml2_p) { + best = current_p; + best_denom = suml2_p; + scale = sumlx_p / suml2_p; + for (int i = 0; i < n; ++i) { + L[i] = Laux[i]; + } + best_i = -1; // right before 0 of the loop after sorting + } + } else { + for (int i = 0; i < n; ++i) { + Laux[i] = 0; + } + } + + const int imax_range = MAX(nmax, -nmin); + const int max_odd = 2*(imax_range + 1) + 1; + const float wmax = fabsf(x[w_amax_i]); + int n_frac = 0; + for (int i = 0; i < n; ++i) { + // assuming nmin <= nmax + const int odd_max = MAX(nmax, -nmin); + const float v = fabsf(x[i]); + const float v_max_odd = v * max_odd; + for (int j = abs(Laux[i]); j < odd_max; ++j) { + const float odd = 2*j + 1; + const float wmax_odd = wmax * odd; + if (wmax_odd < v_max_odd) { + Faux[n_frac++] = (struct fraction){ + .numer=v, + .denom=odd, + .i=i, + }; + } else { + // stop when the inverse scale would result in clamping the most important value + break; + } + } + } + + qsort(Faux, n_frac, sizeof(struct fraction), compare_fractions_desc); + + const float max_common_odd = (MIN(nmax, -nmin) * 2) + 1; + const float max_odd_p = (nmax * 2) + 1; + const float max_odd_n = (-nmin * 2) + 1; + + for (int i = 0; i < n_frac; ++i) { + // maximize the weighted cosine similarity + const int ii = Faux[i].i; + const float w = weights ? weights[ii] : x[ii] * x[ii]; + const float lx = w * Faux[i].numer; + const float odd = Faux[i].denom; + const float l2 = w * odd; + + Laux[ii] += x[ii] < 0.0f ? -1 : 1; + + float sumlx = 0.0f; + float proj = 0.0f; + float norm = 0.0f; + if (odd < max_common_odd) { + sumlx_p += lx; + suml2_p += l2; + sumlx_n -= lx; + suml2_n += l2; + + sumlx = sumlx_p; + proj = sumlx_p * sumlx_p; + norm = suml2_p; + + // avoid double-copying Laux in a single iteration + if (suml2_p != suml2_n && suml2_p * suml2_n > 0.0f) { + const float proj_n = sumlx_n * sumlx_n; + if (proj_n * norm > proj * suml2_n) { + sumlx = sumlx_n; + proj = proj_n; + norm = suml2_n; + } + } + } else if (x[ii] < 0.0f ? odd < max_odd_n : odd < max_odd_p) { + sumlx_p += lx; + suml2_p += l2; + + sumlx = sumlx_p; + proj = sumlx_p * sumlx_p; + norm = suml2_p; + } else { + // outside the positive range means we're now into negatives + sumlx_n -= lx; + suml2_n += l2; + + sumlx = sumlx_n; + proj = sumlx_n * sumlx_n; + norm = suml2_n; + } + if (norm > 0.0f && proj * best_denom > best * norm) { + best = proj; + best_denom = norm; + scale = sumlx / norm; + if (i == best_i + 1) { + // reduce copies for consecutive bests + L[ii] += x[ii] < 0.0f ? -1 : 1; + } else { + for (int j = 0; j < n; ++j) { + L[j] = Laux[j]; + } + } + best_i = i; + } + } + + if (scale < 0.0f) { + for (int i = 0; i < n; ++i) { + L[i] = MAX(nmin, MIN(-L[i], nmax)) - nmin; + } + } else { + for (int i = 0; i < n; ++i) { + L[i] = MAX(nmin, MIN(L[i], nmax)) - nmin; + } + } + + return scale; +} + // non-linear exhaustive search with cumulative sums // Need Faux to have room for n*k fractions static float make_qkxs_nl_quants(int n, int k, const float * restrict x, const float * restrict weights, const int8_t * restrict kvalues, uint8_t * restrict L, uint8_t * restrict Laux, struct fraction * restrict Faux, bool signed_scale) { @@ -874,6 +1050,7 @@ static float make_qkxs_nl_quants(int n, int k, const float * restrict x, const f } // Non-linear mappings are usually not symmetric, so try negating the scale + // This is the same as above, but keeping the old best if the new best is not better. if (signed_scale) { for (int i = 0; i < n; ++i) { Laux[i] = koff; @@ -1298,7 +1475,6 @@ void quantize_row_q3_K_ref(const float * restrict x, block_q3_K * restrict y, in float amax = 0; for (int j = 0; j < QK_K/16; ++j) { scales[j] = make_qkxs_quants(16, -4, 3, x + 16*j, weights, L + 16*j, Laux, Faux, true); - // scales[j] = make_q3_quants(16, 4, x + 16*j, L + 16*j, true); float scale = fabsf(scales[j]); if (scale > amax) { amax = scale; max_scale = scales[j]; @@ -1324,21 +1500,6 @@ void quantize_row_q3_K_ref(const float * restrict x, block_q3_K * restrict y, in y[i].d = GGML_FP32_TO_FP16(0.f); } - // int8_t sc; - // for (int j = 0; j < QK_K/16; ++j) { - // sc = j < 8 ? y[i].scales[j] & 0xF : y[i].scales[j-8] >> 4; - // sc = (sc | (((y[i].scales[8 + j%4] >> (2*(j/4))) & 3) << 4)) - 32; - // float d = GGML_FP16_TO_FP32(y[i].d) * sc; - // if (!d) { - // continue; - // } - // for (int ii = 0; ii < 16; ++ii) { - // int l = nearest_int(x[16*j + ii]/d); - // l = MAX(-4, MIN(3, l)); - // L[16*j + ii] = l + 4; - // } - // } - memset(y[i].hmask, 0, QK_K/8); // We put the high-bit for the 1st 8 quants into bit 0, the next 8 into bit 1, etc. int m = 0; @@ -1441,14 +1602,12 @@ static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restri for (int l = 0; l < 16; ++l) sumw += weight[l]; sw[j] = sumw; - // scales[j] = make_qx_quants(16, 4, x + 16*j, L + 16*j, 1, weight); scales[j] = make_qkxs_quants(16, -4, 3, x + 16*j, weight, L + 16*j, Laux, Faux, true); } memset(y[i].scales, 0, 12); - // float d_block = make_qx_quants(QK_K/16, 32, scales, Ls, 1, sw); float d_block = make_qkxs_quants(QK_K/16, -32, 31, scales, sw, Ls, Laux, Faux, true); for (int j = 0; j < QK_K/16; ++j) { int l = Ls[j]; @@ -1462,21 +1621,6 @@ static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restri } y[i].d = GGML_FP32_TO_FP16(d_block); - // int8_t sc; - // for (int j = 0; j < QK_K/16; ++j) { - // sc = j < 8 ? y[i].scales[j] & 0xF : y[i].scales[j-8] >> 4; - // sc = (sc | (((y[i].scales[8 + j%4] >> (2*(j/4))) & 3) << 4)) - 32; - // float d = GGML_FP16_TO_FP32(y[i].d) * sc; - // if (!d) { - // continue; - // } - // for (int ii = 0; ii < 16; ++ii) { - // int l = nearest_int(x[16*j + ii]/d); - // l = MAX(-4, MIN(3, l)); - // L[16*j + ii] = l + 4; - // } - // } - memset(y[i].hmask, 0, QK_K/8); // We put the high-bit for the 1st 8 quants into bit 0, the next 8 into bit 1, etc. int m = 0; @@ -2526,7 +2670,7 @@ static void quantize_row_tq2_0_impl(const float * restrict x, block_tq2_0 * rest const float * xb = x + QK_K * ib; const float * qw = quant_weights + QK_K * ib; for (int j = 0; j < QK_K; ++j) { weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]); } - float d = make_qkxs_quants(QK_K, -1, 2, xb, weight, L, Laux, Faux, true); + float d = make_qkxss_quants(QK_K, -1, 2, xb, weight, L, Laux, Faux); y[ib].d = GGML_FP32_TO_FP16(d); for (size_t j = 0; j < sizeof(y->qs); j += 32) { From 0c9e44248996f3bcd08ca2c1acf0ef5548428f41 Mon Sep 17 00:00:00 2001 From: Francis Couture-Harpin Date: Sat, 15 Mar 2025 10:29:47 -0400 Subject: [PATCH 05/12] ggml-quants : remove some commented code --- ggml/src/ggml-quants.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index f13805124ffb6..78c0e2f492a77 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -5173,7 +5173,6 @@ static void quantize_row_iq4_nl_impl(const int super_block_size, const int block for (int j = 0; j < block_size; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]); } else { for (int j = 0; j < block_size; ++j) weight[j] = sqrtf(sigma2 + xb[j]*xb[j]); - // for (int j = 0; j < block_size; ++j) weight[j] = 1; } float amax = 0; for (int j = 0; j < block_size; ++j) { @@ -5258,7 +5257,6 @@ size_t quantize_iq4_nl(const float * restrict src, void * restrict dst, int64_t return nrow * nblock * sizeof(block_iq4_nl); } -//void quantize_row_iq4_nl_ref(const float * restrict x, void * restrict vy, int64_t k) { void quantize_row_iq4_nl_ref(const float * restrict x, block_iq4_nl * restrict y, int64_t k) { GGML_ASSERT(k%QK4_NL == 0); int64_t nblock = k/QK4_NL; From 30ad9c2873fd95ae0f4ecae4fabe0fb812dc640d Mon Sep 17 00:00:00 2001 From: Francis Couture-Harpin Date: Sat, 15 Mar 2025 12:55:22 -0400 Subject: [PATCH 06/12] ggml-quants : faster exhaustive IQ4_NL rounding with k_heap --- ggml/src/ggml-quants.c | 317 +++++++++++++++++++++++++++++------------ 1 file changed, 227 insertions(+), 90 deletions(-) diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index 78c0e2f492a77..6762e98f68097 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -658,6 +658,153 @@ static inline int compare_fractions_desc(const void * a, const void * b) { return (ab == ba) ? ((a > b) ? 1 : -1) : (ab < ba) ? 1 : -1; } +struct k_heap_cell { + float frac; + float x; + int x_i; + int k_i; +}; + +// Faster enumeration for cumulative search +struct k_heap { + int n; + int k; + int mid_k; + int8_t kmin; + int8_t kmax; + const float * odd; + const float * steps; + struct k_heap_cell * heap; +}; + +// build a max heap out of k_cells starting from node i +static void k_heap_build(struct k_heap * heap, int i) { + const int n = heap->n; + int max = i; + int prev_max; + while (max < n / 2) { + prev_max = max; + int left = 2*(max + 1) - 1; + int right = 2*(max + 1); + if (left < n && heap->heap[left].frac > heap->heap[max].frac) { + max = left; + } + if (right < n && heap->heap[right].frac > heap->heap[max].frac) { + max = right; + } + if (max != prev_max) { + struct k_heap_cell tmp = heap->heap[prev_max]; + heap->heap[prev_max] = heap->heap[max]; + heap->heap[max] = tmp; + } else { + break; + } + } +} + +// Assuming kvalues is sorted from most negative to most positive. +// odd and steps are buffers of size k, while heap_cells should be big enough for x later on. +static void k_heap_init(struct k_heap * restrict k_heap, int k, const int8_t * restrict kvalues, struct k_heap_cell * restrict heap_cells, float * restrict odd, float * restrict steps) { + GGML_ASSERT(k_heap && kvalues && heap_cells && odd); + k_heap->n = 0; + k_heap->k = k; + k_heap->odd = odd; + k_heap->steps = steps; + k_heap->heap = heap_cells; + + int amin = abs(kvalues[0]); + int amax = abs(kvalues[0]); + int mid_k = 0; + int max_k = 0; + for (int i = 1; i < k; ++i) { + const int ak = abs(kvalues[i]); + if (ak < amin) { + amin = ak; + mid_k = i; + } + if (ak > amax) { + amax = ak; + max_k = i; + } + } + k_heap->mid_k = mid_k; + k_heap->kmin = kvalues[mid_k]; + k_heap->kmax = kvalues[max_k]; + + for (int i = 0; i < k - 1; ++i) { + const float threshold = kvalues[i + 1] + kvalues[i]; + const float step = kvalues[i + 1] - kvalues[i]; + // It's amazing how their product is the difference between consecutive squares of the kvalues + GGML_ASSERT(threshold * step != 0.0f); + odd[i + (i >= mid_k ? 1 : 0)] = fabsf(threshold); + steps[i + (i >= mid_k ? 1 : 0)] = fabsf(step); + } + odd[mid_k] = 0.0f; + steps[mid_k] = 0.0f; + + GGML_ASSERT(mid_k > 0 && mid_k + 1 < k); +} + +// TODO: initial quantized values +static void k_heap_set_x(struct k_heap * k_heap, const float * restrict x, int n, bool invert_sign) { + // TODO: sanity checks + k_heap->n = n; + for (int i = 0; i < n; ++i) { + const int k_i = ((x[i] < 0.0f) != invert_sign) ? k_heap->mid_k - 1 : k_heap->mid_k + 1; + k_heap->heap[i] = (struct k_heap_cell) { + .k_i=k_i, + .x_i=i, + .x=fabsf(x[i]), + .frac=fabsf(x[i] / k_heap->odd[k_i]), + }; + } + + for (int i = (k_heap->n / 2) - 1; i >= 0; --i) { + k_heap_build(k_heap, i); + } +} + +static struct fraction k_heap_pop(struct k_heap * k_heap) { + if (k_heap && k_heap->n > 0) { + struct k_heap_cell * heap_cell = k_heap->heap; + const float step = k_heap->steps[heap_cell->k_i]; + // Properly turn this into a difference of consecutive squares + struct fraction frac = (struct fraction) { + .numer=heap_cell->x*step, + .denom=k_heap->odd[heap_cell->k_i]*step, + .i=heap_cell->x_i, + }; + + if (heap_cell->k_i < k_heap->mid_k) { + if (heap_cell->k_i > 0) { + heap_cell->k_i -= 1; + heap_cell->frac = heap_cell->x/k_heap->odd[heap_cell->k_i]; + } else { + // remove this node + k_heap->heap[0] = k_heap->heap[k_heap->n - 1]; + k_heap->n -= 1; + } + } else { + if (heap_cell->k_i < k_heap->k - 1) { + heap_cell->k_i += 1; + heap_cell->frac = heap_cell->x/k_heap->odd[heap_cell->k_i]; + } else { + // remove this node + k_heap->heap[0] = k_heap->heap[k_heap->n - 1]; + k_heap->n -= 1; + } + } + k_heap_build(k_heap, 0); + + return frac; + } + return (struct fraction) { + .numer=0.0f, + .denom=0.0f, + .i=-1, + }; +} + // exhaustive search with cumulative sums // Need Faux to have room for n*(max(abs(nmin), abs(nmax))) fractions static float make_qkxs_quants(int n, int nmin, int nmax, const float * restrict x, const float * restrict weights, int8_t * restrict L, int8_t * restrict Laux, struct fraction * restrict Faux, bool signed_scale) { @@ -979,111 +1126,89 @@ static float make_qkxss_quants(int n, int nmin, int nmax, const float * restrict } // non-linear exhaustive search with cumulative sums -// Need Faux to have room for n*k fractions -static float make_qkxs_nl_quants(int n, int k, const float * restrict x, const float * restrict weights, const int8_t * restrict kvalues, uint8_t * restrict L, uint8_t * restrict Laux, struct fraction * restrict Faux, bool signed_scale) { +static float make_qkxs_nl_quants(int n, const float * restrict x, const float * restrict weights, uint8_t * restrict L, uint8_t * restrict Laux, struct k_heap * restrict k_heap, bool signed_scale, bool fast) { float sumlx = 0.0f; float suml2 = 0.0f; - int kmin = abs(kvalues[0]); - int koff = 0; - for (int i = 1; i < k; ++i) { - int ak = abs(kvalues[i]); - if (ak < kmin) { - kmin = ak; - koff = i; - } - } - kmin = kvalues[koff]; + float amax = -1.0f; + int amax_i = -1; + const int8_t kmin = k_heap->kmin; for (int i = 0; i < n; ++i) { - float w = weights ? weights[i] : x[i] * x[i]; - Laux[i] = koff; + const float w = weights ? weights[i] : x[i] * x[i]; + const float ax = fabsf(x[i]); + if (ax > amax) { + amax = ax; + amax_i = i; + } + Laux[i] = k_heap->mid_k; sumlx += w * x[i] * kmin; suml2 += w * kmin * kmin; } - int n_frac_p = 0; - for (int i = 0; i < n; ++i) { - const int start = x[i] < 0.0f ? 1 : koff + 1; - const int end = x[i] < 0.0f ? koff + 1: k; - for (int j = start; j < end; ++j) { - const float threshold = kvalues[j] + kvalues[j - 1]; - const float step = kvalues[j] - kvalues[j - 1]; - Faux[n_frac_p++] = (struct fraction){ - // This should always be positive or else - // the fraction comparison function won't work properly - .numer=fabsf(x[i] * step), - // It's amazing how this is still the difference of consecutive squares - .denom=fabsf(threshold * step), - .i=i, - }; - } - } + const bool neg_scale = signed_scale && fast ? (x[amax_i] < 0.0f) != (k_heap->kmax < 0) : false; - qsort(Faux, n_frac_p, sizeof(struct fraction), compare_fractions_desc); + k_heap_set_x(k_heap, x, n, neg_scale); - float best = 0.0f; - float best_sumlx = 0.0f; - float best_suml2 = 1.0f; - float sumlx_p = sumlx; - float suml2_p = suml2; - int best_p_i = -2; // not consecutive with 0..n_frac - for (int i = 0; i < n_frac_p; ++i) { - const int ii = Faux[i].i; - const float w = weights ? weights[ii] : x[ii] * x[ii]; - sumlx_p += w * Faux[i].numer; - suml2_p += w * Faux[i].denom; - const float current = sumlx_p * sumlx_p; - Laux[ii] += x[ii] < 0.0f ? -1 : 1; - if (suml2_p > 0.0f && current * best_suml2 > best * suml2_p) { - best = current; - best_sumlx = sumlx_p; - best_suml2 = suml2_p; - if (i == best_p_i + 1) { - // reduce copies for consecutive bests - L[ii] += x[ii] < 0.0f ? -1 : 1; - } else { - for (int j = 0; j < n; ++j) { - L[j] = Laux[j]; + float best; + float best_sumlx; + float best_suml2; + if (suml2 != 0.0f) { + best = sumlx * sumlx; + best_sumlx = neg_scale ? -sumlx : sumlx; + best_suml2 = suml2 != 0.0f ? suml2 : 1.0f; + } else { + best = 0.0f; + best_sumlx = 0.0f; + best_suml2 = 1.0f; + } + { + float sumlx_p = neg_scale ? -sumlx : sumlx; + float suml2_p = suml2; + int best_p_i = -2; // not consecutive with 0..n_frac + int i = 0; + while (k_heap->n > 0) { + struct fraction frac = k_heap_pop(k_heap); + const int ii = frac.i; + const float w = weights ? weights[ii] : x[ii] * x[ii]; + sumlx_p += w * frac.numer; + suml2_p += w * frac.denom; + const float current = sumlx_p * sumlx_p; + Laux[ii] += (x[ii] < 0.0f) != neg_scale ? -1 : 1; + if (suml2_p > 0.0f && current * best_suml2 > best * suml2_p) { + best = current; + best_sumlx = neg_scale ? -sumlx_p : sumlx_p; + best_suml2 = suml2_p; + if (i == best_p_i + 1) { + // reduce copies for consecutive bests + L[ii] += (x[ii] < 0.0f) != neg_scale ? -1 : 1; + } else { + for (int j = 0; j < n; ++j) { + L[j] = Laux[j]; + } } + best_p_i = i; } - best_p_i = i; } } // Non-linear mappings are usually not symmetric, so try negating the scale // This is the same as above, but keeping the old best if the new best is not better. - if (signed_scale) { + if (signed_scale && !fast) { for (int i = 0; i < n; ++i) { - Laux[i] = koff; - } - - int n_frac_n = 0; - for (int i = 0; i < n; ++i) { - const int start = x[i] >= 0.0f ? 1 : koff + 1; - const int end = x[i] >= 0.0f ? koff + 1: k; - for (int j = start; j < end; ++j) { - const float threshold = kvalues[j] + kvalues[j - 1]; - const float step = kvalues[j] - kvalues[j - 1]; - Faux[n_frac_n++] = (struct fraction){ - // This should always be positive or else - // the fraction comparison function won't work properly - .numer=fabsf(x[i] * step), - // It's amazing how this is still the difference of consecutive squares - .denom=fabsf(threshold * step), - .i=i, - }; - } + Laux[i] = k_heap->mid_k; } - qsort(Faux, n_frac_n, sizeof(struct fraction), compare_fractions_desc); + k_heap_set_x(k_heap, x, n, true); float sumlx_n = -sumlx; float suml2_n = suml2; int best_n_i = -2; // not consecutive with 0..n_frac - for (int i = 0; i < n_frac_n; ++i) { - const int ii = Faux[i].i; + int i = 0; + while (k_heap->n > 0) { + struct fraction frac = k_heap_pop(k_heap); + const int ii = frac.i; const float w = weights ? weights[ii] : x[ii] * x[ii]; - sumlx_n += w * Faux[i].numer; - suml2_n += w * Faux[i].denom; + sumlx_n += w * frac.numer; + suml2_n += w * frac.denom; const float current = sumlx_n * sumlx_n; Laux[ii] += x[ii] >= 0.0f ? -1 : 1; if (suml2_n > 0.0f && current * best_suml2 > best * suml2_n) { @@ -1100,6 +1225,7 @@ static float make_qkxs_nl_quants(int n, int k, const float * restrict x, const f } best_n_i = i; } + ++i; } } @@ -5153,8 +5279,7 @@ static inline int best_index_int8(int n, const int8_t * val, float x) { static void quantize_row_iq4_nl_impl(const int super_block_size, const int block_size, const float * restrict x, ggml_fp16_t * dh, uint8_t * q4, uint16_t * scales_h, uint8_t * scales_l, - float * scales, float * weight, uint8_t * L, uint8_t * Laux, struct fraction * Faux, - const int8_t * values, + float * scales, float * weight, uint8_t * L, uint8_t * Laux, struct k_heap * k_heap, const float * quant_weights) { float sigma2 = 0; @@ -5185,7 +5310,7 @@ static void quantize_row_iq4_nl_impl(const int super_block_size, const int block scales[ib] = 0; continue; } - float d = make_qkxs_nl_quants(block_size, 16, xb, weight, values, Lb, Laux, Faux, true); + float d = make_qkxs_nl_quants(block_size, xb, weight, Lb, Laux, k_heap, true, !quant_weights); scales[ib] = d; float abs_d = fabsf(d); if (abs_d > amax_scale) { @@ -5239,17 +5364,21 @@ size_t quantize_iq4_nl(const float * restrict src, void * restrict dst, int64_t char * qrow = (char *)dst; uint8_t L[QK4_NL]; uint8_t Laux[QK4_NL]; - struct fraction Faux[QK4_NL * 16]; + struct k_heap_cell heap_cells[QK4_NL]; + struct k_heap k_heap; + float odd[16]; + float steps[16]; float weight[QK4_NL]; uint16_t unused_h; uint8_t * unused_l = NULL; float scale; + k_heap_init(&k_heap, 16, kvalues_iq4nl, heap_cells, odd, steps); for (int64_t row = 0; row < nrow; ++row) { block_iq4_nl * iq4 = (block_iq4_nl *)qrow; for (int ibl = 0; ibl < nblock; ++ibl) { const float * qw = quant_weights ? quant_weights + QK4_NL*ibl : NULL; quantize_row_iq4_nl_impl(QK4_NL, 32, src + QK4_NL*ibl, &iq4[ibl].d, iq4[ibl].qs, &unused_h, unused_l, - &scale, weight, L, Laux, Faux, kvalues_iq4nl, qw); + &scale, weight, L, Laux, &k_heap, qw); } src += n_per_row; qrow += nblock*sizeof(block_iq4_nl); @@ -5262,15 +5391,19 @@ void quantize_row_iq4_nl_ref(const float * restrict x, block_iq4_nl * restrict y int64_t nblock = k/QK4_NL; uint8_t L[QK4_NL]; uint8_t Laux[QK4_NL]; - struct fraction Faux[QK4_NL * 16]; + struct k_heap_cell heap_cells[QK4_NL]; + struct k_heap k_heap; + float odd[16]; + float steps[16]; float weight[QK4_NL]; uint16_t unused_h; uint8_t * unused_l = NULL; float scale; block_iq4_nl * iq4 = y; + k_heap_init(&k_heap, 16, kvalues_iq4nl, heap_cells, odd, steps); for (int ibl = 0; ibl < nblock; ++ibl) { quantize_row_iq4_nl_impl(QK4_NL, 32, x + QK4_NL*ibl, &iq4[ibl].d, iq4[ibl].qs, &unused_h, unused_l, - &scale, weight, L, Laux, Faux, kvalues_iq4nl, NULL); + &scale, weight, L, Laux, &k_heap, NULL); } } @@ -5280,15 +5413,19 @@ size_t quantize_iq4_xs(const float * restrict src, void * restrict dst, int64_t char * qrow = (char *)dst; uint8_t L[QK_K]; uint8_t Laux[32]; - struct fraction Faux[32 * 16]; + struct k_heap_cell heap_cells[32]; + struct k_heap k_heap; + float odd[16]; + float steps[16]; float weight[32]; float scales[QK_K/32]; + k_heap_init(&k_heap, 16, kvalues_iq4nl, heap_cells, odd, steps); for (int64_t row = 0; row < nrow; ++row) { block_iq4_xs * iq4 = (block_iq4_xs *)qrow; for (int ibl = 0; ibl < nblock; ++ibl) { const float * qw = quant_weights ? quant_weights + QK_K*ibl : NULL; quantize_row_iq4_nl_impl(QK_K, 32, src + QK_K*ibl, &iq4[ibl].d, iq4[ibl].qs, &iq4[ibl].scales_h, iq4[ibl].scales_l, - scales, weight, L, Laux, Faux, kvalues_iq4nl, qw); + scales, weight, L, Laux, &k_heap, qw); } src += n_per_row; qrow += nblock*sizeof(block_iq4_xs); From 3be115100f581f418701362aa22c1c3a5d382b89 Mon Sep 17 00:00:00 2001 From: Francis Couture-Harpin Date: Thu, 20 Mar 2025 19:21:45 -0400 Subject: [PATCH 07/12] ggml-quants : use a max-heap for linear quants like Q3_K Slightly faster than the previous method. --- ggml/src/ggml-quants.c | 253 ++++++++++++++++++++++++++++++++++++----- 1 file changed, 224 insertions(+), 29 deletions(-) diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index 6762e98f68097..1267e02dd3822 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -635,7 +635,7 @@ struct fraction { int i; }; -// comparator function for sorting fractions in make_qkxs_quants +// comparator function for sorting fractions static inline int compare_fractions_desc(const void * a, const void * b) { const struct fraction * f_a = (const struct fraction *) a; const struct fraction * f_b = (const struct fraction *) b; @@ -734,51 +734,106 @@ static void k_heap_init(struct k_heap * restrict k_heap, int k, const int8_t * r for (int i = 0; i < k - 1; ++i) { const float threshold = kvalues[i + 1] + kvalues[i]; const float step = kvalues[i + 1] - kvalues[i]; - // It's amazing how their product is the difference between consecutive squares of the kvalues + // It's amazing how their product is the difference between consecutive squares of the kvalues, + // but it makes sense because a*a - b*b == (a + b)*(a - b). GGML_ASSERT(threshold * step != 0.0f); odd[i + (i >= mid_k ? 1 : 0)] = fabsf(threshold); steps[i + (i >= mid_k ? 1 : 0)] = fabsf(step); } odd[mid_k] = 0.0f; steps[mid_k] = 0.0f; +} + +static void k_heap_init_linear(struct k_heap * k_heap, int nmin, int nmax, struct k_heap_cell * restrict heap_cells, float * restrict odd) { + GGML_ASSERT(k_heap && heap_cells && odd); + nmin = MIN(0, nmin); + nmax = MAX(0, nmax); + k_heap->n = 0; + k_heap->k = nmax - nmin + 1; + k_heap->odd = odd; + k_heap->steps = NULL; + k_heap->heap = heap_cells; + + k_heap->mid_k = -nmin; + k_heap->kmin = 0; // the range should always overlap 0 + k_heap->kmax = abs(nmin) > abs(nmax) ? nmin : nmax; + + for (int i = nmin; i < nmax; ++i) { + // odd numbers are the difference between consecutive squares + odd[i - nmin + (i >= 0 ? 1 : 0)] = fabsf((float) (i + (i + 1))); + } + odd[-nmin] = 0.0f; +} + +// with initial quantized values +static void k_heap_set_x_L(struct k_heap * k_heap, const float * restrict x, const int8_t * restrict L, int n, bool invert_sign) { + int j = 0; + for (int i = 0; i < n; ++i) { + const int k_i = ((x[i] < 0.0f) != invert_sign) ? L[i] - 1 : L[i] + 1; + GGML_ASSERT(k_i != k_heap->mid_k); + if (k_i >= 0 && k_i < k_heap->k) { + k_heap->heap[j++] = (struct k_heap_cell) { + .k_i=k_i, + .x_i=i, + .x=fabsf(x[i]), + .frac=fabsf(x[i] / k_heap->odd[k_i]), + }; + } + } + k_heap->n = j; - GGML_ASSERT(mid_k > 0 && mid_k + 1 < k); + for (int i = (k_heap->n / 2) - 1; i >= 0; --i) { + k_heap_build(k_heap, i); + } } -// TODO: initial quantized values +// assuming the initial quantized value are all at k_heap->mid_k static void k_heap_set_x(struct k_heap * k_heap, const float * restrict x, int n, bool invert_sign) { - // TODO: sanity checks - k_heap->n = n; + int j = 0; for (int i = 0; i < n; ++i) { const int k_i = ((x[i] < 0.0f) != invert_sign) ? k_heap->mid_k - 1 : k_heap->mid_k + 1; - k_heap->heap[i] = (struct k_heap_cell) { - .k_i=k_i, - .x_i=i, - .x=fabsf(x[i]), - .frac=fabsf(x[i] / k_heap->odd[k_i]), - }; + if (k_i >= 0 && k_i < k_heap->k) { + k_heap->heap[j++] = (struct k_heap_cell) { + .k_i=k_i, + .x_i=i, + .x=fabsf(x[i]), + .frac=fabsf(x[i] / k_heap->odd[k_i]), + }; + } } + k_heap->n = j; for (int i = (k_heap->n / 2) - 1; i >= 0; --i) { k_heap_build(k_heap, i); } } +// returns the fractions in descending order static struct fraction k_heap_pop(struct k_heap * k_heap) { if (k_heap && k_heap->n > 0) { struct k_heap_cell * heap_cell = k_heap->heap; - const float step = k_heap->steps[heap_cell->k_i]; - // Properly turn this into a difference of consecutive squares - struct fraction frac = (struct fraction) { - .numer=heap_cell->x*step, - .denom=k_heap->odd[heap_cell->k_i]*step, - .i=heap_cell->x_i, - }; + struct fraction frac; + if (k_heap->steps) { + const float step = k_heap->steps[heap_cell->k_i]; + // Properly turn this into a difference of consecutive squares even for non-linear steps + frac = (struct fraction) { + .numer=heap_cell->x * step, + .denom=k_heap->odd[heap_cell->k_i] * step, + .i=heap_cell->x_i, + }; + } else { + // step is always 1 for linear quants + frac = (struct fraction) { + .numer=heap_cell->x, + .denom=k_heap->odd[heap_cell->k_i], + .i=heap_cell->x_i, + }; + } if (heap_cell->k_i < k_heap->mid_k) { if (heap_cell->k_i > 0) { heap_cell->k_i -= 1; - heap_cell->frac = heap_cell->x/k_heap->odd[heap_cell->k_i]; + heap_cell->frac = heap_cell->x / k_heap->odd[heap_cell->k_i]; } else { // remove this node k_heap->heap[0] = k_heap->heap[k_heap->n - 1]; @@ -787,7 +842,7 @@ static struct fraction k_heap_pop(struct k_heap * k_heap) { } else { if (heap_cell->k_i < k_heap->k - 1) { heap_cell->k_i += 1; - heap_cell->frac = heap_cell->x/k_heap->odd[heap_cell->k_i]; + heap_cell->frac = heap_cell->x / k_heap->odd[heap_cell->k_i]; } else { // remove this node k_heap->heap[0] = k_heap->heap[k_heap->n - 1]; @@ -838,7 +893,7 @@ static float make_qkxs_quants(int n, int nmin, int nmax, const float * restrict bool negative_scale = false; if (signed_scale && -nmin != nmax) { // the max side should have the biggest range - // FIXME: this is not always the best sign + // NOTE: this is not always the best sign if ((x[amax_i] < 0.0f) == (-nmin < nmax)) { // [-4, 3] ==> [-3, 4] const int tmp = nmin; @@ -870,7 +925,7 @@ static float make_qkxs_quants(int n, int nmin, int nmax, const float * restrict if (amax_range > 1) { // The smallest non-redundant iscale makes the first clamped value half+1 its max integer value. // Proof: anything smaller has a representable vector with values twice as big. - const float iscale = ((float)(amax_range / 2 + 1))/range_max * (negative_scale ? -1.0f : 1.0f); + const float iscale = ((float)((amax_range >> 1) + 1))/range_max * (negative_scale ? -1.0f : 1.0f); for (int i = 0; i < n; ++i) { const float w = weights[i]; int l = MAX(nmin, MIN(lroundf(x[i] * iscale), nmax)); @@ -949,6 +1004,129 @@ static float make_qkxs_quants(int n, int nmin, int nmax, const float * restrict return negative_scale ? -scale : scale; } +static float make_qkxh_quants(int n, const float * restrict x, const float * restrict weights, int8_t * restrict L, int8_t * restrict Laux, struct k_heap * restrict k_heap, bool signed_scale) { + const int nmin = -k_heap->mid_k; // TODO: maybe directly pass these + const int nmax = k_heap->k + nmin - 1; + float amax = fabsf(x[0]); + float w_amax = (weights ? weights[0] : x[0] * x[0]) * amax; + int amax_i = 0; + int w_amax_i = 0; + for (int i = 1; i < n; ++i) { + // Find the most important value + const float w = weights ? weights[i] : x[i] * x[i]; + const float ax = fabsf(x[i]); + const float wax = w * ax; + if (ax > amax) { + amax = ax; + amax_i = i; + } + if (wax > w_amax) { + w_amax = wax; + w_amax_i = i; + } + } + + if (amax < GROUP_MAX_EPS) { // all zero + for (int i = 0; i < n; ++i) { + L[i] = 0; + } + return 0.0f; + } + + bool negative_scale = false; + if (signed_scale && -nmin > nmax) { + // the max side should have the biggest range + // NOTE: this is not always the best sign, but seems to be a good heuristic. + if ((x[amax_i] < 0.0f) == (-nmin < nmax)) { + // [-4, 3] ==> [-3, 4] + negative_scale = true; + } + } + + // Find the max range in [0, amax_range] which doesn't result in clamping. + // This is the range from the side which would clamp first (biggest ratio of max to nmax). + // But it's easier and safer to simply use the smallest range. + int amax_range = MIN(abs(nmin), abs(nmax)); + if (amax_range == 0) { + // one side will clamp anyway + amax_range = MAX(abs(nmin), abs(nmax)); + } + float sumlx = 0.0f; + float suml2 = 0.0f; + float scale = 0.0f; + float best = 0.0f; + float best_denom = 1.0f; // should never be zero + if (amax_range > 1) { + // The smallest non-redundant iscale makes the first clamped value half+1 its max integer value. + // Proof: anything smaller has a representable vector with values twice as big. + // TODO: use a bigger iscale in asymmetric cases when possible + // NOTE: strangely, when using half+1, with nmin=-2 and nmax=2, the corners look slighlty clipped, + // but this does not happen when using half of the range as a starting point. + const float iscale = ((float)(amax_range >> 1))/amax * (negative_scale ? -1.0f : 1.0f); + for (int i = 0; i < n; ++i) { + const float w = weights ? weights[i] : x[i] * x[i]; + int l = MAX(nmin, MIN(lroundf(x[i] * iscale), nmax)); + Laux[i] = l + k_heap->mid_k; + suml2 += w * l * l; + sumlx += w * l * x[i]; + } + if (suml2 > 0.0f) { + best = sumlx * sumlx; + best_denom = suml2; + scale = sumlx / suml2; + } + } else { + memset(Laux, k_heap->mid_k, n); + } + memcpy(L, Laux, n); + + k_heap_set_x_L(k_heap, x, Laux, n, negative_scale); + + const int imax_range = MAX(abs(nmin), abs(nmax)); + // const int imax_range = (x[w_amax_i] < 0.0f) != negative_scale ? abs(nmin) : abs(nmax); + const int max_odd = 2*(imax_range + 1) + 1; + const float wmax = fabsf(x[w_amax_i]); + // const float wmax = amax; + { + int best_p_i = -1; // consecutive with 0..n_frac + int i = 0; + while (k_heap->n > 0) { + struct fraction frac = k_heap_pop(k_heap); + if (frac.numer == 0.0f) { break; } + const float v_max_odd = frac.numer * max_odd; + if (wmax * frac.denom > v_max_odd) { + // stop when the inverse scale would result in clamping the most important value + break; + } + // maximize the weighted cosine similarity + const int ii = frac.i; + const float w = weights ? weights[ii] : x[ii] * x[ii]; + if (negative_scale) { + frac.numer = -frac.numer; + } + sumlx += w * frac.numer; + suml2 += w * frac.denom; + const float current = sumlx * sumlx; + Laux[ii] += (x[ii] < 0.0f) != negative_scale ? -1 : 1; + if (suml2 > 0.0f && current * best_denom > best * suml2) { + best = current; + best_denom = suml2; + scale = sumlx / suml2; + if (i == best_p_i + 1) { + // reduce copies for consecutive bests + L[ii] += (x[ii] < 0.0f) != negative_scale ? -1 : 1; + } else { + memcpy(L, Laux, n); + } + best_p_i = i; + } + i += 1; + } + } + + return scale; +} + // Very similar to make_qkxs_quants, but the sign of the scale is not assumed to be the sign of the absmax value. static float make_qkxss_quants(int n, int nmin, int nmax, const float * restrict x, const float * restrict weights, int8_t * restrict L, int8_t * restrict Laux, struct fraction * restrict Faux) { // start at zero @@ -991,7 +1169,7 @@ static float make_qkxss_quants(int n, int nmin, int nmax, const float * restrict // Pre-calculate the half-point for the common range. // All smaller vectors have a representable vector with twice the values, and thus can be skipped. if (amax_range > 1) { - const float iscale = ((float)(amax_range / 2 + 1))/amax; + const float iscale = ((float)((amax_range >> 1) + 1))/amax; for (int i = 0; i < n; ++i) { const float w = weights ? weights[i] : x[i] * x[i]; int l = MAX(nmin, MIN(lroundf(x[i] * iscale), nmax)); @@ -1587,10 +1765,15 @@ void quantize_row_q3_K_ref(const float * restrict x, block_q3_K * restrict y, in int8_t L[QK_K]; int8_t Laux[16]; - struct fraction Faux[16 * 4]; + // struct fraction Faux[16 * 4]; + struct k_heap_cell heap_cells[16]; + float odd[8]; + struct k_heap k_heap; float scales[QK_K / 16]; float weights[16]; + k_heap_init_linear(&k_heap, -4, 3, heap_cells, odd); + for (int i = 0; i < 16; ++i) { weights[i] = 1.0f; } @@ -1600,7 +1783,8 @@ void quantize_row_q3_K_ref(const float * restrict x, block_q3_K * restrict y, in float max_scale = 0; float amax = 0; for (int j = 0; j < QK_K/16; ++j) { - scales[j] = make_qkxs_quants(16, -4, 3, x + 16*j, weights, L + 16*j, Laux, Faux, true); + // scales[j] = make_qkxs_quants(16, -4, 3, x + 16*j, weights, L + 16*j, Laux, Faux, true); + scales[j] = make_qkxh_quants(16, x + 16*j, weights, L + 16*j, Laux, &k_heap, true); float scale = fabsf(scales[j]); if (scale > amax) { amax = scale; max_scale = scales[j]; @@ -1709,7 +1893,16 @@ static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restri float weight[16]; float sw[QK_K / 16]; int8_t Ls[QK_K / 16]; - struct fraction Faux[16 * 32]; + // struct fraction Faux[16 * 32]; + struct k_heap_cell heap_cells[16]; + float odd[8]; + struct k_heap k_heap; + struct k_heap_cell heap_cells_s[QK_K / 16]; + float odd_s[64]; + struct k_heap k_heap_s; + + k_heap_init_linear(&k_heap, -4, 3, heap_cells, odd); + k_heap_init_linear(&k_heap_s, -32, 31, heap_cells_s, odd_s); for (int i = 0; i < nb; i++) { @@ -1728,13 +1921,15 @@ static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restri for (int l = 0; l < 16; ++l) sumw += weight[l]; sw[j] = sumw; - scales[j] = make_qkxs_quants(16, -4, 3, x + 16*j, weight, L + 16*j, Laux, Faux, true); + // scales[j] = make_qkxs_quants(16, -4, 3, x + 16*j, weight, L + 16*j, Laux, Faux, true); + scales[j] = make_qkxh_quants(16, x + 16*j, weight, L + 16*j, Laux, &k_heap, true); } memset(y[i].scales, 0, 12); - float d_block = make_qkxs_quants(QK_K/16, -32, 31, scales, sw, Ls, Laux, Faux, true); + // float d_block = make_qkxs_quants(QK_K/16, -32, 31, scales, sw, Ls, Laux, Faux, true); + float d_block = make_qkxh_quants(QK_K/16, scales, sw, Ls, Laux, &k_heap_s, true); for (int j = 0; j < QK_K/16; ++j) { int l = Ls[j]; if (j < 8) { From f86b8ff21084c86ab4722887f6b4f6a38f67cbf6 Mon Sep 17 00:00:00 2001 From: Francis Couture-Harpin Date: Fri, 21 Mar 2025 14:05:58 -0400 Subject: [PATCH 08/12] ggml-quants : use qkxh in more places --- ggml/src/ggml-quants.c | 61 +++++++++++++++++++++++------------------- 1 file changed, 34 insertions(+), 27 deletions(-) diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index 1267e02dd3822..5e25887f10fc6 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -629,7 +629,6 @@ static float make_qkx2_quants(int n, int nmax, const float * restrict x, const f } struct fraction { - // float frac; float numer; float denom; int i; @@ -677,7 +676,8 @@ struct k_heap { struct k_heap_cell * heap; }; -// build a max heap out of k_cells starting from node i +// build a max heap out of k_cells starting from node i; +// makes sure the node i is bigger than its children static void k_heap_build(struct k_heap * heap, int i) { const int n = heap->n; int max = i; @@ -744,6 +744,7 @@ static void k_heap_init(struct k_heap * restrict k_heap, int k, const int8_t * r steps[mid_k] = 0.0f; } +// for linear types which have a constant step of 1 between representable values static void k_heap_init_linear(struct k_heap * k_heap, int nmin, int nmax, struct k_heap_cell * restrict heap_cells, float * restrict odd) { GGML_ASSERT(k_heap && heap_cells && odd); nmin = MIN(0, nmin); @@ -1004,6 +1005,7 @@ static float make_qkxs_quants(int n, int nmin, int nmax, const float * restrict return negative_scale ? -scale : scale; } +// exhaustive search with cumulative sums static float make_qkxh_quants(int n, const float * restrict x, const float * restrict weights, int8_t * restrict L, int8_t * restrict Laux, struct k_heap * restrict k_heap, bool signed_scale) { const int nmin = -k_heap->mid_k; // TODO: maybe directly pass these const int nmax = k_heap->k + nmin - 1; @@ -1027,9 +1029,7 @@ static float make_qkxh_quants(int n, const float * restrict x, const float * res } if (amax < GROUP_MAX_EPS) { // all zero - for (int i = 0; i < n; ++i) { - L[i] = 0; - } + memset(L, 0, n); return 0.0f; } @@ -1304,7 +1304,7 @@ static float make_qkxss_quants(int n, int nmin, int nmax, const float * restrict } // non-linear exhaustive search with cumulative sums -static float make_qkxs_nl_quants(int n, const float * restrict x, const float * restrict weights, uint8_t * restrict L, uint8_t * restrict Laux, struct k_heap * restrict k_heap, bool signed_scale, bool fast) { +static float make_qkxh_nl_quants(int n, const float * restrict x, const float * restrict weights, uint8_t * restrict L, uint8_t * restrict Laux, struct k_heap * restrict k_heap, bool signed_scale, bool fast) { float sumlx = 0.0f; float suml2 = 0.0f; float amax = -1.0f; @@ -1687,11 +1687,18 @@ static void quantize_row_q2_K_impl(const float * restrict x, block_q2_K * restri uint8_t L[QK_K]; uint8_t Laux[16]; + int8_t Lsaux[16]; float mins[QK_K/16]; float scales[QK_K/16]; float sw[QK_K/16]; float weight[16]; - uint8_t Ls[QK_K/16], Lm[QK_K/16]; + int8_t Ls[QK_K/16], Lm[QK_K/16]; + + struct k_heap_cell heap_cells_s[QK_K/16]; + float odd_s[16]; + struct k_heap k_heap_s; + + k_heap_init_linear(&k_heap_s, 0, 15, heap_cells_s, odd_s); for (int i = 0; i < nb; i++) { memset(sw, 0, QK_K/16*sizeof(float)); @@ -1706,8 +1713,8 @@ static void quantize_row_q2_K_impl(const float * restrict x, block_q2_K * restri } float dm, mm; - dm = make_qp_quants(QK_K/16, 15, scales, Ls, sw); - mm = make_qp_quants(QK_K/16, 15, mins, Lm, sw); + dm = make_qkxh_quants(QK_K/16, scales, sw, Ls, Lsaux, &k_heap_s, false); + mm = make_qkxh_quants(QK_K/16, mins, sw, Lm, Lsaux, &k_heap_s, false); y[i].d = GGML_FP32_TO_FP16(dm); y[i].dmin = GGML_FP32_TO_FP16(mm); @@ -2607,7 +2614,12 @@ static void quantize_row_q4_0_impl(const float * restrict x, block_q4_0 * restri float weight[QK4_0]; int8_t L[QK4_0]; int8_t Laux[QK4_0]; - struct fraction Faux[8 * QK4_0]; + // struct fraction Faux[8 * QK4_0]; + struct k_heap_cell heap_cells[QK4_0]; + float odd[16]; + struct k_heap k_heap; + + k_heap_init_linear(&k_heap, -8, 7, heap_cells, odd); float sum_x2 = 0; for (int j = 0; j < n_per_row; ++j) sum_x2 += x[j]*x[j]; @@ -2618,7 +2630,8 @@ static void quantize_row_q4_0_impl(const float * restrict x, block_q4_0 * restri const float * xb = x + QK4_0 * ib; const float * qw = quant_weights + QK4_0 * ib; for (int j = 0; j < QK4_0; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]); - float d = make_qkxs_quants(QK4_0, -8, 7, xb, weight, L, Laux, Faux, true); + // float d = make_qkxs_quants(QK4_0, -8, 7, xb, weight, L, Laux, Faux, true); + float d = make_qkxh_quants(QK4_0, xb, weight, L, Laux, &k_heap, true); y[ib].d = GGML_FP32_TO_FP16(d); for (int j = 0; j < 16; ++j) { y[ib].qs[j] = L[j] | (L[j+16] << 4); @@ -2697,7 +2710,12 @@ static void quantize_row_q5_0_impl(const float * restrict x, block_q5_0 * restri float weight[QK5_0]; int8_t L[QK5_0]; int8_t Laux[QK5_0]; - struct fraction Faux[16 * QK5_0]; + // struct fraction Faux[16 * QK5_0]; + struct k_heap_cell heap_cells[QK5_0]; + float odd[32]; + struct k_heap k_heap; + + k_heap_init_linear(&k_heap, -16, 15, heap_cells, odd); float sum_x2 = 0; for (int j = 0; j < n_per_row; ++j) sum_x2 += x[j]*x[j]; @@ -2708,7 +2726,8 @@ static void quantize_row_q5_0_impl(const float * restrict x, block_q5_0 * restri const float * xb = x + QK5_0 * ib; const float * qw = quant_weights + QK5_0 * ib; for (int j = 0; j < QK5_0; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]); - float d = make_qkxs_quants(QK5_0, -16, 15, xb, weight, L, Laux, Faux, true); + float d = make_qkxh_quants(QK5_0, xb, weight, L, Laux, &k_heap, true); + // float d = make_qkxs_quants(QK5_0, -16, 15, xb, weight, L, Laux, Faux, true); y[ib].d = GGML_FP32_TO_FP16(d); uint32_t qh = 0; @@ -5505,7 +5524,7 @@ static void quantize_row_iq4_nl_impl(const int super_block_size, const int block scales[ib] = 0; continue; } - float d = make_qkxs_nl_quants(block_size, xb, weight, Lb, Laux, k_heap, true, !quant_weights); + float d = make_qkxh_nl_quants(block_size, xb, weight, Lb, Laux, k_heap, true, !quant_weights); scales[ib] = d; float abs_d = fabsf(d); if (abs_d > amax_scale) { @@ -5516,19 +5535,13 @@ static void quantize_row_iq4_nl_impl(const int super_block_size, const int block if (super_block_size/block_size > 1) { int nb = super_block_size/block_size; memset(scales_h, 0, ((nb+7)/8)*sizeof(uint16_t)); + // TODO: use make_qkxh_quants float d = -max_scale/32; dh[0] = GGML_FP32_TO_FP16(d); float id = d ? 1/d : 0.f; for (int ib = 0; ib < super_block_size/block_size; ++ib) { int l = nearest_int(id*scales[ib]); l = MAX(-32, MIN(31, l)); - // float dl = d * l; - // float idl = dl ? 1/dl : 0.f; - // uint8_t * Lb = L + ib*block_size; - // const float * xb = x + ib*block_size; - // for (int j = 0; j < block_size; ++j) { - // Lb[j] = best_index_int8(16, values, idl*xb[j]); - // } l += 32; uint8_t l_l = l & 0xf; uint8_t l_h = l >> 4; @@ -5538,12 +5551,6 @@ static void quantize_row_iq4_nl_impl(const int super_block_size, const int block } } else { dh[0] = GGML_FP32_TO_FP16(scales[0]); - // if (ntry > 0) { - // float id = scales[0] ? 1/scales[0] : 0; - // for (int j = 0; j < super_block_size; ++j) { - // L[j] = best_index_int8(16, values, id*x[j]); - // } - // } } for (int i = 0; i < super_block_size/32; ++i) { From 3e4b675c9f0f955965ad1ea3a927a57eee582b6d Mon Sep 17 00:00:00 2001 From: Francis Couture-Harpin Date: Sat, 22 Mar 2025 12:03:26 -0400 Subject: [PATCH 09/12] ggml-quants : use a max-heap for TQ1_0 and TQ2_0 quantization --- ggml/src/ggml-quants.c | 238 ++++++++++++++++++++++++++++++++++------- 1 file changed, 199 insertions(+), 39 deletions(-) diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index 5e25887f10fc6..9be87a671ce83 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -1007,8 +1007,8 @@ static float make_qkxs_quants(int n, int nmin, int nmax, const float * restrict // exhaustive search with cumulative sums static float make_qkxh_quants(int n, const float * restrict x, const float * restrict weights, int8_t * restrict L, int8_t * restrict Laux, struct k_heap * restrict k_heap, bool signed_scale) { - const int nmin = -k_heap->mid_k; // TODO: maybe directly pass these - const int nmax = k_heap->k + nmin - 1; + const int nmin = MIN(0, -k_heap->mid_k); // TODO: maybe directly pass these + const int nmax = MAX(0, k_heap->k + nmin - 1); float amax = fabsf(x[0]); float w_amax = (weights ? weights[0] : x[0] * x[0]) * amax; int amax_i = 0; @@ -1046,10 +1046,10 @@ static float make_qkxh_quants(int n, const float * restrict x, const float * res // Find the max range in [0, amax_range] which doesn't result in clamping. // This is the range from the side which would clamp first (biggest ratio of max to nmax). // But it's easier and safer to simply use the smallest range. - int amax_range = MIN(abs(nmin), abs(nmax)); + int amax_range = MIN(-nmin, nmax); if (amax_range == 0) { // one side will clamp anyway - amax_range = MAX(abs(nmin), abs(nmax)); + amax_range = MAX(-nmin, nmax); } float sumlx = 0.0f; float suml2 = 0.0f; @@ -1087,40 +1087,192 @@ static float make_qkxh_quants(int n, const float * restrict x, const float * res const int max_odd = 2*(imax_range + 1) + 1; const float wmax = fabsf(x[w_amax_i]); // const float wmax = amax; - { - int best_p_i = -1; // consecutive with 0..n_frac - int i = 0; - while (k_heap->n > 0) { - struct fraction frac = k_heap_pop(k_heap); - if (frac.numer == 0.0f) { break; } - const float v_max_odd = frac.numer * max_odd; - if (wmax * frac.denom > v_max_odd) { - // stop when the inverse scale would result in clamping the most important value - break; + int best_p_i = -1; // consecutive with 0..n_frac + for (int i = 0; k_heap->n > 0; ++i) { + struct fraction frac = k_heap_pop(k_heap); + if (frac.numer == 0.0f) { break; } + const float v_max_odd = frac.numer * max_odd; + if (wmax * frac.denom > v_max_odd) { + // stop when the inverse scale would result in clamping the most important value + break; + } + // maximize the weighted cosine similarity + const int ii = frac.i; + const float w = weights ? weights[ii] : x[ii] * x[ii]; + if (negative_scale) { + frac.numer = -frac.numer; + } + sumlx += w * frac.numer; + suml2 += w * frac.denom; + const float current = sumlx * sumlx; + Laux[ii] += (x[ii] < 0.0f) != negative_scale ? -1 : 1; + if (suml2 > 0.0f && current * best_denom > best * suml2) { + best = current; + best_denom = suml2; + scale = sumlx / suml2; + if (i == best_p_i + 1) { + // reduce copies for consecutive bests + L[ii] += (x[ii] < 0.0f) != negative_scale ? -1 : 1; + } else { + memcpy(L, Laux, n); } - // maximize the weighted cosine similarity - const int ii = frac.i; - const float w = weights ? weights[ii] : x[ii] * x[ii]; - if (negative_scale) { - frac.numer = -frac.numer; - } - sumlx += w * frac.numer; - suml2 += w * frac.denom; - const float current = sumlx * sumlx; - Laux[ii] += (x[ii] < 0.0f) != negative_scale ? -1 : 1; - if (suml2 > 0.0f && current * best_denom > best * suml2) { - best = current; - best_denom = suml2; - scale = sumlx / suml2; - if (i == best_p_i + 1) { - // reduce copies for consecutive bests - L[ii] += (x[ii] < 0.0f) != negative_scale ? -1 : 1; - } else { - memcpy(L, Laux, n); + best_p_i = i; + } + } + + return scale; +} + +// like make_qkxh_quants, but doesn't assume the sign of the scale is the sign of the absmax value +static float make_qkxsh_quants(int n, int nmin, int nmax, const float * restrict x, const float * restrict weights, int8_t * restrict L, int8_t * restrict Laux, struct k_heap * restrict k_heap) { + nmin = MIN(0, nmin); + nmax = MAX(0, nmax); + // start at zero + float amax = fabsf(x[0]); + float min = x[0]; + float max = x[0]; + float w_amax = weights[0] * amax; + int w_amax_i = 0; + for (int i = 1; i < n; ++i) { + const float w = weights[i]; + const float ax = fabsf(x[i]); + const float wax = w * ax; + if (ax > amax) { amax = ax; } + if (x[i] > max) { max = x[i]; } + if (x[i] < min) { min = x[i]; } + // Find the most important value + if (wax > w_amax) { w_amax = wax; w_amax_i = i; } + } + + if (amax < GROUP_MAX_EPS) { // all zero + memset(L, 0, n); + return 0.0f; + } + + // Use the side which will clamp first. + // The first clamped value is the absmax at the end of the common range. + int amax_range = MIN(-nmin, nmax); + if (amax_range == 0) { + // One side will always clamp anyway + amax_range = MAX(-nmin, nmax); + } + float sumlx_p = 0.0f; + float suml2_p = 0.0f; + float sumlx_n = 0.0f; + float suml2_n = 0.0f; + float scale = 0.0f; + float best = 0.0f; + float best_denom = 1.0f; + int best_i = -1; // consecutive with 0..n_frac + // Pre-calculate the half-point for the common range. + // All smaller vectors have a representable vector with twice the values, and thus can be skipped. + if (amax_range > 1) { + const float iscale = ((float)((amax_range >> 1) + 1))/amax; + for (int i = 0; i < n; ++i) { + const float w = weights[i]; + int l = MAX(nmin, MIN(lroundf(x[i] * iscale), nmax)); + Laux[i] = l + k_heap->mid_k; + suml2_p += w * l * l; + sumlx_p += w * l * x[i]; + } + sumlx_n = -sumlx_p; + suml2_n = suml2_p; + const float current_p = sumlx_p * sumlx_p; + if (suml2_p > 0.0f) { + best = current_p; + best_denom = suml2_p; + scale = sumlx_p / suml2_p; + best_i = -1; // right before 0 of the loop after sorting + } + } else { + memset(Laux, k_heap->mid_k, n); + } + memcpy(L, Laux, n); + + k_heap_set_x_L(k_heap, x, Laux, n, false); + + // TODO: make that range sign-aware to reduce the search space + const int imax_range = MAX(nmax, -nmin); + const int max_odd = 2*(imax_range + 1) + 1; + const float wmax = fabsf(x[w_amax_i]); + + const float max_common_odd = (MIN(nmax, -nmin) * 2) + 1; + const float max_odd_p = (nmax * 2) + 1; + const float max_odd_n = (-nmin * 2) + 1; + for (int i = 0; k_heap->n > 0; ++i) { + struct fraction frac = k_heap_pop(k_heap); + // maximize the weighted cosine similarity + const int ii = frac.i; + const float w = weights[ii]; + const float lx = w * frac.numer; + const float odd = frac.denom; + const float l2 = w * odd; + if (wmax * odd > frac.numer * max_odd) { + // stop when the inverse scale would result in clamping the most important value + break; + } + + Laux[ii] += x[ii] < 0.0f ? -1 : 1; + + float sumlx; + float proj; + float norm; + if (odd < max_common_odd) { + sumlx_p += lx; + suml2_p += l2; + sumlx_n -= lx; + suml2_n += l2; + + sumlx = sumlx_p; + proj = sumlx_p * sumlx_p; + norm = suml2_p; + + // avoid double-copying Laux in a single iteration + if (suml2_p != suml2_n && suml2_p * suml2_n > 0.0f) { + const float proj_n = sumlx_n * sumlx_n; + if (proj_n * norm > proj * suml2_n) { + sumlx = sumlx_n; + proj = proj_n; + norm = suml2_n; } - best_p_i = i; } - i += 1; + } else if (x[ii] < 0.0f ? odd < max_odd_n : odd < max_odd_p) { + sumlx_p += lx; + suml2_p += l2; + + sumlx = sumlx_p; + proj = sumlx_p * sumlx_p; + norm = suml2_p; + } else { + // outside the positive range means we're now into negatives + sumlx_n -= lx; + suml2_n += l2; + + sumlx = sumlx_n; + proj = sumlx_n * sumlx_n; + norm = suml2_n; + } + if (norm > 0.0f && proj * best_denom > best * norm) { + best = proj; + best_denom = norm; + scale = sumlx / norm; + if (i == best_i + 1) { + // reduce copies for consecutive bests + L[ii] += x[ii] < 0.0f ? -1 : 1; + } else { + memcpy(L, Laux, n); + } + best_i = i; + } + } + + if (scale < 0.0f) { + for (int i = 0; i < n; ++i) { + L[i] = MAX(nmin, MIN(-(L[i] - k_heap->mid_k), nmax)) - nmin; + } + } else { + for (int i = 0; i < n; ++i) { + L[i] = MAX(nmin, MIN(L[i] - k_heap->mid_k, nmax)) - nmin; } } @@ -2898,7 +3050,11 @@ static void quantize_row_tq1_0_impl(const float * restrict x, block_tq1_0 * rest float weight[QK_K]; int8_t L[QK_K]; int8_t Laux[QK_K]; - struct fraction Faux[1 * QK_K]; + struct k_heap_cell heap_cells[QK_K]; + float odd[3]; + struct k_heap k_heap; + + k_heap_init_linear(&k_heap, -1, 1, heap_cells, odd); float sum_x2 = 0; for (int j = 0; j < n_per_row; ++j) { sum_x2 += x[j]*x[j]; } @@ -2910,7 +3066,7 @@ static void quantize_row_tq1_0_impl(const float * restrict x, block_tq1_0 * rest const float * qw = quant_weights + QK_K * ib; const int8_t * Lptr = L; for (int j = 0; j < QK_K; ++j) { weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]); } - float d = make_qkxs_quants(QK_K, -1, 1, xb, weight, L, Laux, Faux, false); + float d = make_qkxh_quants(QK_K, xb, weight, L, Laux, &k_heap, false); y[ib].d = GGML_FP32_TO_FP16(d); // 5 elements per byte, along 32 bytes @@ -2999,7 +3155,11 @@ static void quantize_row_tq2_0_impl(const float * restrict x, block_tq2_0 * rest float weight[QK_K]; int8_t L[QK_K]; int8_t Laux[QK_K]; - struct fraction Faux[2 * QK_K]; + struct k_heap_cell heap_cells[QK_K]; + float odd[4 + 1]; + struct k_heap k_heap; + + k_heap_init_linear(&k_heap, -2, 2, heap_cells, odd); float sum_x2 = 0; for (int j = 0; j < n_per_row; ++j) { sum_x2 += x[j]*x[j]; } @@ -3010,7 +3170,7 @@ static void quantize_row_tq2_0_impl(const float * restrict x, block_tq2_0 * rest const float * xb = x + QK_K * ib; const float * qw = quant_weights + QK_K * ib; for (int j = 0; j < QK_K; ++j) { weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]); } - float d = make_qkxss_quants(QK_K, -1, 2, xb, weight, L, Laux, Faux); + float d = make_qkxsh_quants(QK_K, -1, 2, xb, weight, L, Laux, &k_heap); y[ib].d = GGML_FP32_TO_FP16(d); for (size_t j = 0; j < sizeof(y->qs); j += 32) { From af23abd3cb1385142271af3318c26d6475e17c54 Mon Sep 17 00:00:00 2001 From: Francis Couture-Harpin Date: Sat, 22 Mar 2025 12:07:28 -0400 Subject: [PATCH 10/12] ggml-quants : remove slower qsort-based cumulative search --- ggml/src/ggml-quants.c | 329 ----------------------------------------- 1 file changed, 329 deletions(-) diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index 9be87a671ce83..f2edfc16241b6 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -861,150 +861,6 @@ static struct fraction k_heap_pop(struct k_heap * k_heap) { }; } -// exhaustive search with cumulative sums -// Need Faux to have room for n*(max(abs(nmin), abs(nmax))) fractions -static float make_qkxs_quants(int n, int nmin, int nmax, const float * restrict x, const float * restrict weights, int8_t * restrict L, int8_t * restrict Laux, struct fraction * restrict Faux, bool signed_scale) { - float max = x[0]; - float min = x[0]; - float w_amax = weights[0] * fabsf(x[0]); - int max_i = 0; - int w_amax_i = 0; - int min_i = 0; - for (int i = 1; i < n; ++i) { - if (x[i] < min) { min = x[i]; min_i = i; } - if (x[i] > max) { max = x[i]; max_i = i; } - // Find the most important value - const float w = weights[i]; - const float wax = w * fabsf(x[i]); - if (wax > w_amax) { - w_amax = wax; - w_amax_i = i; - } - } - const int amax_i = fabsf(min) > fabsf(max) ? min_i : max_i; - const float amax = fabsf(x[amax_i]); - - if (amax < GROUP_MAX_EPS) { // all zero - for (int i = 0; i < n; ++i) { - L[i] = 0; - } - return 0.0f; - } - - bool negative_scale = false; - if (signed_scale && -nmin != nmax) { - // the max side should have the biggest range - // NOTE: this is not always the best sign - if ((x[amax_i] < 0.0f) == (-nmin < nmax)) { - // [-4, 3] ==> [-3, 4] - const int tmp = nmin; - const float ftmp = min; - nmin = -nmax; - nmax = -tmp; - min = -max; - max = -ftmp; - negative_scale = true; - } - } - - // Find the max range in [0, amax_range] which doesn't result in clamping. - // This is the range from the side which would clamp first (biggest ratio of max to nmax). - int amax_range; - float range_max; - if (fabsf(-max * nmin) < fabsf(-min * nmax)) { - amax_range = MAX(0, -nmin); - range_max = fabsf(min); - } else { - amax_range = MAX(0, nmax); - range_max = fabsf(max); - } - float sumlx = 0.0f; - float suml2 = 0.0f; - float scale = 0.0f; - float best = 0.0f; - float best_denom = 1.0f; - if (amax_range > 1) { - // The smallest non-redundant iscale makes the first clamped value half+1 its max integer value. - // Proof: anything smaller has a representable vector with values twice as big. - const float iscale = ((float)((amax_range >> 1) + 1))/range_max * (negative_scale ? -1.0f : 1.0f); - for (int i = 0; i < n; ++i) { - const float w = weights[i]; - int l = MAX(nmin, MIN(lroundf(x[i] * iscale), nmax)); - if (negative_scale) { l = -l; } - Laux[i] = l; - L[i] = l; - suml2 += w * l * l; - sumlx += w * l * x[i]; - } - best = sumlx * sumlx; - best_denom = suml2; // should never be zero - scale = sumlx / suml2; - } else { - for (int i = 0; i < n; ++i) { - Laux[i] = 0; - L[i] = 0; - } - } - - const int imax_range = MAX(0, (x[w_amax_i] < 0.0f) ? -nmin : nmax); - const int max_odd = 2*(imax_range + 1) + 1; - const float wmax = fabsf(x[w_amax_i]); - int n_frac = 0; - for (int i = 0; i < n; ++i) { - // assuming nmin <= nmax - const int odd_max = MAX(abs(Laux[i]), x[i] < 0.0f ? -nmin : nmax); - const int odd_min = MAX(abs(Laux[i]), x[i] < 0.0f ? -nmax : nmin); - const float v = fabsf(x[i]); - const float v_max_odd = v * max_odd; - for (int j = odd_min; j < odd_max; ++j) { - const float odd = 2*j + 1; - if (wmax * odd < v_max_odd) { - Faux[n_frac++] = (struct fraction){ - .numer=v, - .denom=odd, - .i=i, - }; - } else { - // stop when the inverse scale would result in clamping the most important value - break; - } - } - } - - qsort(Faux, n_frac, sizeof(struct fraction), compare_fractions_desc); - - int best_p_i = -1; // consecutive with 0..n_frac - for (int i = 0; i < n_frac; ++i) { - // maximize the weighted cosine - const int ii = Faux[i].i; - const float w = weights ? weights[ii] : x[ii] * x[ii]; - sumlx += w * Faux[i].numer; - suml2 += w * Faux[i].denom; - const float current = sumlx * sumlx; - Laux[ii] += x[ii] < 0.0f ? -1 : 1; - if (suml2 > 0.0f && Faux[i].numer > 0.0f && current * best_denom > best * suml2) { - best = current; - best_denom = suml2; - scale = sumlx / suml2; - if (i == best_p_i + 1) { - // reduce copies for consecutive bests - L[ii] += x[ii] < 0.0f ? -1 : 1; - } else { - for (int j = 0; j < n; ++j) { - L[j] = Laux[j]; - } - } - best_p_i = i; - } - } - for (int i = 0; i < n; ++i) { - L[i] = negative_scale ? (-L[i] + nmax) : (L[i] + -nmin); - GGML_ASSERT(L[i] >= 0 && L[i] <= nmax - nmin); - } - - return negative_scale ? -scale : scale; -} - // exhaustive search with cumulative sums static float make_qkxh_quants(int n, const float * restrict x, const float * restrict weights, int8_t * restrict L, int8_t * restrict Laux, struct k_heap * restrict k_heap, bool signed_scale) { const int nmin = MIN(0, -k_heap->mid_k); // TODO: maybe directly pass these @@ -1279,182 +1135,6 @@ static float make_qkxsh_quants(int n, int nmin, int nmax, const float * restrict return scale; } -// Very similar to make_qkxs_quants, but the sign of the scale is not assumed to be the sign of the absmax value. -static float make_qkxss_quants(int n, int nmin, int nmax, const float * restrict x, const float * restrict weights, int8_t * restrict L, int8_t * restrict Laux, struct fraction * restrict Faux) { - // start at zero - nmin = MIN(0, nmin); - nmax = MAX(0, nmax); - float amax = 0.0f; - float min = 0.0f; - float max = 0.0f; - float w_amax = 0.0f; - int amax_i = -1; - int w_amax_i = -1; - for (int i = 0; i < n; ++i) { - const float w = weights ? weights[i] : x[i] * x[i]; - const float ax = fabsf(x[i]); - const float wax = w * ax; - if (ax > amax) { amax = ax; amax_i = i; } - if (x[i] > max) { max = x[i]; } - if (x[i] < min) { min = x[i]; } - // Find the most important value - if (wax > w_amax) { w_amax = wax; w_amax_i = i; } - } - - if (amax < GROUP_MAX_EPS || amax_i < 0 || w_amax_i < 0) { // all zero - for (int i = 0; i < n; ++i) { L[i] = 0; } - return 0.0f; - } - - // Use the side which will clamp first. - // The first clamped value is the absmax at the end of the common range. - // TODO: reduce the search space when one of the ranges is 0 - const int amax_range = MIN(-nmin, nmax); - float sumlx_p = 0.0f; - float suml2_p = 0.0f; - float sumlx_n = 0.0f; - float suml2_n = 0.0f; - float scale = 0.0f; - float best = 0.0f; - float best_denom = 1.0f; - int best_i = -2; // not consecutive with 0..n_frac - // Pre-calculate the half-point for the common range. - // All smaller vectors have a representable vector with twice the values, and thus can be skipped. - if (amax_range > 1) { - const float iscale = ((float)((amax_range >> 1) + 1))/amax; - for (int i = 0; i < n; ++i) { - const float w = weights ? weights[i] : x[i] * x[i]; - int l = MAX(nmin, MIN(lroundf(x[i] * iscale), nmax)); - Laux[i] = l; - suml2_p += w * l * l; - sumlx_p += w * l * x[i]; - } - sumlx_n = -sumlx_p; - suml2_n = suml2_p; - const float current_p = sumlx_p * sumlx_p; - if (suml2_p > 0.0f && current_p * best_denom > best * suml2_p) { - best = current_p; - best_denom = suml2_p; - scale = sumlx_p / suml2_p; - for (int i = 0; i < n; ++i) { - L[i] = Laux[i]; - } - best_i = -1; // right before 0 of the loop after sorting - } - } else { - for (int i = 0; i < n; ++i) { - Laux[i] = 0; - } - } - - const int imax_range = MAX(nmax, -nmin); - const int max_odd = 2*(imax_range + 1) + 1; - const float wmax = fabsf(x[w_amax_i]); - int n_frac = 0; - for (int i = 0; i < n; ++i) { - // assuming nmin <= nmax - const int odd_max = MAX(nmax, -nmin); - const float v = fabsf(x[i]); - const float v_max_odd = v * max_odd; - for (int j = abs(Laux[i]); j < odd_max; ++j) { - const float odd = 2*j + 1; - const float wmax_odd = wmax * odd; - if (wmax_odd < v_max_odd) { - Faux[n_frac++] = (struct fraction){ - .numer=v, - .denom=odd, - .i=i, - }; - } else { - // stop when the inverse scale would result in clamping the most important value - break; - } - } - } - - qsort(Faux, n_frac, sizeof(struct fraction), compare_fractions_desc); - - const float max_common_odd = (MIN(nmax, -nmin) * 2) + 1; - const float max_odd_p = (nmax * 2) + 1; - const float max_odd_n = (-nmin * 2) + 1; - - for (int i = 0; i < n_frac; ++i) { - // maximize the weighted cosine similarity - const int ii = Faux[i].i; - const float w = weights ? weights[ii] : x[ii] * x[ii]; - const float lx = w * Faux[i].numer; - const float odd = Faux[i].denom; - const float l2 = w * odd; - - Laux[ii] += x[ii] < 0.0f ? -1 : 1; - - float sumlx = 0.0f; - float proj = 0.0f; - float norm = 0.0f; - if (odd < max_common_odd) { - sumlx_p += lx; - suml2_p += l2; - sumlx_n -= lx; - suml2_n += l2; - - sumlx = sumlx_p; - proj = sumlx_p * sumlx_p; - norm = suml2_p; - - // avoid double-copying Laux in a single iteration - if (suml2_p != suml2_n && suml2_p * suml2_n > 0.0f) { - const float proj_n = sumlx_n * sumlx_n; - if (proj_n * norm > proj * suml2_n) { - sumlx = sumlx_n; - proj = proj_n; - norm = suml2_n; - } - } - } else if (x[ii] < 0.0f ? odd < max_odd_n : odd < max_odd_p) { - sumlx_p += lx; - suml2_p += l2; - - sumlx = sumlx_p; - proj = sumlx_p * sumlx_p; - norm = suml2_p; - } else { - // outside the positive range means we're now into negatives - sumlx_n -= lx; - suml2_n += l2; - - sumlx = sumlx_n; - proj = sumlx_n * sumlx_n; - norm = suml2_n; - } - if (norm > 0.0f && proj * best_denom > best * norm) { - best = proj; - best_denom = norm; - scale = sumlx / norm; - if (i == best_i + 1) { - // reduce copies for consecutive bests - L[ii] += x[ii] < 0.0f ? -1 : 1; - } else { - for (int j = 0; j < n; ++j) { - L[j] = Laux[j]; - } - } - best_i = i; - } - } - - if (scale < 0.0f) { - for (int i = 0; i < n; ++i) { - L[i] = MAX(nmin, MIN(-L[i], nmax)) - nmin; - } - } else { - for (int i = 0; i < n; ++i) { - L[i] = MAX(nmin, MIN(L[i], nmax)) - nmin; - } - } - - return scale; -} - // non-linear exhaustive search with cumulative sums static float make_qkxh_nl_quants(int n, const float * restrict x, const float * restrict weights, uint8_t * restrict L, uint8_t * restrict Laux, struct k_heap * restrict k_heap, bool signed_scale, bool fast) { float sumlx = 0.0f; @@ -1924,7 +1604,6 @@ void quantize_row_q3_K_ref(const float * restrict x, block_q3_K * restrict y, in int8_t L[QK_K]; int8_t Laux[16]; - // struct fraction Faux[16 * 4]; struct k_heap_cell heap_cells[16]; float odd[8]; struct k_heap k_heap; @@ -1942,7 +1621,6 @@ void quantize_row_q3_K_ref(const float * restrict x, block_q3_K * restrict y, in float max_scale = 0; float amax = 0; for (int j = 0; j < QK_K/16; ++j) { - // scales[j] = make_qkxs_quants(16, -4, 3, x + 16*j, weights, L + 16*j, Laux, Faux, true); scales[j] = make_qkxh_quants(16, x + 16*j, weights, L + 16*j, Laux, &k_heap, true); float scale = fabsf(scales[j]); if (scale > amax) { @@ -2052,7 +1730,6 @@ static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restri float weight[16]; float sw[QK_K / 16]; int8_t Ls[QK_K / 16]; - // struct fraction Faux[16 * 32]; struct k_heap_cell heap_cells[16]; float odd[8]; struct k_heap k_heap; @@ -2080,14 +1757,12 @@ static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restri for (int l = 0; l < 16; ++l) sumw += weight[l]; sw[j] = sumw; - // scales[j] = make_qkxs_quants(16, -4, 3, x + 16*j, weight, L + 16*j, Laux, Faux, true); scales[j] = make_qkxh_quants(16, x + 16*j, weight, L + 16*j, Laux, &k_heap, true); } memset(y[i].scales, 0, 12); - // float d_block = make_qkxs_quants(QK_K/16, -32, 31, scales, sw, Ls, Laux, Faux, true); float d_block = make_qkxh_quants(QK_K/16, scales, sw, Ls, Laux, &k_heap_s, true); for (int j = 0; j < QK_K/16; ++j) { int l = Ls[j]; @@ -2766,7 +2441,6 @@ static void quantize_row_q4_0_impl(const float * restrict x, block_q4_0 * restri float weight[QK4_0]; int8_t L[QK4_0]; int8_t Laux[QK4_0]; - // struct fraction Faux[8 * QK4_0]; struct k_heap_cell heap_cells[QK4_0]; float odd[16]; struct k_heap k_heap; @@ -2782,7 +2456,6 @@ static void quantize_row_q4_0_impl(const float * restrict x, block_q4_0 * restri const float * xb = x + QK4_0 * ib; const float * qw = quant_weights + QK4_0 * ib; for (int j = 0; j < QK4_0; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]); - // float d = make_qkxs_quants(QK4_0, -8, 7, xb, weight, L, Laux, Faux, true); float d = make_qkxh_quants(QK4_0, xb, weight, L, Laux, &k_heap, true); y[ib].d = GGML_FP32_TO_FP16(d); for (int j = 0; j < 16; ++j) { @@ -2862,7 +2535,6 @@ static void quantize_row_q5_0_impl(const float * restrict x, block_q5_0 * restri float weight[QK5_0]; int8_t L[QK5_0]; int8_t Laux[QK5_0]; - // struct fraction Faux[16 * QK5_0]; struct k_heap_cell heap_cells[QK5_0]; float odd[32]; struct k_heap k_heap; @@ -2879,7 +2551,6 @@ static void quantize_row_q5_0_impl(const float * restrict x, block_q5_0 * restri const float * qw = quant_weights + QK5_0 * ib; for (int j = 0; j < QK5_0; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]); float d = make_qkxh_quants(QK5_0, xb, weight, L, Laux, &k_heap, true); - // float d = make_qkxs_quants(QK5_0, -16, 15, xb, weight, L, Laux, Faux, true); y[ib].d = GGML_FP32_TO_FP16(d); uint32_t qh = 0; From 8b8b88f3de204486eddeaabc64015384b921b09f Mon Sep 17 00:00:00 2001 From: Francis Couture-Harpin Date: Sat, 22 Mar 2025 18:47:56 -0400 Subject: [PATCH 11/12] ggml-quants : restore Q2_K use of make_qp_quants Weirdly, it seems like in practice replacing this instance is not better. This is probably because of its interaction with make_qkx3_quants. --- ggml/src/ggml-quants.c | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index 8e52f99c6070c..3b08b7e6a8c4e 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -1519,18 +1519,11 @@ static void quantize_row_q2_K_impl(const float * GGML_RESTRICT x, block_q2_K * G uint8_t L[QK_K]; uint8_t Laux[16]; - int8_t Lsaux[16]; float mins[QK_K/16]; float scales[QK_K/16]; float sw[QK_K/16]; float weight[16]; - int8_t Ls[QK_K/16], Lm[QK_K/16]; - - struct k_heap_cell heap_cells_s[QK_K/16]; - float odd_s[16]; - struct k_heap k_heap_s; - - k_heap_init_linear(&k_heap_s, 0, 15, heap_cells_s, odd_s); + uint8_t Ls[QK_K/16], Lm[QK_K/16]; for (int i = 0; i < nb; i++) { memset(sw, 0, QK_K/16*sizeof(float)); @@ -1545,8 +1538,8 @@ static void quantize_row_q2_K_impl(const float * GGML_RESTRICT x, block_q2_K * G } float dm, mm; - dm = make_qkxh_quants(QK_K/16, scales, sw, Ls, Lsaux, &k_heap_s, false); - mm = make_qkxh_quants(QK_K/16, mins, sw, Lm, Lsaux, &k_heap_s, false); + dm = make_qp_quants(QK_K/16, 15, scales, Ls, sw); + mm = make_qp_quants(QK_K/16, 15, mins, Lm, sw); y[i].d = GGML_FP32_TO_FP16(dm); y[i].dmin = GGML_FP32_TO_FP16(mm); From a5b1943912c5657482b1921ef8385c72b9609b5d Mon Sep 17 00:00:00 2001 From: Francis Couture-Harpin Date: Sun, 23 Mar 2025 17:59:37 -0400 Subject: [PATCH 12/12] ggml-quants : fix some edge cases in make_qkxh_nl_quants --- ggml/src/ggml-quants.c | 66 ++++++++++++++++++------------------------ 1 file changed, 28 insertions(+), 38 deletions(-) diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index 3b08b7e6a8c4e..728e77fe87cd0 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -1149,10 +1149,11 @@ static float make_qkxh_nl_quants(int n, const float * GGML_RESTRICT x, const flo amax = ax; amax_i = i; } - Laux[i] = k_heap->mid_k; sumlx += w * x[i] * kmin; suml2 += w * kmin * kmin; } + memset(Laux, k_heap->mid_k, n); + memset(L, k_heap->mid_k, n); const bool neg_scale = signed_scale && fast ? (x[amax_i] < 0.0f) != (k_heap->kmax < 0) : false; @@ -1163,57 +1164,49 @@ static float make_qkxh_nl_quants(int n, const float * GGML_RESTRICT x, const flo float best_suml2; if (suml2 != 0.0f) { best = sumlx * sumlx; - best_sumlx = neg_scale ? -sumlx : sumlx; - best_suml2 = suml2 != 0.0f ? suml2 : 1.0f; + best_sumlx = sumlx; // can't change the sign of kmin + best_suml2 = suml2; } else { best = 0.0f; best_sumlx = 0.0f; best_suml2 = 1.0f; } - { - float sumlx_p = neg_scale ? -sumlx : sumlx; - float suml2_p = suml2; - int best_p_i = -2; // not consecutive with 0..n_frac - int i = 0; - while (k_heap->n > 0) { - struct fraction frac = k_heap_pop(k_heap); - const int ii = frac.i; - const float w = weights ? weights[ii] : x[ii] * x[ii]; - sumlx_p += w * frac.numer; - suml2_p += w * frac.denom; - const float current = sumlx_p * sumlx_p; - Laux[ii] += (x[ii] < 0.0f) != neg_scale ? -1 : 1; - if (suml2_p > 0.0f && current * best_suml2 > best * suml2_p) { - best = current; - best_sumlx = neg_scale ? -sumlx_p : sumlx_p; - best_suml2 = suml2_p; - if (i == best_p_i + 1) { - // reduce copies for consecutive bests - L[ii] += (x[ii] < 0.0f) != neg_scale ? -1 : 1; - } else { - for (int j = 0; j < n; ++j) { - L[j] = Laux[j]; - } - } - best_p_i = i; + float sumlx_p = neg_scale ? -sumlx : sumlx; + float suml2_p = suml2; + int best_p_i = -1; // consecutive with 0..n_frac + for (int i = 0; k_heap->n > 0; ++i) { + struct fraction frac = k_heap_pop(k_heap); + const int ii = frac.i; + const float w = weights ? weights[ii] : x[ii] * x[ii]; + sumlx_p += w * frac.numer; + suml2_p += w * frac.denom; + const float current = sumlx_p * sumlx_p; + Laux[ii] += (x[ii] < 0.0f) != neg_scale ? -1 : 1; + if (suml2_p > 0.0f && current * best_suml2 > best * suml2_p) { + best = current; + best_sumlx = neg_scale ? -sumlx_p : sumlx_p; + best_suml2 = suml2_p; + if (i == best_p_i + 1) { + // reduce copies for consecutive bests + L[ii] += (x[ii] < 0.0f) != neg_scale ? -1 : 1; + } else { + memcpy(L, Laux, n); } + best_p_i = i; } } // Non-linear mappings are usually not symmetric, so try negating the scale // This is the same as above, but keeping the old best if the new best is not better. if (signed_scale && !fast) { - for (int i = 0; i < n; ++i) { - Laux[i] = k_heap->mid_k; - } + memset(Laux, k_heap->mid_k, n); k_heap_set_x(k_heap, x, n, true); float sumlx_n = -sumlx; float suml2_n = suml2; int best_n_i = -2; // not consecutive with 0..n_frac - int i = 0; - while (k_heap->n > 0) { + for (int i = 0; k_heap->n > 0; ++i) { struct fraction frac = k_heap_pop(k_heap); const int ii = frac.i; const float w = weights ? weights[ii] : x[ii] * x[ii]; @@ -1229,13 +1222,10 @@ static float make_qkxh_nl_quants(int n, const float * GGML_RESTRICT x, const flo // reduce copies for consecutive bests L[ii] += x[ii] >= 0.0f ? -1 : 1; } else { - for (int j = 0; j < n; ++j) { - L[j] = Laux[j]; - } + memcpy(L, Laux, n); } best_n_i = i; } - ++i; } }