From 2d61ed7fc6c56de61fd347624b1fd40496ac1b4f Mon Sep 17 00:00:00 2001 From: Ilya Kurdyukov <59548320+ilyakurdyukov@users.noreply.github.com> Date: Tue, 16 May 2023 17:19:05 +0700 Subject: [PATCH 1/4] ~7% faster Q5_1 AVX2 code X is always positive (range 0..31) in ggml_vec_dot_q5_1_q8_1(), so there's no need for two _mm256_sign_epi8(). --- ggml.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/ggml.c b/ggml.c index 4311ce7cf9dbe..fad137ad67ce8 100644 --- a/ggml.c +++ b/ggml.c @@ -543,12 +543,7 @@ static inline __m256 sum_i16_pairs_float(const __m256i x) { return _mm256_cvtepi32_ps(summed_pairs); } -// multiply int8_t, add results pairwise twice and return as float vector -static inline __m256 mul_sum_i8_pairs_float(const __m256i x, const __m256i y) { - // Get absolute values of x vectors - const __m256i ax = _mm256_sign_epi8(x, x); - // Sign the values of the y vectors - const __m256i sy = _mm256_sign_epi8(y, x); +static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy) { #if __AVXVNNI__ const __m256i zero = _mm256_setzero_si256(); const __m256i summed_pairs = _mm256_dpbusd_epi32(zero, ax, sy); @@ -560,6 +555,15 @@ static inline __m256 mul_sum_i8_pairs_float(const __m256i x, const __m256i y) { #endif } +// multiply int8_t, add results pairwise twice and return as float vector +static inline __m256 mul_sum_i8_pairs_float(const __m256i x, const __m256i y) { + // Get absolute values of x vectors + const __m256i ax = _mm256_sign_epi8(x, x); + // Sign the values of the y vectors + const __m256i sy = _mm256_sign_epi8(y, x); + return mul_sum_us8_pairs_float(ax, sy); +} + static inline __m128i packNibbles( __m256i bytes ) { // Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh @@ -2906,7 +2910,7 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * const __m256 dy = _mm256_broadcast_ss(&y[i].d); const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs); - const __m256 q = mul_sum_i8_pairs_float(bx, by); + const __m256 q = mul_sum_us8_pairs_float(bx, by); acc = _mm256_fmadd_ps(q, _mm256_mul_ps(dx, dy), acc); } @@ -2940,7 +2944,7 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * const __m256 dy = _mm256_broadcast_ss(&y[i].d); const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs); - const __m256 q = mul_sum_i8_pairs_float(bx, by); + const __m256 q = mul_sum_us8_pairs_float(bx, by); acc = _mm256_add_ps(_mm256_mul_ps(q, _mm256_mul_ps(dx, dy)), acc); } From c68faa991bbf02360d6db9c9d701641b830fb978 Mon Sep 17 00:00:00 2001 From: Ilya Kurdyukov <59548320+ilyakurdyukov@users.noreply.github.com> Date: Tue, 16 May 2023 17:30:23 +0700 Subject: [PATCH 2/4] fixed AVX code --- ggml.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/ggml.c b/ggml.c index fad137ad67ce8..09299914e56fe 100644 --- a/ggml.c +++ b/ggml.c @@ -623,6 +623,17 @@ static inline __m256 sum_i16_pairs_float(const __m128i xh, const __m128i xl) { return _mm256_cvtepi32_ps(summed_pairs); } +static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy) { + const __m128i axl = _mm256_castsi256_si128(ax); + const __m128i axh = _mm256_extractf128_si256(ax, 1); + const __m128i syl = _mm256_castsi256_si128(sy); + const __m128i syh = _mm256_extractf128_si256(sy, 1); + // Perform multiplication and create 16-bit values + const __m128i dotl = _mm_maddubs_epi16(axl, syl); + const __m128i doth = _mm_maddubs_epi16(axh, syh); + return sum_i16_pairs_float(doth, dotl); +} + // multiply int8_t, add results pairwise twice and return as float vector static inline __m256 mul_sum_i8_pairs_float(const __m256i x, const __m256i y) { const __m128i xl = _mm256_castsi256_si128(x); From c1b3a8e24dbba0b65aa52d17b2ec7d6759b27ba4 Mon Sep 17 00:00:00 2001 From: Ilya Kurdyukov <59548320+ilyakurdyukov@users.noreply.github.com> Date: Tue, 16 May 2023 21:17:49 +0700 Subject: [PATCH 3/4] also applied to ggml_vec_dot_q4_1_q8_1 --- ggml.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml.c b/ggml.c index 09299914e56fe..baa0005a1c95d 100644 --- a/ggml.c +++ b/ggml.c @@ -2449,7 +2449,7 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * const __m256i bx = bytes_from_nibbles_32(x[i].qs); const __m256i by = _mm256_loadu_si256( (const __m256i *)y[i].qs ); - const __m256 xy = mul_sum_i8_pairs_float(bx, by); + const __m256 xy = mul_sum_us8_pairs_float(bx, by); // Accumulate d0*d1*x*y #if defined(__AVX2__) From ff2638412d8f3065d3e398743ce378fb4f930492 Mon Sep 17 00:00:00 2001 From: Ilya Kurdyukov <59548320+ilyakurdyukov@users.noreply.github.com> Date: Tue, 16 May 2023 21:39:06 +0700 Subject: [PATCH 4/4] make use of upcoming AVXVNNIINT8 --- ggml.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/ggml.c b/ggml.c index baa0005a1c95d..dbef99312a56d 100644 --- a/ggml.c +++ b/ggml.c @@ -557,11 +557,17 @@ static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy) // multiply int8_t, add results pairwise twice and return as float vector static inline __m256 mul_sum_i8_pairs_float(const __m256i x, const __m256i y) { +#if __AVXVNNIINT8__ + const __m256i zero = _mm256_setzero_si256(); + const __m256i summed_pairs = _mm256_dpbssd_epi32(zero, x, y); + return _mm256_cvtepi32_ps(summed_pairs); +#else // Get absolute values of x vectors const __m256i ax = _mm256_sign_epi8(x, x); // Sign the values of the y vectors const __m256i sy = _mm256_sign_epi8(y, x); return mul_sum_us8_pairs_float(ax, sy); +#endif } static inline __m128i packNibbles( __m256i bytes )