Skip to content

Vulkan IQ4_NL Support #8613

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Jul 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
322 changes: 140 additions & 182 deletions ggml/src/ggml-vulkan.cpp

Large diffs are not rendered by default.

8 changes: 8 additions & 0 deletions ggml/src/vulkan-shaders/dequant_funcs.comp
Original file line number Diff line number Diff line change
Expand Up @@ -58,3 +58,11 @@ vec2 dequantize(uint ib, uint iqs, uint a_offset) {
return vec2(int(data_a[a_offset + ib].qs[iqs]), int(data_a[a_offset + ib].qs[iqs + 1])) * d;
}
#endif

#if defined(DATA_A_IQ4_NL)
vec2 dequantize(uint ib, uint iqs, uint a_offset) {
const float d = float(data_a[a_offset + ib].d);
const uint vui = uint(data_a[a_offset + ib].qs[iqs]);
return vec2(kvalues_iq4nl[vui & 0xF], kvalues_iq4nl[vui >> 4]) * d;
}
#endif
30 changes: 30 additions & 0 deletions ggml/src/vulkan-shaders/dequant_iq4_nl.comp
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
#version 450

#include "dequant_head.comp"

layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;

layout (binding = 0) readonly buffer A {block_iq4_nl data_a[];};
layout (binding = 1) writeonly buffer D {D_TYPE data_b[];};

void main() {
const uint i = gl_WorkGroupID.x * 4 + gl_LocalInvocationID.x / 64;

const uint tid = gl_LocalInvocationID.x % 64;
const uint il = tid/32;
const uint ir = tid%32;
const uint ib = 32*i + ir;
if (ib >= p.nel / 32) {
return;
}

const uint q_idx = 8*il;
const uint b_idx = 1024*i + 32*ir + q_idx;

const float d = float(data_a[ib].d);

[[unroll]] for (uint l = 0; l < 8; ++l) {
data_b[b_idx + l + 0] = D_TYPE(d * kvalues_iq4nl[data_a[ib].qs[q_idx + l] & 0xF]);
data_b[b_idx + l + 16] = D_TYPE(d * kvalues_iq4nl[data_a[ib].qs[q_idx + l] >> 4]);
}
}
10 changes: 4 additions & 6 deletions ggml/src/vulkan-shaders/dequant_q4_0.comp
Original file line number Diff line number Diff line change
Expand Up @@ -18,15 +18,13 @@ void main() {
return;
}

const uint b_idx = 1024*i + 32*ir + 8*il;
const uint q_idx = 8*il;
const uint b_idx = 1024*i + 32*ir + q_idx;

const float d = float(data_a[ib].d);
const float dm = -8.0f * d;

const uint q_idx = 8*il;

[[unroll]] for (uint l = 0; l < 8; ++l) {
data_b[b_idx + l + 0] = D_TYPE(d * (data_a[ib].qs[q_idx + l] & 0xF) + dm);
data_b[b_idx + l + 16] = D_TYPE(d * (data_a[ib].qs[q_idx + l] >> 4) + dm);
data_b[b_idx + l + 0] = D_TYPE(d * ((data_a[ib].qs[q_idx + l] & 0xF) - 8.0f));
data_b[b_idx + l + 16] = D_TYPE(d * ((data_a[ib].qs[q_idx + l] >> 4) - 8.0f));
}
}
15 changes: 14 additions & 1 deletion ggml/src/vulkan-shaders/mul_mm.comp
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ shared FLOAT_TYPE buf_a[BM * (BK+1)];
shared FLOAT_TYPE buf_b[BN * (BK+1)];

#ifdef MUL_MAT_ID
shared u16vec2 row_ids[2048];
shared u16vec2 row_ids[3072];
#endif

void main() {
Expand Down Expand Up @@ -380,6 +380,19 @@ void main() {

buf_a[buf_idx ] = FLOAT_TYPE(dscale * float(int8_t(((data_a[ib].ql[qsi ] >> (b * 4)) & 0xF) | (((data_a[ib].qh[qhi ] >> qhshift) & 3) << 4)) - 32));
buf_a[buf_idx + 1] = FLOAT_TYPE(dscale * float(int8_t(((data_a[ib].ql[qsi + 1] >> (b * 4)) & 0xF) | (((data_a[ib].qh[qhi + 1] >> qhshift) & 3) << 4)) - 32));
#elif defined(DATA_A_IQ4_NL)
const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
const uint buf_idx = (loadc_a + l) * (BK+1) + loadr_a;

const uint ib = idx / 16;
const uint iqs = idx & 0xF;

const float d = float(data_a[ib].d);
const uint vui = uint(data_a[ib].qs[iqs]);
const vec2 v = vec2(kvalues_iq4nl[vui & 0xF], kvalues_iq4nl[vui >> 4]) * d;

buf_a[buf_idx ] = FLOAT_TYPE(v.x);
buf_a[buf_idx + 16] = FLOAT_TYPE(v.y);
#endif
}
[[unroll]] for (uint l = 0; l < BN; l += loadstride_b) {
Expand Down
21 changes: 21 additions & 0 deletions ggml/src/vulkan-shaders/types.comp
Original file line number Diff line number Diff line change
Expand Up @@ -177,3 +177,24 @@ struct block_q6_K

#define A_TYPE block_q6_K
#endif

// IQuants

#if defined(DATA_A_IQ4_NL)
#extension GL_EXT_shader_16bit_storage : require
#define QUANT_K 32
#define QUANT_R 2

struct block_iq4_nl
{
float16_t d;
uint8_t qs[QUANT_K/2];
};

#define A_TYPE block_iq4_nl

const int8_t kvalues_iq4nl[16] = {
int8_t(-127), int8_t(-104), int8_t(-83), int8_t(-65), int8_t(-49), int8_t(-35), int8_t(-22), int8_t(-10),
int8_t(1), int8_t(13), int8_t(25), int8_t(38), int8_t(53), int8_t(69), int8_t(89), int8_t(113)
};
#endif
3 changes: 2 additions & 1 deletion ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,8 @@ const std::vector<std::string> type_names = {
"q3_k",
"q4_k",
"q5_k",
"q6_k"
"q6_k",
"iq4_nl"
};

void execute_command(const std::string& command, std::string& stdout_str, std::string& stderr_str) {
Expand Down
Loading