Skip to content

ggml: adds CONV_2D op and direct GEMM Vulkan implementation #14316

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions ggml/include/ggml-backend.h
Original file line number Diff line number Diff line change
Expand Up @@ -340,6 +340,8 @@ extern "C" {

// Compare the output of two backends
GGML_API bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data);
// Compare the output of two backends, graphs can be different and only the selected nodes will be compared
GGML_API bool ggml_backend_compare_graph_backend_node(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph1, struct ggml_cgraph * graph2, ggml_backend_eval_callback callback, void * user_data, char* op_name_out_1, char* op_name_out_2);

// Tensor initialization
GGML_API enum ggml_status ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);
Expand Down
12 changes: 12 additions & 0 deletions ggml/include/ggml.h
Original file line number Diff line number Diff line change
Expand Up @@ -481,6 +481,7 @@ extern "C" {
GGML_OP_CONV_TRANSPOSE_1D,
GGML_OP_IM2COL,
GGML_OP_IM2COL_BACK,
GGML_OP_CONV_2D,
GGML_OP_CONV_2D_DW,
GGML_OP_CONV_TRANSPOSE_2D,
GGML_OP_POOL_1D,
Expand Down Expand Up @@ -1663,6 +1664,17 @@ extern "C" {
int d0, // dilation dimension 0
int d1); // dilation dimension 1

GGML_API struct ggml_tensor * ggml_conv_2d_direct(
struct ggml_context * ctx,
struct ggml_tensor * a, // convolution kernel
struct ggml_tensor * b, // data
int stride0, // stride dimension 0
int stride1, // stride dimension 1
int padding0, // padding dimension 0
int padding1, // padding dimension 1
int dilation0, // dilation dimension 0
int dilation1); // dilation dimension 1

// kernel size is a->ne[0] x a->ne[1]
// stride is equal to kernel size
// padding is zero
Expand Down
49 changes: 49 additions & 0 deletions ggml/src/ggml-backend.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1864,6 +1864,55 @@ bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t
return true;
}

bool ggml_backend_compare_graph_backend_node(
ggml_backend_t backend1,
ggml_backend_t backend2,
struct ggml_cgraph * graph1,
struct ggml_cgraph * graph2,
ggml_backend_eval_callback callback, void * user_data, char* op_name_out_1, char* op_name_out_2) {

ggml_tensor * out1 = NULL;
ggml_tensor * out2 = NULL;

struct ggml_cgraph * g1 = graph1;
struct ggml_cgraph * g2 = graph2;

for (int i = 0; i < g1->n_nodes; i++) {
struct ggml_tensor * t1 = g1->nodes[i];
struct ggml_cgraph g1v = ggml_graph_view(g1, i, i + 1);
ggml_backend_graph_compute(backend1, &g1v);
if (ggml_is_view_op(t1->op)) {
continue;
}
if(strcmp(t1 -> name, op_name_out_1) == 0){
out1 = t1;
}
}

for (int i = 0; i < g2->n_nodes; i++) {
struct ggml_tensor * t2 = g2->nodes[i];
struct ggml_cgraph g2v = ggml_graph_view(g2, i, i + 1);
ggml_backend_graph_compute(backend2, &g2v);
if (ggml_is_view_op(t2->op)) {
continue;
}
if(strcmp(t2 -> name, op_name_out_2) == 0){
out2 = t2;
}
}

assert(out1 != NULL);
assert(out2 != NULL);
assert(ggml_are_same_layout(out1, out2));

// compare results, calculate rms etc
if (!callback(0, out1, out2, user_data)) {
return false;
}

return true;
}

// CPU backend - buffer

static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
Expand Down
4 changes: 4 additions & 0 deletions ggml/src/ggml-cpu/ggml-cpu.c
Original file line number Diff line number Diff line change
Expand Up @@ -1858,6 +1858,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
{
ggml_compute_forward_im2col_back_f32(params, tensor);
} break;
case GGML_OP_CONV_2D:
{
GGML_ABORT("Op not supported on CPU yet.");
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think a CPU implementation is generally required.

} break;
case GGML_OP_CONV_2D_DW:
{
ggml_compute_forward_conv_2d_dw(params, tensor);
Expand Down
181 changes: 180 additions & 1 deletion ggml/src/ggml-vulkan/ggml-vulkan.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -457,6 +457,7 @@ struct vk_device_struct {
vk_pipeline pipeline_rwkv_wkv6_f32;
vk_pipeline pipeline_rwkv_wkv7_f32;
vk_pipeline pipeline_opt_step_adamw_f32;
vk_pipeline pipeline_conv2d_f32;
vk_pipeline pipeline_conv2d_dw_whcn_f32;
vk_pipeline pipeline_conv2d_dw_cwhn_f32;

Expand Down Expand Up @@ -816,6 +817,38 @@ struct vk_op_rwkv_wkv7_push_constants {
uint32_t H;
};

struct vk_op_conv2d_push_constants {
uint32_t Cout;
uint32_t Cin;
uint32_t N;

uint32_t KW;
uint32_t KH;
uint32_t W;
uint32_t H;
uint32_t OW;
uint32_t OH;

uint32_t s0;
uint32_t s1;
uint32_t p0;
uint32_t p1;
uint32_t d0;
uint32_t d1;

uint32_t nb01;
uint32_t nb02;
uint32_t nb03;

uint32_t nb11;
uint32_t nb12;
uint32_t nb13;

uint32_t nb1;
uint32_t nb2;
uint32_t nb3;
};

struct vk_op_conv2d_dw_push_constants {
uint32_t ne;
uint32_t batches;
Expand Down Expand Up @@ -916,16 +949,33 @@ class vk_memory_logger {
class vk_perf_logger {
public:
void print_timings() {
if(timings.empty()){
return;
}
std::cerr << "----------------\nVulkan Timings:" << std::endl;
for (const auto& t : timings) {
uint64_t total = 0;
for (const auto& time : t.second) {
total += time;
}
std::cerr << t.first << ": " << t.second.size() << " x " << (total / t.second.size() / 1000.0) << " us" << std::endl;
std::cerr << t.first << ": " << t.second.size() << " x " << (total / t.second.size() / 1000.0) << " us";

// If we have as many flops entries as timing entries for the op, then compute and log the flops/S.
auto it = flops.find(t.first);
if(it != flops.end() && (it->second).size() == t.second.size()){
uint64_t total_nflops = 0;
for(const auto& elem : it->second){
total_nflops += elem;
}
std::cout << " (" << (double(total_nflops)/(1000.0*1000.0*1000.0)) / (double(total)/(1000.0*1000.0*1000.0)) << " GFLOPS/s)";
}


std::cerr << std::endl;
}

timings.clear();
flops.clear();
}

void log_timing(const ggml_tensor * node, uint64_t time) {
Expand All @@ -944,12 +994,33 @@ class vk_perf_logger {
name += " m=" + std::to_string(m) + " n=" + std::to_string(n) + " k=" + std::to_string(k);
}
timings[name].push_back(time);
flops[name].push_back( m*n*(k+(k-1)) );
return;
}
if(node->op == GGML_OP_CONV_2D){
std::string name = ggml_op_name(node->op);
ggml_tensor * knl = node->src[0];
uint64_t OW = node->ne[0];
uint64_t OH = node->ne[1];
uint64_t N = node->ne[3];
uint64_t Cout = node->ne[2];
uint64_t KW = knl->ne[0];
uint64_t KH = knl->ne[1];
uint64_t Cin = knl->ne[2];
// KxCRS @ CRSxNPQ = KxNPQ -> M=K, K=CRS, N=NPQ
uint64_t size_M = Cout;
uint64_t size_K = Cin*KW*KH;
uint64_t size_N = N*OW*OH;
uint64_t n_flops = size_M*size_N*(size_K+(size_K-1));
flops[name].push_back(n_flops);
timings[name].push_back(time);
return;
}
timings[ggml_op_name(node->op)].push_back(time);
}
private:
std::map<std::string, std::vector<uint64_t>> timings;
std::map<std::string, std::vector<uint64_t>> flops;
};

struct ggml_backend_vk_context {
Expand Down Expand Up @@ -2806,6 +2877,8 @@ static void ggml_vk_load_shaders(vk_device& device) {

ggml_vk_create_pipeline(device, device->pipeline_opt_step_adamw_f32, "opt_step_adamw_f32", opt_step_adamw_f32_len, opt_step_adamw_f32_data, "main", 5, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);

ggml_vk_create_pipeline(device, device->pipeline_conv2d_f32, "conv2d_f32", conv2d_f32_len, conv2d_f32_data, "main", 3, sizeof(vk_op_conv2d_push_constants), {128 /* equal to BS_K in the shader */, 128 /* equal to BS_NPQ in the shader */, 1}, {}, 1);

ggml_vk_create_pipeline(device, device->pipeline_conv2d_dw_whcn_f32, "conv2d_dw_whcn_f32", conv2d_dw_whcn_f32_len, conv2d_dw_whcn_f32_data, "main", 3, sizeof(vk_op_conv2d_dw_push_constants), {512, 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_conv2d_dw_cwhn_f32, "conv2d_dw_cwhn_f32", conv2d_dw_cwhn_f32_len, conv2d_dw_cwhn_f32_data, "main", 3, sizeof(vk_op_conv2d_dw_push_constants), {512, 1, 1}, {}, 1);

Expand Down Expand Up @@ -6578,6 +6651,16 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
return ctx->device->pipeline_leaky_relu_f32;
}
return nullptr;
case GGML_OP_CONV_2D:
if (src0->type == GGML_TYPE_F32 &&
src1->type == GGML_TYPE_F32 &&
dst->type == GGML_TYPE_F32 &&
ggml_is_contiguous(src0) &&
ggml_is_contiguous(src1) &&
ggml_is_contiguous(dst)) {
return ctx->device->pipeline_conv2d_f32;
}
return nullptr;
case GGML_OP_CONV_2D_DW:
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
if (ggml_is_contiguous(src1)) {
Expand Down Expand Up @@ -6899,6 +6982,30 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
const uint32_t OW = dst->ne[0];
elements = { N * OC * OH * OW, 1, 1};
} break;
case GGML_OP_CONV_2D:
{
// src0 - kernel: [KW, KH, Cin, Cout]
// src1 - input: [W, H, Cin, N]
// dst - result: [OW, OH, Cout, N]

// Copied from ggml.c: int64_t ggml_calc_conv_output_size(int64_t ins, int64_t ks, int s, int p, int d)
auto calc_conv_output_size = [](int64_t ins, int64_t ks, int s, int p, int d) -> int64_t {
return (ins + 2 * p - d * (ks - 1) - 1) / s + 1;
};
// parallelize in {OW/BS_K, OH/BS_NPQ, 1}
int64_t W = src1->ne[0];
int64_t H = src1->ne[1];
int64_t KW = src0->ne[0];
int64_t KH = src0->ne[1];
int64_t Cout = src0->ne[3];
int64_t N = src1->ne[3];
int64_t OH = calc_conv_output_size(H, KH, dst->op_params[1], dst->op_params[3], dst->op_params[5]);
int64_t OW = calc_conv_output_size(W, KW, dst->op_params[0], dst->op_params[2], dst->op_params[4]);
int64_t NPQ = N*OW*OH;

// Tile output matrix to (K/NB_K, NPQ/NB_NPQ, 1) workgroups
elements = {static_cast<uint32_t>(Cout), static_cast<uint32_t>(NPQ), 1};
} break;
case GGML_OP_ADD:
case GGML_OP_SUB:
case GGML_OP_DIV:
Expand Down Expand Up @@ -7753,6 +7860,55 @@ static void ggml_vk_pool_2d(ggml_backend_vk_context * ctx, vk_context& subctx, c
}, dryrun);
}

static void ggml_vk_conv_2d(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT(src1->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32);

GGML_TENSOR_BINARY_OP_LOCALS

GGML_ASSERT(nb00 == sizeof(float));
GGML_ASSERT(nb10 == sizeof(float));
GGML_ASSERT(nb0 == sizeof(float));

vk_op_conv2d_push_constants p{};
p.Cout = static_cast<uint32_t>(ne03);
p.Cin = static_cast<uint32_t>(ne02);
p.N = static_cast<uint32_t>(ne13);

p.KW = static_cast<uint32_t>(ne00);
p.KH = static_cast<uint32_t>(ne01);
p.W = static_cast<uint32_t>(ne10);
p.H = static_cast<uint32_t>(ne11);
p.OW = static_cast<uint32_t>(ne0);
p.OH = static_cast<uint32_t>(ne1);

p.s0 = static_cast<uint32_t>(dst->op_params[0]);
p.s1 = static_cast<uint32_t>(dst->op_params[1]);
p.p0 = static_cast<uint32_t>(dst->op_params[2]);
p.p1 = static_cast<uint32_t>(dst->op_params[3]);
p.d0 = static_cast<uint32_t>(dst->op_params[4]);
p.d1 = static_cast<uint32_t>(dst->op_params[5]);

p.nb01 = static_cast<uint32_t>(nb01/nb00);
p.nb02 = static_cast<uint32_t>(nb02/nb00);
p.nb03 = static_cast<uint32_t>(nb03/nb00);

p.nb11 = static_cast<uint32_t>(nb11/nb10);
p.nb12 = static_cast<uint32_t>(nb12/nb10);
p.nb13 = static_cast<uint32_t>(nb13/nb10);

p.nb1 = static_cast<uint32_t>(nb1 / nb0);
p.nb2 = static_cast<uint32_t>(nb2 / nb0);
p.nb3 = static_cast<uint32_t>(nb3 / nb0);

GGML_ASSERT(ne03 == ne2);
GGML_ASSERT(ne02 == ne12);

ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_CONV_2D, std::move(p), dryrun);

}

static void ggml_vk_conv_2d_dw(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
vk_op_conv2d_dw_push_constants p{};
p.ne = ggml_nelements(dst);
Expand Down Expand Up @@ -8799,6 +8955,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
case GGML_OP_TIMESTEP_EMBEDDING:
case GGML_OP_CONV_TRANSPOSE_1D:
case GGML_OP_POOL_2D:
case GGML_OP_CONV_2D:
case GGML_OP_CONV_2D_DW:
case GGML_OP_RWKV_WKV6:
case GGML_OP_RWKV_WKV7:
Expand Down Expand Up @@ -8864,6 +9021,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
case GGML_OP_TIMESTEP_EMBEDDING:
case GGML_OP_CONV_TRANSPOSE_1D:
case GGML_OP_POOL_2D:
case GGML_OP_CONV_2D:
case GGML_OP_CONV_2D_DW:
case GGML_OP_LEAKY_RELU:
{
Expand Down Expand Up @@ -9042,6 +9200,10 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
case GGML_OP_POOL_2D:
ggml_vk_pool_2d(ctx, compute_ctx, src0, node, dryrun);

break;
case GGML_OP_CONV_2D:
ggml_vk_conv_2d(ctx, compute_ctx, src0, src1, node, dryrun);

break;
case GGML_OP_CONV_2D_DW:
ggml_vk_conv_2d_dw(ctx, compute_ctx, src0, src1, node, dryrun);
Expand Down Expand Up @@ -9168,6 +9330,7 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor *
case GGML_OP_TIMESTEP_EMBEDDING:
case GGML_OP_CONV_TRANSPOSE_1D:
case GGML_OP_POOL_2D:
case GGML_OP_CONV_2D:
case GGML_OP_CONV_2D_DW:
case GGML_OP_RWKV_WKV6:
case GGML_OP_RWKV_WKV7:
Expand Down Expand Up @@ -10242,6 +10405,14 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
return true;
case GGML_OP_CONV_TRANSPOSE_1D:
return op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32;
case GGML_OP_CONV_2D:
// Channel-contiguous format is not supported yet.
return (op->src[0]->type == GGML_TYPE_F32 &&
op->src[1]->type == GGML_TYPE_F32 &&
op->type == GGML_TYPE_F32 &&
ggml_is_contiguous(op->src[0]) &&
ggml_is_contiguous(op->src[1]) &&
ggml_is_contiguous(op));
default:
return false;
}
Expand Down Expand Up @@ -10765,6 +10936,14 @@ static void ggml_vk_check_results_0(ggml_tensor * tensor) {
const int32_t p1 = tensor->op_params[6];

tensor_clone = ggml_pool_2d(ggml_ctx, src_clone[0], op, k0, k1, s0, s1, p0, p1);
} else if (tensor->op == GGML_OP_CONV_2D) {
const int32_t s0 = tensor->op_params[0];
const int32_t s1 = tensor->op_params[1];
const int32_t p0 = tensor->op_params[2];
const int32_t p1 = tensor->op_params[3];
const int32_t d0 = tensor->op_params[4];
const int32_t d1 = tensor->op_params[5];
tensor_clone = ggml_conv_2d(ggml_ctx, src_clone[0], src_clone[1], s0, s1, p0, p1, d0, d1);
} else if (tensor->op == GGML_OP_LEAKY_RELU) {
const float * op_params = (const float *)tensor->op_params;
tensor_clone = ggml_leaky_relu(ggml_ctx, src_clone[0], op_params[0], false);
Expand Down
Loading
Loading