From f8e816e3f7f8be79f1b213f70ff40b6238717921 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 24 Aug 2023 13:14:22 +0300 Subject: [PATCH 1/7] metal : fix memory leak --- ggml-metal.m | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/ggml-metal.m b/ggml-metal.m index 06eb3872e25e4..d38534055afff 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -1121,6 +1121,9 @@ void ggml_metal_graph_compute( [command_buffers[n_cb - 1] waitUntilCompleted]; + // release resources + [queue release]; + // check status of command buffers // needed to detect if the device ran out-of-memory for example (#1881) for (int i = 0; i < n_cb; i++) { @@ -1129,5 +1132,9 @@ void ggml_metal_graph_compute( fprintf(stderr, "%s: command buffer %d failed with status %lu\n", __func__, i, status); GGML_ASSERT(false); } + + [command_buffers[i] release]; } + + [command_buffers release]; } From 59196290f8f2db28ee34529dcc6827966ed30564 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 24 Aug 2023 20:59:10 +0300 Subject: [PATCH 2/7] metal : fix encoders memory leak --- ggml-metal.m | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/ggml-metal.m b/ggml-metal.m index d38534055afff..a362a360a573e 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -521,13 +521,16 @@ void ggml_metal_graph_compute( const int n_cb = ctx->n_cb; - NSMutableArray * command_buffers = [NSMutableArray arrayWithCapacity:n_cb]; + NSMutableArray * command_buffers = [NSMutableArray arrayWithCapacity:n_cb]; + NSMutableArray * command_encoders = [NSMutableArray arrayWithCapacity:n_cb]; for (int i = 0; i < n_cb; ++i) { command_buffers[i] = [ctx->queue commandBuffer]; // enqueue the command buffers in order to specify their execution order [command_buffers[i] enqueue]; + + command_encoders[i] = [command_buffers[i] computeCommandEncoderWithDescriptor: edesc]; } // TODO: is this the best way to start threads? @@ -541,9 +544,8 @@ void ggml_metal_graph_compute( size_t offs_src1 = 0; size_t offs_dst = 0; - id command_buffer = command_buffers[cb_idx]; - - id encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc]; + id command_buffer = command_buffers[cb_idx]; + id encoder = command_encoders[cb_idx]; const int node_start = (cb_idx + 0) * n_nodes_per_cb; const int node_end = MIN((cb_idx == n_cb - 1) ? n_nodes : (cb_idx + 1) * n_nodes_per_cb, n_nodes); @@ -1133,8 +1135,10 @@ void ggml_metal_graph_compute( GGML_ASSERT(false); } + [command_encoders[i] release]; [command_buffers[i] release]; } + [command_encoders release]; [command_buffers release]; } From e7c4cccef6b2197b5960cc72198a517dddf90297 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 25 Aug 2023 09:36:45 +0300 Subject: [PATCH 3/7] metal : clean up more memory resources --- ggml-metal.m | 57 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) diff --git a/ggml-metal.m b/ggml-metal.m index a362a360a573e..eae307469918f 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -239,9 +239,65 @@ @implementation GGMLMetalClass void ggml_metal_free(struct ggml_metal_context * ctx) { fprintf(stderr, "%s: deallocating\n", __func__); +#define GGML_METAL_DEL_KERNEL(name) \ + [ctx->function_##name release]; \ + [ctx->pipeline_##name release]; + + GGML_METAL_DEL_KERNEL(add); + GGML_METAL_DEL_KERNEL(add_row); + GGML_METAL_DEL_KERNEL(mul); + GGML_METAL_DEL_KERNEL(mul_row); + GGML_METAL_DEL_KERNEL(scale); + GGML_METAL_DEL_KERNEL(silu); + GGML_METAL_DEL_KERNEL(relu); + GGML_METAL_DEL_KERNEL(gelu); + GGML_METAL_DEL_KERNEL(soft_max); + GGML_METAL_DEL_KERNEL(diag_mask_inf); + GGML_METAL_DEL_KERNEL(get_rows_f16); + GGML_METAL_DEL_KERNEL(get_rows_q4_0); + GGML_METAL_DEL_KERNEL(get_rows_q4_1); + GGML_METAL_DEL_KERNEL(get_rows_q8_0); + GGML_METAL_DEL_KERNEL(get_rows_q2_K); + GGML_METAL_DEL_KERNEL(get_rows_q3_K); + GGML_METAL_DEL_KERNEL(get_rows_q4_K); + GGML_METAL_DEL_KERNEL(get_rows_q5_K); + GGML_METAL_DEL_KERNEL(get_rows_q6_K); + GGML_METAL_DEL_KERNEL(rms_norm); + GGML_METAL_DEL_KERNEL(norm); + GGML_METAL_DEL_KERNEL(mul_mat_f16_f32); + GGML_METAL_DEL_KERNEL(mul_mat_q4_0_f32); + GGML_METAL_DEL_KERNEL(mul_mat_q4_1_f32); + GGML_METAL_DEL_KERNEL(mul_mat_q8_0_f32); + GGML_METAL_DEL_KERNEL(mul_mat_q2_K_f32); + GGML_METAL_DEL_KERNEL(mul_mat_q3_K_f32); + GGML_METAL_DEL_KERNEL(mul_mat_q4_K_f32); + GGML_METAL_DEL_KERNEL(mul_mat_q5_K_f32); + GGML_METAL_DEL_KERNEL(mul_mat_q6_K_f32); + GGML_METAL_DEL_KERNEL(mul_mm_f16_f32); + GGML_METAL_DEL_KERNEL(mul_mm_q4_0_f32); + GGML_METAL_DEL_KERNEL(mul_mm_q8_0_f32); + GGML_METAL_DEL_KERNEL(mul_mm_q4_1_f32); + GGML_METAL_DEL_KERNEL(mul_mm_q2_K_f32); + GGML_METAL_DEL_KERNEL(mul_mm_q3_K_f32); + GGML_METAL_DEL_KERNEL(mul_mm_q4_K_f32); + GGML_METAL_DEL_KERNEL(mul_mm_q5_K_f32); + GGML_METAL_DEL_KERNEL(mul_mm_q6_K_f32); + GGML_METAL_DEL_KERNEL(rope); + GGML_METAL_DEL_KERNEL(alibi_f32); + GGML_METAL_DEL_KERNEL(cpy_f32_f16); + GGML_METAL_DEL_KERNEL(cpy_f32_f32); + GGML_METAL_DEL_KERNEL(cpy_f16_f16); + +#undef GGML_METAL_DEL_KERNEL + for (int i = 0; i < ctx->n_buffers; ++i) { [ctx->buffers[i].metal release]; } + + [ctx->library release]; + [ctx->queue release]; + [ctx->device release]; + free(ctx); } @@ -1124,6 +1180,7 @@ void ggml_metal_graph_compute( [command_buffers[n_cb - 1] waitUntilCompleted]; // release resources + [edesc release]; [queue release]; // check status of command buffers From 67dd7463ce920580762b86e45fb507bac27d0128 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 25 Aug 2023 19:05:21 +0300 Subject: [PATCH 4/7] metal : fix more leaks --- ggml-metal.m | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/ggml-metal.m b/ggml-metal.m index eae307469918f..e825b630bad93 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -1177,15 +1177,11 @@ void ggml_metal_graph_compute( // wait for all threads to finish dispatch_barrier_sync(queue, ^{}); - [command_buffers[n_cb - 1] waitUntilCompleted]; - - // release resources - [edesc release]; - [queue release]; - // check status of command buffers // needed to detect if the device ran out-of-memory for example (#1881) for (int i = 0; i < n_cb; i++) { + [command_buffers[i] waitUntilCompleted]; + MTLCommandBufferStatus status = (MTLCommandBufferStatus) [command_buffers[i] status]; if (status != MTLCommandBufferStatusCompleted) { fprintf(stderr, "%s: command buffer %d failed with status %lu\n", __func__, i, status); @@ -1196,6 +1192,10 @@ void ggml_metal_graph_compute( [command_buffers[i] release]; } + // release resources + [edesc release]; + [queue release]; + [command_encoders release]; [command_buffers release]; } From 43a8a6297b155dc63a721b45855a589e86ee718e Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 28 Aug 2023 09:57:36 +0300 Subject: [PATCH 5/7] metal : reuse dispatch queue + autoreleasepool --- ggml-metal.m | 26 ++++++++++---------------- 1 file changed, 10 insertions(+), 16 deletions(-) diff --git a/ggml-metal.m b/ggml-metal.m index e825b630bad93..1ab8ae8e26909 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -33,12 +33,12 @@ struct ggml_metal_context { int n_cb; - float * logits; - id device; id queue; id library; + dispatch_queue_t d_queue; + int n_buffers; struct ggml_metal_buffer buffers[GGML_METAL_MAX_BUFFERS]; @@ -120,6 +120,7 @@ @implementation GGMLMetalClass ctx->n_buffers = 0; ctx->concur_list_len = 0; + ctx->d_queue = dispatch_queue_create("llama.cpp", DISPATCH_QUEUE_CONCURRENT); #if 0 // compile from source string and show compile log @@ -298,6 +299,8 @@ void ggml_metal_free(struct ggml_metal_context * ctx) { [ctx->queue release]; [ctx->device release]; + dispatch_release(ctx->d_queue); + free(ctx); } @@ -563,6 +566,8 @@ void ggml_metal_graph_compute( struct ggml_cgraph * gf) { metal_printf("%s: evaluating graph\n", __func__); + @autoreleasepool { + // if there is ctx->concur_list, dispatch concurrently // else fallback to serial dispatch MTLComputePassDescriptor * edesc = MTLComputePassDescriptor.computePassDescriptor; @@ -589,13 +594,10 @@ void ggml_metal_graph_compute( command_encoders[i] = [command_buffers[i] computeCommandEncoderWithDescriptor: edesc]; } - // TODO: is this the best way to start threads? - dispatch_queue_t queue = dispatch_queue_create("llama.cpp", DISPATCH_QUEUE_CONCURRENT); - for (int cb_idx = 0; cb_idx < n_cb; ++cb_idx) { const int n_nodes_per_cb = (n_nodes + n_cb - 1) / n_cb; - dispatch_async(queue, ^{ + dispatch_async(ctx->d_queue, ^{ size_t offs_src0 = 0; size_t offs_src1 = 0; size_t offs_dst = 0; @@ -1175,7 +1177,7 @@ void ggml_metal_graph_compute( } // wait for all threads to finish - dispatch_barrier_sync(queue, ^{}); + dispatch_barrier_sync(ctx->d_queue, ^{}); // check status of command buffers // needed to detect if the device ran out-of-memory for example (#1881) @@ -1187,15 +1189,7 @@ void ggml_metal_graph_compute( fprintf(stderr, "%s: command buffer %d failed with status %lu\n", __func__, i, status); GGML_ASSERT(false); } - - [command_encoders[i] release]; - [command_buffers[i] release]; } - // release resources - [edesc release]; - [queue release]; - - [command_encoders release]; - [command_buffers release]; + } } From fffd1670696717619324b1e049892c3091d69d74 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 28 Aug 2023 10:49:27 +0300 Subject: [PATCH 6/7] metal : reuse array for command buffers and encoders --- ggml-metal.h | 1 + ggml-metal.m | 24 ++++++++++++------------ 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/ggml-metal.h b/ggml-metal.h index 00202b787c804..fca28d37ef970 100644 --- a/ggml-metal.h +++ b/ggml-metal.h @@ -24,6 +24,7 @@ // max memory buffers that can be mapped to the device #define GGML_METAL_MAX_BUFFERS 16 +#define GGML_METAL_MAX_COMMAND_BUFFERS 32 struct ggml_tensor; struct ggml_cgraph; diff --git a/ggml-metal.m b/ggml-metal.m index 1ab8ae8e26909..ad2ee8cf5fef0 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -37,6 +37,9 @@ id queue; id library; + id command_buffers [GGML_METAL_MAX_COMMAND_BUFFERS]; + id command_encoders[GGML_METAL_MAX_COMMAND_BUFFERS]; + dispatch_queue_t d_queue; int n_buffers; @@ -114,7 +117,7 @@ @implementation GGMLMetalClass struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context)); - ctx->n_cb = n_cb; + ctx->n_cb = MIN(n_cb, GGML_METAL_MAX_BUFFERS); ctx->device = MTLCreateSystemDefaultDevice(); ctx->queue = [ctx->device newCommandQueue]; ctx->n_buffers = 0; @@ -320,7 +323,7 @@ void ggml_metal_host_free(void * data) { } void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb) { - ctx->n_cb = n_cb; + ctx->n_cb = MIN(n_cb, GGML_METAL_MAX_BUFFERS); } int ggml_metal_if_optimized(struct ggml_metal_context * ctx) { @@ -582,16 +585,13 @@ void ggml_metal_graph_compute( const int n_cb = ctx->n_cb; - NSMutableArray * command_buffers = [NSMutableArray arrayWithCapacity:n_cb]; - NSMutableArray * command_encoders = [NSMutableArray arrayWithCapacity:n_cb]; - for (int i = 0; i < n_cb; ++i) { - command_buffers[i] = [ctx->queue commandBuffer]; + ctx->command_buffers[i] = [ctx->queue commandBuffer]; // enqueue the command buffers in order to specify their execution order - [command_buffers[i] enqueue]; + [ctx->command_buffers[i] enqueue]; - command_encoders[i] = [command_buffers[i] computeCommandEncoderWithDescriptor: edesc]; + ctx->command_encoders[i] = [ctx->command_buffers[i] computeCommandEncoderWithDescriptor: edesc]; } for (int cb_idx = 0; cb_idx < n_cb; ++cb_idx) { @@ -602,8 +602,8 @@ void ggml_metal_graph_compute( size_t offs_src1 = 0; size_t offs_dst = 0; - id command_buffer = command_buffers[cb_idx]; - id encoder = command_encoders[cb_idx]; + id command_buffer = ctx->command_buffers[cb_idx]; + id encoder = ctx->command_encoders[cb_idx]; const int node_start = (cb_idx + 0) * n_nodes_per_cb; const int node_end = MIN((cb_idx == n_cb - 1) ? n_nodes : (cb_idx + 1) * n_nodes_per_cb, n_nodes); @@ -1182,9 +1182,9 @@ void ggml_metal_graph_compute( // check status of command buffers // needed to detect if the device ran out-of-memory for example (#1881) for (int i = 0; i < n_cb; i++) { - [command_buffers[i] waitUntilCompleted]; + [ctx->command_buffers[i] waitUntilCompleted]; - MTLCommandBufferStatus status = (MTLCommandBufferStatus) [command_buffers[i] status]; + MTLCommandBufferStatus status = (MTLCommandBufferStatus) [ctx->command_buffers[i] status]; if (status != MTLCommandBufferStatusCompleted) { fprintf(stderr, "%s: command buffer %d failed with status %lu\n", __func__, i, status); GGML_ASSERT(false); From ddfa865926e3accf9ad9af0246344124cef020ce Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 28 Aug 2023 10:49:47 +0300 Subject: [PATCH 7/7] ggml : assert for odd number of blocks on ARM 15M tinyllama is an example --- ggml.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/ggml.c b/ggml.c index 767c19ae2b58a..54f426bc066b6 100644 --- a/ggml.c +++ b/ggml.c @@ -2436,7 +2436,6 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void * const int nb = n / qk; assert(n % qk == 0); - assert(nb % 2 == 0); const block_q4_0 * restrict x = vx; const block_q8_0 * restrict y = vy; @@ -2445,6 +2444,7 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void * float32x4_t sumv0 = vdupq_n_f32(0.0f); float32x4_t sumv1 = vdupq_n_f32(0.0f); + GGML_ASSERT(nb % 2 == 0); // TODO: handle odd nb for (int i = 0; i < nb; i += 2) { const block_q4_0 * restrict x0 = &x[i + 0]; const block_q4_0 * restrict x1 = &x[i + 1]; @@ -2623,6 +2623,7 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void * } // Main loop + GGML_ASSERT(nb % 2 == 0); // TODO: handle odd nb for (int i = 2; i < nb; i+=2) { _mm_prefetch(&x[i] + sizeof(block_q4_0), _MM_HINT_T0); _mm_prefetch(&y[i] + sizeof(block_q8_0), _MM_HINT_T0); @@ -2706,7 +2707,6 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * const int nb = n / qk; assert(n % qk == 0); - assert(nb % 2 == 0); const block_q4_1 * restrict x = vx; const block_q8_1 * restrict y = vy; @@ -2718,6 +2718,7 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * float summs = 0; + GGML_ASSERT(nb % 2 == 0); // TODO: handle odd nb for (int i = 0; i < nb; i += 2) { const block_q4_1 * restrict x0 = &x[i + 0]; const block_q4_1 * restrict x1 = &x[i + 1]; @@ -2832,7 +2833,6 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * const int nb = n / qk; assert(n % qk == 0); - assert(nb % 2 == 0); assert(qk == QK5_0); const block_q5_0 * restrict x = vx; @@ -2848,6 +2848,7 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * uint64_t tmp0[4]; uint64_t tmp1[4]; + GGML_ASSERT(nb % 2 == 0); // TODO: handle odd nb for (int i = 0; i < nb; i += 2) { const block_q5_0 * restrict x0 = &x[i]; const block_q5_0 * restrict x1 = &x[i + 1]; @@ -3072,7 +3073,6 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * const int nb = n / qk; assert(n % qk == 0); - assert(nb % 2 == 0); assert(qk == QK5_1); const block_q5_1 * restrict x = vx; @@ -3091,6 +3091,7 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * uint64_t tmp0[4]; uint64_t tmp1[4]; + GGML_ASSERT(nb % 2 == 0); // TODO: handle odd nb for (int i = 0; i < nb; i += 2) { const block_q5_1 * restrict x0 = &x[i]; const block_q5_1 * restrict x1 = &x[i + 1]; @@ -3328,7 +3329,6 @@ static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * const int nb = n / qk; assert(n % qk == 0); - assert(nb % 2 == 0); const block_q8_0 * restrict x = vx; const block_q8_0 * restrict y = vy; @@ -3337,6 +3337,7 @@ static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * float32x4_t sumv0 = vdupq_n_f32(0.0f); float32x4_t sumv1 = vdupq_n_f32(0.0f); + GGML_ASSERT(nb % 2 == 0); // TODO: handle odd nb for (int i = 0; i < nb; i += 2) { const block_q8_0 * restrict x0 = &x[i + 0]; const block_q8_0 * restrict x1 = &x[i + 1];