From f8e816e3f7f8be79f1b213f70ff40b6238717921 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 24 Aug 2023 13:14:22 +0300
Subject: [PATCH 1/7] metal : fix memory leak

---
 ggml-metal.m | 7 +++++++
 1 file changed, 7 insertions(+)
diff --git a/ggml-metal.m b/ggml-metal.m
index 06eb3872e25e4..d38534055afff 100644
--- a/ggml-metal.m
+++ b/ggml-metal.m
@@ -1121,6 +1121,9 @@ void ggml_metal_graph_compute(
 
     [command_buffers[n_cb - 1] waitUntilCompleted];
 
+    // release resources
+    [queue release];
+
     // check status of command buffers
     // needed to detect if the device ran out-of-memory for example (#1881)
     for (int i = 0; i < n_cb; i++) {
@@ -1129,5 +1132,9 @@ void ggml_metal_graph_compute(
             fprintf(stderr, "%s: command buffer %d failed with status %lu\n", __func__, i, status);
             GGML_ASSERT(false);
         }
+
+        [command_buffers[i] release];
     }
+
+    [command_buffers release];
 }

From 59196290f8f2db28ee34529dcc6827966ed30564 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 24 Aug 2023 20:59:10 +0300
Subject: [PATCH 2/7] metal : fix encoders memory leak

---
 ggml-metal.m | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/ggml-metal.m b/ggml-metal.m
index d38534055afff..a362a360a573e 100644
--- a/ggml-metal.m
+++ b/ggml-metal.m
@@ -521,13 +521,16 @@ void ggml_metal_graph_compute(
 
     const int n_cb = ctx->n_cb;
 
-    NSMutableArray * command_buffers = [NSMutableArray arrayWithCapacity:n_cb];
+    NSMutableArray * command_buffers  = [NSMutableArray arrayWithCapacity:n_cb];
+    NSMutableArray * command_encoders = [NSMutableArray arrayWithCapacity:n_cb];
 
     for (int i = 0; i < n_cb; ++i) {
         command_buffers[i] = [ctx->queue commandBuffer];
 
         // enqueue the command buffers in order to specify their execution order
         [command_buffers[i] enqueue];
+
+        command_encoders[i] = [command_buffers[i] computeCommandEncoderWithDescriptor: edesc];
     }
 
     // TODO: is this the best way to start threads?
@@ -541,9 +544,8 @@ void ggml_metal_graph_compute(
             size_t offs_src1 = 0;
             size_t offs_dst  = 0;
 
-            id<MTLCommandBuffer> command_buffer = command_buffers[cb_idx];
-
-            id<MTLComputeCommandEncoder> encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
+            id<MTLCommandBuffer> command_buffer  = command_buffers[cb_idx];
+            id<MTLComputeCommandEncoder> encoder = command_encoders[cb_idx];
 
             const int node_start =                                      (cb_idx + 0) * n_nodes_per_cb;
             const int node_end   = MIN((cb_idx == n_cb - 1) ? n_nodes : (cb_idx + 1) * n_nodes_per_cb, n_nodes);
@@ -1133,8 +1135,10 @@ void ggml_metal_graph_compute(
             GGML_ASSERT(false);
         }
 
+        [command_encoders[i] release];
         [command_buffers[i] release];
     }
 
+    [command_encoders release];
     [command_buffers release];
 }

From e7c4cccef6b2197b5960cc72198a517dddf90297 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 25 Aug 2023 09:36:45 +0300
Subject: [PATCH 3/7] metal : clean up more memory resources

---
 ggml-metal.m | 57 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 57 insertions(+)

diff --git a/ggml-metal.m b/ggml-metal.m
index a362a360a573e..eae307469918f 100644
--- a/ggml-metal.m
+++ b/ggml-metal.m
@@ -239,9 +239,65 @@ @implementation GGMLMetalClass
 
 void ggml_metal_free(struct ggml_metal_context * ctx) {
     fprintf(stderr, "%s: deallocating\n", __func__);
+#define GGML_METAL_DEL_KERNEL(name) \
+    [ctx->function_##name release]; \
+    [ctx->pipeline_##name release];
+
+    GGML_METAL_DEL_KERNEL(add);
+    GGML_METAL_DEL_KERNEL(add_row);
+    GGML_METAL_DEL_KERNEL(mul);
+    GGML_METAL_DEL_KERNEL(mul_row);
+    GGML_METAL_DEL_KERNEL(scale);
+    GGML_METAL_DEL_KERNEL(silu);
+    GGML_METAL_DEL_KERNEL(relu);
+    GGML_METAL_DEL_KERNEL(gelu);
+    GGML_METAL_DEL_KERNEL(soft_max);
+    GGML_METAL_DEL_KERNEL(diag_mask_inf);
+    GGML_METAL_DEL_KERNEL(get_rows_f16);
+    GGML_METAL_DEL_KERNEL(get_rows_q4_0);
+    GGML_METAL_DEL_KERNEL(get_rows_q4_1);
+    GGML_METAL_DEL_KERNEL(get_rows_q8_0);
+    GGML_METAL_DEL_KERNEL(get_rows_q2_K);
+    GGML_METAL_DEL_KERNEL(get_rows_q3_K);
+    GGML_METAL_DEL_KERNEL(get_rows_q4_K);
+    GGML_METAL_DEL_KERNEL(get_rows_q5_K);
+    GGML_METAL_DEL_KERNEL(get_rows_q6_K);
+    GGML_METAL_DEL_KERNEL(rms_norm);
+    GGML_METAL_DEL_KERNEL(norm);
+    GGML_METAL_DEL_KERNEL(mul_mat_f16_f32);
+    GGML_METAL_DEL_KERNEL(mul_mat_q4_0_f32);
+    GGML_METAL_DEL_KERNEL(mul_mat_q4_1_f32);
+    GGML_METAL_DEL_KERNEL(mul_mat_q8_0_f32);
+    GGML_METAL_DEL_KERNEL(mul_mat_q2_K_f32);
+    GGML_METAL_DEL_KERNEL(mul_mat_q3_K_f32);
+    GGML_METAL_DEL_KERNEL(mul_mat_q4_K_f32);
+    GGML_METAL_DEL_KERNEL(mul_mat_q5_K_f32);
+    GGML_METAL_DEL_KERNEL(mul_mat_q6_K_f32);
+    GGML_METAL_DEL_KERNEL(mul_mm_f16_f32);
+    GGML_METAL_DEL_KERNEL(mul_mm_q4_0_f32);
+    GGML_METAL_DEL_KERNEL(mul_mm_q8_0_f32);
+    GGML_METAL_DEL_KERNEL(mul_mm_q4_1_f32);
+    GGML_METAL_DEL_KERNEL(mul_mm_q2_K_f32);
+    GGML_METAL_DEL_KERNEL(mul_mm_q3_K_f32);
+    GGML_METAL_DEL_KERNEL(mul_mm_q4_K_f32);
+    GGML_METAL_DEL_KERNEL(mul_mm_q5_K_f32);
+    GGML_METAL_DEL_KERNEL(mul_mm_q6_K_f32);
+    GGML_METAL_DEL_KERNEL(rope);
+    GGML_METAL_DEL_KERNEL(alibi_f32);
+    GGML_METAL_DEL_KERNEL(cpy_f32_f16);
+    GGML_METAL_DEL_KERNEL(cpy_f32_f32);
+    GGML_METAL_DEL_KERNEL(cpy_f16_f16);
+
+#undef GGML_METAL_DEL_KERNEL
+
     for (int i = 0; i < ctx->n_buffers; ++i) {
         [ctx->buffers[i].metal release];
     }
+
+    [ctx->library release];
+    [ctx->queue release];
+    [ctx->device release];
+
     free(ctx);
 }
 
@@ -1124,6 +1180,7 @@ void ggml_metal_graph_compute(
     [command_buffers[n_cb - 1] waitUntilCompleted];
 
     // release resources
+    [edesc release];
     [queue release];
 
     // check status of command buffers

From 67dd7463ce920580762b86e45fb507bac27d0128 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 25 Aug 2023 19:05:21 +0300
Subject: [PATCH 4/7] metal : fix more leaks

---
 ggml-metal.m | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/ggml-metal.m b/ggml-metal.m
index eae307469918f..e825b630bad93 100644
--- a/ggml-metal.m
+++ b/ggml-metal.m
@@ -1177,15 +1177,11 @@ void ggml_metal_graph_compute(
     // wait for all threads to finish
     dispatch_barrier_sync(queue, ^{});
 
-    [command_buffers[n_cb - 1] waitUntilCompleted];
-
-    // release resources
-    [edesc release];
-    [queue release];
-
     // check status of command buffers
     // needed to detect if the device ran out-of-memory for example (#1881)
     for (int i = 0; i < n_cb; i++) {
+        [command_buffers[i] waitUntilCompleted];
+
         MTLCommandBufferStatus status = (MTLCommandBufferStatus) [command_buffers[i] status];
         if (status != MTLCommandBufferStatusCompleted) {
             fprintf(stderr, "%s: command buffer %d failed with status %lu\n", __func__, i, status);
@@ -1196,6 +1192,10 @@ void ggml_metal_graph_compute(
         [command_buffers[i] release];
     }
 
+    // release resources
+    [edesc release];
+    [queue release];
+
     [command_encoders release];
     [command_buffers release];
 }

From 43a8a6297b155dc63a721b45855a589e86ee718e Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 28 Aug 2023 09:57:36 +0300
Subject: [PATCH 5/7] metal : reuse dispatch queue + autoreleasepool

---
 ggml-metal.m | 26 ++++++++++----------------
 1 file changed, 10 insertions(+), 16 deletions(-)

diff --git a/ggml-metal.m b/ggml-metal.m
index e825b630bad93..1ab8ae8e26909 100644
--- a/ggml-metal.m
+++ b/ggml-metal.m
@@ -33,12 +33,12 @@
 struct ggml_metal_context {
     int n_cb;
 
-    float * logits;
-
     id<MTLDevice>       device;
     id<MTLCommandQueue> queue;
     id<MTLLibrary>      library;
 
+    dispatch_queue_t d_queue;
+
     int n_buffers;
     struct ggml_metal_buffer buffers[GGML_METAL_MAX_BUFFERS];
 
@@ -120,6 +120,7 @@ @implementation GGMLMetalClass
     ctx->n_buffers = 0;
     ctx->concur_list_len = 0;
 
+    ctx->d_queue = dispatch_queue_create("llama.cpp", DISPATCH_QUEUE_CONCURRENT);
 
 #if 0
     // compile from source string and show compile log
@@ -298,6 +299,8 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
     [ctx->queue release];
     [ctx->device release];
 
+    dispatch_release(ctx->d_queue);
+
     free(ctx);
 }
 
@@ -563,6 +566,8 @@ void ggml_metal_graph_compute(
                struct ggml_cgraph * gf) {
     metal_printf("%s: evaluating graph\n", __func__);
 
+    @autoreleasepool {
+
     // if there is ctx->concur_list, dispatch concurrently
     // else fallback to serial dispatch
     MTLComputePassDescriptor * edesc = MTLComputePassDescriptor.computePassDescriptor;
@@ -589,13 +594,10 @@ void ggml_metal_graph_compute(
         command_encoders[i] = [command_buffers[i] computeCommandEncoderWithDescriptor: edesc];
     }
 
-    // TODO: is this the best way to start threads?
-    dispatch_queue_t queue = dispatch_queue_create("llama.cpp", DISPATCH_QUEUE_CONCURRENT);
-
     for (int cb_idx = 0; cb_idx < n_cb; ++cb_idx) {
         const int n_nodes_per_cb = (n_nodes + n_cb - 1) / n_cb;
 
-        dispatch_async(queue, ^{
+        dispatch_async(ctx->d_queue, ^{
             size_t offs_src0 = 0;
             size_t offs_src1 = 0;
             size_t offs_dst  = 0;
@@ -1175,7 +1177,7 @@ void ggml_metal_graph_compute(
     }
 
     // wait for all threads to finish
-    dispatch_barrier_sync(queue, ^{});
+    dispatch_barrier_sync(ctx->d_queue, ^{});
 
     // check status of command buffers
     // needed to detect if the device ran out-of-memory for example (#1881)
@@ -1187,15 +1189,7 @@ void ggml_metal_graph_compute(
             fprintf(stderr, "%s: command buffer %d failed with status %lu\n", __func__, i, status);
             GGML_ASSERT(false);
         }
-
-        [command_encoders[i] release];
-        [command_buffers[i] release];
     }
 
-    // release resources
-    [edesc release];
-    [queue release];
-
-    [command_encoders release];
-    [command_buffers release];
+    }
 }

From fffd1670696717619324b1e049892c3091d69d74 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 28 Aug 2023 10:49:27 +0300
Subject: [PATCH 6/7] metal : reuse array for command buffers and encoders

---
 ggml-metal.h |  1 +
 ggml-metal.m | 24 ++++++++++++------------
 2 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/ggml-metal.h b/ggml-metal.h
index 00202b787c804..fca28d37ef970 100644
--- a/ggml-metal.h
+++ b/ggml-metal.h
@@ -24,6 +24,7 @@
 
 // max memory buffers that can be mapped to the device
 #define GGML_METAL_MAX_BUFFERS 16
+#define GGML_METAL_MAX_COMMAND_BUFFERS 32
 
 struct ggml_tensor;
 struct ggml_cgraph;
diff --git a/ggml-metal.m b/ggml-metal.m
index 1ab8ae8e26909..ad2ee8cf5fef0 100644
--- a/ggml-metal.m
+++ b/ggml-metal.m
@@ -37,6 +37,9 @@
     id<MTLCommandQueue> queue;
     id<MTLLibrary>      library;
 
+    id<MTLCommandBuffer>         command_buffers [GGML_METAL_MAX_COMMAND_BUFFERS];
+    id<MTLComputeCommandEncoder> command_encoders[GGML_METAL_MAX_COMMAND_BUFFERS];
+
     dispatch_queue_t d_queue;
 
     int n_buffers;
@@ -114,7 +117,7 @@ @implementation GGMLMetalClass
 
     struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context));
 
-    ctx->n_cb   = n_cb;
+    ctx->n_cb   = MIN(n_cb, GGML_METAL_MAX_BUFFERS);
     ctx->device = MTLCreateSystemDefaultDevice();
     ctx->queue  = [ctx->device newCommandQueue];
     ctx->n_buffers = 0;
@@ -320,7 +323,7 @@ void ggml_metal_host_free(void * data) {
 }
 
 void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb) {
-    ctx->n_cb = n_cb;
+    ctx->n_cb = MIN(n_cb, GGML_METAL_MAX_BUFFERS);
 }
 
 int ggml_metal_if_optimized(struct ggml_metal_context * ctx) {
@@ -582,16 +585,13 @@ void ggml_metal_graph_compute(
 
     const int n_cb = ctx->n_cb;
 
-    NSMutableArray * command_buffers  = [NSMutableArray arrayWithCapacity:n_cb];
-    NSMutableArray * command_encoders = [NSMutableArray arrayWithCapacity:n_cb];
-
     for (int i = 0; i < n_cb; ++i) {
-        command_buffers[i] = [ctx->queue commandBuffer];
+        ctx->command_buffers[i] = [ctx->queue commandBuffer];
 
         // enqueue the command buffers in order to specify their execution order
-        [command_buffers[i] enqueue];
+        [ctx->command_buffers[i] enqueue];
 
-        command_encoders[i] = [command_buffers[i] computeCommandEncoderWithDescriptor: edesc];
+        ctx->command_encoders[i] = [ctx->command_buffers[i] computeCommandEncoderWithDescriptor: edesc];
     }
 
     for (int cb_idx = 0; cb_idx < n_cb; ++cb_idx) {
@@ -602,8 +602,8 @@ void ggml_metal_graph_compute(
             size_t offs_src1 = 0;
             size_t offs_dst  = 0;
 
-            id<MTLCommandBuffer> command_buffer  = command_buffers[cb_idx];
-            id<MTLComputeCommandEncoder> encoder = command_encoders[cb_idx];
+            id<MTLCommandBuffer> command_buffer  = ctx->command_buffers[cb_idx];
+            id<MTLComputeCommandEncoder> encoder = ctx->command_encoders[cb_idx];
 
             const int node_start =                                      (cb_idx + 0) * n_nodes_per_cb;
             const int node_end   = MIN((cb_idx == n_cb - 1) ? n_nodes : (cb_idx + 1) * n_nodes_per_cb, n_nodes);
@@ -1182,9 +1182,9 @@ void ggml_metal_graph_compute(
     // check status of command buffers
     // needed to detect if the device ran out-of-memory for example (#1881)
     for (int i = 0; i < n_cb; i++) {
-        [command_buffers[i] waitUntilCompleted];
+        [ctx->command_buffers[i] waitUntilCompleted];
 
-        MTLCommandBufferStatus status = (MTLCommandBufferStatus) [command_buffers[i] status];
+        MTLCommandBufferStatus status = (MTLCommandBufferStatus) [ctx->command_buffers[i] status];
         if (status != MTLCommandBufferStatusCompleted) {
             fprintf(stderr, "%s: command buffer %d failed with status %lu\n", __func__, i, status);
             GGML_ASSERT(false);

From ddfa865926e3accf9ad9af0246344124cef020ce Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 28 Aug 2023 10:49:47 +0300
Subject: [PATCH 7/7] ggml : assert for odd number of blocks on ARM

15M tinyllama is an example
---
 ggml.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/ggml.c b/ggml.c
index 767c19ae2b58a..54f426bc066b6 100644
--- a/ggml.c
+++ b/ggml.c
@@ -2436,7 +2436,6 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
     const int nb = n / qk;
 
     assert(n % qk == 0);
-    assert(nb % 2 == 0);
 
     const block_q4_0 * restrict x = vx;
     const block_q8_0 * restrict y = vy;
@@ -2445,6 +2444,7 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
     float32x4_t sumv0 = vdupq_n_f32(0.0f);
     float32x4_t sumv1 = vdupq_n_f32(0.0f);
 
+    GGML_ASSERT(nb % 2 == 0); // TODO: handle odd nb
     for (int i = 0; i < nb; i += 2) {
         const block_q4_0 * restrict x0 = &x[i + 0];
         const block_q4_0 * restrict x1 = &x[i + 1];
@@ -2623,6 +2623,7 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
     }
 
     // Main loop
+    GGML_ASSERT(nb % 2 == 0); // TODO: handle odd nb
     for (int i = 2; i < nb; i+=2) {
         _mm_prefetch(&x[i] + sizeof(block_q4_0), _MM_HINT_T0);
         _mm_prefetch(&y[i] + sizeof(block_q8_0), _MM_HINT_T0);
@@ -2706,7 +2707,6 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void *
     const int nb = n / qk;
 
     assert(n % qk == 0);
-    assert(nb % 2 == 0);
 
     const block_q4_1 * restrict x = vx;
     const block_q8_1 * restrict y = vy;
@@ -2718,6 +2718,7 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void *
 
     float summs = 0;
 
+    GGML_ASSERT(nb % 2 == 0); // TODO: handle odd nb
     for (int i = 0; i < nb; i += 2) {
         const block_q4_1 * restrict x0 = &x[i + 0];
         const block_q4_1 * restrict x1 = &x[i + 1];
@@ -2832,7 +2833,6 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
     const int nb = n / qk;
 
     assert(n % qk == 0);
-    assert(nb % 2 == 0);
     assert(qk == QK5_0);
 
     const block_q5_0 * restrict x = vx;
@@ -2848,6 +2848,7 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
     uint64_t tmp0[4];
     uint64_t tmp1[4];
 
+    GGML_ASSERT(nb % 2 == 0); // TODO: handle odd nb
     for (int i = 0; i < nb; i += 2) {
         const block_q5_0 * restrict x0 = &x[i];
         const block_q5_0 * restrict x1 = &x[i + 1];
@@ -3072,7 +3073,6 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
     const int nb = n / qk;
 
     assert(n % qk == 0);
-    assert(nb % 2 == 0);
     assert(qk == QK5_1);
 
     const block_q5_1 * restrict x = vx;
@@ -3091,6 +3091,7 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
     uint64_t tmp0[4];
     uint64_t tmp1[4];
 
+    GGML_ASSERT(nb % 2 == 0); // TODO: handle odd nb
     for (int i = 0; i < nb; i += 2) {
         const block_q5_1 * restrict x0 = &x[i];
         const block_q5_1 * restrict x1 = &x[i + 1];
@@ -3328,7 +3329,6 @@ static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void *
     const int nb = n / qk;
 
     assert(n % qk == 0);
-    assert(nb % 2 == 0);
 
     const block_q8_0 * restrict x = vx;
     const block_q8_0 * restrict y = vy;
@@ -3337,6 +3337,7 @@ static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void *
     float32x4_t sumv0 = vdupq_n_f32(0.0f);
     float32x4_t sumv1 = vdupq_n_f32(0.0f);
 
+    GGML_ASSERT(nb % 2 == 0); // TODO: handle odd nb
     for (int i = 0; i < nb; i += 2) {
         const block_q8_0 * restrict x0 = &x[i + 0];
         const block_q8_0 * restrict x1 = &x[i + 1];