From ed5f4fe00ecad67f951643011fef76c3c9e04007 Mon Sep 17 00:00:00 2001
From: Sebastian Apel <13675545+SebastianApel@users.noreply.github.com>
Date: Fri, 31 Mar 2023 16:39:39 +0200
Subject: [PATCH 1/8] Initial version of q4_0 matrix multiplication benchmark

---
 Makefile                            |   4 +
 tests/test-benchmark-q4_0-matmult.c | 212 ++++++++++++++++++++++++++++
 2 files changed, 216 insertions(+)
 create mode 100644 tests/test-benchmark-q4_0-matmult.c
diff --git a/Makefile b/Makefile
index 83a4514ef7177..370a405a36b96 100644
--- a/Makefile
+++ b/Makefile
@@ -256,6 +256,10 @@ embedding: examples/embedding/embedding.cpp ggml.o llama.o common.o
 # Tests
 #
 
+benchmark:
+	$(CXX) $(CXXFLAGS) tests/test-benchmark-q4_0-matmult.c ggml.o -o tests/test-benchmark-q4_0-matmult $(LDFLAGS)	
+	tests/test-benchmark-q4_0-matmult
+	
 .PHONY: tests
 tests:
 	bash ./tests/run-tests.sh
diff --git a/tests/test-benchmark-q4_0-matmult.c b/tests/test-benchmark-q4_0-matmult.c
new file mode 100644
index 0000000000000..2d74f5837c6e4
--- /dev/null
+++ b/tests/test-benchmark-q4_0-matmult.c
@@ -0,0 +1,212 @@
+/*
+    License: MIT License
+
+    Changelog:
+    - 2023-03-31 Initial version by Sebastian Apel (https://github.com/SebastianApel)
+
+*/
+
+#include <locale.h>
+#include "ggml.h"
+#include <assert.h>
+#include <math.h>
+#include <cstdio>
+#include <cinttypes>
+#include <unordered_map>
+#include <queue>
+#include <string.h>
+
+uint64_t rdtsc(){
+    unsigned int lo,hi;
+    __asm__ __volatile__ ("rdtsc" : "=a" (lo), "=d" (hi));
+    return ((uint64_t)hi << 32) | lo;
+}
+
+float tensor_sum_elements(struct ggml_tensor * tensor) {
+    float sum = 0;
+    if (tensor->type==6) { 
+        for (int j = 0; j < tensor->ne[1]; j++) { 
+            for (int k = 0; k < tensor->ne[0]; k++) { 
+                sum +=  ((float *) tensor->data)[j*tensor->ne[0]+k]; 
+            } 
+        } 
+    }
+    return sum;
+}
+
+
+/*
+    These are mapping to unknown
+    GGML_TYPE_I8,
+    GGML_TYPE_I16,
+    GGML_TYPE_I32,    
+    GGML_TYPE_COUNT,
+*/
+
+#define TENSOR_TYPE_AS_STR(TYPE) TYPE == GGML_TYPE_F32 ? "FP32" : TYPE == GGML_TYPE_F16 ? "FP16" : TYPE == GGML_TYPE_Q4_0 ? "Q4_0" : TYPE == GGML_TYPE_Q4_1 ? "Q4_1" : "UNKNOWN"
+
+#define TENSOR_DUMP(TENSOR) printf("%15s: type = %i (%5s) ne = %5d x %5d x %5d, nb = (%5li, %5li, %5li) - ", #TENSOR, \
+        TENSOR->type,TENSOR_TYPE_AS_STR(TENSOR->type),\
+        TENSOR->ne[0], TENSOR->ne[1], TENSOR->ne[2], TENSOR->nb[0], TENSOR->nb[1], TENSOR->nb[2]); \
+    { float sum = tensor_sum_elements(TENSOR); printf("Sum of tensor %s is %6.2f\n",#TENSOR, sum); }
+
+int main(void) {
+    // create the ggml context
+    printf("Starting Test\n");
+    
+
+    
+    struct ggml_context * ctx;
+    //const int sizex = 4096;
+    //const int sizey = 11008;
+
+#undef VERBOSE_DEBUGGING
+#ifndef VERBOSE_DEBUGGING
+    const int sizey = 4096;
+    const int sizex = 11008;  
+    const int sizez = 128;
+#else
+    /* Working - let's increase size */
+    const int sizey = 1;
+    const int sizex = (8*32);  
+    const int sizez = 1;
+
+    /*const int sizey = 1;
+    const int sizex = 3*(8*32);  
+    const int sizez = 1;*/
+#endif
+
+    //printf("Memsize required = %i\n", sizex*sizex);
+    ggml_type wtype = GGML_TYPE_F32;    
+    
+    size_t ctx_size = 0;
+    ctx_size += sizex*sizey*ggml_type_sizef(wtype);
+    ctx_size += sizex*sizey*ggml_type_sizef(wtype);
+    ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32);
+    ctx_size += sizex*sizeof(float);
+    ctx_size += 1024*1024*100;    
+    
+    printf("Allocating Memory of size %li byes, %li MB\n",ctx_size, (ctx_size/1024/1024));
+    
+    struct ggml_init_params params = {
+        /*.mem_size   =*/ ctx_size,
+        /*.mem_buffer =*/ NULL,
+        /* no_alloc   =*/ 0
+    };
+
+    ctx = ggml_init(params);
+    if (!ctx) {
+        fprintf(stderr, "%s: ggml_init() failed\n", __func__);
+        return false;
+    }
+    
+    
+    printf("Creating new tensors\n");
+    // printf("Creating new tensor m1\n");
+    struct ggml_tensor * m11 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizey);
+    ggml_set_f32(m11, 1.0f);
+    
+    // printf("Creating new tensor m1\n");
+    struct ggml_tensor * m12 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizey);
+    ggml_set_f32(m12, 1.5f);
+    
+    // printf("Creating new tensor m2\n");
+    struct ggml_tensor * m2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizez);
+    ggml_set_f32(m2, 2.0f);
+    
+    printf("\n------ Test 1 - Matrix Mult via F32 code ------------------------------------------------------------------------------\n");
+    // printf("Creating new tensor m11xm2\n");
+    struct ggml_tensor * m11xm2 = ggml_mul_mat(ctx, m11, m2);
+    
+    // printf("Creating compute graph\n");
+    struct ggml_cgraph gf = ggml_build_forward(m11xm2);
+    
+    gf.n_threads=1;
+    printf("cgraph->n_threads=%i\n",gf.n_threads); 
+    
+    TENSOR_DUMP(m11);
+    TENSOR_DUMP(m2);
+    
+    ggml_graph_compute(ctx, &gf);
+
+    TENSOR_DUMP(gf.nodes[0]);
+    
+    printf("\n------ Test 2 - Matrix Mult via Q4_0 code ------------------------------------------------------------------------------\n");
+        
+    int32_t nelements = sizex*sizey;
+    int32_t ne[2] = { sizex, sizey };
+        
+    std::vector<int64_t> hist_cur(1 << 4, 0);    
+
+    // Set up a the benchmark matrices
+    // printf("Creating new tensor q11 & Running quantize\n");
+    struct ggml_tensor * q11 = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, sizex, sizey);
+    ggml_quantize_q4_0((const float *) m11->data, q11->data, nelements, ne[0], hist_cur.data());
+    
+    // Set up a the compute graph
+    // printf("Creating new tensor q31\n");
+    struct ggml_tensor * q31 = ggml_mul_mat(ctx, q11, m2);
+        
+    // printf("Creating compute graph\n");
+    struct ggml_cgraph gf31 = ggml_build_forward(q31);
+    gf31.n_threads=1;
+    
+    // Set up a second graph computation to make sure we override the CPU cache lines    
+    // printf("Creating new tensor q12 & Running quantize\n");
+    struct ggml_tensor * q12 = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, sizex, sizey);
+    ggml_quantize_q4_0((const float *) m12->data, q12->data, nelements, ne[0], hist_cur.data());
+
+    // printf("Creating new tensor q32\n");
+    struct ggml_tensor * q32 = ggml_mul_mat(ctx, q12, m2);
+        
+    //printf("Creating compute graph\n");
+    struct ggml_cgraph gf32 = ggml_build_forward(q32);
+    gf32.n_threads=1;
+    printf("cgraph->n_threads=%i\n",gf31.n_threads); 
+    
+    const int dimx = sizex;
+    const int dimy = sizey;
+    const int dimz = sizez;
+    long long int flops_per_dot_product = dimy + dimy;
+    long long int flops_per_matrix = flops_per_dot_product * dimx * dimz; ;
+    printf("Matrix Multiplication of (%i,%i,%i) x (%i,%i,%i) - aboout %6.2f gFLOPS\n\n", sizex, sizey, 1, sizex, sizez, 1, 1.0f*flops_per_matrix / 1000 / 1000 / 1000);
+   
+
+    // We cannot use the F32 result, because it will not be exactly the same (due to quantization)
+    // float sum_of_F32_reference = tensor_sum_elements(gf.nodes[0]);
+    float sum_of_F32_reference = 11611395072.00f;
+
+    printf("Iteration;NThreads; SizeX; SizeY; SizeZ; Required_FLOPS; Elapsed_CPU_Cycles; FLOPS_per_Cycle\n");
+    printf("============================================================================================\n");
+    
+    setlocale(LC_ALL,"de_DE_UTF8");
+    
+    for (int i=0;i<10;i++) {
+    
+        long long int start = rdtsc();
+        //printf("Running ggml_graph_compute\n");
+        ggml_graph_compute(ctx, &gf31);
+        long long int stop = rdtsc();
+        long long int cycles = stop-start;
+        float flops_per_cycle = (1.0f*flops_per_matrix)/cycles;
+        printf("%9i;%8i;%6i;%6i;%6i;%15lli;%19lli;%16.2f\n",i,gf31.n_threads, sizex, sizey, sizez, flops_per_matrix, cycles,flops_per_cycle);
+
+#ifdef VERBOSE_DEBUGGING
+        TENSOR_DUMP("res",gf31.nodes[0])
+#endif
+
+        float sum_of_Q4_result = tensor_sum_elements(gf31.nodes[0]);
+        if (sum_of_Q4_result != sum_of_F32_reference) {
+            printf("\nABORT - ERROR in Matrix Multiplication result - expected %6.2f, got %6.2f\n",
+                sum_of_F32_reference, 
+                sum_of_Q4_result
+            );
+            exit(0);
+        }
+        
+        // Running a different graph computation to make sure we override the CPU cache lines    
+        ggml_graph_compute(ctx, &gf32);
+        
+    }
+    
+}

From 3b7dcc0fc821ee07c859d41eb022a641ddc94113 Mon Sep 17 00:00:00 2001
From: Sebastian Apel <13675545+SebastianApel@users.noreply.github.com>
Date: Fri, 31 Mar 2023 16:58:54 +0200
Subject: [PATCH 2/8] Bugfix: Added dependency to ggml.o to benchmark

---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 370a405a36b96..d00f110fcfaa4 100644
--- a/Makefile
+++ b/Makefile
@@ -256,7 +256,7 @@ embedding: examples/embedding/embedding.cpp ggml.o llama.o common.o
 # Tests
 #
 
-benchmark:
+benchmark: ggml.o
 	$(CXX) $(CXXFLAGS) tests/test-benchmark-q4_0-matmult.c ggml.o -o tests/test-benchmark-q4_0-matmult $(LDFLAGS)	
 	tests/test-benchmark-q4_0-matmult
 	

From fd2f59a03d4c2cf6f55b1ed02f548a4f4b2c60c9 Mon Sep 17 00:00:00 2001
From: Sebastian Apel <13675545+SebastianApel@users.noreply.github.com>
Date: Fri, 31 Mar 2023 17:38:19 +0200
Subject: [PATCH 3/8] Reviewer requests: added parameter for threads, switched
 to ggml_time_us()

---
 tests/test-benchmark-q4_0-matmult.c | 66 ++++++++++++++++++++++++-----
 1 file changed, 56 insertions(+), 10 deletions(-)

diff --git a/tests/test-benchmark-q4_0-matmult.c b/tests/test-benchmark-q4_0-matmult.c
index 2d74f5837c6e4..34569baff5236 100644
--- a/tests/test-benchmark-q4_0-matmult.c
+++ b/tests/test-benchmark-q4_0-matmult.c
@@ -10,11 +10,17 @@
 #include "ggml.h"
 #include <assert.h>
 #include <math.h>
+#include <cstring>
 #include <cstdio>
 #include <cinttypes>
 #include <unordered_map>
 #include <queue>
 #include <string.h>
+#include <cassert>
+#include <fstream>
+#include <string>
+#include <iterator>
+#include <algorithm>
 
 uint64_t rdtsc(){
     unsigned int lo,hi;
@@ -50,7 +56,42 @@ float tensor_sum_elements(struct ggml_tensor * tensor) {
         TENSOR->ne[0], TENSOR->ne[1], TENSOR->ne[2], TENSOR->nb[0], TENSOR->nb[1], TENSOR->nb[2]); \
     { float sum = tensor_sum_elements(TENSOR); printf("Sum of tensor %s is %6.2f\n",#TENSOR, sum); }
 
-int main(void) {
+void print_usage(int /*argc*/, char ** argv, const int n_threads) {
+    fprintf(stderr, "usage: %s [options]\n", argv[0]);
+    fprintf(stderr, "\n");
+    fprintf(stderr, "options:\n");
+    fprintf(stderr, "  -h, --help            show this help message and exit\n");
+    fprintf(stderr, "  -t N, --threads N     number of threads to use during computation (default: %d)\n", n_threads);
+    fprintf(stderr, "\n");
+}
+
+int main(int argc, char ** argv)  {
+
+    int n_threads = 1;
+
+    bool invalid_param = false;
+    std::string arg;
+    for (int i = 1; i < argc; i++) {
+        arg = argv[i];
+
+        if (arg == "-t" || arg == "--threads") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            n_threads = std::stoi(argv[i]);
+        } else if (arg == "-h" || arg == "--help") {
+            print_usage(argc, argv, n_threads);
+            exit(0);
+        }     
+        if (invalid_param) {
+            fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
+            print_usage(argc, argv, n_threads);
+            exit(1);
+        }
+    }
+
+
     // create the ggml context
     printf("Starting Test\n");
     
@@ -121,7 +162,7 @@ int main(void) {
     // printf("Creating compute graph\n");
     struct ggml_cgraph gf = ggml_build_forward(m11xm2);
     
-    gf.n_threads=1;
+    gf.n_threads=n_threads;
     printf("cgraph->n_threads=%i\n",gf.n_threads); 
     
     TENSOR_DUMP(m11);
@@ -149,7 +190,7 @@ int main(void) {
         
     // printf("Creating compute graph\n");
     struct ggml_cgraph gf31 = ggml_build_forward(q31);
-    gf31.n_threads=1;
+    gf31.n_threads=n_threads;
     
     // Set up a second graph computation to make sure we override the CPU cache lines    
     // printf("Creating new tensor q12 & Running quantize\n");
@@ -176,20 +217,25 @@ int main(void) {
     // float sum_of_F32_reference = tensor_sum_elements(gf.nodes[0]);
     float sum_of_F32_reference = 11611395072.00f;
 
-    printf("Iteration;NThreads; SizeX; SizeY; SizeZ; Required_FLOPS; Elapsed_CPU_Cycles; FLOPS_per_Cycle\n");
-    printf("============================================================================================\n");
+    printf("Iteration;NThreads; SizeX; SizeY; SizeZ; Required_FLOPS; Elapsed_u_Seconds; FLOPS_per_u_Second\n");
+    printf("==============================================================================================\n");
     
     setlocale(LC_ALL,"de_DE_UTF8");
     
     for (int i=0;i<10;i++) {
     
-        long long int start = rdtsc();
+        long long int start = ggml_time_us();
         //printf("Running ggml_graph_compute\n");
         ggml_graph_compute(ctx, &gf31);
-        long long int stop = rdtsc();
-        long long int cycles = stop-start;
-        float flops_per_cycle = (1.0f*flops_per_matrix)/cycles;
-        printf("%9i;%8i;%6i;%6i;%6i;%15lli;%19lli;%16.2f\n",i,gf31.n_threads, sizex, sizey, sizez, flops_per_matrix, cycles,flops_per_cycle);
+        long long int stop = ggml_time_us();
+        long long int usec = stop-start;
+        float sec = usec/1000000;
+        float flops_per_usec = (1.0f*flops_per_matrix)/usec;
+        printf("%9i;%8i;%6i;%6i;%6i;%15lli;%18lli;%19.2f\n",
+            i,
+            gf31.n_threads, 
+            sizex, sizey, sizez, flops_per_matrix, 
+            usec,flops_per_usec);
 
 #ifdef VERBOSE_DEBUGGING
         TENSOR_DUMP("res",gf31.nodes[0])

From 6e691af99773b25c308e6ff833683142fb0f20f3 Mon Sep 17 00:00:00 2001
From: Sebastian Apel <13675545+SebastianApel@users.noreply.github.com>
Date: Sat, 1 Apr 2023 20:27:19 +0200
Subject: [PATCH 4/8] Reviewer input: removed rtsc, use epsilon for check

---
 tests/test-benchmark-q4_0-matmult.c | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/tests/test-benchmark-q4_0-matmult.c b/tests/test-benchmark-q4_0-matmult.c
index 34569baff5236..5285d4741edd3 100644
--- a/tests/test-benchmark-q4_0-matmult.c
+++ b/tests/test-benchmark-q4_0-matmult.c
@@ -22,12 +22,6 @@
 #include <iterator>
 #include <algorithm>
 
-uint64_t rdtsc(){
-    unsigned int lo,hi;
-    __asm__ __volatile__ ("rdtsc" : "=a" (lo), "=d" (hi));
-    return ((uint64_t)hi << 32) | lo;
-}
-
 float tensor_sum_elements(struct ggml_tensor * tensor) {
     float sum = 0;
     if (tensor->type==6) { 
@@ -213,9 +207,9 @@ int main(int argc, char ** argv)  {
     printf("Matrix Multiplication of (%i,%i,%i) x (%i,%i,%i) - aboout %6.2f gFLOPS\n\n", sizex, sizey, 1, sizex, sizez, 1, 1.0f*flops_per_matrix / 1000 / 1000 / 1000);
    
 
-    // We cannot use the F32 result, because it will not be exactly the same (due to quantization)
-    // float sum_of_F32_reference = tensor_sum_elements(gf.nodes[0]);
-    float sum_of_F32_reference = 11611395072.00f;
+    // Let's use the F32 result from above as a reference for the q4_0 multiplication
+    float sum_of_F32_reference = tensor_sum_elements(gf.nodes[0]);
+    
 
     printf("Iteration;NThreads; SizeX; SizeY; SizeZ; Required_FLOPS; Elapsed_u_Seconds; FLOPS_per_u_Second\n");
     printf("==============================================================================================\n");
@@ -241,11 +235,18 @@ int main(int argc, char ** argv)  {
         TENSOR_DUMP("res",gf31.nodes[0])
 #endif
 
+        // Check that the matrix multiplication result is in the right ballpark        
+        // We cannot use the exact value from the F32 multiplication because the quantizuation will be slightly different
         float sum_of_Q4_result = tensor_sum_elements(gf31.nodes[0]);
-        if (sum_of_Q4_result != sum_of_F32_reference) {
-            printf("\nABORT - ERROR in Matrix Multiplication result - expected %6.2f, got %6.2f\n",
+        float delta = abs(sum_of_Q4_result - sum_of_F32_reference);
+        float allowed_delta = (sum_of_F32_reference) / 1000 / 1000; //  Let's accept an epsilon of 10^-6
+
+        if (delta > allowed_delta)  {
+            printf("\nABORT - ERROR in Matrix Multiplication result - expected %6.2f, got %6.2f (delta %6.2f > allowed_delta %6.2f)\n",
                 sum_of_F32_reference, 
-                sum_of_Q4_result
+                sum_of_Q4_result,
+                delta,
+                allowed_delta
             );
             exit(0);
         }

From 100dc551e104f941be59a79e7a233032bf6fa611 Mon Sep 17 00:00:00 2001
From: Sebastian Apel <13675545+SebastianApel@users.noreply.github.com>
Date: Sat, 1 Apr 2023 20:33:11 +0200
Subject: [PATCH 5/8] Review comment: Removed set_locale

---
 tests/test-benchmark-q4_0-matmult.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tests/test-benchmark-q4_0-matmult.c b/tests/test-benchmark-q4_0-matmult.c
index 5285d4741edd3..f06a76932cafc 100644
--- a/tests/test-benchmark-q4_0-matmult.c
+++ b/tests/test-benchmark-q4_0-matmult.c
@@ -214,8 +214,6 @@ int main(int argc, char ** argv)  {
     printf("Iteration;NThreads; SizeX; SizeY; SizeZ; Required_FLOPS; Elapsed_u_Seconds; FLOPS_per_u_Second\n");
     printf("==============================================================================================\n");
     
-    setlocale(LC_ALL,"de_DE_UTF8");
-    
     for (int i=0;i<10;i++) {
     
         long long int start = ggml_time_us();

From 5833baeeec62e19b053c1d37d29cf724019a470c Mon Sep 17 00:00:00 2001
From: Sebastian Apel <13675545+SebastianApel@users.noreply.github.com>
Date: Sat, 1 Apr 2023 21:06:47 +0200
Subject: [PATCH 6/8] Feature: Param for numer of iterations, Bugfix for use of
 parameter threads

---
 tests/test-benchmark-q4_0-matmult.c | 35 ++++++++++++++++++++---------
 1 file changed, 24 insertions(+), 11 deletions(-)

diff --git a/tests/test-benchmark-q4_0-matmult.c b/tests/test-benchmark-q4_0-matmult.c
index f06a76932cafc..9ca9b133a9290 100644
--- a/tests/test-benchmark-q4_0-matmult.c
+++ b/tests/test-benchmark-q4_0-matmult.c
@@ -50,18 +50,25 @@ float tensor_sum_elements(struct ggml_tensor * tensor) {
         TENSOR->ne[0], TENSOR->ne[1], TENSOR->ne[2], TENSOR->nb[0], TENSOR->nb[1], TENSOR->nb[2]); \
     { float sum = tensor_sum_elements(TENSOR); printf("Sum of tensor %s is %6.2f\n",#TENSOR, sum); }
 
-void print_usage(int /*argc*/, char ** argv, const int n_threads) {
+struct benchmark_params_struct {    
+    int32_t n_threads     = 1;
+    int32_t n_iterations  = 10;
+};
+
+void print_usage(int /*argc*/, char ** argv, struct benchmark_params_struct params) {
     fprintf(stderr, "usage: %s [options]\n", argv[0]);
     fprintf(stderr, "\n");
     fprintf(stderr, "options:\n");
     fprintf(stderr, "  -h, --help            show this help message and exit\n");
-    fprintf(stderr, "  -t N, --threads N     number of threads to use during computation (default: %d)\n", n_threads);
+    fprintf(stderr, "  -t N, --threads N     number of threads to use during computation (default: %d)\n", params.n_threads);
+    fprintf(stderr, "  -i N, --iter N     number of iterations to use during computation (default: %d)\n", params.n_iterations);
     fprintf(stderr, "\n");
 }
 
 int main(int argc, char ** argv)  {
 
-    int n_threads = 1;
+    
+    struct benchmark_params_struct benchmark_params;
 
     bool invalid_param = false;
     std::string arg;
@@ -73,14 +80,20 @@ int main(int argc, char ** argv)  {
                 invalid_param = true;
                 break;
             }
-            n_threads = std::stoi(argv[i]);
-        } else if (arg == "-h" || arg == "--help") {
-            print_usage(argc, argv, n_threads);
+            benchmark_params.n_threads = std::stoi(argv[i]);
+        } else if (arg == "-i" || arg == "--iter") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            benchmark_params.n_iterations = std::stoi(argv[i]);
+        }  else if (arg == "-h" || arg == "--help") {
+            print_usage(argc, argv, benchmark_params);
             exit(0);
         }     
         if (invalid_param) {
             fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
-            print_usage(argc, argv, n_threads);
+            print_usage(argc, argv, benchmark_params);
             exit(1);
         }
     }
@@ -156,7 +169,7 @@ int main(int argc, char ** argv)  {
     // printf("Creating compute graph\n");
     struct ggml_cgraph gf = ggml_build_forward(m11xm2);
     
-    gf.n_threads=n_threads;
+    gf.n_threads=benchmark_params.n_threads;
     printf("cgraph->n_threads=%i\n",gf.n_threads); 
     
     TENSOR_DUMP(m11);
@@ -184,7 +197,7 @@ int main(int argc, char ** argv)  {
         
     // printf("Creating compute graph\n");
     struct ggml_cgraph gf31 = ggml_build_forward(q31);
-    gf31.n_threads=n_threads;
+    gf31.n_threads=benchmark_params.n_threads;
     
     // Set up a second graph computation to make sure we override the CPU cache lines    
     // printf("Creating new tensor q12 & Running quantize\n");
@@ -196,7 +209,7 @@ int main(int argc, char ** argv)  {
         
     //printf("Creating compute graph\n");
     struct ggml_cgraph gf32 = ggml_build_forward(q32);
-    gf32.n_threads=1;
+    gf32.n_threads=benchmark_params.n_threads;
     printf("cgraph->n_threads=%i\n",gf31.n_threads); 
     
     const int dimx = sizex;
@@ -214,7 +227,7 @@ int main(int argc, char ** argv)  {
     printf("Iteration;NThreads; SizeX; SizeY; SizeZ; Required_FLOPS; Elapsed_u_Seconds; FLOPS_per_u_Second\n");
     printf("==============================================================================================\n");
     
-    for (int i=0;i<10;i++) {
+    for (int i=0;i<benchmark_params.n_iterations ;i++) {
     
         long long int start = ggml_time_us();
         //printf("Running ggml_graph_compute\n");

From f370a670beed294ed88dee3dc4f20d7aa3e46eb4 Mon Sep 17 00:00:00 2001
From: Sebastian Apel <13675545+SebastianApel@users.noreply.github.com>
Date: Sat, 1 Apr 2023 21:18:42 +0200
Subject: [PATCH 7/8] Reviewer suggestion: Moved to examples

---
 .../benchmark/benchmark-q4_0-matmult.c                            | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename tests/test-benchmark-q4_0-matmult.c => examples/benchmark/benchmark-q4_0-matmult.c (100%)

diff --git a/tests/test-benchmark-q4_0-matmult.c b/examples/benchmark/benchmark-q4_0-matmult.c
similarity index 100%
rename from tests/test-benchmark-q4_0-matmult.c
rename to examples/benchmark/benchmark-q4_0-matmult.c

From 56c78d16d21ebf5d15e22480d73625cfc5c15de0 Mon Sep 17 00:00:00 2001
From: Sebastian Apel <13675545+SebastianApel@users.noreply.github.com>
Date: Sun, 2 Apr 2023 21:16:43 +0200
Subject: [PATCH 8/8] Reviewer feedback: Updated clean: and benchmark: sections

---
 Makefile | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/Makefile b/Makefile
index d00f110fcfaa4..707dfa3581b96 100644
--- a/Makefile
+++ b/Makefile
@@ -235,7 +235,7 @@ common.o: examples/common.cpp examples/common.h
 	$(CXX) $(CXXFLAGS) -c examples/common.cpp -o common.o
 
 clean:
-	rm -vf *.o main quantize perplexity embedding
+	rm -vf *.o main quantize perplexity embedding examples/benchmark/benchmark-q4_0-matmult
 
 main: examples/main/main.cpp ggml.o llama.o common.o
 	$(CXX) $(CXXFLAGS) examples/main/main.cpp ggml.o llama.o common.o -o main $(LDFLAGS)
@@ -257,8 +257,8 @@ embedding: examples/embedding/embedding.cpp ggml.o llama.o common.o
 #
 
 benchmark: ggml.o
-	$(CXX) $(CXXFLAGS) tests/test-benchmark-q4_0-matmult.c ggml.o -o tests/test-benchmark-q4_0-matmult $(LDFLAGS)	
-	tests/test-benchmark-q4_0-matmult
+	$(CXX) $(CXXFLAGS) examples/benchmark/benchmark-q4_0-matmult.c ggml.o -o examples/benchmark/benchmark-q4_0-matmult $(LDFLAGS)	
+	examples/benchmark/benchmark-q4_0-matmult
 	
 .PHONY: tests
 tests: