From 32bc3f4fcf941dc6656b638074d28f94fdf48da2 Mon Sep 17 00:00:00 2001
From: Cebtenzzre <cebtenzzre@gmail.com>
Date: Sat, 9 Sep 2023 23:04:53 -0400
Subject: [PATCH 1/2] llama : enable mmap in quantize on Linux -> 31% faster

---
 llama.cpp | 21 +++++++++++++++++----
 1 file changed, 17 insertions(+), 4 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index a6502612232f8..bfff91be1e271 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -5658,7 +5658,18 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
         nthread = std::thread::hardware_concurrency();
     }
 
-    std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname_inp, /*use_mmap*/ false));
+    // mmap consistently increases speed Linux, is inconsistent on macOS
+    // (possibly related to free memory), and has not been tested on Windows.
+#ifdef __linux__
+    constexpr bool use_mmap = true;
+#else
+    constexpr bool use_mmap = false;
+#endif
+
+    std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname_inp, use_mmap));
+    if (ml->use_mmap) {
+        ml->mapping.reset(new llama_mmap(&ml->file, /* prefetch */ 0, ggml_is_numa()));
+    }
 
     llama_model model;
     llm_load_arch(*ml, model);
@@ -5736,10 +5747,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
 
         const std::string name = ggml_get_name(tensor);
 
-        if (read_data.size() < ggml_nbytes(tensor)) {
-            read_data.resize(ggml_nbytes(tensor));
+        if (!ml->use_mmap) {
+            if (read_data.size() < ggml_nbytes(tensor)) {
+                read_data.resize(ggml_nbytes(tensor));
+            }
+            tensor->data = read_data.data();
         }
-        tensor->data = read_data.data();
         ml->load_data_for(tensor);
 
         LLAMA_LOG_INFO("[%4d/%4d] %36s - [%s], type = %6s, ",

From b3a6b2862262ddd2eeb6a93d435dcc7b58551a05 Mon Sep 17 00:00:00 2001
From: Cebtenzzre <cebtenzzre@gmail.com>
Date: Sat, 23 Sep 2023 23:39:28 -0400
Subject: [PATCH 2/2] also enable mmap on Windows

---
 llama.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index bfff91be1e271..e2e325eb48c60 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -5658,9 +5658,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
         nthread = std::thread::hardware_concurrency();
     }
 
-    // mmap consistently increases speed Linux, is inconsistent on macOS
-    // (possibly related to free memory), and has not been tested on Windows.
-#ifdef __linux__
+    // mmap consistently increases speed Linux, and also increases speed on Windows with
+    // hot cache. It may cause a slowdown on macOS, possibly related to free memory.
+#if defined(__linux__) || defined(_WIN32)
     constexpr bool use_mmap = true;
 #else
     constexpr bool use_mmap = false;