From a1aa65e069d05ca48f7b80bbbfce941431f20d5e Mon Sep 17 00:00:00 2001
From: Joan Martinez <joan.fontanals.martinez@jina.ai>
Date: Thu, 2 May 2024 12:02:26 +0200
Subject: [PATCH 1/5] feat: change convert hf to gguf

---
 convert-hf-to-gguf.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py
index 2f146d7302a78..28bf20e99e274 100755
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@@ -271,7 +271,7 @@ def get_vocab_base_pre(self, tokenizer) -> str:
 
         chktxt = '\n \n\n \n\n\n \t \t\t \t\n  \n   \n    \n     \n🚀 (normal) 😶\u200d🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天～ ------======= нещо на Български \'\'\'\'\'\'```````""""......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL'
 
-        chktok = tokenizer.encode(chktxt)
+        chktok = tokenizer.decode(tokenizer.encode(chktxt))
         chkhsh = sha256(str(chktok).encode()).hexdigest()
 
         print(f"chktok: {chktok}")

From 0672cd8f4268c1ccef22fdb7558d327ad213f88b Mon Sep 17 00:00:00 2001
From: Joan Martinez <joan.fontanals.martinez@jina.ai>
Date: Thu, 2 May 2024 15:31:00 +0200
Subject: [PATCH 2/5] use conver ids to tokens

---
 convert-hf-to-gguf.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py
index 28bf20e99e274..555ef8fa1c1f5 100755
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@@ -271,10 +271,12 @@ def get_vocab_base_pre(self, tokenizer) -> str:
 
         chktxt = '\n \n\n \n\n\n \t \t\t \t\n  \n   \n    \n     \n🚀 (normal) 😶\u200d🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天～ ------======= нещо на Български \'\'\'\'\'\'```````""""......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL'
 
-        chktok = tokenizer.decode(tokenizer.encode(chktxt))
-        chkhsh = sha256(str(chktok).encode()).hexdigest()
+        token_ids = tokenizer.encode(chktxt)
+        token_list = tokenizer.convert_ids_to_tokens(token_ids)
+        chkhsh = sha256(str(token_list).encode()).hexdigest()
 
-        print(f"chktok: {chktok}")
+        print(f"token_ids: {token_ids}")
+        print(f"token_list: {token_list}")
         print(f"chkhsh: {chkhsh}")
 
         res = None

From 0f94ff7155f3dd8756e009f8104d7672e1f64d9a Mon Sep 17 00:00:00 2001
From: Joan Martinez <joan.fontanals.martinez@jina.ai>
Date: Thu, 2 May 2024 16:39:58 +0200
Subject: [PATCH 3/5] fix: only do pre tokenization and normalization

---
 convert-hf-to-gguf.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py
index 555ef8fa1c1f5..b7e715cb6e53f 100755
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@@ -271,12 +271,10 @@ def get_vocab_base_pre(self, tokenizer) -> str:
 
         chktxt = '\n \n\n \n\n\n \t \t\t \t\n  \n   \n    \n     \n🚀 (normal) 😶\u200d🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天～ ------======= нещо на Български \'\'\'\'\'\'```````""""......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL'
 
-        token_ids = tokenizer.encode(chktxt)
-        token_list = tokenizer.convert_ids_to_tokens(token_ids)
-        chkhsh = sha256(str(token_list).encode()).hexdigest()
+        pre_out = tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(tokenizer.backend_tokenizer.normalizer.normalize_str(chktxt))
+        chkhsh = sha256(str(pre_out).encode()).hexdigest()
 
-        print(f"token_ids: {token_ids}")
-        print(f"token_list: {token_list}")
+        print(f"pre_out: {pre_out}")
         print(f"chkhsh: {chkhsh}")
 
         res = None

From c7a6c32882111ff977d8f8b1b2d4e310873d8566 Mon Sep 17 00:00:00 2001
From: Joan Martinez <joan.fontanals.martinez@jina.ai>
Date: Thu, 2 May 2024 18:01:00 +0200
Subject: [PATCH 4/5] fix: protect against slow tokenizer

---
 convert-hf-to-gguf.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py
index b7e715cb6e53f..5a5d3b49a6ad8 100755
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@@ -271,10 +271,14 @@ def get_vocab_base_pre(self, tokenizer) -> str:
 
         chktxt = '\n \n\n \n\n\n \t \t\t \t\n  \n   \n    \n     \n🚀 (normal) 😶\u200d🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天～ ------======= нещо на Български \'\'\'\'\'\'```````""""......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL'
 
-        pre_out = tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(tokenizer.backend_tokenizer.normalizer.normalize_str(chktxt))
-        chkhsh = sha256(str(pre_out).encode()).hexdigest()
+        if hasattr(tokenizer, 'backend_tokenizer'):
+            chktok = tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(tokenizer.backend_tokenizer.normalizer.normalize_str(chktxt))
+            chkhsh = sha256(str(chktok).encode()).hexdigest()
+        else:
+            chktok = tokenizer.encode(chktxt)
+            chkhsh = sha256(str(chktok).encode()).hexdigest()
 
-        print(f"pre_out: {pre_out}")
+        print(f"chktok: {chktok}")
         print(f"chkhsh: {chkhsh}")
 
         res = None

From ef5dd3603ed5fa4bf3acaba20abf8607d1558c5c Mon Sep 17 00:00:00 2001
From: Joan Martinez <joan.fontanals.martinez@jina.ai>
Date: Mon, 6 May 2024 09:19:36 +0200
Subject: [PATCH 5/5] fix: use is_fast assertion

---
 convert-hf-to-gguf.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py
index 5a5d3b49a6ad8..ce504e9edbd74 100755
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@@ -271,7 +271,7 @@ def get_vocab_base_pre(self, tokenizer) -> str:
 
         chktxt = '\n \n\n \n\n\n \t \t\t \t\n  \n   \n    \n     \n🚀 (normal) 😶\u200d🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天～ ------======= нещо на Български \'\'\'\'\'\'```````""""......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL'
 
-        if hasattr(tokenizer, 'backend_tokenizer'):
+        if tokenizer.is_fast:
             chktok = tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(tokenizer.backend_tokenizer.normalizer.normalize_str(chktxt))
             chkhsh = sha256(str(chktok).encode()).hexdigest()
         else: