[ExecuTorch][Llama] Change runner to enable chunked prefill

kimishpatel · kimishpatel · commit da9f2afab91f · 2025-03-31T15:43:56.000-07:00
This diff adds code to chunk prompt longer than max_seq_len to enable prefill of larger context Differential Revision: [D71833061](https://our.internmc.facebook.com/intern/diff/D71833061/) ghstack-source-id: 275212294 Pull Request resolved: #9785
diff --git a/examples/models/llama/runner/runner.cpp b/examples/models/llama/runner/runner.cpp
@@ -11,6 +11,7 @@
 
 #include <executorch/examples/models/llama/runner/runner.h>
 
+#include <algorithm>
 #include <ctime>
 
 #include <executorch/extension/llm/runner/util.h>
@@ -221,11 +222,11 @@ Error Runner::generate(
 
   ET_CHECK_MSG(num_prompt_tokens >= 1, "Expected at least 1 prompt token");
   ET_CHECK_MSG(
-      num_prompt_tokens < metadata_.at(kMaxSeqLen),
+      num_prompt_tokens < metadata_.at(kMaxContextLen),
       "num_prompt_tokens %d >= max_seq_len_ %" PRId64
       ", Max seq length exceeded - please increase max seq len value in .../llama2/model.py",
       num_prompt_tokens,
-      metadata_.at(kMaxSeqLen));
+      metadata_.at(kMaxContextLen));
   ET_CHECK_MSG(
       num_prompt_tokens < seq_len,
       "num_prompt_tokens %d >= seq_len %d, Sequence length exceeded - please increase the seq_len value passed to generate()",
@@ -241,11 +242,26 @@ Error Runner::generate(
     wrapped_callback(prompt);
   }
   int64_t pos = 0;
-  auto prefill_res = text_prefiller_->prefill(prompt_tokens, pos);
+  uint64_t cur_token;
+  int max_seq_len = metadata_.at(kMaxSeqLen) -
+      1; // -1 because for some reason tracing results in this upperbound
+  int num_tokens_to_process = 0;
+  while (num_tokens_to_process < num_prompt_tokens) {
+    auto num_tokens_to_prefill_with =
+        std::min(num_prompt_tokens - num_tokens_to_process, max_seq_len);
+    std::vector<uint64_t> prompt_tokens_to_process(num_tokens_to_prefill_with);
+    std::copy(
+        prompt_tokens.begin() + num_tokens_to_process,
+        prompt_tokens.begin() + num_tokens_to_process + num_tokens_to_prefill_with,
+        prompt_tokens_to_process.begin());
+    auto prefill_res =
+        text_prefiller_->prefill(prompt_tokens_to_process, pos);
+    ET_CHECK_OK_OR_RETURN_ERROR(prefill_res.error());
+    cur_token = prefill_res.get();
+    num_tokens_to_process += num_tokens_to_prefill_with;
+  }
   stats_.first_token_ms = llm::time_in_ms();
   stats_.prompt_eval_end_ms = llm::time_in_ms();
-  ET_CHECK_OK_OR_RETURN_ERROR(prefill_res.error());
-  uint64_t cur_token = prefill_res.get();
 
   // print the first token from prefill. No prev_token so use cur_token for it.
   wrapped_callback(