Update on "[ExecuTorch][Llama] Change runner to enable chunked prefill"

kimishpatel · kimishpatel · commit 6312ea66716f · 2025-03-31T19:58:10.000-07:00
This diff adds code to chunk prompt longer than max_seq_len to enable prefill of larger context Differential Revision: [D71833061](https://our.internmc.facebook.com/intern/diff/D71833061/) [ghstack-poisoned]
diff --git a/extension/llm/runner/text_prefiller.cpp b/extension/llm/runner/text_prefiller.cpp
@@ -24,7 +24,8 @@ TextPrefiller::TextPrefiller(
     : text_decoder_runner_(text_decoder_runner),
       use_kv_cache_(use_kv_cache),
       enable_parallel_prefill_(enable_parallel_prefill),
-      max_seq_len_(max_seq_len > 0 ? max_seq_len - 1 : 127) {} // -1 because for some reason tracing results in this upperbound
+      max_seq_len_(max_seq_len > 0 ? max_seq_len - 1 : 127) {
+} // -1 because for some reason tracing results in this upperbound
 
 ::executorch::runtime::Result<uint64_t> TextPrefiller::prefill(
     std::vector<uint64_t>& prompt_tokens,
@@ -33,33 +34,35 @@ ::executorch::runtime::Result<uint64_t> TextPrefiller::prefill(
   if (!text_decoder_runner_->is_method_loaded()) {
     ET_CHECK_OK_OR_RETURN_ERROR(text_decoder_runner_->load());
   }
-  
+
   // Check if we need to chunk the prompt tokens
   int32_t num_prompt_tokens = prompt_tokens.size();
-  
+
   // If prompt tokens exceed max_seq_len_, we need to chunk them
   if (num_prompt_tokens > max_seq_len_) {
     uint64_t cur_token = 0;
     int num_tokens_to_process = 0;
-    
+
     while (num_tokens_to_process < num_prompt_tokens) {
-      auto num_tokens_to_prefill_with =
-          std::min<int>(num_prompt_tokens - num_tokens_to_process, max_seq_len_);
-      
-      std::vector<uint64_t> prompt_tokens_to_process(num_tokens_to_prefill_with);
+      auto num_tokens_to_prefill_with = std::min<int>(
+          num_prompt_tokens - num_tokens_to_process, max_seq_len_);
+
+      std::vector<uint64_t> prompt_tokens_to_process(
+          num_tokens_to_prefill_with);
       std::copy(
           prompt_tokens.begin() + num_tokens_to_process,
-          prompt_tokens.begin() + num_tokens_to_process + num_tokens_to_prefill_with,
+          prompt_tokens.begin() + num_tokens_to_process +
+              num_tokens_to_prefill_with,
           prompt_tokens_to_process.begin());
-      
+
       // Process this chunk
       auto chunk_result = prefillChunk(prompt_tokens_to_process, start_pos);
       ET_CHECK_OK_OR_RETURN_ERROR(chunk_result.error());
       cur_token = chunk_result.get();
-      
+
       num_tokens_to_process += num_tokens_to_prefill_with;
     }
-    
+
     return cur_token;
   } else {
     // If prompt tokens don't exceed max_seq_len_, process them directly
diff --git a/extension/llm/runner/text_prefiller.h b/extension/llm/runner/text_prefiller.h
@@ -39,7 +39,8 @@ class ET_EXPERIMENTAL TextPrefiller {
   /**
    * Helper method to prefill a chunk of tokens.
    * @param prompt_tokens The chunk of text prompt tokens to process.
-   * @param start_pos The starting position in KV cache of the input in the LLM Module.
+   * @param start_pos The starting position in KV cache of the input in the LLM
+   * Module.
    * @return The next token of the LLM Module after prefilling this chunk.
    */
   ::executorch::runtime::Result<uint64_t> prefillChunk(