diff --git a/examples/models/llava/runner/llava_text_decoder_runner.h b/examples/models/llava/runner/llava_text_decoder_runner.h
index 3de418b57ea..a5ad6fcab0a 100644
--- a/examples/models/llava/runner/llava_text_decoder_runner.h
+++ b/examples/models/llava/runner/llava_text_decoder_runner.h
@@ -11,6 +11,7 @@
 #pragma once
 
 #include <executorch/extension/llm/runner/text_decoder_runner.h>
+#include <executorch/extension/tensor/tensor.h>
 
 namespace example {
 
@@ -18,18 +19,20 @@ class ET_EXPERIMENTAL LlavaTextDecoderRunner
     : public executorch::extension::llm::TextDecoderRunner {
  public:
   explicit LlavaTextDecoderRunner(executorch::extension::Module* module)
-      : TextDecoderRunner(module, true) {}
+      : TextDecoderRunner(module) {}
 
   inline executorch::runtime::Result<executorch::aten::Tensor> step(
       executorch::extension::TensorPtr& tokens,
-      executorch::extension::TensorPtr& start_pos) override {
+      int64_t start_pos) override {
     // run token embedding
     auto token_embedding_outputs =
         ET_UNWRAP(module_->execute(kTokenEmbeddingMethod, tokens));
 
+    auto start_pos_tensor = ::executorch::extension::from_blob(
+        &start_pos, {1}, executorch::aten::ScalarType::Long);
     // run text model
     auto outputs_res = ET_UNWRAP(module_->execute(
-        kTextModelMethod, {start_pos, token_embedding_outputs[0]}));
+        kTextModelMethod, {start_pos_tensor, token_embedding_outputs[0]}));
 
     ET_CHECK_MSG(
         outputs_res.size() == 1,
diff --git a/extension/llm/runner/CMakeLists.txt b/extension/llm/runner/CMakeLists.txt
index 7f3f8ad1519..b1ec3c0fd1c 100644
--- a/extension/llm/runner/CMakeLists.txt
+++ b/extension/llm/runner/CMakeLists.txt
@@ -56,6 +56,7 @@ add_subdirectory(
 set(runner_deps executorch_core extension_module extension_tensor tokenizers)
 
 target_link_libraries(extension_llm_runner PUBLIC ${runner_deps})
+set_target_properties(extension_llm_runner PROPERTIES POSITION_INDEPENDENT_CODE ON)
 
 target_include_directories(
   extension_llm_runner
diff --git a/extension/llm/runner/targets.bzl b/extension/llm/runner/targets.bzl
index 244515112ac..b6434d3e51d 100644
--- a/extension/llm/runner/targets.bzl
+++ b/extension/llm/runner/targets.bzl
@@ -34,6 +34,7 @@ def define_common_targets():
             ],
             exported_deps = [
                 ":stats",
+                "//executorch/kernels/portable/cpu/util:arange_util" + aten_suffix,
                 "//executorch/extension/llm/sampler:sampler" + aten_suffix,
                 "//executorch/extension/module:module" + aten_suffix,
                 "//executorch/extension/tensor:tensor" + aten_suffix,
diff --git a/extension/llm/runner/test/CMakeLists.txt b/extension/llm/runner/test/CMakeLists.txt
index 15b4d005f9d..78dcb25bcc5 100644
--- a/extension/llm/runner/test/CMakeLists.txt
+++ b/extension/llm/runner/test/CMakeLists.txt
@@ -18,7 +18,7 @@ set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../../..)
 include(${EXECUTORCH_ROOT}/tools/cmake/Test.cmake)
 
 set(_test_srcs test_generation_config.cpp test_text_llm_runner.cpp
-               test_text_prefiller.cpp
+               test_text_prefiller.cpp test_text_decoder_runner.cpp
 )
 
 et_cxx_test(
diff --git a/extension/llm/runner/test/TARGETS b/extension/llm/runner/test/TARGETS
index 97de7abe9b1..7544d1607bd 100644
--- a/extension/llm/runner/test/TARGETS
+++ b/extension/llm/runner/test/TARGETS
@@ -8,7 +8,22 @@
 # targets.bzl. This file can contain fbcode-only targets.
 
 load(":targets.bzl", "define_common_targets")
-
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
 oncall("executorch")
 
 define_common_targets()
+
+runtime.cxx_test(
+    name = "test_text_decoder_runner",
+    srcs = ["test_text_decoder_runner.cpp"],
+    deps = [
+        "//executorch/extension/llm/runner:runner_lib",
+        "//executorch/kernels/portable:generated_lib",
+        "//executorch/runtime/core/exec_aten/testing_util:tensor_util",
+    ],
+    env = {
+        "KVCACHE_CACHE_POS": "$(location fbcode//executorch/test/models:exported_programs[ModuleKVCacheCachePos.pte])",
+        "KVCACHE_INPUT_POS": "$(location fbcode//executorch/test/models:exported_programs[ModuleKVCacheInputPos.pte])",
+        "NO_KVCACHE": "$(location fbcode//executorch/test/models:exported_programs[ModuleNoKVCache.pte])",
+    }
+)
diff --git a/extension/llm/runner/test/test_text_decoder_runner.cpp b/extension/llm/runner/test/test_text_decoder_runner.cpp
new file mode 100644
index 00000000000..c9a8de271f1
--- /dev/null
+++ b/extension/llm/runner/test/test_text_decoder_runner.cpp
@@ -0,0 +1,199 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ * @lint-ignore-every CLANGTIDY facebook-hte-Deprecated
+ */
+
+#include <executorch/extension/llm/runner/text_decoder_runner.h>
+#include <executorch/extension/module/module.h>
+#include <executorch/extension/tensor/tensor.h>
+#include <executorch/runtime/core/exec_aten/testing_util/tensor_factory.h>
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include <cstdlib>
+
+using namespace ::testing;
+using executorch::extension::Module;
+using executorch::extension::TensorPtr;
+using executorch::extension::llm::TextDecoderRunner;
+using executorch::runtime::Error;
+using executorch::runtime::EValue;
+using executorch::runtime::Result;
+using executorch::runtime::testing::TensorFactory;
+
+// Mock Module class for testing
+class MockModule : public Module {
+ public:
+  MockModule() : Module("") {}
+};
+
+class TextDecoderRunnerTest : public Test {
+ protected:
+  void SetUp() override {
+    mock_module_ = std::make_unique<MockModule>();
+    runner_ = std::make_unique<TextDecoderRunner>(mock_module_.get());
+  }
+
+  std::unique_ptr<MockModule> mock_module_;
+  std::unique_ptr<TextDecoderRunner> runner_;
+};
+
+// Test logits_to_token() method with Float tensor
+TEST_F(TextDecoderRunnerTest, LogitsToTokenFloat) {
+  TensorFactory<executorch::aten::ScalarType::Float> tf_float;
+  auto logits = tf_float.make({1, 4}, {0.1f, 0.2f, 0.8f, 0.4f});
+
+  // Call logits_to_token with temperature 0 (deterministic)
+  int32_t token = runner_->logits_to_token(logits, 0.0f);
+
+  // With temperature 0, should return the argmax (index 2)
+  EXPECT_EQ(token, 2);
+}
+
+// Test logits_to_token() method with 3D tensor (batch, seq_length, vocab_size)
+TEST_F(TextDecoderRunnerTest, LogitsToToken3D) {
+  TensorFactory<executorch::aten::ScalarType::Float> tf_float;
+  // Shape: [1, 2, 4] - batch=1, seq_length=2, vocab_size=4
+  auto logits = tf_float.make(
+      {1, 2, 4},
+      {
+          0.1f,
+          0.2f,
+          0.3f,
+          0.4f, // First sequence position
+          0.5f,
+          0.6f,
+          0.9f,
+          0.8f // Second sequence position (last)
+      });
+
+  // Call logits_to_token with temperature 0 (deterministic)
+  int32_t token = runner_->logits_to_token(logits, 0.0f);
+
+  // Should use the last sequence position and return argmax (index 2)
+  EXPECT_EQ(token, 2);
+}
+
+// Test logits_to_token() method with Half tensor
+TEST_F(TextDecoderRunnerTest, LogitsToTokenHalf) {
+  TensorFactory<executorch::aten::ScalarType::Half> tf_half;
+  auto logits = tf_half.make({1, 4}, {0.1f, 0.2f, 0.8f, 0.4f});
+
+  // Call logits_to_token with temperature 0 (deterministic)
+  int32_t token = runner_->logits_to_token(logits, 0.0f);
+
+  // With temperature 0, should return the argmax (index 2)
+  EXPECT_EQ(token, 2);
+}
+
+// Test logits_to_token() method with BFloat16 tensor
+TEST_F(TextDecoderRunnerTest, LogitsToTokenBFloat16) {
+  TensorFactory<executorch::aten::ScalarType::BFloat16> tf_bfloat16;
+  auto logits = tf_bfloat16.make({1, 4}, {0.1f, 0.2f, 0.8f, 0.4f});
+
+  // Call logits_to_token with temperature 0 (deterministic)
+  int32_t token = runner_->logits_to_token(logits, 0.0f);
+
+  // With temperature 0, should return the argmax (index 2)
+  EXPECT_EQ(token, 2);
+}
+
+// Test logits_to_token() method with non-zero temperature
+TEST_F(TextDecoderRunnerTest, LogitsToTokenWithTemperature) {
+  TensorFactory<executorch::aten::ScalarType::Float> tf_float;
+  auto logits = tf_float.make({1, 4}, {0.1f, 0.2f, 0.8f, 0.4f});
+
+  // Call logits_to_token with temperature > 0 (stochastic)
+  int32_t token = runner_->logits_to_token(logits, 1.0f);
+
+  // With temperature > 0, result should be within valid range
+  EXPECT_GE(token, 0);
+  EXPECT_LT(token, 4);
+}
+
+// Test step() method with all available PTE models
+TEST_F(TextDecoderRunnerTest, StepWithAllModels) {
+  // List of all environment variables for PTE models
+  std::vector<std::pair<std::string, const char*>> env_vars = {
+      {"KVCACHE_CACHE_POS", "KVCACHE_CACHE_POS"},
+      {"KVCACHE_INPUT_POS", "KVCACHE_INPUT_POS"},
+      {"NO_KVCACHE", "NO_KVCACHE"}};
+
+  // Check if any environment variables are set up front
+  bool any_env_set = false;
+  for (const auto& [model_name, env_var] : env_vars) {
+    if (std::getenv(env_var)) {
+      any_env_set = true;
+      break;
+    }
+  }
+
+  // Skip test if no environment variables are set
+  if (!any_env_set) {
+    GTEST_SKIP() << "No PTE model environment variables were set";
+  }
+
+  bool any_model_tested = false;
+
+  // Loop through all available models
+  for (const auto& [model_name, env_var] : env_vars) {
+    const char* model_path = std::getenv(env_var);
+    if (!model_path) {
+      continue; // Skip if environment variable not set
+    }
+
+    SCOPED_TRACE(
+        "Testing model: " + model_name + " from " + std::string(model_path));
+
+    // Load the model
+    auto module = std::make_unique<Module>(model_path);
+    auto load_result = module->load();
+    if (load_result != Error::Ok) {
+      ADD_FAILURE() << "Failed to load model " << model_name << " from "
+                    << model_path << " with error: " << (int)load_result;
+      continue;
+    }
+
+    // Create TextDecoderRunner
+    TextDecoderRunner runner(module.get());
+    auto runner_load_result = runner.load();
+    ASSERT_EQ(runner_load_result, Error::Ok)
+        << "Failed to load runner for " << model_name;
+
+    // Verify method is loaded
+    EXPECT_TRUE(runner.is_method_loaded())
+        << "Method not loaded for " << model_name;
+
+    // Create input tensor pointer
+
+    TensorFactory<executorch::aten::ScalarType::Long> tf_long;
+    auto input_tokens_ =
+        tf_long.make({1, 3}, {50, 7, 11}); // Single token input
+
+    auto input_ptr = std::make_shared<executorch::aten::Tensor>(input_tokens_);
+    int64_t start_pos = 0;
+
+    // Call step() and verify result is ok
+    auto result = runner.step(input_ptr, start_pos);
+    ASSERT_TRUE(result.ok()) << "step() failed for " << model_name
+                             << " with error: " << (int)result.error();
+
+    // Verify output tensor is valid
+    auto output_tensor = result.get();
+    EXPECT_GT(output_tensor.numel(), 0)
+        << "Output tensor empty for " << model_name;
+
+    // Test logits_to_token works
+    int32_t token = runner.logits_to_token(output_tensor, 0.0f);
+    EXPECT_GE(token, 0) << "Invalid token for " << model_name;
+
+    any_model_tested = true;
+  }
+
+  // This should not happen since we checked environment variables up front
+  ASSERT_TRUE(any_model_tested)
+      << "No models were tested despite environment variables being set";
+}
diff --git a/extension/llm/runner/test/test_text_llm_runner.cpp b/extension/llm/runner/test/test_text_llm_runner.cpp
index 02f04a69b38..6896c56e961 100644
--- a/extension/llm/runner/test/test_text_llm_runner.cpp
+++ b/extension/llm/runner/test/test_text_llm_runner.cpp
@@ -63,11 +63,11 @@ class MockModule : public ::executorch::extension::Module {
 
 class MockTextDecoderRunner : public TextDecoderRunner {
  public:
-  MockTextDecoderRunner() : TextDecoderRunner(nullptr, false) {}
+  MockTextDecoderRunner() : TextDecoderRunner(nullptr) {}
   MOCK_METHOD(
       Result<executorch::aten::Tensor>,
       step,
-      (executorch::extension::TensorPtr&, executorch::extension::TensorPtr&),
+      (executorch::extension::TensorPtr&, int64_t),
       ());
   MOCK_METHOD(bool, is_method_loaded, (), ());
   MOCK_METHOD(Result<uint64_t>, prefill, (std::vector<uint64_t>&, int64_t), ());
@@ -134,8 +134,7 @@ class RunnerTest : public Test {
   std::unique_ptr<MockTextDecoderRunner> createMockTextDecoderRunner() {
     auto text_decoder_runner = std::make_unique<MockTextDecoderRunner>();
     ON_CALL(*text_decoder_runner, step)
-        .WillByDefault([&](executorch::extension::TensorPtr&,
-                           executorch::extension::TensorPtr&) {
+        .WillByDefault([&](executorch::extension::TensorPtr&, int64_t) {
           return Result<executorch::aten::Tensor>(tensor);
         });
     ON_CALL(*text_decoder_runner, is_method_loaded())
diff --git a/extension/llm/runner/test/test_text_prefiller.cpp b/extension/llm/runner/test/test_text_prefiller.cpp
index b786fc71978..dc8bdc625e9 100644
--- a/extension/llm/runner/test/test_text_prefiller.cpp
+++ b/extension/llm/runner/test/test_text_prefiller.cpp
@@ -24,11 +24,11 @@ using executorch::runtime::testing::TensorFactory;
 // Mock class for TextDecoderRunner
 class MockTextDecoderRunner : public TextDecoderRunner {
  public:
-  MockTextDecoderRunner() : TextDecoderRunner(nullptr, false) {}
+  MockTextDecoderRunner() : TextDecoderRunner(nullptr) {}
   MOCK_METHOD(
       Result<executorch::aten::Tensor>,
       step,
-      (executorch::extension::TensorPtr&, executorch::extension::TensorPtr&),
+      (executorch::extension::TensorPtr&, int64_t),
       ());
   MOCK_METHOD(bool, is_method_loaded, (), ());
   MOCK_METHOD(Result<uint64_t>, prefill, (std::vector<uint64_t>&, int64_t), ());
@@ -44,8 +44,7 @@ class TextPrefillerTest : public Test {
     ON_CALL(text_decoder_runner_, is_method_loaded())
         .WillByDefault(Return(true));
     ON_CALL(text_decoder_runner_, step)
-        .WillByDefault([&](executorch::extension::TensorPtr&,
-                           executorch::extension::TensorPtr&) {
+        .WillByDefault([&](executorch::extension::TensorPtr&, int64_t) {
           return Result<executorch::aten::Tensor>(tensor);
         });
   }
diff --git a/extension/llm/runner/text_decoder_runner.cpp b/extension/llm/runner/text_decoder_runner.cpp
index 8705dfeb842..e60a07bc50a 100644
--- a/extension/llm/runner/text_decoder_runner.cpp
+++ b/extension/llm/runner/text_decoder_runner.cpp
@@ -9,6 +9,7 @@
 // Given inputs, run a text decoder and return logits.
 
 #include <executorch/extension/llm/runner/text_decoder_runner.h>
+#include <executorch/kernels/portable/cpu/util/arange_util.h>
 
 #include <ctime>
 
@@ -21,18 +22,53 @@ namespace llm {
 // NOTE: we observed ~2x loading performance increase on iPhone 15
 // and a ~5% improvement on Galaxy S22 by switching to
 // FileDataLoader instead of MmapDataLoader + UseMlockIgnoreErrors.
-TextDecoderRunner::TextDecoderRunner(Module* module, bool use_kv_cache)
-    : module_(module), use_kv_cache_(use_kv_cache) {}
+TextDecoderRunner::TextDecoderRunner(Module* module) : module_(module) {}
 
 // This function is functional, meaning it shouldn't modify any state of the
 // input. It should be safe to call multiple times with the same inputs. The
 // outer loop (call site) is responsible for managing state.
 ::executorch::runtime::Result<executorch::aten::Tensor> TextDecoderRunner::step(
     TensorPtr& tokens,
-    TensorPtr& start_pos) {
+    int64_t start_pos) {
   // ET_LOG(Info, "Input token %" PRIu64, input_token);
-  if (use_kv_cache_) {
-    auto outputs_res = module_->forward({tokens, start_pos});
+  auto method_meta = ET_UNWRAP(module_->method_meta("forward"));
+  // If only 1 input, we are not using kv cache
+  bool use_kv_cache = method_meta.num_inputs() > 1;
+
+  if (use_kv_cache) {
+    // Size of the second argument. This could be either input_pos or
+    // cache_positions
+
+    // Check if we are using cache positions instead of input pos.
+    auto second_input_info = ET_UNWRAP(method_meta.input_tensor_meta(1));
+    // For input_pos, numel is 1, for cache_positions, numel is max_seq_len
+    auto sizes = second_input_info.sizes();
+    // Assuming 1D tensor
+    ET_CHECK_OR_RETURN_ERROR(
+        sizes.size() == 1,
+        InvalidProgram,
+        "The second input tensor is not 1D tensor. Got dimension (%zu)",
+        sizes.size());
+    auto numel = sizes[0];
+    std::vector<::executorch::aten::SizesType> sizes_vec = {numel};
+
+    // Assuming the last dimension is the one with the variable token length,
+    // for example [1, S] or [1, 1, S]
+    sizes_vec[sizes_vec.size() - 1] = numel;
+    TensorPtr start_pos_tensor;
+    if (numel > 1) {
+      // Assuming model is exported with cache_positions, create a tensor with
+      // the same size as cache_positions
+      start_pos_tensor = empty(sizes_vec, ::executorch::aten::ScalarType::Long);
+      torch::executor::native::arange_out_impl(
+          start_pos, start_pos + numel, 1.0, *start_pos_tensor);
+    } else {
+      // Assuming model is exported with input_pos, create a tensor with size 1
+      start_pos_tensor = from_blob(
+          &start_pos, sizes_vec, ::executorch::aten::ScalarType::Long);
+    }
+    ET_LOG(Info, "Start pos tensor numel: %zu", start_pos_tensor->numel());
+    auto outputs_res = module_->forward({tokens, start_pos_tensor});
     ET_CHECK_OK_OR_RETURN_ERROR(outputs_res.error());
     ET_CHECK_MSG(
         outputs_res.get().size() == 1,
diff --git a/extension/llm/runner/text_decoder_runner.h b/extension/llm/runner/text_decoder_runner.h
index 6c1256c6b90..e930763668e 100644
--- a/extension/llm/runner/text_decoder_runner.h
+++ b/extension/llm/runner/text_decoder_runner.h
@@ -21,7 +21,7 @@ namespace llm {
 
 class ET_EXPERIMENTAL TextDecoderRunner {
  public:
-  TextDecoderRunner(Module* module, bool use_kv_cache);
+  explicit TextDecoderRunner(Module* module);
 
   virtual ~TextDecoderRunner() = default;
 
@@ -34,7 +34,7 @@ class ET_EXPERIMENTAL TextDecoderRunner {
    */
   virtual ::executorch::runtime::Result<executorch::aten::Tensor> step(
       TensorPtr& input,
-      TensorPtr& start_pos);
+      int64_t start_pos);
 
   /**
    * Load the Module for text decode purpose.
@@ -101,7 +101,6 @@ class ET_EXPERIMENTAL TextDecoderRunner {
    * Module remains valid for the duration of TextDecoderRunner's usage.
    */
   Module* module_;
-  bool use_kv_cache_;
   bool should_stop_{false};
 };
 
diff --git a/extension/llm/runner/text_llm_runner.cpp b/extension/llm/runner/text_llm_runner.cpp
index 6a0cfd45044..b93988cffd5 100644
--- a/extension/llm/runner/text_llm_runner.cpp
+++ b/extension/llm/runner/text_llm_runner.cpp
@@ -393,8 +393,7 @@ std::unique_ptr<TextLLMRunner> create_text_llm_runner(
 
   // Create text_decoder_runner. Use a shared_ptr so that it can be shared with
   // TextPrefiller and TextTokenGenerator
-  auto text_decoder_runner = std::make_unique<TextDecoderRunner>(
-      module.get(), metadata.at(kUseKVCache));
+  auto text_decoder_runner = std::make_unique<TextDecoderRunner>(module.get());
 
   // Create text_prefiller
   auto text_prefiller = std::make_unique<TextPrefiller>(
diff --git a/extension/llm/runner/text_prefiller.cpp b/extension/llm/runner/text_prefiller.cpp
index 64f3fee167b..de092b6b05d 100644
--- a/extension/llm/runner/text_prefiller.cpp
+++ b/extension/llm/runner/text_prefiller.cpp
@@ -86,10 +86,7 @@ ::executorch::runtime::Result<uint64_t> TextPrefiller::prefill_chunk(
         {1, num_prompt_tokens},
         executorch::aten::ScalarType::Long);
 
-    auto start_pos_tensor =
-        from_blob(&start_pos, {1}, executorch::aten::ScalarType::Long);
-
-    auto outputs_res = text_decoder_runner_->step(tokens, start_pos_tensor);
+    auto outputs_res = text_decoder_runner_->step(tokens, start_pos);
 
     ET_CHECK_OK_OR_RETURN_ERROR(outputs_res.error());
     ET_LOG(
@@ -106,13 +103,10 @@ ::executorch::runtime::Result<uint64_t> TextPrefiller::prefill_chunk(
     auto tokens =
         from_blob(&cur_token, {1, 1}, executorch::aten::ScalarType::Long);
 
-    auto start_pos_tensor =
-        from_blob(&start_pos, {1}, executorch::aten::ScalarType::Long);
-
     // run the first token and get back logits tensor. Assuming the first token
     // is bos so don't callback.
     auto logits_tensor =
-        ET_UNWRAP(text_decoder_runner_->step(tokens, start_pos_tensor));
+        ET_UNWRAP(text_decoder_runner_->step(tokens, start_pos));
 
     pos += 1; // start the loop from index 1
     start_pos += 1;
@@ -122,8 +116,7 @@ ::executorch::runtime::Result<uint64_t> TextPrefiller::prefill_chunk(
       // NOLINTNEXTLINE(facebook-hte-ParameterUncheckedArrayBounds)
       cur_token = prompt_tokens[pos];
 
-      logits_tensor =
-          ET_UNWRAP(text_decoder_runner_->step(tokens, start_pos_tensor));
+      logits_tensor = ET_UNWRAP(text_decoder_runner_->step(tokens, start_pos));
 
       pos++;
       start_pos++;
diff --git a/extension/llm/runner/text_token_generator.h b/extension/llm/runner/text_token_generator.h
index 839ad195c7e..1a05921ed3a 100644
--- a/extension/llm/runner/text_token_generator.h
+++ b/extension/llm/runner/text_token_generator.h
@@ -78,16 +78,13 @@ class ET_EXPERIMENTAL TextTokenGenerator {
     // initialize tensor wrappers
     auto tokens_managed = from_blob(
         token_data.data(), token_shape, executorch::aten::ScalarType::Long);
-    auto start_pos_managed =
-        from_blob(&pos, {1}, executorch::aten::ScalarType::Long);
 
     should_stop_ = false;
 
     // Generate our tokens
     while (pos < start_pos + max_new_tokens) {
       // Run the model
-      auto logits_res =
-          text_decoder_runner_->step(tokens_managed, start_pos_managed);
+      auto logits_res = text_decoder_runner_->step(tokens_managed, pos);
 
       ET_CHECK_OK_OR_RETURN_ERROR(logits_res.error());
       executorch::aten::Tensor& logits_tensor = logits_res.get();
diff --git a/kernels/portable/cpu/util/arange_util.cpp b/kernels/portable/cpu/util/arange_util.cpp
index e13f7652736..f9383723af1 100644
--- a/kernels/portable/cpu/util/arange_util.cpp
+++ b/kernels/portable/cpu/util/arange_util.cpp
@@ -12,20 +12,21 @@ namespace torch::executor::native {
 #define ET_ARANGE_IMPL(ctx, start, numel, step, out, op_name)               \
   ET_SWITCH_REALHBF16_TYPES(out.scalar_type(), ctx, op_name, CTYPE, [&]() { \
     auto out_data = out.mutable_data_ptr<CTYPE>();                          \
-    for (Tensor::SizesType i = 0; i < numel; ++i) {                         \
+    for (executorch::aten::SizesType i = 0; i < numel; ++i) {               \
       out_data[i] = static_cast<CTYPE>(start + i * step);                   \
     }                                                                       \
   })
 
-Tensor::SizesType
+executorch::aten::SizesType
 compute_arange_out_size(double start, double end, double step) {
-  Tensor::SizesType numel =
-      static_cast<Tensor::SizesType>(std::ceil((end - start) / step));
+  executorch::aten::SizesType numel =
+      static_cast<executorch::aten::SizesType>(std::ceil((end - start) / step));
 
   ET_CHECK_MSG(
       numel >= 0,
-      "numel should be non-negative, but got (%d). start (%f), end (%f), step (%f)",
-      numel,
+      "numel should be non-negative, but got (%" PRId64
+      "). start (%f), end (%f), step (%f)",
+      static_cast<int64_t>(numel),
       start,
       end,
       step);
@@ -39,7 +40,7 @@ void arange_out_impl(
     double step,
     Tensor& out) {
   (void)ctx;
-  Tensor::SizesType numel = compute_arange_out_size(start, end, step);
+  executorch::aten::SizesType numel = compute_arange_out_size(start, end, step);
   ET_ARANGE_IMPL(ctx, start, numel, step, out, "arange.start_out");
 }
 
diff --git a/kernels/portable/cpu/util/arange_util.h b/kernels/portable/cpu/util/arange_util.h
index 5abb52f410c..badaa81867d 100644
--- a/kernels/portable/cpu/util/arange_util.h
+++ b/kernels/portable/cpu/util/arange_util.h
@@ -12,10 +12,10 @@
 
 namespace torch::executor::native {
 
-Tensor::SizesType
+executorch::aten::SizesType
 compute_arange_out_size(double start, double end, double step);
 
-inline Tensor::SizesType compute_arange_out_size(double end) {
+inline executorch::aten::SizesType compute_arange_out_size(double end) {
   return compute_arange_out_size(0.0, end, 1.0);
 }
 
diff --git a/kernels/portable/cpu/util/targets.bzl b/kernels/portable/cpu/util/targets.bzl
index d7ee1ac89ce..7d97e3a05f5 100644
--- a/kernels/portable/cpu/util/targets.bzl
+++ b/kernels/portable/cpu/util/targets.bzl
@@ -295,19 +295,6 @@ def define_common_targets():
         visibility = ["//executorch/kernels/portable/cpu/..."],
     )
 
-    runtime.cxx_library(
-        name = "arange_util",
-        srcs = ["arange_util.cpp"],
-        exported_headers = ["arange_util.h"],
-        deps = [
-            "//executorch/runtime/kernel:kernel_includes",
-        ],
-        visibility = [
-            "//executorch/kernels/portable/cpu/...",
-            "//executorch/extension/llm/...",
-        ],
-    )
-
     runtime.cxx_library(
         name = "broadcast_indexes_range",
         exported_headers = ["broadcast_indexes_range.h"],
@@ -343,3 +330,17 @@ def define_common_targets():
                 "@EXECUTORCH_CLIENTS",
             ],
         )
+
+
+        runtime.cxx_library(
+            name = "arange_util{}".format(suffix),
+            srcs = ["arange_util.cpp"],
+            exported_headers = ["arange_util.h"],
+            exported_deps = [
+                "//executorch/runtime/kernel:kernel_includes{}".format(suffix),
+            ],
+            visibility = [
+                "//executorch/kernels/portable/cpu/...",
+                "//executorch/extension/llm/...",
+            ],
+        )
diff --git a/test/models/export_program.py b/test/models/export_program.py
index e13b63eaf74..dac42ecee1c 100644
--- a/test/models/export_program.py
+++ b/test/models/export_program.py
@@ -213,6 +213,55 @@ def export_state_names():
         return True
 
 
+# Mimicking LLM with forward taking tokens and input_pos
+class ModuleKVCacheInputPos(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.linear = torch.nn.Linear(3, 3)
+
+    def forward(self, x, input_pos):
+        return (self.linear(x.to(torch.float)).to(torch.long) + input_pos).to(
+            torch.float
+        )
+
+    def get_random_inputs(self):
+        return (
+            torch.randint(100, [1, 3], dtype=torch.long),
+            torch.tensor([0], dtype=torch.long),
+        )
+
+
+# Mimicking LLM with forward taking tokens and cache_positions
+class ModuleKVCacheCachePos(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.linear = torch.nn.Linear(3, 3)
+
+    def forward(self, x, cache_positions):
+        return (self.linear(x.to(torch.float)).to(torch.long) + cache_positions).to(
+            torch.float
+        )
+
+    def get_random_inputs(self):
+        return (
+            torch.randint(100, [1, 3], dtype=torch.long),
+            torch.arange(3, dtype=torch.long),
+        )
+
+
+# Mimicking LLM with forward taking only tokens
+class ModuleNoKVCache(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.linear = torch.nn.Linear(3, 3)
+
+    def forward(self, x):
+        return self.linear(x.to(torch.float))
+
+    def get_random_inputs(self):
+        return (torch.randint(100, [1, 3], dtype=torch.long),)
+
+
 #
 # Main logic.
 #
diff --git a/test/models/targets.bzl b/test/models/targets.bzl
index 391ce230ab8..769fcb65ccd 100644
--- a/test/models/targets.bzl
+++ b/test/models/targets.bzl
@@ -63,7 +63,10 @@ def define_common_targets():
         "ModuleAddHalf",
         "ModuleAddMul",
         "ModuleBasic",
+        "ModuleKVCacheCachePos",
+        "ModuleKVCacheInputPos",
         "ModuleMultipleEntry",
+        "ModuleNoKVCache",
         "ModuleIndex",
         "ModuleDynamicCatUnallocatedIO",
         "ModuleSimpleTrain",