pytorch · Dan-Flores · Jul 21, 2025 · Jul 10, 2025 · Jul 10, 2025 · Jul 10, 2025
diff --git a/src/torchcodec/_core/SingleStreamDecoder.cpp b/src/torchcodec/_core/SingleStreamDecoder.cpp
@@ -198,6 +198,45 @@ int SingleStreamDecoder::getBestStreamIndex(AVMediaType mediaType) {
 // VIDEO METADATA QUERY API
 // --------------------------------------------------------------------------
 
+void SingleStreamDecoder::sortAllFrames() {
-void SingleStreamDecoder::sortAllFrames() {
+void SingleStreamDecoder::sortAllFrames() {
+  // Sort the allFrames and keyFrames vecs in each stream, and also sets
+  // additional fields of the FrameInfo entries like nextPts and frameIndex
+  // This is called at the end of a scan, or when setting a user-defined frame
+  // mapping.
+ 
-void SingleStreamDecoder::sortAllFrames() {
+void SingleStreamDecoder::sortAllFrames() {
+  // Sort the allFrames and keyFrames vecs in each stream, and also sets
+  // additional fields of the FrameInfo entries like nextPts and frameIndex
+  // This is called at the end of a scan, or when setting a user-defined frame
+  // mapping.
+ 
+  // Sort the allFrames and keyFrames vecs in each stream, and also sets
+  // additional fields of the FrameInfo entries like nextPts and frameIndex
+  // This is called at the end of a scan, or when setting a user-defined frame
+  // mapping.
+  for (auto& [streamIndex, streamInfo] : streamInfos_) {
+    std::sort(
+        streamInfo.keyFrames.begin(),
+        streamInfo.keyFrames.end(),
+        [](const FrameInfo& frameInfo1, const FrameInfo& frameInfo2) {
+          return frameInfo1.pts < frameInfo2.pts;
+        });
+    std::sort(
+        streamInfo.allFrames.begin(),
+        streamInfo.allFrames.end(),
+        [](const FrameInfo& frameInfo1, const FrameInfo& frameInfo2) {
+          return frameInfo1.pts < frameInfo2.pts;
+        });
+
+    size_t keyFrameIndex = 0;
+    for (size_t i = 0; i < streamInfo.allFrames.size(); ++i) {
+      streamInfo.allFrames[i].frameIndex = i;
+      if (streamInfo.allFrames[i].isKeyFrame) {
+        TORCH_CHECK(
+            keyFrameIndex < streamInfo.keyFrames.size(),
+            "The allFrames vec claims it has MORE keyFrames than the keyFrames vec. There's a bug in torchcodec.");
+        streamInfo.keyFrames[keyFrameIndex].frameIndex = i;
+        ++keyFrameIndex;
+      }
+      if (i + 1 < streamInfo.allFrames.size()) {
+        streamInfo.allFrames[i].nextPts = streamInfo.allFrames[i + 1].pts;
+      }
+    }
+    TORCH_CHECK(
+        keyFrameIndex == streamInfo.keyFrames.size(),
+        "The allFrames vec claims it has LESS keyFrames than the keyFrames vec. There's a bug in torchcodec.");
+  }
+}
+
 void SingleStreamDecoder::scanFileAndUpdateMetadataAndIndex() {
   if (scannedAllStreams_) {
     return;
@@ -283,40 +322,46 @@ void SingleStreamDecoder::scanFileAndUpdateMetadataAndIndex() {
       getFFMPEGErrorStringFromErrorCode(status));
 
   // Sort all frames by their pts.
-  for (auto& [streamIndex, streamInfo] : streamInfos_) {
-    std::sort(
-        streamInfo.keyFrames.begin(),
-        streamInfo.keyFrames.end(),
-        [](const FrameInfo& frameInfo1, const FrameInfo& frameInfo2) {
-          return frameInfo1.pts < frameInfo2.pts;
-        });
-    std::sort(
-        streamInfo.allFrames.begin(),
-        streamInfo.allFrames.end(),
-        [](const FrameInfo& frameInfo1, const FrameInfo& frameInfo2) {
-          return frameInfo1.pts < frameInfo2.pts;
-        });
+  sortAllFrames();
+  scannedAllStreams_ = true;
+}
 
-    size_t keyFrameIndex = 0;
-    for (size_t i = 0; i < streamInfo.allFrames.size(); ++i) {
-      streamInfo.allFrames[i].frameIndex = i;
-      if (streamInfo.allFrames[i].isKeyFrame) {
-        TORCH_CHECK(
-            keyFrameIndex < streamInfo.keyFrames.size(),
-            "The allFrames vec claims it has MORE keyFrames than the keyFrames vec. There's a bug in torchcodec.");
-        streamInfo.keyFrames[keyFrameIndex].frameIndex = i;
-        ++keyFrameIndex;
-      }
-      if (i + 1 < streamInfo.allFrames.size()) {
-        streamInfo.allFrames[i].nextPts = streamInfo.allFrames[i + 1].pts;
-      }
+void SingleStreamDecoder::readCustomFrameMappingsUpdateMetadataAndIndex(
+    int streamIndex,
+    FrameMappings customFrameMappings) {
+  auto& all_frames = customFrameMappings.all_frames;
+  auto& is_key_frame = customFrameMappings.is_key_frame;
+  auto& duration = customFrameMappings.duration;
+  TORCH_CHECK(
+      all_frames.size(0) == is_key_frame.size(0) &&
+          is_key_frame.size(0) == duration.size(0),
+      "all_frames, is_key_frame, and duration from custom_frame_mappings were not same size.");
+
+  auto& streamMetadata = containerMetadata_.allStreamMetadata[streamIndex];
+
+  streamMetadata.beginStreamPtsFromContent = all_frames[0].item<int64_t>();
+  streamMetadata.endStreamPtsFromContent =
+      all_frames[-1].item<int64_t>() + duration[-1].item<int64_t>();
+
+  auto avStream = formatContext_->streams[streamIndex];
+  streamMetadata.beginStreamPtsSecondsFromContent =
+      *streamMetadata.beginStreamPtsFromContent * av_q2d(avStream->time_base);
+
+  streamMetadata.endStreamPtsSecondsFromContent =
+      *streamMetadata.endStreamPtsFromContent * av_q2d(avStream->time_base);
+
+  streamMetadata.numFramesFromContent = all_frames.size(0);
+  for (int64_t i = 0; i < all_frames.size(0); ++i) {
+    FrameInfo frameInfo;
+    frameInfo.pts = all_frames[i].item<int64_t>();
+    frameInfo.isKeyFrame = is_key_frame[i].item<bool>();
+    streamInfos_[streamIndex].allFrames.push_back(frameInfo);
+    if (frameInfo.isKeyFrame) {
+      streamInfos_[streamIndex].keyFrames.push_back(frameInfo);
     }
-    TORCH_CHECK(
-        keyFrameIndex == streamInfo.keyFrames.size(),
-        "The allFrames vec claims it has LESS keyFrames than the keyFrames vec. There's a bug in torchcodec.");
   }
-
-  scannedAllStreams_ = true;
+  // Sort all frames by their pts
+  sortAllFrames();
 }
 
 // Sort all frames by their pts. 
 for (auto& [streamIndex, streamInfo] : streamInfos_) { 
   std::sort( 
       streamInfo.keyFrames.begin(), 
       streamInfo.keyFrames.end(), 
       [](const FrameInfo& frameInfo1, const FrameInfo& frameInfo2) { 
         return frameInfo1.pts < frameInfo2.pts; 
       }); 
   std::sort( 
       streamInfo.allFrames.begin(), 
       streamInfo.allFrames.end(), 
       [](const FrameInfo& frameInfo1, const FrameInfo& frameInfo2) { 
         return frameInfo1.pts < frameInfo2.pts; 
       }); 
 // Sort all frames by their pts. 
 for (auto& [streamIndex, streamInfo] : streamInfos_) { 
   std::sort( 
       streamInfo.keyFrames.begin(), 
       streamInfo.keyFrames.end(), 
       [](const FrameInfo& frameInfo1, const FrameInfo& frameInfo2) { 
         return frameInfo1.pts < frameInfo2.pts; 
       }); 
   std::sort( 
       streamInfo.allFrames.begin(), 
       streamInfo.allFrames.end(), 
       [](const FrameInfo& frameInfo1, const FrameInfo& frameInfo2) { 
         return frameInfo1.pts < frameInfo2.pts; 
       }); 
 ContainerMetadata SingleStreamDecoder::getContainerMetadata() const {
@@ -431,7 +476,8 @@ void SingleStreamDecoder::addStream(
 
 void SingleStreamDecoder::addVideoStream(
     int streamIndex,
-    const VideoStreamOptions& videoStreamOptions) {
+    const VideoStreamOptions& videoStreamOptions,
+    std::optional<FrameMappings> customFrameMappings) {
   addStream(
       streamIndex,
       AVMEDIA_TYPE_VIDEO,
@@ -456,6 +502,14 @@ void SingleStreamDecoder::addVideoStream(
   streamMetadata.height = streamInfo.codecContext->height;
   streamMetadata.sampleAspectRatio =
       streamInfo.codecContext->sample_aspect_ratio;
+
+  if (seekMode_ == SeekMode::custom_frame_mappings) {
+    TORCH_CHECK(
+        customFrameMappings.has_value(),
+        "Please provide frame mappings when using custom_frame_mappings seek mode.");
+    readCustomFrameMappingsUpdateMetadataAndIndex(
+        streamIndex, customFrameMappings.value());
+  }
 }
 
 void SingleStreamDecoder::addAudioStream(
@@ -1407,6 +1461,7 @@ int SingleStreamDecoder::getKeyFrameIndexForPtsUsingScannedIndex(
 int64_t SingleStreamDecoder::secondsToIndexLowerBound(double seconds) {
   auto& streamInfo = streamInfos_[activeStreamIndex_];
   switch (seekMode_) {
+    case SeekMode::custom_frame_mappings:
     case SeekMode::exact: {
       auto frame = std::lower_bound(
           streamInfo.allFrames.begin(),
@@ -1434,6 +1489,7 @@ int64_t SingleStreamDecoder::secondsToIndexLowerBound(double seconds) {
 int64_t SingleStreamDecoder::secondsToIndexUpperBound(double seconds) {
   auto& streamInfo = streamInfos_[activeStreamIndex_];
   switch (seekMode_) {
+    case SeekMode::custom_frame_mappings:
     case SeekMode::exact: {
       auto frame = std::upper_bound(
           streamInfo.allFrames.begin(),
@@ -1461,6 +1517,7 @@ int64_t SingleStreamDecoder::secondsToIndexUpperBound(double seconds) {
 int64_t SingleStreamDecoder::getPts(int64_t frameIndex) {
   auto& streamInfo = streamInfos_[activeStreamIndex_];
   switch (seekMode_) {
+    case SeekMode::custom_frame_mappings:
     case SeekMode::exact:
       return streamInfo.allFrames[frameIndex].pts;
     case SeekMode::approximate: {
@@ -1485,6 +1542,7 @@ int64_t SingleStreamDecoder::getPts(int64_t frameIndex) {
 std::optional<int64_t> SingleStreamDecoder::getNumFrames(
     const StreamMetadata& streamMetadata) {
   switch (seekMode_) {
+    case SeekMode::custom_frame_mappings:
     case SeekMode::exact:
       return streamMetadata.numFramesFromContent.value();
     case SeekMode::approximate: {
@@ -1498,6 +1556,7 @@ std::optional<int64_t> SingleStreamDecoder::getNumFrames(
 double SingleStreamDecoder::getMinSeconds(
     const StreamMetadata& streamMetadata) {
   switch (seekMode_) {
+    case SeekMode::custom_frame_mappings:
     case SeekMode::exact:
       return streamMetadata.beginStreamPtsSecondsFromContent.value();
     case SeekMode::approximate:
@@ -1510,6 +1569,7 @@ double SingleStreamDecoder::getMinSeconds(
 std::optional<double> SingleStreamDecoder::getMaxSeconds(
     const StreamMetadata& streamMetadata) {
   switch (seekMode_) {
+    case SeekMode::custom_frame_mappings:
     case SeekMode::exact:
       return streamMetadata.endStreamPtsSecondsFromContent.value();
     case SeekMode::approximate: {
@@ -1645,6 +1705,8 @@ SingleStreamDecoder::SeekMode seekModeFromString(std::string_view seekMode) {
     return SingleStreamDecoder::SeekMode::exact;
   } else if (seekMode == "approximate") {
     return SingleStreamDecoder::SeekMode::approximate;
+  } else if (seekMode == "custom_frame_mappings") {
+    return SingleStreamDecoder::SeekMode::custom_frame_mappings;
   } else {
     TORCH_CHECK(false, "Invalid seek mode: " + std::string(seekMode));
   }

diff --git a/src/torchcodec/_core/SingleStreamDecoder.h b/src/torchcodec/_core/SingleStreamDecoder.h
@@ -29,7 +29,7 @@ class SingleStreamDecoder {
   // CONSTRUCTION API
   // --------------------------------------------------------------------------
 
-  enum class SeekMode { exact, approximate };
+  enum class SeekMode { exact, approximate, custom_frame_mappings };
 if (seekMode_ == SeekMode::exact) { 
   scanFileAndUpdateMetadataAndIndex(); 
 } 
 if (seekMode_ == SeekMode::exact) { 
   scanFileAndUpdateMetadataAndIndex(); 
 } 
 
   // Creates a SingleStreamDecoder from the video at videoFilePath.
   explicit SingleStreamDecoder(
@@ -53,20 +53,38 @@ class SingleStreamDecoder {
   // the allFrames and keyFrames vectors.
   void scanFileAndUpdateMetadataAndIndex();
 
+  // Sorts the keyFrames and allFrames vectors in each StreamInfo by pts.
+  void sortAllFrames();
+
   // Returns the metadata for the container.
   ContainerMetadata getContainerMetadata() const;
 
   // Returns the key frame indices as a tensor. The tensor is 1D and contains
   // int64 values, where each value is the frame index for a key frame.
   torch::Tensor getKeyFrameIndices();
 
+  // FrameMappings is used for the custom_frame_mappings seek mode to store
+  // metadata of frames in a stream. The size of all tensors in this struct must
+  // match.
+
   // --------------------------------------------------------------------------
   // ADDING STREAMS API
   // --------------------------------------------------------------------------
+  struct FrameMappings {
+    // 1D tensor of int64, each value is the PTS of a frame in timebase units.
+    torch::Tensor all_frames;
+    // 1D tensor of bool, each value indicates if the corresponding frame in
+    // all_frames is a key frame.
+    torch::Tensor is_key_frame;
+    // 1D tensor of int64, each value is the duration of the corresponding frame
+    // in all_frames in timebase units.
+    torch::Tensor duration;
+  };
 
   void addVideoStream(
       int streamIndex,
-      const VideoStreamOptions& videoStreamOptions = VideoStreamOptions());
+      const VideoStreamOptions& videoStreamOptions = VideoStreamOptions(),
+      std::optional<FrameMappings> customFrameMappings = std::nullopt);
   void addAudioStream(
       int streamIndex,
       const AudioStreamOptions& audioStreamOptions = AudioStreamOptions());
@@ -226,6 +244,13 @@ class SingleStreamDecoder {
   // --------------------------------------------------------------------------
 
   void initializeDecoder();
+
+  // Reads the user provided frame index and updates each StreamInfo's index,
+  // i.e. the allFrames and keyFrames vectors, and
+  // endStreamPtsSecondsFromContent
+  void readCustomFrameMappingsUpdateMetadataAndIndex(
+      int streamIndex,
+      FrameMappings customFrameMappings);
   // --------------------------------------------------------------------------
   // DECODING APIS AND RELATED UTILS
   // --------------------------------------------------------------------------

diff --git a/src/torchcodec/_core/custom_ops.cpp b/src/torchcodec/_core/custom_ops.cpp
@@ -36,9 +36,9 @@ TORCH_LIBRARY(torchcodec_ns, m) {
       "create_from_tensor(Tensor video_tensor, str? seek_mode=None) -> Tensor");
   m.def("_convert_to_tensor(int decoder_ptr) -> Tensor");
   m.def(
-      "_add_video_stream(Tensor(a!) decoder, *, int? width=None, int? height=None, int? num_threads=None, str? dimension_order=None, int? stream_index=None, str? device=None, str? color_conversion_library=None) -> ()");
+      "_add_video_stream(Tensor(a!) decoder, *, int? width=None, int? height=None, int? num_threads=None, str? dimension_order=None, int? stream_index=None, str? device=None, (Tensor, Tensor, Tensor)? custom_frame_mappings=None, str? color_conversion_library=None) -> ()");
   m.def(
-      "add_video_stream(Tensor(a!) decoder, *, int? width=None, int? height=None, int? num_threads=None, str? dimension_order=None, int? stream_index=None, str? device=None) -> ()");
+      "add_video_stream(Tensor(a!) decoder, *, int? width=None, int? height=None, int? num_threads=None, str? dimension_order=None, int? stream_index=None, str? device=None, (Tensor, Tensor, Tensor)? custom_frame_mappings=None) -> ()");
   m.def(
       "add_audio_stream(Tensor(a!) decoder, *, int? stream_index=None, int? sample_rate=None, int? num_channels=None) -> ()");
   m.def("seek_to_pts(Tensor(a!) decoder, float seconds) -> ()");
@@ -105,6 +105,14 @@ OpsFrameOutput makeOpsFrameOutput(FrameOutput& frame) {
       torch::tensor(frame.durationSeconds, torch::dtype(torch::kFloat64)));
 }
 
+SingleStreamDecoder::FrameMappings makeFrameMappings(
+    std::tuple<at::Tensor, at::Tensor, at::Tensor> custom_frame_mappings) {
+  return SingleStreamDecoder::FrameMappings{
+      std::get<0>(custom_frame_mappings),
+      std::get<1>(custom_frame_mappings),
+      std::get<2>(custom_frame_mappings)};
+}
+
 // All elements of this tuple are tensors of the same leading dimension. The
 // tuple represents the frames for N total frames, where N is the dimension of
 // each stacked tensor. The elments are:
@@ -223,6 +231,8 @@ void _add_video_stream(
     std::optional<std::string_view> dimension_order = std::nullopt,
     std::optional<int64_t> stream_index = std::nullopt,
     std::optional<std::string_view> device = std::nullopt,
+    std::optional<std::tuple<at::Tensor, at::Tensor, at::Tensor>>
+        custom_frame_mappings = std::nullopt,
     std::optional<std::string_view> color_conversion_library = std::nullopt) {
   VideoStreamOptions videoStreamOptions;
   videoStreamOptions.width = width;
@@ -253,9 +263,13 @@ void _add_video_stream(
   if (device.has_value()) {
     videoStreamOptions.device = createTorchDevice(std::string(device.value()));
   }
-
+  std::optional<SingleStreamDecoder::FrameMappings> converted_mappings =
+      custom_frame_mappings.has_value()
+      ? std::make_optional(makeFrameMappings(custom_frame_mappings.value()))
+      : std::nullopt;
   auto videoDecoder = unwrapTensorToGetDecoder(decoder);
-  videoDecoder->addVideoStream(stream_index.value_or(-1), videoStreamOptions);
+  videoDecoder->addVideoStream(
+      stream_index.value_or(-1), videoStreamOptions, converted_mappings);
 }
 
 // Add a new video stream at `stream_index` using the provided options.
@@ -266,15 +280,18 @@ void add_video_stream(
     std::optional<int64_t> num_threads = std::nullopt,
     std::optional<std::string_view> dimension_order = std::nullopt,
     std::optional<int64_t> stream_index = std::nullopt,
-    std::optional<std::string_view> device = std::nullopt) {
+    std::optional<std::string_view> device = std::nullopt,
+    std::optional<std::tuple<at::Tensor, at::Tensor, at::Tensor>>
+        custom_frame_mappings = std::nullopt) {
   _add_video_stream(
       decoder,
       width,
       height,
       num_threads,
       dimension_order,
       stream_index,
-      device);
+      device,
+      custom_frame_mappings);
 }
 
 void add_audio_stream(

diff --git a/src/torchcodec/_core/ops.py b/src/torchcodec/_core/ops.py
@@ -205,6 +205,9 @@ def _add_video_stream_abstract(
     dimension_order: Optional[str] = None,
     stream_index: Optional[int] = None,
     device: Optional[str] = None,
+    custom_frame_mappings: Optional[
+        tuple[torch.Tensor, torch.Tensor, torch.Tensor]
+    ] = None,
     color_conversion_library: Optional[str] = None,
 ) -> None:
     return
@@ -220,6 +223,9 @@ def add_video_stream_abstract(
     dimension_order: Optional[str] = None,
     stream_index: Optional[int] = None,
     device: Optional[str] = None,
+    custom_frame_mappings: Optional[
+        tuple[torch.Tensor, torch.Tensor, torch.Tensor]
+    ] = None,
 ) -> None:
     return