diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index af19028f3..f91be9a89 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -42,6 +42,10 @@ git clone git@github.com:pytorch/torchcodec.git
 # Or, using https instead of ssh: git clone https://github.com/pytorch/torchcodec.git
 cd torchcodec
 
+# Optional, but recommended: define a persistent build directory which speeds-up
+# subsequent builds.
+export TORCHCODEC_CMAKE_BUILD_DIR="${PWD}/build"
+
 pip install -e ".[dev]" --no-build-isolation -vv
 # Or, for cuda support: ENABLE_CUDA=1 pip install -e ".[dev]" --no-build-isolation -vv
 ```
diff --git a/setup.py b/setup.py
index 5074ef478..2efccf982 100644
--- a/setup.py
+++ b/setup.py
@@ -126,12 +126,17 @@ def _build_all_extensions_with_cmake(self):
             f"-DTORCHCODEC_DISABLE_COMPILE_WARNING_AS_ERROR={torchcodec_disable_compile_warning_as_error}",
         ]
 
+        self.build_temp = os.getenv("TORCHCODEC_CMAKE_BUILD_DIR", self.build_temp)
+        print(f"Using {self.build_temp = }", flush=True)
         Path(self.build_temp).mkdir(parents=True, exist_ok=True)
 
+        print("Calling cmake (configure)", flush=True)
         subprocess.check_call(
             ["cmake", str(_ROOT_DIR)] + cmake_args, cwd=self.build_temp
         )
+        print("Calling cmake --build", flush=True)
         subprocess.check_call(["cmake", "--build", "."], cwd=self.build_temp)
+        print("Calling cmake --install", flush=True)
         subprocess.check_call(["cmake", "--install", "."], cwd=self.build_temp)
 
     def copy_extensions_to_source(self):
diff --git a/src/torchcodec/_core/CudaDeviceInterface.cpp b/src/torchcodec/_core/CudaDeviceInterface.cpp
index 1da00484f..857617a51 100644
--- a/src/torchcodec/_core/CudaDeviceInterface.cpp
+++ b/src/torchcodec/_core/CudaDeviceInterface.cpp
@@ -227,14 +227,44 @@ void CudaDeviceInterface::convertAVFrameToFrameOutput(
   NppiSize oSizeROI = {width, height};
   Npp8u* input[2] = {avFrame->data[0], avFrame->data[1]};
 
+  // Conversion matrix taken from https://mymusing.co/bt-709-yuv-to-rgb-conversion-color/
+  // The -128 offset is needed to first center the U and V channels around 0
+  static const Npp32f bt709ColorTwist[3][4] = {
+    {1.0f, 0.0f, 1.5748f, 0.0f},     
+    {1.0f, -0.187324, -0.468124, -128.0f},  
+    {1.0f, 1.8556, 0.0f, -128.0f}         
+  };
+
   NppStatus status;
   if (avFrame->colorspace == AVColorSpace::AVCOL_SPC_BT709) {
-    status = nppiNV12ToRGB_709CSC_8u_P2C3R(
-        input,
-        avFrame->linesize[0],
-        static_cast<Npp8u*>(dst.data_ptr()),
-        dst.stride(0),
-        oSizeROI);
+    if (avFrame->color_range == AVColorRange::AVCOL_RANGE_JPEG) {
+      // BT.709 full range using custom ColorTwist to match libswscale
+      // Create NPP stream context for the _Ctx function
+      printf("it's a BT.709 full range frame\n");
+      NppStreamContext nppStreamCtx;
+      nppGetStreamContext(&nppStreamCtx);
+      
+      // ColorTwist function expects step arrays for planar input format
+      int srcStep[2] = {avFrame->linesize[0], avFrame->linesize[1]};
+      
+      status = nppiNV12ToRGB_8u_ColorTwist32f_P2C3R_Ctx(
+          input,
+          srcStep,
+          static_cast<Npp8u*>(dst.data_ptr()),
+          dst.stride(0),
+          oSizeROI,
+          bt709ColorTwist,
+          nppStreamCtx);
+    } else {
+      printf("it's a BT.709 studio range frame\n");
+      // BT.709 studio range
+      status = nppiNV12ToRGB_709CSC_8u_P2C3R(
+          input,
+          avFrame->linesize[0],
+          static_cast<Npp8u*>(dst.data_ptr()),
+          dst.stride(0),
+          oSizeROI);
+    }
   } else {
     status = nppiNV12ToRGB_8u_P2C3R(
         input,
diff --git a/test/resources/full_range_709.mp4 b/test/resources/full_range_709.mp4
new file mode 100644
index 000000000..004028190
Binary files /dev/null and b/test/resources/full_range_709.mp4 differ
diff --git a/test/test_decoders.py b/test/test_decoders.py
index dcf9a1585..b95a911bd 100644
--- a/test/test_decoders.py
+++ b/test/test_decoders.py
@@ -25,6 +25,7 @@
     assert_frames_equal,
     AV1_VIDEO,
     cpu_and_cuda,
+    FULL_COLOR_RANGE,
     get_ffmpeg_major_version,
     H265_VIDEO,
     in_fbcode,
@@ -32,6 +33,7 @@
     NASA_AUDIO_MP3,
     NASA_AUDIO_MP3_44100,
     NASA_VIDEO,
+    needs_cuda,
     SINE_MONO_S16,
     SINE_MONO_S32,
     SINE_MONO_S32_44100,
@@ -1138,6 +1140,17 @@ def test_pts_to_dts_fallback(self, seek_mode):
         with pytest.raises(AssertionError, match="not equal"):
             torch.testing.assert_close(decoder[0], decoder[10])
 
+    @needs_cuda
+    def test_full_range_bt709_video(self):
+        decoder_gpu = VideoDecoder(FULL_COLOR_RANGE.path, device="cuda")
+        decoder_cpu = VideoDecoder(FULL_COLOR_RANGE.path, device="cpu")
+
+        a, b = decoder_gpu[0].data.cpu(), decoder_cpu[0].data
+        for frame_index in (0, 10, 20, 5):
+            gpu_frame = decoder_gpu.get_frame_at(frame_index).data.cpu()
+            cpu_frame = decoder_cpu.get_frame_at(frame_index).data
+            torch.testing.assert_close(gpu_frame, cpu_frame, rtol=0, atol=2)
+
 
 class TestAudioDecoder:
     @pytest.mark.parametrize("asset", (NASA_AUDIO, NASA_AUDIO_MP3, SINE_MONO_S32))
diff --git a/test/utils.py b/test/utils.py
index c83a0f241..c7160d594 100644
--- a/test/utils.py
+++ b/test/utils.py
@@ -609,3 +609,12 @@ def sample_format(self) -> str:
         },
     },
 )
+
+FULL_COLOR_RANGE = TestVideo(
+    filename="full_range_709.mp4",
+    default_stream_index=0,
+    stream_infos={
+        0: TestVideoStreamInfo(width=1280, height=720, num_color_channels=3),
+    },
+    frames={0: {}},  # Not needed for now
+)