amend

vmoens · vmoens · commit 710d6eb54f30 · 2025-07-29T09:31:53.000+01:00
diff --git a/benchmarks/test_llm.py b/benchmarks/test_llm.py
@@ -0,0 +1,96 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from __future__ import annotations
+
+import importlib.util
+
+import pytest
+import torch
+from tensordict import set_list_to_stack, TensorDict
+from torchrl.data.llm import History
+from torchrl.modules.llm.policies.common import ChatHistory
+from torchrl.modules.llm.policies.transformers_wrapper import TransformersWrapper
+
+_has_transformers = importlib.import_module("transformers") is not None
+
+
+@pytest.fixture(scope="module")
+def transformers_wrapper():
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    with torch.device(device):
+        model = TransformersWrapper(
+            model="Qwen/Qwen2.5-0.5B",
+            tokenizer="Qwen/Qwen2.5-0.5B",
+            pad_model_input=False,
+            generate=False,
+        )
+        return model
+
+
+@pytest.mark.skipif(not _has_transformers, reason="transformers not installed")
+class TestWrappers:
+    @pytest.mark.parametrize("packing", [True, False])
+    @set_list_to_stack(True)
+    def test_packing(self, benchmark, transformers_wrapper, packing: bool):
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        with torch.device(device):
+            transformers_wrapper = TransformersWrapper(
+                model=transformers_wrapper.model,
+                tokenizer=transformers_wrapper.tokenizer,
+                pad_model_input=not packing,
+                generate=False,
+                pad_output=False,
+            )
+            data = TensorDict(
+                {
+                    "history": ChatHistory(
+                        full=History(
+                            role=[
+                                ["user", "assistant"],
+                                ["user", "assistant"],
+                                ["user", "assistant"],
+                                ["user", "assistant"],
+                            ],
+                            content=[
+                                [
+                                    "Lorem ipsum dolor sit amet",
+                                    "consectetur adipiscing elit",
+                                ],
+                                [
+                                    "sed do eiusmod tempor incididunt ut labore et dolore magna aliqua",
+                                    "Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat",
+                                ],
+                                [
+                                    "Lorem ipsum dolor sit amet",
+                                    "consectetur adipiscing elit",
+                                ],
+                                [
+                                    "sed do eiusmod tempor incididunt ut labore et dolore magna aliqua",
+                                    "Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat",
+                                ],
+                            ],
+                            batch_size=(4, 2),
+                            device=device,
+                        ),
+                        batch_size=(4,),
+                        device=device,
+                    )
+                },
+                batch_size=(4,),
+                device=device,
+            ).to_lazystack()
+
+            def setup():
+                if torch.cuda.is_available():
+                    torch.cuda.empty_cache()
+
+            benchmark.pedantic(
+                transformers_wrapper,
+                (data,),
+                rounds=10,
+                warmup_rounds=3,
+                setup=setup,
+            )
diff --git a/test/llm/test_wrapper.py b/test/llm/test_wrapper.py
@@ -355,7 +355,6 @@ def test_history_input_mode(
             data = data[0]
 
         # Run wrapper
-        print(f"{data=}")
         result = wrapper(data)
         check_output_shapes(result, pad_output, requested_log_probs=not generate)
 
@@ -1841,7 +1840,7 @@ def test_transformers_custom_masking(
 
 
 @pytest.mark.skipif(not _has_transformers, reason="transformers not available")
-@pytest.mark.parametrize("pad_output", [True, False])
+@pytest.mark.parametrize("pad_output", [False, True])
 class TestPacking:
     def test_packing_history(
         self, transformers_instance, sample_history_assistant, pad_output
@@ -1871,8 +1870,8 @@ def test_packing_history(
             {"history": ChatHistory(full=sample_history_assistant)}, batch_size=(2,)
         ).to_lazystack(0)
 
-        result_packed = wrapper_packed(td)
         result_padded = wrapped_padded(td)
+        result_packed = wrapper_packed(td)
         assert_close(result_packed["log_probs"], result_padded["log_probs"])
 
     def test_packing_text(self, transformers_instance, sample_text, pad_output):
@@ -1895,9 +1894,7 @@ def test_packing_text(self, transformers_instance, sample_text, pad_output):
             pad_output=pad_output,
             pad_model_input=True,
         )
-        td = TensorDict({"text": Text(full=sample_text)}, batch_size=(2,)).to_lazystack(
-            0
-        )
+        td = TensorDict({"text": Text(full=sample_text)}, batch_size=(2,))
         result_packed = wrapper_packed(td)
         result_padded = wrapped_padded(td)
         assert_close(result_packed["log_probs"], result_padded["log_probs"])
@@ -1931,8 +1928,8 @@ def test_packing_tokens(
             },
             batch_size=(2,),
         ).to_lazystack(0)
-        result_packed = wrapper_packed(td)
         result_padded = wrapped_padded(td)
+        result_packed = wrapper_packed(td)
         assert_close(result_packed["log_probs"], result_padded["log_probs"])
 
 
diff --git a/torchrl/data/llm/history.py b/torchrl/data/llm/history.py
@@ -519,8 +519,10 @@ class History(TensorClass["nocast"]):
         :class:`~torchrl.modules.llm.policies.Tokens`: Container for token data.
     """
 
-    role: str
-    content: str | ContentBase
+    role: str | list[str] | list[list[str]]
+    content: str | ContentBase | list[str] | list[ContentBase] | list[list[str]] | list[
+        list[ContentBase]
+    ]
     is_complete: bool = True
     tool_calls: list[dict] | None = None
     tool_responses: list[str] | None = None
diff --git a/torchrl/modules/llm/policies/transformers_wrapper.py b/torchrl/modules/llm/policies/transformers_wrapper.py
@@ -39,8 +39,18 @@
 class TransformersWrapper(LLMWrapperBase):
     """A wrapper class for Hugging Face Transformers models, providing a consistent interface for text generation and log probability computation.
 
-    This class is a subclass of :class:`~torchrl.modules.llm.policies.LLMWrapperBase` and provides a unified API for handling different input modalities
-    (history, text, tokens) with consistent output structure using :class:`~tensordict.TensorClass` objects.
+    Packing vs Padding:
+        - Packing (`pad_model_input=False`):
+            * More memory efficient for variable-length sequences.
+            * Not all models support packed input (requires custom attention masks and position ids).
+            * May be less compatible with some HuggingFace models or custom architectures.
+        - Padding (`pad_model_input=True`):
+            * Universally supported by all models.
+            * Wastes memory for short sequences in a batch.
+            * Simpler, but less efficient for highly variable-length data.
+        - If unsure, use padding for maximum compatibility. Use packing for large batches of variable-length data and when your model supports it.
+
+    Additional error handling is provided for empty and overlong sequences.
 
     Args:
         model (transformers.AutoModelForCausalLM | str): The Hugging Face Transformers model to wrap.
@@ -2038,9 +2048,26 @@ def _pack_sequences(
         )
 
     def _model_forward_with_padded_sequences(
-        self, tokens_full_padded, attention_mask_full_padded, pad_val, logits_only=False, **kwargs
+        self,
+        tokens_full_padded: torch.Tensor,
+        attention_mask_full_padded: torch.Tensor,
+        *,
+        pad_val: float | int | torch.Tensor | None = None,
+        logits_only: bool = False,
+        **kwargs,
     ):
         """Forward pass with padded sequences."""
+        # Error handling for empty sequences
+        if tokens_full_padded.numel() == 0:
+            raise ValueError(
+                "Input contains empty sequences. Packing/padding requires at least one token per sequence."
+            )
+        # Error handling for overlong sequences
+        max_len = getattr(self.model.config, "max_position_embeddings", None)
+        if max_len is not None and tokens_full_padded.shape[-1] > max_len:
+            raise ValueError(
+                f"Input sequence length ({tokens_full_padded.shape[-1]}) exceeds model's max_position_embeddings ({max_len}). Consider truncating or splitting your input."
+            )
         tokens_out_struct = self.model(
             tokens_full_padded, attention_mask_full_padded, **kwargs
         )
@@ -2057,35 +2084,51 @@ def _model_forward_with_padded_sequences(
         return log_probs_full_padded, logits_full_padded
 
     def _model_forward_with_packed_sequences(
-        self, flat_input_ids, block_diag_attention_mask, pad: bool = True, logits_only=False, **kwargs
+        self,
+        flat_input_ids: torch.Tensor,
+        block_diag_attention_mask: torch.Tensor,
+        *,
+        pad: bool = True,
+        logits_only: bool = False,
+        **kwargs,
     ):
         """Pack sequences into a single tensor and forward them through the model.
 
         Args:
-            input_ids: NestedTensor of shape (batch_size, -1)
-            attention_mask: NestedTensor of shape (batch_size, -1)
+            flat_input_ids (NestedTensor): NestedTensor of shape (batch_size, -1)
+            block_diag_attention_mask (NestedTensor): NestedTensor of shape (batch_size, -1)
 
         Returns:
-            logits: NestedTensor of shape (batch_size, -1, vocab_size)
+            pad (bool): Whether to pad the output tensors.
+            logits_only (bool): Whether to return only logits.
+            kwargs (dict): Additional keyword arguments to pass to the model.
 
         """
+        # Error handling for empty sequences
+        if flat_input_ids.numel() == 0:
+            raise ValueError(
+                "Input contains empty sequences. Packing requires at least one token per sequence."
+            )
+        # Error handling for overlong sequences
+        # Note: Skipping this check for nested tensors due to symbolic representation issues
+        # The model will handle sequence length limits internally
+        max_len = getattr(self.model.config, "max_position_embeddings", None)
+        if max_len is not None and not hasattr(flat_input_ids, "size"):
+            # Only check for regular tensors, not nested tensors
+            actual_size = flat_input_ids.shape[-1]
+            if actual_size > max_len:
+                raise ValueError(
+                    f"Input sequence length ({actual_size}) exceeds model's max_position_embeddings ({max_len}). Consider truncating or splitting your input."
+                )
         (
             flat_input_ids,
             block_diag_attention_mask,
             packing_metadata,
         ) = self._pack_sequences(flat_input_ids, block_diag_attention_mask)
-        # check shapes: [B, L] for input_ids, [B, L, L] for attention_mask
-        if flat_input_ids.shape != block_diag_attention_mask.shape[:2]:
-            raise ValueError(
-                f"Input ids shape {flat_input_ids.shape=} does not match attention mask shape {block_diag_attention_mask.shape[:2]=}"
-            )
-        if flat_input_ids.shape[1] != block_diag_attention_mask.shape[2]:
-            raise ValueError(
-                f"Input ids shape {flat_input_ids.shape[1]=} does not match attention mask shape {block_diag_attention_mask.shape[2]=}"
-            )
+
         outputs = self.model(
             input_ids=flat_input_ids,
-            attention_mask=block_diag_attention_mask,
+            attention_mask=block_diag_attention_mask.unsqueeze(0),
             position_ids=packing_metadata["position_ids"],
             use_cache=False,  # Disable KV cache for packing
             **kwargs,
@@ -2113,30 +2156,34 @@ def _unpack_outputs(
             logits_only=logits_only,
         )
         # check shapes: [1, L] for log_probs, [1, L, vocab_size] for logits
-        if log_probs.shape != logits.shape[:2]:
-            raise ValueError(
-                f"Log probs shape {log_probs.shape=} does not match logits shape {logits.shape[:2]=}"
-            )
-        if log_probs.ndim != 2:
-            raise ValueError(f"Log probs shape {log_probs.shape=} is not 2D")
-        if logits.ndim != 3:
-            raise ValueError(f"Logits shape {logits.shape=} is not 3D")
-        sequence_lengths = packing_metadata["sequence_lengths"]
-        if log_probs.shape[1] != sequence_lengths.sum():
-            raise ValueError(
-                f"Log probs shape {log_probs.shape=} does not match sequence lengths {sequence_lengths.sum()=}"
+        if logits_only:
+            log_probs = None
+        else:
+            if log_probs.shape != logits.shape[:2]:
+                raise ValueError(
+                    f"Log probs shape {log_probs.shape=} does not match logits shape {logits.shape[:2]=}"
+                )
+            if log_probs.ndim != 2:
+                raise ValueError(f"Log probs shape {log_probs.shape=} is not 2D")
+            if logits.ndim != 3:
+                raise ValueError(f"Logits shape {logits.shape=} is not 3D")
+            sequence_lengths = packing_metadata["sequence_lengths"]
+            if log_probs.shape[1] != sequence_lengths.sum():
+                raise ValueError(
+                    f"Log probs shape {log_probs.shape=} does not match sequence lengths {sequence_lengths.sum()=}"
+                )
+
+            log_probs = log_probs.squeeze(0)
+            nested_logprobs = torch.nested.nested_tensor_from_jagged(
+                log_probs,
+                lengths=sequence_lengths,
             )
 
         logits = logits.squeeze(0)
         nested_logits = torch.nested.nested_tensor_from_jagged(
             logits,  # Remove batch dim: (total_length, vocab_size)
             lengths=sequence_lengths,
         )
-        log_probs = log_probs.squeeze(0)
-        nested_logprobs = torch.nested.nested_tensor_from_jagged(
-            log_probs,
-            lengths=sequence_lengths,
-        )
 
         if pad:
             return nested_logprobs.to_padded_tensor(
@@ -2173,7 +2220,7 @@ def repeat_interleave_causal(self, sequence_lengths: torch.Tensor) -> torch.Tens
         seq_ids = torch.arange(len(sequence_lengths), device=sequence_lengths.device)
         position_to_seq_id = seq_ids.repeat_interleave(sequence_lengths)
 
-        positions = torch.arange(total_length, device=sequence_lengths.device)
+        positions = torch.arange(int(total_length), device=sequence_lengths.device)
 
         same_sequence = position_to_seq_id.unsqueeze(1) == position_to_seq_id.unsqueeze(
             0
@@ -2193,7 +2240,7 @@ def _create_packed_position_ids(
         No cuda syncs.
         """
         if total_length is None:
-            total_length = sequence_lengths.sum()
+            total_length = int(sequence_lengths.sum().item())
 
         # Create global position IDs: [0, 1, 2, 3, 4]
         global_positions = torch.arange(total_length, device=sequence_lengths.device)