# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import warnings
from typing import Any, Optional, cast
import torch
from datasets import Dataset
from transformers.tokenization_utils_base import PreTrainedTokenizerBase
from nemo_rl.data.interfaces import (
FlatMessagesType,
LLMMessageLogType,
TaskDataSpec,
)
from nemo_rl.distributed.batched_data_dict import BatchedDataDict
Tensor = torch.Tensor
TokenizerType = PreTrainedTokenizerBase
[docs]
def message_log_to_flat_messages(
message_log: LLMMessageLogType,
) -> FlatMessagesType:
"""Converts a message log (sequence of message turns) into a flattened representation.
This function takes a message log (list of dict messages with 'role', 'content', 'token_ids', etc.)
and converts it to a flat dictionary where all tensors of the same key are concatenated and
all strings of the same key are put into lists.
Args:
message_log: List of message dictionaries with 'role', 'content', and potentially 'token_ids'
Returns:
FlatMessagesType: Dictionary mapping keys to concatenated tensors and string lists
Examples:
```{doctest}
>>> import torch
>>> from nemo_rl.data.llm_message_utils import message_log_to_flat_messages
>>> # Create a simple message log with two messages
>>> message_log = [
... {'role': 'user', 'content': 'Hello', 'token_ids': torch.tensor([1, 2, 3])},
... {'role': 'assistant', 'content': 'Hi there', 'token_ids': torch.tensor([4, 5, 6, 7])}
... ]
>>> flat_msgs = message_log_to_flat_messages(message_log)
>>> flat_msgs['role']
['user', 'assistant']
>>> flat_msgs['content']
['Hello', 'Hi there']
>>> flat_msgs['token_ids']
tensor([1, 2, 3, 4, 5, 6, 7])
```
"""
result: dict[str, list[Any]] = {}
if len(message_log) == 0:
return cast(FlatMessagesType, result)
# Get all unique keys across all messages
all_keys: set[str] = set()
for msg in message_log:
all_keys.update(msg.keys())
# Initialize result with empty lists for each key
for key in all_keys:
result[key] = []
# Collect values for each key
for msg in message_log:
for key in all_keys:
if key in msg:
result[key].append(msg[key])
# Concatenate tensors for each key
concat: FlatMessagesType = {}
for key in result:
if result[key] and isinstance(result[key][0], Tensor):
try:
concat[key] = torch.cat(result[key])
except RuntimeError as e:
if "same number of dimensions" in str(e):
raise RuntimeError(
f"tensors for {key=} must have same number of dimensions: {[t.shape for t in result[key]]}"
) from e
raise
output: FlatMessagesType = {**result, **concat}
return output
[docs]
def get_keys_from_message_log(
message_log: LLMMessageLogType, keys: list[str]
) -> LLMMessageLogType:
"""Return a new LLMMessageLogType containing only the specified keys from each message.
Args:
message_log: Original message log to extract keys from
keys: List of keys to keep in each message
Returns:
LLMMessageLogType: New list with only specified keys
"""
return [{k: msg[k] for k in keys if k in msg} for msg in message_log]
[docs]
def add_loss_mask_to_message_log(
batch_message_log: list[LLMMessageLogType],
roles_to_train_on: list[str] = ["assistant"],
only_unmask_final: bool = False,
) -> None:
"""Add token-level loss masks to each message in a message log.
Args:
message_log (LLMMessageLogType): List of message dictionaries containing token IDs and metadata
roles_to_train_on (list[str]): List of strings indicating which speakers to unmask. Default: ["assistant"]
only_unmask_final (bool): If True, only unmask the final message in the log. Default: False
"""
for i, role in enumerate(roles_to_train_on):
roles_to_train_on[i] = role.lower()
for message_log in batch_message_log:
for i, message in enumerate(message_log):
if only_unmask_final:
if i == len(message_log) - 1:
message["token_loss_mask"] = torch.ones_like(
cast(Tensor, message["token_ids"])
)
else:
message["token_loss_mask"] = torch.zeros_like(
cast(Tensor, message["token_ids"])
)
else:
if message["role"] in roles_to_train_on:
message["token_loss_mask"] = torch.ones_like(
cast(Tensor, message["token_ids"])
)
else:
message["token_loss_mask"] = torch.zeros_like(
cast(Tensor, message["token_ids"])
)
[docs]
def _pad_tensor(
tensor: Tensor,
max_len: int,
pad_side: str,
pad_value: int = 0,
) -> Tensor:
"""Pad a tensor to the specified length.
Args:
tensor: Tensor to pad
max_len: Length to pad to
pad_side: Whether to pad on the 'left' or 'right'
pad_value: Value to use for padding
Returns:
torch.Tensor: Padded tensor
"""
pad_len = max_len - tensor.size(0)
if pad_len <= 0:
return tensor
padding = torch.full(
(pad_len, *tensor.shape[1:]),
pad_value,
dtype=tensor.dtype,
device=tensor.device,
)
return torch.cat(
[padding, tensor] if pad_side == "left" else [tensor, padding], dim=0
)
[docs]
def _validate_tensor_consistency(tensors: list[Tensor]) -> None:
"""Validate that all tensors have consistent dtypes and devices.
Args:
tensors: List of tensors to validate
Raises:
RuntimeError: If tensors have different dtypes or devices
"""
if not tensors:
return
first = tensors[0]
if not all(t is None or t.dtype == first.dtype for t in tensors):
raise RuntimeError(
f"expected consistent types but got: {[t.dtype for t in tensors]}"
)
if not all(t is None or t.device == first.device for t in tensors):
raise RuntimeError(
f"expected tensors on the same device but got: {[t.device for t in tensors]}"
)
[docs]
def batched_message_log_to_flat_message(
message_log_batch: list[LLMMessageLogType],
pad_value_dict: Optional[dict[str, int]] = None,
make_sequence_length_divisible_by: int = 1,
) -> tuple[BatchedDataDict[FlatMessagesType], Tensor]:
"""Process and pad a batch of message logs for model input.
For each message log in the batch:
1. Converts it to a flat representation using message_log_to_flat_messages
2. Pads all resulting tensors to the same length for batching
3. Returns a BatchedDataDict and sequence lengths tensor
Padding is always applied to the right side of sequences.
Args:
message_log_batch: List of LLMMessageLogType (each a conversation with multiple turns)
pad_value_dict: Dictionary mapping keys to padding values (default is 0)
make_sequence_length_divisible_by: forces the data to be divisible by this value
Returns:
BatchedDataDict[FlatMessagesType]: Dictionary containing padded stacked tensors
torch.Tensor: Input lengths tensor with shape [batch_size] (pre-padding lengths)
Raises:
RuntimeError: If tensors have different dtypes or devices
Examples:
```{doctest}
>>> import torch
>>> from nemo_rl.data.llm_message_utils import batched_message_log_to_flat_message
>>> from nemo_rl.distributed.batched_data_dict import BatchedDataDict
>>> # Create a batch of two message logs with different lengths
>>> message_log_batch = [
... # First conversation
... [
... {'role': 'user', 'content': 'What is 2+2?', 'token_ids': torch.tensor([1, 2, 3, 4, 5])},
... {'role': 'assistant', 'content': '4', 'token_ids': torch.tensor([6, 7])}
... ],
... # Second conversation
... [
... {'role': 'user', 'content': 'Solve x+10=15', 'token_ids': torch.tensor([1, 8, 9, 10, 11, 12])},
... {'role': 'assistant', 'content': 'x=5', 'token_ids': torch.tensor([13, 14, 15])}
... ]
... ]
>>> pad_value_dict = {'token_ids': 0}
>>> batched_flat, input_lengths = batched_message_log_to_flat_message(message_log_batch, pad_value_dict)
>>> batched_flat['token_ids'][0].tolist()
[1, 2, 3, 4, 5, 6, 7, 0, 0]
>>> batched_flat['token_ids'][1].tolist()
[1, 8, 9, 10, 11, 12, 13, 14, 15]
>>> batched_flat['content'][0]
['What is 2+2?', '4']
>>> batched_flat['content'][1]
['Solve x+10=15', 'x=5']
>>> batched_flat['role']
[['user', 'assistant'], ['user', 'assistant']]
>>> input_lengths
tensor([7, 9], dtype=torch.int32)
>>>
```
"""
if not message_log_batch:
return BatchedDataDict(), torch.empty(0)
# Process each message log into a flat representation
sequenced_lists = [message_log_to_flat_messages(ml) for ml in message_log_batch]
all_keys = {k for seq in sequenced_lists for k in seq}
# Find max length and identify tensor keys
max_len = 0
tensor_keys = []
for seq in sequenced_lists:
for key, value in seq.items():
if isinstance(value, Tensor):
tensor_keys.append(key)
max_len = max(max_len, value.size(0))
if max_len % make_sequence_length_divisible_by != 0:
max_len = (
(max_len // make_sequence_length_divisible_by) + 1
) * make_sequence_length_divisible_by
# Handle non-tensor case
if not tensor_keys:
result: BatchedDataDict[FlatMessagesType] = BatchedDataDict(
{
k: [seq[k][0] if k in seq else None for seq in sequenced_lists]
for k in all_keys
}
)
return result, torch.empty(0)
# Create input_lengths tensor
input_lengths = []
for seq in sequenced_lists:
# Find the maximum length among all tensors in the dictionary, default to 0 if none exist
# Use maximum here since there may be keys that aren't populated for all messages yet.
# For example, logprobs don't get populated for non-generated tokens until post-processing.
seq_len = max(
(v.size(0) for v in seq.values() if isinstance(v, Tensor)), default=0
)
input_lengths.append(seq_len)
input_lengths_tensor = torch.tensor(input_lengths, dtype=torch.int32)
# Process each key
result = BatchedDataDict()
for key in all_keys:
values = [seq.get(key) for seq in sequenced_lists]
if not values or not isinstance(values[0], Tensor):
result[key] = values
continue
# Filter out None values and validate consistency
values: list[Tensor | None] = cast(list[Tensor | None], values)
tensors = cast(list[Tensor], [t for t in values if t is not None])
_validate_tensor_consistency(tensors)
# Create zero tensors for None values
filled_values: list[Tensor] = [
(
torch.zeros(0, dtype=tensors[0].dtype, device=tensors[0].device) # type: ignore
if v is None
else v
)
for v in values
]
# Pad and stack tensors (always right padding)
pad_value = pad_value_dict.get(key, 0) if pad_value_dict else 0
padded = [_pad_tensor(t, max_len, "right", pad_value) for t in filled_values]
result[key] = torch.stack(padded)
return result, input_lengths_tensor
[docs]
def message_log_shape(message_log: LLMMessageLogType) -> list[dict[str, torch.Size]]:
"""Get the shape of the tensors in the message log.
This utility function examines each message in the message log and reports
the shape of tensor values or recursively processes list values.
Args:
message_log: The message log to analyze
Returns:
List of dictionaries containing tensor shapes for each key in messages
"""
shapes = []
for message in message_log:
shape = {}
for k in message.keys():
if isinstance(message[k], Tensor):
shape[k] = message[k].shape # type: ignore # we know it's a tensor
elif isinstance(message[k], list):
shape[k] = [message_log_shape(v) for v in message[k]] # type: ignore
shapes.append(shape)
return shapes
[docs]
def get_first_index_that_differs(str1: str, str2: str) -> int:
"""Get the first index that differs between two strings."""
for i, (c1, c2) in enumerate(zip(str1, str2)):
if c1 != c2:
return i
return min(len(str1), len(str2))
[docs]
def remap_dataset_keys(
dataset: Dataset,
mapping_dict: dict[str, str],
) -> Dataset:
"""Remap dataset keys as per mapping.
Args:
dataset: The input dataset to remap keys in
mapping_dict: A dictionary mapping input keys to output keys
Returns:
Dataset: A new dataset with remapped keys
"""
# no need to remap if the keys are already correct
if all(k == v for k, v in mapping_dict.items()):
return dataset
# return the remapped dataset
return dataset.map(
lambda x: {v: x[k] for k, v in mapping_dict.items()},
remove_columns=list(mapping_dict.keys()),
)