Source code for nemo_rl.data.hf_datasets.openmathinstruct2

# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


from datasets import load_dataset

from nemo_rl.data.interfaces import TaskDataSpec



[docs]
def format_math(data, output_key: str = "expected_answer"):
    return {
        "messages": [
            {
                "role": "user",
                "content": data["problem"],
            },
            {
                "role": "assistant",
                "content": data[output_key],
            },
        ],
        # For v0.1 release, nemo rl datasets require a task_name key such that user can map a task processor per unique task.
        "task_name": "math",
    }




[docs]
def prepare_openinstructmath2_dataset(
    split: str = "train_1M",
    seed=42,
    test_size=0.05,
    output_key: str = "expected_answer",
):
    """Load and split the OpenMathInstruct-2 dataset into train and validation sets using HF's train_test_split."""
    print(
        "WARNING: For reproducible experiments, preprocess the dataset once and define your own HfDataset subclass that directly uses the preprocessed datasets."
    )

    # Load the original dataset
    original_ds = load_dataset("nvidia/OpenMathInstruct-2", split=split)

    # Split into train and validation sets using HF's train_test_split
    split_ds = original_ds.train_test_split(test_size=test_size, seed=seed)

    # Format the examples, removing original columns
    train_formatted = split_ds["train"].map(
        format_math,
        remove_columns=split_ds["train"].column_names,
        fn_kwargs={"output_key": output_key},
    )
    val_formatted = split_ds["test"].map(
        format_math,
        remove_columns=split_ds["test"].column_names,
        fn_kwargs={"output_key": output_key},
    )

    return {
        "train": train_formatted,
        "validation": val_formatted,
    }




[docs]
class OpenMathInstruct2Dataset:
    def __init__(
        self,
        split: str = "train_1M",
        seed: int = 42,
        test_size: float = 0.05,
        output_key: str = "expected_answer",
        prompt_file: str = None,
    ):
        """Initialize the OpenMathInstruct2 dataset with train/validation split.

        Args:
            seed: Random seed for reproducible splitting
            test_size: Proportion of data to use for validation (0.0-1.0)
        """
        # train, train_1M, train_2M, and train_5M are supported splits.
        if split not in ["train", "train_1M", "train_2M", "train_5M"]:
            raise ValueError(
                f"Invalid split: {split}. Please use 'train', 'train_1M', 'train_2M', or 'train_5M'."
            )

        self.formatted_ds = prepare_openinstructmath2_dataset(
            split=split, seed=seed, test_size=test_size, output_key=output_key
        )

        self.task_spec = TaskDataSpec(
            task_name="OpenMathInstruct-2",
            prompt_file=prompt_file,
        )