Source code for nemo_automodel.datasets.llm.mock_packed

# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import random

from datasets import Dataset, Features, Sequence, Value



[docs]
def make_vocab(vocab_size:int=100):
    """
    Build a trivial vocab; index 0=<pad>, 1=<eos>, rest = word_i.
    """
    vocab = {"<pad>": 0, "<eos>": 1}
    for i in range(2, vocab_size):
        vocab[f"tok_{i}"] = i
    return vocab



[docs]
def gen_sentence_ids(vocab, mean_len:float, std_len:float, max_len:int):
    """
    Sentence generator with Gaussian length control.
    """
    words = list(vocab.values())[2:]     # exclude <pad>, <eos>
    # truncated Gaussian
    L = max(1, min(max_len, int(random.gauss(mean_len, std_len))))
    return random.choices(words, k=L) + [vocab["<eos>"]]



[docs]
def flush_block(block, block_size):
    """
    Flush helper (build position_ids that reset after <eos>).
    """
    pos, pos_ids = 0, []
    for tid in block:
        pos_ids.append(pos)
        pos = 0 if tid == 1 else pos + 1          # 1 == <eos>
    return {
        "input_ids": block,
        "attention_mask": [1]*block_size,
        "labels": block.copy(),
        "position_ids": pos_ids,
    }



[docs]
def build_packed_dataset(
        *,
        num_blocks:int          = 10,
        block_size:int          = 128,
        mean_len:float          = 20.0,
        std_len:float           = 6.0,
        vocab_size:int          = 100,
        max_sentence_len:int    = 64,
        seed:int                = 0,
):
    """
    Dataset builder.
    """
    random.seed(seed)
    vocab = make_vocab(vocab_size)
    current, examples = [], []

    while len(examples) < num_blocks:
        sent = gen_sentence_ids(vocab, mean_len, std_len, max_sentence_len)

        # overflow? -> save current block if full
        if len(current) + len(sent) > block_size:
            if len(current) == block_size:
                examples.append(flush_block(current, block_size))
            current = []

        current.extend(sent)

    # Optional: emit last block if exactly full
    if len(current) == block_size and len(examples) < num_blocks:
        examples.append(flush_block(current, block_size))

    features = Features({
        "input_ids":     Sequence(Value("int64")),
        "attention_mask":Sequence(Value("int8")),
        "labels":        Sequence(Value("int64")),
        "position_ids":  Sequence(Value("int64")),
    })
    return Dataset.from_list(examples[:num_blocks], features=features)


if __name__ == "__main__":
    ds = build_packed_dataset(
        num_blocks=3,
        block_size=32,
        mean_len=10,
        std_len=3,
        vocab_size=50,
    )
    print(ds)
    print("Row-0 lengths:", {k: len(v) for k,v in ds[0].items()})
    print("Row-0 position_ids:", ds[0]["position_ids"])