Source code for nemo_automodel.datasets.llm.mock

# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import random

from datasets import Dataset, Features, Sequence, Value



[docs]
def make_vocab(vocab_size:int=100):
    """
    Build a trivial vocab; index 0=<pad>, 1=<eos>, rest = tok_i.
    """
    vocab = {"<pad>": 0, "<eos>": 1}
    for i in range(2, vocab_size):
        vocab[f"tok_{i}"] = i
    return vocab



[docs]
def gen_sentence_ids(vocab, mean_len:float, std_len:float, max_len:int):
    """
    Sentence generator with Gaussian length control.
    """
    words = list(vocab.values())[2:]     # exclude <pad>, <eos>
    L = max(1, min(max_len, int(random.gauss(mean_len, std_len))))
    return random.choices(words, k=L) + [vocab["<eos>"]]



[docs]
def build_unpacked_dataset(
        *,
        num_sentences: int       = 10,
        mean_len: float          = 20.0,
        std_len: float           = 6.0,
        vocab_size: int          = 100,
        max_sentence_len: int    = 64,
        seed: int                = 0,
):
    """
    Build a dataset where each example is one sentence (variable length).

    Returns:
      - a HuggingFace Dataset with fields:
          input_ids:     Sequence(int64)
          attention_mask:Sequence(int8)
          labels:        Sequence(int64)
          position_ids:  Sequence(int64)
    """
    random.seed(seed)
    vocab = make_vocab(vocab_size)
    eos_id = vocab["<eos>"]

    examples = []
    for _ in range(num_sentences):
        sent = gen_sentence_ids(vocab, mean_len, std_len, max_sentence_len)
        # build position_ids just like flush_block would
        pos_ids = []
        pos = 0
        for tid in sent:
            pos_ids.append(pos)
            pos = 0 if tid == eos_id else pos + 1

        examples.append({
            "input_ids":      sent,
            "attention_mask": [1] * len(sent),
            "labels":         sent.copy(),
            "position_ids":   pos_ids,
        })

    features = Features({
        "input_ids":      Sequence(Value("int64")),
        "attention_mask": Sequence(Value("int8")),
        "labels":         Sequence(Value("int64")),
        "position_ids":   Sequence(Value("int64")),
    })
    ds = Dataset.from_list(examples, features=features)
    return ds


if __name__ == "__main__":
    ds = build_unpacked_dataset(
        num_sentences   = 5,
        mean_len        = 12.0,
        std_len         = 3.0,
        vocab_size      = 50,
        max_sentence_len= 20,
    )
    print(ds)
    # Show lengths of each field for the first example
    print({k: len(v) for k, v in ds[0].items()})
    print("ds[0]:", ds[0])