nemo_curator.stages.text.io.writer.megatron_tokenizer

View as Markdown

Module Contents

Classes

NameDescription
MegatronTokenizerWriterWriter that writes a DocumentBatch to Megatron ready tokenized files.

Data

_INDEX_HEADER

API

class nemo_curator.stages.text.io.writer.megatron_tokenizer.MegatronTokenizerWriter(
path: str,
file_extension: list[str] = (lambda: FILETYPE_TO_DEFAUL...,
write_kwargs: dict[str, typing.Any] = dict(),
fields: list[str] | None = None,
name: str = 'megatron_tokenizer_writer',
mode: typing.Literal['ignore', 'overwrite', 'append', 'error'] = 'ignore',
append_mode_implemented: bool = False,
model_identifier: str | None = None,
cache_dir: str | None = None,
hf_token: str | None = None,
text_field: str = 'text',
tokenization_batch_size: int = 1000,
append_eod: bool = False
)
Dataclass

Bases: BaseWriter

Writer that writes a DocumentBatch to Megatron ready tokenized files.

append_eod
bool = False
cache_dir
str | None = None
fields
list[str] | None = field(default=None, init=False, repr=False)
file_extension
list[str]
hf_token
str | None = None
model_identifier
str | None = None
name
str = 'megatron_tokenizer_writer'
text_field
str = 'text'
tokenization_batch_size
int = 1000
nemo_curator.stages.text.io.writer.megatron_tokenizer.MegatronTokenizerWriter.__post_init__()
nemo_curator.stages.text.io.writer.megatron_tokenizer.MegatronTokenizerWriter._sequence_pointers(
sequence_lengths: list[int],
token_size: int
) -> list[int]
staticmethod

Build the sequence pointers per the sequence lengths and dtype size

Returns: list[int]: The pointer to the beginning of each sequence

Parameters:

sequence_lengths
list[int]

The length of each sequence

token_size
int

The size of each token in bytes

nemo_curator.stages.text.io.writer.megatron_tokenizer.MegatronTokenizerWriter.process(
task: nemo_curator.tasks.DocumentBatch
) -> nemo_curator.tasks.FileGroupTask
nemo_curator.stages.text.io.writer.megatron_tokenizer.MegatronTokenizerWriter.setup(
_worker_metadata: nemo_curator.backends.base.WorkerMetadata | None = None
) -> None
nemo_curator.stages.text.io.writer.megatron_tokenizer.MegatronTokenizerWriter.setup_on_node(
_node_info: nemo_curator.backends.base.NodeInfo | None = None,
_worker_metadata: nemo_curator.backends.base.WorkerMetadata = None
) -> None
nemo_curator.stages.text.io.writer.megatron_tokenizer.MegatronTokenizerWriter.write_data(
bin_file: typing.BinaryIO,
token_dtype: numpy.dtype,
eod_token_id: int,
tokens_batch: list[list[int]],
sequence_lengths: list[int]
) -> None

Write tokens to the .bin file Args: tokens_batch (list[list[int]]): The batch of tokens to write

nemo_curator.stages.text.io.writer.megatron_tokenizer.MegatronTokenizerWriter.write_idx_data(
file_prefix: str,
token_size: int,
token_dtype_code: int,
sequence_lengths: list[int]
) -> None

Write the .idx file data

nemo_curator.stages.text.io.writer.megatron_tokenizer._INDEX_HEADER = b'MMIDIDX\x00\x00'