stages.text.io.writer.megatron_tokenizer#

Module Contents#

Classes#

MegatronTokenizerWriter

Writer that writes a DocumentBatch to Megatron ready tokenized files.

API#

class stages.text.io.writer.megatron_tokenizer.MegatronTokenizerWriter#

Bases: stages.text.io.writer.base.BaseWriter

Writer that writes a DocumentBatch to Megatron ready tokenized files.

append_eod: bool#

False

cache_dir: str | None#

None

fields: list[str] | None#

‘field(…)’

file_extension: list[str]#

‘field(…)’

hf_token: str | None#

None

model_identifier: str | None#

None

name: str#

‘megatron_tokenizer_writer’

process(
task: nemo_curator.tasks.DocumentBatch,
) nemo_curator.tasks.FileGroupTask#

Process a DocumentBatch and write to files.

Args: task (DocumentBatch): DocumentBatch containing data to write

Returns: FileGroupTask: Task containing paths to written files

setup(
_worker_metadata: nemo_curator.backends.base.WorkerMetadata | None = None,
) None#

Setup method called once before processing begins. Override this method to perform any initialization that should happen once per worker. Args: worker_metadata (WorkerMetadata, optional): Information about the worker (provided by some backends)

setup_on_node(
_node_info: nemo_curator.backends.base.NodeInfo | None = None,
_worker_metadata: nemo_curator.backends.base.WorkerMetadata = None,
) None#

Setup method called once per node in distributed settings. Override this method to perform node-level initialization. Args: node_info (NodeInfo, optional): Information about the node (provided by some backends) worker_metadata (WorkerMetadata, optional): Information about the worker (provided by some backends)

text_field: str#

‘text’

tokenization_batch_size: int#

1000

write_data(
bin_file: BinaryIO,
token_dtype: numpy.dtype,
eod_token_id: int,
tokens_batch: list[list[int]],
sequence_lengths: list[int],
) None#

Write tokens to the .bin file Args: tokens_batch (list[list[int]]): The batch of tokens to write

write_idx_data(
file_prefix: str,
token_size: int,
token_dtype_code: int,
sequence_lengths: list[int],
) None#

Write the .idx file data