`utils.script_utils`#

Module Contents#

Classes#

ArgumentHelper

A helper class to add common arguments to an argparse.ArgumentParser instance.

API#

class utils.script_utils.ArgumentHelper(parser: argparse.ArgumentParser)#

A helper class to add common arguments to an argparse.ArgumentParser instance.

Initialization

add_arg_autocast(help: str = 'Whether to use autocast or not') → None#

add_arg_batch_size( default: int = 64, help: str = 'Number of files to read into memory at a time.', ) → None#

add_arg_device() → None#

add_arg_enable_spilling() → None#

add_arg_id_column() → None#

add_arg_id_column_type() → None#

add_arg_input_data_dir( required: bool = False, help: str = 'Input directory consisting of .jsonl files that are accessible to all nodes. Use this for a distributed file system.', ) → None#

add_arg_input_file_extension( help: str = 'The file extension of the input files. If not provided, the input file type will be used.', ) → None#

add_arg_input_file_type( choices: list | None = None, required: bool = False, help: str = 'File type of the dataset to be read in. Supported file formats include "jsonl" (default), "pickle", or "parquet".', ) → None#

add_arg_input_local_data_dir() → None#

add_arg_input_meta() → None#

add_arg_input_text_field() → None#

add_arg_language(help: str) → None#

add_arg_log_dir(default: str) → None#

add_arg_max_chars(default: int = 2000) → None#

add_arg_max_mem_gb_classifier() → None#

add_arg_minhash_length() → None#

add_arg_model_path(help: str = 'The path to the model file') → None#

add_arg_nvlink_only() → None#

add_arg_output_data_dir(help: str) → None#

add_arg_output_dir( required: bool = False, help: str = 'The output directory to write results.', ) → None#

add_arg_output_file_type( choices: list | None = None, help: str = 'File type the dataset will be written to. Supported file formats include "jsonl" (default), "pickle", or "parquet".', ) → None#

add_arg_output_train_file( help: str, default: str | None = None, ) → None#

add_arg_protocol() → None#

add_arg_rmm_pool_size() → None#

add_arg_scheduler_address() → None#

add_arg_scheduler_file() → None#

add_arg_seed( default: int = 42, help: str = 'If specified, the random seed used for shuffling.', ) → None#

add_arg_set_torch_to_use_rmm() → None#

add_arg_shuffle(help: str) → None#

add_arg_text_ddf_blocksize() → None#

add_distributed_args() → argparse.ArgumentParser#: Adds default set of arguments that are needed for Dask cluster setup.

add_distributed_classifier_cluster_args() → None#: Adds Dask cluster arguments needed for the distributed data classifiers.

static attach_bool_arg( parser: argparse.ArgumentParser, flag_name: str, default: bool = False, help: str | None = None, ) → None#

attach_version_arg(version_string: str) → None#

static parse_client_args(args: argparse.Namespace) → dict#: Extracts relevant arguments from an argparse namespace to pass to get_client.

static parse_distributed_classifier_args( description: str = 'Default distributed classifier argument parser.', max_chars_default: int = 2000, ) → argparse.ArgumentParser#: Adds default set of arguments that are common to multiple stages of the pipeline.

parse_gpu_dedup_args() → argparse.ArgumentParser#: Adds default set of arguments that are common to multiple stages of the fuzzy deduplication pipeline.

static parse_semdedup_args( description: str = 'Default argument parser for semantic deduplication.', ) → argparse.ArgumentParser#: Adds default set of arguments that are common to multiple stages of the semantic deduplication pipeline.

set_default_n_workers(max_mem_gb_per_worker: float) → None#

Sets the default –n-workers for a script to maximize parallelization while ensuring we don’t trigger an out of memory error. Like –n-workers, this only applies when running the script locally.

Args: max_mem_per_worker (float): The maximum memory that each worker usually achieves for a script in units of gigabytes. It can be determined by watching the Dask dashboard. This value may change based on the size of each shard, so use a JSONL shard size of about 100 MB.

utils.script_utils#

Module Contents#

Classes#

API#

`utils.script_utils`#