utils.script_utils
#
Module Contents#
Classes#
A helper class to add common arguments to an argparse.ArgumentParser instance. |
API#
- class utils.script_utils.ArgumentHelper(parser: argparse.ArgumentParser)#
A helper class to add common arguments to an argparse.ArgumentParser instance.
Initialization
- add_arg_autocast(help: str = 'Whether to use autocast or not') None #
- add_arg_batch_size(
- default: int = 64,
- help: str = 'Number of files to read into memory at a time.',
- add_arg_device() None #
- add_arg_enable_spilling() None #
- add_arg_id_column() None #
- add_arg_id_column_type() None #
- add_arg_input_data_dir(
- required: bool = False,
- help: str = 'Input directory consisting of .jsonl files that are accessible to all nodes. Use this for a distributed file system.',
- add_arg_input_file_extension(
- help: str = 'The file extension of the input files. If not provided, the input file type will be used.',
- add_arg_input_file_type(
- choices: list | None = None,
- required: bool = False,
- help: str = 'File type of the dataset to be read in. Supported file formats include "jsonl" (default), "pickle", or "parquet".',
- add_arg_input_local_data_dir() None #
- add_arg_input_meta() None #
- add_arg_input_text_field() None #
- add_arg_language(help: str) None #
- add_arg_log_dir(default: str) None #
- add_arg_max_chars(default: int = 2000) None #
- add_arg_max_mem_gb_classifier() None #
- add_arg_minhash_length() None #
- add_arg_model_path(help: str = 'The path to the model file') None #
- add_arg_nvlink_only() None #
- add_arg_output_data_dir(help: str) None #
- add_arg_output_dir(
- required: bool = False,
- help: str = 'The output directory to write results.',
- add_arg_output_file_type(
- choices: list | None = None,
- help: str = 'File type the dataset will be written to. Supported file formats include "jsonl" (default), "pickle", or "parquet".',
- add_arg_output_train_file(
- help: str,
- default: str | None = None,
- add_arg_protocol() None #
- add_arg_rmm_pool_size() None #
- add_arg_scheduler_address() None #
- add_arg_scheduler_file() None #
- add_arg_seed(
- default: int = 42,
- help: str = 'If specified, the random seed used for shuffling.',
- add_arg_set_torch_to_use_rmm() None #
- add_arg_shuffle(help: str) None #
- add_arg_text_ddf_blocksize() None #
- add_distributed_args() argparse.ArgumentParser #
Adds default set of arguments that are needed for Dask cluster setup.
- add_distributed_classifier_cluster_args() None #
Adds Dask cluster arguments needed for the distributed data classifiers.
- static attach_bool_arg(
- parser: argparse.ArgumentParser,
- flag_name: str,
- default: bool = False,
- help: str | None = None,
- attach_version_arg(version_string: str) None #
- static parse_client_args(args: argparse.Namespace) dict #
Extracts relevant arguments from an argparse namespace to pass to get_client.
- static parse_distributed_classifier_args(
- description: str = 'Default distributed classifier argument parser.',
- max_chars_default: int = 2000,
Adds default set of arguments that are common to multiple stages of the pipeline.
- parse_gpu_dedup_args() argparse.ArgumentParser #
Adds default set of arguments that are common to multiple stages of the fuzzy deduplication pipeline.
- static parse_semdedup_args(
- description: str = 'Default argument parser for semantic deduplication.',
Adds default set of arguments that are common to multiple stages of the semantic deduplication pipeline.
- set_default_n_workers(max_mem_gb_per_worker: float) None #
Sets the default –n-workers for a script to maximize parallelization while ensuring we don’t trigger an out of memory error. Like –n-workers, this only applies when running the script locally.
Args: max_mem_per_worker (float): The maximum memory that each worker usually achieves for a script in units of gigabytes. It can be determined by watching the Dask dashboard. This value may change based on the size of each shard, so use a JSONL shard size of about 100 MB.