utils.script_utils#

Module Contents#

Classes#

ArgumentHelper

A helper class to add common arguments to an argparse.ArgumentParser instance.

API#

class utils.script_utils.ArgumentHelper(parser: argparse.ArgumentParser)#

A helper class to add common arguments to an argparse.ArgumentParser instance.

Initialization

add_arg_autocast(help: str = 'Whether to use autocast or not') None#
add_arg_batch_size(
default: int = 64,
help: str = 'Number of files to read into memory at a time.',
) None#
add_arg_device() None#
add_arg_enable_spilling() None#
add_arg_id_column() None#
add_arg_id_column_type() None#
add_arg_input_data_dir(
required: bool = False,
help: str = 'Input directory consisting of .jsonl files that are accessible to all nodes. Use this for a distributed file system.',
) None#
add_arg_input_file_extension(
help: str = 'The file extension of the input files. If not provided, the input file type will be used.',
) None#
add_arg_input_file_type(
choices: list | None = None,
required: bool = False,
help: str = 'File type of the dataset to be read in. Supported file formats include "jsonl" (default), "pickle", or "parquet".',
) None#
add_arg_input_local_data_dir() None#
add_arg_input_meta() None#
add_arg_input_text_field() None#
add_arg_language(help: str) None#
add_arg_log_dir(default: str) None#
add_arg_max_chars(default: int = 2000) None#
add_arg_max_mem_gb_classifier() None#
add_arg_minhash_length() None#
add_arg_model_path(help: str = 'The path to the model file') None#
add_arg_output_data_dir(help: str) None#
add_arg_output_dir(
required: bool = False,
help: str = 'The output directory to write results.',
) None#
add_arg_output_file_type(
choices: list | None = None,
help: str = 'File type the dataset will be written to. Supported file formats include "jsonl" (default), "pickle", or "parquet".',
) None#
add_arg_output_train_file(
help: str,
default: str | None = None,
) None#
add_arg_protocol() None#
add_arg_rmm_pool_size() None#
add_arg_scheduler_address() None#
add_arg_scheduler_file() None#
add_arg_seed(
default: int = 42,
help: str = 'If specified, the random seed used for shuffling.',
) None#
add_arg_set_torch_to_use_rmm() None#
add_arg_shuffle(help: str) None#
add_arg_text_ddf_blocksize() None#
add_distributed_args() argparse.ArgumentParser#

Adds default set of arguments that are needed for Dask cluster setup.

add_distributed_classifier_cluster_args() None#

Adds Dask cluster arguments needed for the distributed data classifiers.

static attach_bool_arg(
parser: argparse.ArgumentParser,
flag_name: str,
default: bool = False,
help: str | None = None,
) None#
attach_version_arg(version_string: str) None#
static parse_client_args(args: argparse.Namespace) dict#

Extracts relevant arguments from an argparse namespace to pass to get_client.

static parse_distributed_classifier_args(
description: str = 'Default distributed classifier argument parser.',
max_chars_default: int = 2000,
) argparse.ArgumentParser#

Adds default set of arguments that are common to multiple stages of the pipeline.

parse_gpu_dedup_args() argparse.ArgumentParser#

Adds default set of arguments that are common to multiple stages of the fuzzy deduplication pipeline.

static parse_semdedup_args(
description: str = 'Default argument parser for semantic deduplication.',
) argparse.ArgumentParser#

Adds default set of arguments that are common to multiple stages of the semantic deduplication pipeline.

set_default_n_workers(max_mem_gb_per_worker: float) None#

Sets the default –n-workers for a script to maximize parallelization while ensuring we don’t trigger an out of memory error. Like –n-workers, this only applies when running the script locally.

Args: max_mem_per_worker (float): The maximum memory that each worker usually achieves for a script in units of gigabytes. It can be determined by watching the Dask dashboard. This value may change based on the size of each shard, so use a JSONL shard size of about 100 MB.