stages.text.deduplication.removal_workflow#

Module Contents#

Classes#

API#

class stages.text.deduplication.removal_workflow.TextDuplicatesRemovalWorkflow#
id_generator_path: str | None#

None

id_generator_storage_options: dict[str, Any] | None#

None

ids_to_remove_duplicate_id_field: str#

‘id’

ids_to_remove_path: str#

None

ids_to_remove_read_kwargs: dict[str, Any] | None#

None

input_blocksize: str | None#

None

input_fields: list[str] | None#

None

input_file_extensions: list[str] | None#

None

input_files_per_partition: int | None#

None

input_filetype: Literal[parquet, jsonl]#

‘parquet’

input_id_field: str | None#

None

input_kwargs: dict[str, Any] | None#

None

input_path: str | None#

None

input_task_limit: int | None#

None

output_fields: list[str] | None#

None

output_file_extension: str | None#

None

output_filetype: Literal[parquet, jsonl]#

‘parquet’

output_kwargs: dict[str, Any] | None#

None

output_mode: Literal[ignore, overwrite, append, error] | None#

None

output_path: str#

None

run(
executor: nemo_curator.backends.base.BaseExecutor | None = None,
initial_tasks: list[nemo_curator.tasks.FileGroupTask] | None = None,
) list[nemo_curator.tasks.FileGroupTask] | None#