stages.text.deduplication.removal_workflow#
Module Contents#
Classes#
API#
- class stages.text.deduplication.removal_workflow.TextDuplicatesRemovalWorkflow#
- id_generator_path: str | None#
None
- id_generator_storage_options: dict[str, Any] | None#
None
- ids_to_remove_duplicate_id_field: str#
‘id’
- ids_to_remove_path: str#
None
- ids_to_remove_read_kwargs: dict[str, Any] | None#
None
- input_blocksize: str | None#
None
- input_fields: list[str] | None#
None
- input_file_extensions: list[str] | None#
None
- input_files_per_partition: int | None#
None
- input_filetype: Literal[parquet, jsonl]#
‘parquet’
- input_id_field: str | None#
None
- input_kwargs: dict[str, Any] | None#
None
- input_path: str | None#
None
- input_task_limit: int | None#
None
- output_fields: list[str] | None#
None
- output_file_extension: str | None#
None
- output_filetype: Literal[parquet, jsonl]#
‘parquet’
- output_kwargs: dict[str, Any] | None#
None
- output_mode: Literal[ignore, overwrite, append, error] | None#
None
- output_path: str#
None
- run(
- executor: nemo_curator.backends.base.BaseExecutor | None = None,
- initial_tasks: list[nemo_curator.tasks.FileGroupTask] | None = None,