nemo_curator.stages.text.deduplication.removal_workflow

View as Markdown

Module Contents

Classes

API

class nemo_curator.stages.text.deduplication.removal_workflow.TextDuplicatesRemovalWorkflow(
input_path: str | None,
ids_to_remove_path: str,
output_path: str,
input_filetype: typing.Literal['parquet', 'jsonl'] = 'parquet',
input_fields: list[str] | None = None,
id_field: str | None = CURATOR_DEDUP_ID_STR,
input_files_per_partition: int | None = None,
input_blocksize: str | None = None,
input_file_extensions: list[str] | None = None,
input_task_limit: int | None = None,
input_kwargs: dict[str, typing.Any] | None = None,
duplicate_id_field: str = 'id',
duplicate_id_read_kwargs: dict[str, typing.Any] | None = None,
id_generator_path: str | None = None,
id_generator_storage_options: dict[str, typing.Any] | None = None,
output_file_extension: str | None = None,
output_filetype: typing.Literal['parquet', 'jsonl'] = 'parquet',
output_kwargs: dict[str, typing.Any] | None = None,
output_fields: list[str] | None = None,
output_mode: typing.Literal['ignore', 'overwrite', 'append', 'error'] | None = None
)
Dataclass

Bases: WorkflowBase

duplicate_id_field
str = 'id'
duplicate_id_read_kwargs
dict[str, Any] | None = None
id_field
str | None = CURATOR_DEDUP_ID_STR
id_generator_path
str | None = None
id_generator_storage_options
dict[str, Any] | None = None
ids_to_remove_path
str
input_blocksize
str | None = None
input_fields
list[str] | None = None
input_file_extensions
list[str] | None = None
input_files_per_partition
int | None = None
input_filetype
Literal['parquet', 'jsonl'] = 'parquet'
input_kwargs
dict[str, Any] | None = None
input_path
str | None
input_task_limit
int | None = None
output_fields
list[str] | None = None
output_file_extension
str | None = None
output_filetype
Literal['parquet', 'jsonl'] = 'parquet'
output_kwargs
dict[str, Any] | None = None
output_mode
Literal['ignore', 'overwrite', 'append', 'error'] | None = None
output_path
str
nemo_curator.stages.text.deduplication.removal_workflow.TextDuplicatesRemovalWorkflow.__post_init__()

Initialize parent class after dataclass initialization.

nemo_curator.stages.text.deduplication.removal_workflow.TextDuplicatesRemovalWorkflow._count_removed_duplicates(
tasks: list[nemo_curator.tasks.FileGroupTask] | None
) -> int
staticmethod

Sum num_removed metadata reported by downstream stages.

nemo_curator.stages.text.deduplication.removal_workflow.TextDuplicatesRemovalWorkflow._generate_stages(
initial_tasks: list[nemo_curator.tasks.FileGroupTask] | None = None
) -> list[nemo_curator.stages.base.ProcessingStage]
nemo_curator.stages.text.deduplication.removal_workflow.TextDuplicatesRemovalWorkflow.run(
executor: typing.Optional[nemo_curator.backends.base.BaseExecutor] = None,
initial_tasks: list[nemo_curator.tasks.FileGroupTask] | None = None
) -> nemo_curator.pipeline.workflow.WorkflowRunResult