> For clean Markdown of any page, append .md to the page URL.
> For a complete documentation index, see https://docs.nvidia.com/nemo/curator/llms.txt.
> For full documentation content, see https://docs.nvidia.com/nemo/curator/llms-full.txt.

# nemo_curator.stages.text.deduplication.removal_workflow

## Module Contents

### Classes

| Name                                                                                                                      | Description |
| ------------------------------------------------------------------------------------------------------------------------- | ----------- |
| [`TextDuplicatesRemovalWorkflow`](#nemo_curator-stages-text-deduplication-removal_workflow-TextDuplicatesRemovalWorkflow) | -           |

### API

<Anchor id="nemo_curator-stages-text-deduplication-removal_workflow-TextDuplicatesRemovalWorkflow">
  <CodeBlock showLineNumbers={false} wordWrap={true}>
    ```python
    class nemo_curator.stages.text.deduplication.removal_workflow.TextDuplicatesRemovalWorkflow(
        input_path: str | None,
        ids_to_remove_path: str,
        output_path: str,
        input_filetype: typing.Literal['parquet', 'jsonl'] = 'parquet',
        input_fields: list[str] | None = None,
        id_field: str | None = CURATOR_DEDUP_ID_STR,
        input_files_per_partition: int | None = None,
        input_blocksize: str | None = None,
        input_file_extensions: list[str] | None = None,
        input_task_limit: int | None = None,
        input_kwargs: dict[str, typing.Any] | None = None,
        duplicate_id_field: str = 'id',
        duplicate_id_read_kwargs: dict[str, typing.Any] | None = None,
        id_generator_path: str | None = None,
        id_generator_storage_options: dict[str, typing.Any] | None = None,
        output_file_extension: str | None = None,
        output_filetype: typing.Literal['parquet', 'jsonl'] = 'parquet',
        output_kwargs: dict[str, typing.Any] | None = None,
        output_fields: list[str] | None = None,
        output_mode: typing.Literal['ignore', 'overwrite', 'append', 'error'] | None = None
    )
    ```
  </CodeBlock>
</Anchor>

<Indent>
  <Badge>
    Dataclass
  </Badge>

  **Bases:** [WorkflowBase](/nemo-curator/nemo_curator/pipeline/workflow#nemo_curator-pipeline-workflow-WorkflowBase)

  <ParamField path="duplicate_id_field" type="str = 'id'" />

  <ParamField path="duplicate_id_read_kwargs" type="dict[str, Any] | None = None" />

  <ParamField path="id_field" type="str | None = CURATOR_DEDUP_ID_STR" />

  <ParamField path="id_generator_path" type="str | None = None" />

  <ParamField path="id_generator_storage_options" type="dict[str, Any] | None = None" />

  <ParamField path="ids_to_remove_path" type="str" />

  <ParamField path="input_blocksize" type="str | None = None" />

  <ParamField path="input_fields" type="list[str] | None = None" />

  <ParamField path="input_file_extensions" type="list[str] | None = None" />

  <ParamField path="input_files_per_partition" type="int | None = None" />

  <ParamField path="input_filetype" type="Literal['parquet', 'jsonl'] = 'parquet'" />

  <ParamField path="input_kwargs" type="dict[str, Any] | None = None" />

  <ParamField path="input_path" type="str | None" />

  <ParamField path="input_task_limit" type="int | None = None" />

  <ParamField path="output_fields" type="list[str] | None = None" />

  <ParamField path="output_file_extension" type="str | None = None" />

  <ParamField path="output_filetype" type="Literal['parquet', 'jsonl'] = 'parquet'" />

  <ParamField path="output_kwargs" type="dict[str, Any] | None = None" />

  <ParamField path="output_mode" type="Literal['ignore', 'overwrite', 'append', 'error'] | None = None" />

  <ParamField path="output_path" type="str" />

  <Anchor id="nemo_curator-stages-text-deduplication-removal_workflow-TextDuplicatesRemovalWorkflow-__post_init__">
    <CodeBlock showLineNumbers={false} wordWrap={true}>
      ```python
      nemo_curator.stages.text.deduplication.removal_workflow.TextDuplicatesRemovalWorkflow.__post_init__()
      ```
    </CodeBlock>
  </Anchor>

  <Indent>
    Initialize parent class after dataclass initialization.
  </Indent>

  <Anchor id="nemo_curator-stages-text-deduplication-removal_workflow-TextDuplicatesRemovalWorkflow-_count_removed_duplicates">
    <CodeBlock links={{"nemo_curator.tasks.FileGroupTask":"/nemo-curator/nemo_curator/tasks/file_group#nemo_curator-tasks-file_group-FileGroupTask"}} showLineNumbers={false} wordWrap={true}>
      ```python
      nemo_curator.stages.text.deduplication.removal_workflow.TextDuplicatesRemovalWorkflow._count_removed_duplicates(
          tasks: list[nemo_curator.tasks.FileGroupTask] | None
      ) -> int
      ```
    </CodeBlock>
  </Anchor>

  <Indent>
    <Badge>
      staticmethod
    </Badge>

    Sum num\_removed metadata reported by downstream stages.
  </Indent>

  <Anchor id="nemo_curator-stages-text-deduplication-removal_workflow-TextDuplicatesRemovalWorkflow-_generate_stages">
    <CodeBlock links={{"nemo_curator.tasks.FileGroupTask":"/nemo-curator/nemo_curator/tasks/file_group#nemo_curator-tasks-file_group-FileGroupTask","nemo_curator.stages.base.ProcessingStage":"/nemo-curator/nemo_curator/stages/base#nemo_curator-stages-base-ProcessingStage"}} showLineNumbers={false} wordWrap={true}>
      ```python
      nemo_curator.stages.text.deduplication.removal_workflow.TextDuplicatesRemovalWorkflow._generate_stages(
          initial_tasks: list[nemo_curator.tasks.FileGroupTask] | None = None
      ) -> list[nemo_curator.stages.base.ProcessingStage]
      ```
    </CodeBlock>
  </Anchor>

  <Indent />

  <Anchor id="nemo_curator-stages-text-deduplication-removal_workflow-TextDuplicatesRemovalWorkflow-run">
    <CodeBlock links={{"nemo_curator.backends.base.BaseExecutor":"/nemo-curator/nemo_curator/backends/base#nemo_curator-backends-base-BaseExecutor","nemo_curator.tasks.FileGroupTask":"/nemo-curator/nemo_curator/tasks/file_group#nemo_curator-tasks-file_group-FileGroupTask","nemo_curator.pipeline.workflow.WorkflowRunResult":"/nemo-curator/nemo_curator/pipeline/workflow#nemo_curator-pipeline-workflow-WorkflowRunResult"}} showLineNumbers={false} wordWrap={true}>
      ```python
      nemo_curator.stages.text.deduplication.removal_workflow.TextDuplicatesRemovalWorkflow.run(
          executor: typing.Optional[nemo_curator.backends.base.BaseExecutor] = None,
          initial_tasks: list[nemo_curator.tasks.FileGroupTask] | None = None
      ) -> nemo_curator.pipeline.workflow.WorkflowRunResult
      ```
    </CodeBlock>
  </Anchor>

  <Indent />
</Indent>