filters.doc_filter#

Module Contents#

Classes#

Functions#

API#

class filters.doc_filter.DocumentFilter#

Bases: abc.ABC

Initialization

property backend: Literal[pandas, cudf, any]#
abstractmethod keep_document(scores: float | list[int | float]) bool#
property name: str#
property ngrams: dict#
property paragraphs: list#
abstractmethod score_document(text: str) float | list[int | float]#
property sentences: list#
filters.doc_filter.import_filter(
filter_path: str,
) filters.doc_filter.DocumentFilter | nemo_curator.filters.bitext_filter.BitextFilter#