filters.classifier_filter#

Module Contents#

Classes#

API#

class filters.classifier_filter.FastTextLangId(
model_path: str | None = None,
min_langid_score: float = 0.3,
)#

Bases: nemo_curator.filters.doc_filter.DocumentFilter

Initialization

keep_document(score: float) bool#
score_document(df: pandas.Series) pandas.Series#
class filters.classifier_filter.FastTextQualityFilter(
model_path: str | None = None,
label: str = '__label__hq',
alpha: float = 3,
seed: int = 42,
)#

Bases: nemo_curator.filters.doc_filter.DocumentFilter

Initialization

keep_document(df: pandas.Series) pandas.Series#
score_document(df: pandas.Series) pandas.Series#
class filters.classifier_filter.QualityEstimationFilter(
model_name: str,
cutoff: float,
mode: str = 'always_en_x',
gpu: bool = False,
**kwargs,
)#

Bases: nemo_curator.filters.bitext_filter.BitextFilter

Initialization

SUPPORTED_MODELS: Final[dict[str, type[nemo_curator.filters.models.qe_models.QEModel]]]#

None

keep_bitext(score: float) bool#
score_bitext(
src: pandas.Series,
tgt: pandas.Series,
src_lang: pandas.Series,
tgt_lang: pandas.Series,
) pandas.Series#