filters.classifier_filter#
Module Contents#
Classes#
API#
- class filters.classifier_filter.FastTextLangId(
- model_path: str | None = None,
- min_langid_score: float = 0.3,
Bases:
nemo_curator.filters.doc_filter.DocumentFilterInitialization
- keep_document(score: float) bool#
- score_document(df: pandas.Series) pandas.Series#
- class filters.classifier_filter.FastTextQualityFilter(
- model_path: str | None = None,
- label: str = '__label__hq',
- alpha: float = 3,
- seed: int = 42,
Bases:
nemo_curator.filters.doc_filter.DocumentFilterInitialization
- keep_document(df: pandas.Series) pandas.Series#
- score_document(df: pandas.Series) pandas.Series#
- class filters.classifier_filter.QualityEstimationFilter(
- model_name: str,
- cutoff: float,
- mode: str = 'always_en_x',
- gpu: bool = False,
- **kwargs,
Bases:
nemo_curator.filters.bitext_filter.BitextFilterInitialization
- SUPPORTED_MODELS: Final[dict[str, type[nemo_curator.filters.models.qe_models.QEModel]]]#
None
- keep_bitext(score: float) bool#
- score_bitext(
- src: pandas.Series,
- tgt: pandas.Series,
- src_lang: pandas.Series,
- tgt_lang: pandas.Series,