nemo_curator.stages.text.filters.fasttext.fasttext_filters
nemo_curator.stages.text.filters.fasttext.fasttext_filters
Module Contents
Classes
| Name | Description |
|---|---|
FastTextLangId | - |
FastTextQualityFilter | - |
API
class nemo_curator.stages.text.filters.fasttext.fasttext_filters.FastTextLangId( model_path: str | None = None, min_langid_score: float = 0.3 )
Bases: DocumentFilter
_name
= 'lang_id'
nemo_curator.stages.text.filters.fasttext.fasttext_filters.FastTextLangId.keep_document( score: float | str ) -> bool
nemo_curator.stages.text.filters.fasttext.fasttext_filters.FastTextLangId.load_model() -> None
nemo_curator.stages.text.filters.fasttext.fasttext_filters.FastTextLangId.model_check_or_download() -> None
nemo_curator.stages.text.filters.fasttext.fasttext_filters.FastTextLangId.score_document( text: str ) -> list[float | str]
class nemo_curator.stages.text.filters.fasttext.fasttext_filters.FastTextQualityFilter( model_path: str | None = None, label: str = '__label__hq', alpha: float = 3, seed: int = 42 )
Bases: DocumentFilter
_name
= 'fasttext_quality_filter'
_seed
= np.random.seed(seed)
nemo_curator.stages.text.filters.fasttext.fasttext_filters.FastTextQualityFilter.keep_document( score: float ) -> bool
nemo_curator.stages.text.filters.fasttext.fasttext_filters.FastTextQualityFilter.load_model() -> None
nemo_curator.stages.text.filters.fasttext.fasttext_filters.FastTextQualityFilter.model_check_or_download() -> None
nemo_curator.stages.text.filters.fasttext.fasttext_filters.FastTextQualityFilter.score_document( text: str ) -> float