nemo_curator.stages.text.filters.fasttext.fasttext_filters

View as Markdown

Module Contents

Classes

API

class nemo_curator.stages.text.filters.fasttext.fasttext_filters.FastTextLangId(
model_path: str | None = None,
min_langid_score: float = 0.3
)

Bases: DocumentFilter

_name
= 'lang_id'
nemo_curator.stages.text.filters.fasttext.fasttext_filters.FastTextLangId.keep_document(
score: float | str
) -> bool
nemo_curator.stages.text.filters.fasttext.fasttext_filters.FastTextLangId.load_model() -> None
nemo_curator.stages.text.filters.fasttext.fasttext_filters.FastTextLangId.model_check_or_download() -> None
nemo_curator.stages.text.filters.fasttext.fasttext_filters.FastTextLangId.score_document(
text: str
) -> list[float | str]
class nemo_curator.stages.text.filters.fasttext.fasttext_filters.FastTextQualityFilter(
model_path: str | None = None,
label: str = '__label__hq',
alpha: float = 3,
seed: int = 42
)

Bases: DocumentFilter

_name
= 'fasttext_quality_filter'
_seed
= np.random.seed(seed)
nemo_curator.stages.text.filters.fasttext.fasttext_filters.FastTextQualityFilter.keep_document(
score: float
) -> bool
nemo_curator.stages.text.filters.fasttext.fasttext_filters.FastTextQualityFilter.load_model() -> None
nemo_curator.stages.text.filters.fasttext.fasttext_filters.FastTextQualityFilter.model_check_or_download() -> None
nemo_curator.stages.text.filters.fasttext.fasttext_filters.FastTextQualityFilter.score_document(
text: str
) -> float