filters.heuristic_filter#
Module Contents#
Classes#
API#
- class filters.heuristic_filter.BoilerPlateStringFilter(
- remove_if_at_top_or_bottom: bool = True,
- max_boilerplate_string_ratio: float = 0.4,
Bases:
nemo_curator.filters.doc_filter.DocumentFilterInitialization
- keep_document(score: float) bool#
- score_document(text: str) float#
- class filters.heuristic_filter.BulletsFilter(max_bullet_lines_ratio: float = 0.9)#
Bases:
nemo_curator.filters.doc_filter.DocumentFilterInitialization
- keep_document(score: float) bool#
- score_document(text: str) float#
- class filters.heuristic_filter.CommonEnglishWordsFilter(
- min_num_common_words: int = 2,
- stop_at_false: bool = True,
Bases:
nemo_curator.filters.doc_filter.DocumentFilterInitialization
- keep_document(score: int) bool#
- score_document(text: str) int#
- class filters.heuristic_filter.EllipsisFilter(max_num_lines_ending_with_ellipsis_ratio: float = 0.3)#
Bases:
nemo_curator.filters.doc_filter.DocumentFilterInitialization
- keep_document(score: float) bool#
- score_document(text: str) float#
- class filters.heuristic_filter.HistogramFilter(
- lang: str | None = 'en',
- threshold: float | None = 0.8,
- cache_dir: str | None = '',
- threshold_char: str | None = ']',
Bases:
nemo_curator.filters.doc_filter.DocumentFilterInitialization
- keep_document(score: float) bool#
- score_document(text: str) float#
- class filters.heuristic_filter.LengthRatioFilter(
- max_ratio: float = 3.0,
- src_lang: str = 'en',
- tgt_lang: str = 'en',
- **kwargs,
Bases:
nemo_curator.filters.bitext_filter.BitextFilterInitialization
- keep_bitext(score: float) bool#
- score_bitext(src: str, tgt: str) float#
- class filters.heuristic_filter.LongWordFilter(max_word_length: int = 1000, lang: str = 'en')#
Bases:
nemo_curator.filters.doc_filter.DocumentFilterInitialization
- keep_document(score: float) bool#
- score_document(text: str) float#
- class filters.heuristic_filter.MeanWordLengthFilter(
- min_mean_word_length: int = 3,
- max_mean_word_length: int = 10,
- lang: str = 'en',
Bases:
nemo_curator.filters.doc_filter.DocumentFilterInitialization
- keep_document(score: float) bool#
- score_document(text: str) float#
- class filters.heuristic_filter.NonAlphaNumericFilter(
- max_non_alpha_numeric_to_text_ratio: float = 0.25,
Bases:
nemo_curator.filters.doc_filter.DocumentFilterInitialization
- keep_document(score: float) bool#
- score_document(text: str) float#
- class filters.heuristic_filter.NumbersFilter(max_number_to_text_ratio: float = 0.15)#
Bases:
nemo_curator.filters.doc_filter.DocumentFilterInitialization
- keep_document(score: float) bool#
- score_document(text: str) float#
- class filters.heuristic_filter.ParenthesesFilter(max_parentheses_ratio: float = 0.1)#
Bases:
nemo_curator.filters.doc_filter.DocumentFilterInitialization
- keep_document(score: float) bool#
- score_document(text: str) float#
- class filters.heuristic_filter.PornographicUrlsFilter#
Bases:
nemo_curator.filters.doc_filter.DocumentFilterInitialization
- keep_document(score: int) bool#
- score_document(text: str) int#
- class filters.heuristic_filter.PunctuationFilter(
- max_num_sentences_without_endmark_ratio: float = 0.85,
Bases:
nemo_curator.filters.doc_filter.DocumentFilterInitialization
- keep_document(score: float) bool#
- score_document(text: str) float#
- class filters.heuristic_filter.RepeatedLinesByCharFilter(max_repeated_lines_char_ratio: float = 0.8)#
Bases:
nemo_curator.filters.doc_filter.DocumentFilterInitialization
- keep_document(score: float) bool#
- score_document(text: str) float#
- class filters.heuristic_filter.RepeatedLinesFilter(max_repeated_line_fraction: float = 0.7)#
Bases:
nemo_curator.filters.doc_filter.DocumentFilterInitialization
- keep_document(score: float) bool#
- score_document(text: str) float#
- class filters.heuristic_filter.RepeatedParagraphsByCharFilter(
- max_repeated_paragraphs_char_ratio: float = 0.8,
Bases:
nemo_curator.filters.doc_filter.DocumentFilterInitialization
- keep_document(score: float) bool#
- score_document(text: str) float#
- class filters.heuristic_filter.RepeatedParagraphsFilter(max_repeated_paragraphs_ratio: float = 0.7)#
Bases:
nemo_curator.filters.doc_filter.DocumentFilterInitialization
- keep_document(score: float) bool#
- score_document(text: str) float#
- class filters.heuristic_filter.RepeatingDuplicateNGramsFilter(
- n: int = 2,
- max_repeating_duplicate_ngram_ratio: float = 0.2,
- lang: str = 'en',
Bases:
nemo_curator.filters.doc_filter.DocumentFilterInitialization
- keep_document(score: float) bool#
- score_document(text: str) float#
- class filters.heuristic_filter.RepeatingTopNGramsFilter(
- n: int = 2,
- max_repeating_ngram_ratio: float = 0.2,
- lang: str = 'en',
Bases:
nemo_curator.filters.doc_filter.DocumentFilterInitialization
- keep_document(score: float) bool#
- score_document(text: str) float#
- class filters.heuristic_filter.SubstringFilter(
- substring: str,
- position: Literal[prefix, suffix, any],
Bases:
nemo_curator.filters.doc_filter.DocumentFilterInitialization
- keep_document(score: int) bool#
- score_document(text: str) int#
- class filters.heuristic_filter.SymbolsToWordsFilter(
- max_symbol_to_word_ratio: float = 0.1,
- lang: str = 'en',
Bases:
nemo_curator.filters.doc_filter.DocumentFilterInitialization
- keep_document(score: float) bool#
- score_document(text: str) float#
- class filters.heuristic_filter.TokenCountFilter(
- tokenizer: transformers.AutoTokenizer,
- min_tokens: int = 0,
- max_tokens: int = float('inf'),
Bases:
nemo_curator.filters.doc_filter.DocumentFilterInitialization
- keep_document(score: int) bool#
- score_document(text: str) int#
- class filters.heuristic_filter.UrlsFilter(max_url_to_text_ratio: float = 0.2)#
Bases:
nemo_curator.filters.doc_filter.DocumentFilterInitialization
- keep_document(score: float) bool#
- score_document(text: str) float#
- class filters.heuristic_filter.WhiteSpaceFilter(max_white_space_ratio: float = 0.25)#
Bases:
nemo_curator.filters.doc_filter.DocumentFilterInitialization
- keep_document(score: float) bool#
- score_document(text: str) float#