filters.heuristic_filter#

Module Contents#

Classes#

API#

class filters.heuristic_filter.BoilerPlateStringFilter(
remove_if_at_top_or_bottom: bool = True,
max_boilerplate_string_ratio: float = 0.4,
)#

Bases: nemo_curator.filters.doc_filter.DocumentFilter

Initialization

keep_document(score: float) bool#
score_document(text: str) float#
class filters.heuristic_filter.BulletsFilter(max_bullet_lines_ratio: float = 0.9)#

Bases: nemo_curator.filters.doc_filter.DocumentFilter

Initialization

keep_document(score: float) bool#
score_document(text: str) float#
class filters.heuristic_filter.CommonEnglishWordsFilter(
min_num_common_words: int = 2,
stop_at_false: bool = True,
)#

Bases: nemo_curator.filters.doc_filter.DocumentFilter

Initialization

keep_document(score: int) bool#
score_document(text: str) int#
class filters.heuristic_filter.EllipsisFilter(max_num_lines_ending_with_ellipsis_ratio: float = 0.3)#

Bases: nemo_curator.filters.doc_filter.DocumentFilter

Initialization

keep_document(score: float) bool#
score_document(text: str) float#
class filters.heuristic_filter.HistogramFilter(
lang: str | None = 'en',
threshold: float | None = 0.8,
cache_dir: str | None = '',
threshold_char: str | None = ']',
)#

Bases: nemo_curator.filters.doc_filter.DocumentFilter

Initialization

keep_document(score: float) bool#
score_document(text: str) float#
class filters.heuristic_filter.LengthRatioFilter(
max_ratio: float = 3.0,
src_lang: str = 'en',
tgt_lang: str = 'en',
**kwargs,
)#

Bases: nemo_curator.filters.bitext_filter.BitextFilter

Initialization

keep_bitext(score: float) bool#
score_bitext(src: str, tgt: str) float#
class filters.heuristic_filter.LongWordFilter(max_word_length: int = 1000, lang: str = 'en')#

Bases: nemo_curator.filters.doc_filter.DocumentFilter

Initialization

keep_document(score: float) bool#
score_document(text: str) float#
class filters.heuristic_filter.MeanWordLengthFilter(
min_mean_word_length: int = 3,
max_mean_word_length: int = 10,
lang: str = 'en',
)#

Bases: nemo_curator.filters.doc_filter.DocumentFilter

Initialization

keep_document(score: float) bool#
score_document(text: str) float#
class filters.heuristic_filter.NonAlphaNumericFilter(
max_non_alpha_numeric_to_text_ratio: float = 0.25,
)#

Bases: nemo_curator.filters.doc_filter.DocumentFilter

Initialization

keep_document(score: float) bool#
score_document(text: str) float#
class filters.heuristic_filter.NumbersFilter(max_number_to_text_ratio: float = 0.15)#

Bases: nemo_curator.filters.doc_filter.DocumentFilter

Initialization

keep_document(score: float) bool#
score_document(text: str) float#
class filters.heuristic_filter.ParenthesesFilter(max_parentheses_ratio: float = 0.1)#

Bases: nemo_curator.filters.doc_filter.DocumentFilter

Initialization

keep_document(score: float) bool#
score_document(text: str) float#
class filters.heuristic_filter.PornographicUrlsFilter#

Bases: nemo_curator.filters.doc_filter.DocumentFilter

Initialization

keep_document(score: int) bool#
score_document(text: str) int#
class filters.heuristic_filter.PunctuationFilter(
max_num_sentences_without_endmark_ratio: float = 0.85,
)#

Bases: nemo_curator.filters.doc_filter.DocumentFilter

Initialization

keep_document(score: float) bool#
score_document(text: str) float#
class filters.heuristic_filter.RepeatedLinesByCharFilter(max_repeated_lines_char_ratio: float = 0.8)#

Bases: nemo_curator.filters.doc_filter.DocumentFilter

Initialization

keep_document(score: float) bool#
score_document(text: str) float#
class filters.heuristic_filter.RepeatedLinesFilter(max_repeated_line_fraction: float = 0.7)#

Bases: nemo_curator.filters.doc_filter.DocumentFilter

Initialization

keep_document(score: float) bool#
score_document(text: str) float#
class filters.heuristic_filter.RepeatedParagraphsByCharFilter(
max_repeated_paragraphs_char_ratio: float = 0.8,
)#

Bases: nemo_curator.filters.doc_filter.DocumentFilter

Initialization

keep_document(score: float) bool#
score_document(text: str) float#
class filters.heuristic_filter.RepeatedParagraphsFilter(max_repeated_paragraphs_ratio: float = 0.7)#

Bases: nemo_curator.filters.doc_filter.DocumentFilter

Initialization

keep_document(score: float) bool#
score_document(text: str) float#
class filters.heuristic_filter.RepeatingDuplicateNGramsFilter(
n: int = 2,
max_repeating_duplicate_ngram_ratio: float = 0.2,
lang: str = 'en',
)#

Bases: nemo_curator.filters.doc_filter.DocumentFilter

Initialization

keep_document(score: float) bool#
score_document(text: str) float#
class filters.heuristic_filter.RepeatingTopNGramsFilter(
n: int = 2,
max_repeating_ngram_ratio: float = 0.2,
lang: str = 'en',
)#

Bases: nemo_curator.filters.doc_filter.DocumentFilter

Initialization

keep_document(score: float) bool#
score_document(text: str) float#
class filters.heuristic_filter.SubstringFilter(
substring: str,
position: Literal[prefix, suffix, any],
)#

Bases: nemo_curator.filters.doc_filter.DocumentFilter

Initialization

keep_document(score: int) bool#
score_document(text: str) int#
class filters.heuristic_filter.SymbolsToWordsFilter(
max_symbol_to_word_ratio: float = 0.1,
lang: str = 'en',
)#

Bases: nemo_curator.filters.doc_filter.DocumentFilter

Initialization

keep_document(score: float) bool#
score_document(text: str) float#
class filters.heuristic_filter.TokenCountFilter(
tokenizer: transformers.AutoTokenizer,
min_tokens: int = 0,
max_tokens: int = float('inf'),
)#

Bases: nemo_curator.filters.doc_filter.DocumentFilter

Initialization

keep_document(score: int) bool#
score_document(text: str) int#
class filters.heuristic_filter.UrlsFilter(max_url_to_text_ratio: float = 0.2)#

Bases: nemo_curator.filters.doc_filter.DocumentFilter

Initialization

keep_document(score: float) bool#
score_document(text: str) float#
class filters.heuristic_filter.WhiteSpaceFilter(max_white_space_ratio: float = 0.25)#

Bases: nemo_curator.filters.doc_filter.DocumentFilter

Initialization

keep_document(score: float) bool#
score_document(text: str) float#
class filters.heuristic_filter.WordCountFilter(
min_words: int = 50,
max_words: int = 100000,
lang: str = 'en',
)#

Bases: nemo_curator.filters.doc_filter.DocumentFilter

Initialization

keep_document(score: float) bool#
score_document(text: str) float#
class filters.heuristic_filter.WordsWithoutAlphabetsFilter(
min_words_with_alphabets: float = 0.8,
lang: str = 'en',
)#

Bases: nemo_curator.filters.doc_filter.DocumentFilter

Initialization

keep_document(score: float) bool#
score_document(text: str) float#