filters.code#

Module Contents#

Classes#

API#

class filters.code.AlphaFilter(min_alpha_ratio: float = 0.25)#

Bases: nemo_curator.filters.doc_filter.DocumentFilter

Initialization

keep_document(score: float) bool#
score_document(source: str) float#
class filters.code.GeneralCommentToCodeFilter(
language: str,
min_comment_to_code_ratio: float = 0.01,
max_comment_to_code_ratio: float = 0.85,
)#

Bases: nemo_curator.filters.doc_filter.DocumentFilter

Initialization

keep_document(score: float) bool#
score_document(source: str) float#
class filters.code.HTMLBoilerplateFilter(
min_lang_content_ratio: float = 0.2,
min_lang_content_num_chars: int = 100,
)#

Bases: nemo_curator.filters.doc_filter.DocumentFilter

Initialization

keep_document(score: float) bool#
score_document(source: str) float | None#
class filters.code.NumberOfLinesOfCodeFilter(min_lines: int = 10, max_lines: int = 20000)#

Bases: nemo_curator.filters.doc_filter.DocumentFilter

Initialization

keep_document(score: int) bool#
score_document(source: str) int#
class filters.code.PerExtensionFilter(
lang: str,
extension: str,
metadata_file: str = 'code_meta.csv',
)#

Bases: nemo_curator.filters.doc_filter.DocumentFilter

Initialization

keep_document(score: float | None) bool#
score_document(source: str) float#
class filters.code.PythonCommentToCodeFilter(
min_comment_to_code_ratio: float = 0.01,
max_comment_to_code_ratio: float = 0.85,
)#

Bases: nemo_curator.filters.doc_filter.DocumentFilter

Initialization

keep_document(score: float) bool#
score_document(source: str) float#
class filters.code.TokenizerFertilityFilter(
path_to_tokenizer: str | None = None,
min_char_to_token_ratio: float = 2.5,
)#

Bases: nemo_curator.filters.doc_filter.DocumentFilter

Initialization

keep_document(score: float) bool#
score_document(source: str) float#
class filters.code.XMLHeaderFilter(char_prefix_search_length: int = 100)#

Bases: nemo_curator.filters.doc_filter.DocumentFilter

Initialization

keep_document(score: float) bool#
score_document(source: str) float#