filters.code#
Module Contents#
Classes#
API#
- class filters.code.AlphaFilter(min_alpha_ratio: float = 0.25)#
Bases:
nemo_curator.filters.doc_filter.DocumentFilterInitialization
- keep_document(score: float) bool#
- score_document(source: str) float#
- class filters.code.GeneralCommentToCodeFilter(
- language: str,
- min_comment_to_code_ratio: float = 0.01,
- max_comment_to_code_ratio: float = 0.85,
Bases:
nemo_curator.filters.doc_filter.DocumentFilterInitialization
- keep_document(score: float) bool#
- score_document(source: str) float#
- class filters.code.HTMLBoilerplateFilter(
- min_lang_content_ratio: float = 0.2,
- min_lang_content_num_chars: int = 100,
Bases:
nemo_curator.filters.doc_filter.DocumentFilterInitialization
- keep_document(score: float) bool#
- score_document(source: str) float | None#
- class filters.code.NumberOfLinesOfCodeFilter(min_lines: int = 10, max_lines: int = 20000)#
Bases:
nemo_curator.filters.doc_filter.DocumentFilterInitialization
- keep_document(score: int) bool#
- score_document(source: str) int#
- class filters.code.PerExtensionFilter(
- lang: str,
- extension: str,
- metadata_file: str = 'code_meta.csv',
Bases:
nemo_curator.filters.doc_filter.DocumentFilterInitialization
- keep_document(score: float | None) bool#
- score_document(source: str) float#
- class filters.code.PythonCommentToCodeFilter(
- min_comment_to_code_ratio: float = 0.01,
- max_comment_to_code_ratio: float = 0.85,
Bases:
nemo_curator.filters.doc_filter.DocumentFilterInitialization
- keep_document(score: float) bool#
- score_document(source: str) float#