nemo_curator.stages.text.download.html_extractors.justext

View as Markdown

Module Contents

Classes

NameDescription
JusTextExtractor-

API

class nemo_curator.stages.text.download.html_extractors.justext.JusTextExtractor(
length_low: int = 70,
length_high: int = 200,
stopwords_low: float = 0.3,
stopwords_high: float = 0.32,
max_link_density: float = 0.2,
max_heading_distance: int = 200,
no_headings: bool = False,
is_boilerplate: bool | None = None
)

Bases: HTMLExtractorAlgorithm

_logged_languages
set[str] = set()
nemo_curator.stages.text.download.html_extractors.justext.JusTextExtractor.extract_text(
html: str,
stop_words: frozenset[str],
language: str
) -> list[str] | None