nemo_curator.stages.text.download.html_extractors.trafilatura

View as Markdown

Module Contents

Classes

NameDescription
TrafilaturaExtractor-

API

class nemo_curator.stages.text.download.html_extractors.trafilatura.TrafilaturaExtractor(
required_stopword_density: float = 0.32,
min_extracted_size: int = 250,
min_extracted_comm_size: int = 1,
min_output_size: int = 1,
min_output_comm_size: int = 1,
max_tree_size: int | None = None,
min_duplcheck_size: int = 100,
max_repetitions: int = 2,
extract_kwargs = {}
)

Bases: HTMLExtractorAlgorithm

nemo_curator.stages.text.download.html_extractors.trafilatura.TrafilaturaExtractor.extract_text(
html: str,
stop_words: frozenset[str],
language: str
) -> list[str] | None