nemo_curator.stages.text.download.html_extractors.resiliparse

View as Markdown

Module Contents

Classes

NameDescription
ResiliparseExtractor-

API

class nemo_curator.stages.text.download.html_extractors.resiliparse.ResiliparseExtractor(
required_stopword_density: float = 0.32,
main_content: bool = True,
alt_texts: bool = False
)

Bases: HTMLExtractorAlgorithm

nemo_curator.stages.text.download.html_extractors.resiliparse.ResiliparseExtractor.extract_text(
html: str,
stop_words: frozenset[str],
language: str
) -> list[str] | None