nemo_curator.stages.text.download.common_crawl.extract

View as Markdown

Module Contents

Classes

NameDescription
CommonCrawlHTMLExtractor-

API

class nemo_curator.stages.text.download.common_crawl.extract.CommonCrawlHTMLExtractor(
algorithm: nemo_curator.stages.text.download.html_extractors.HTMLExtractorAlgorithm | str | None = None,
algorithm_kwargs: dict | None = None,
stop_lists: dict[str, frozenset[str]] | None = None
)

Bases: DocumentExtractor

nemo_curator.stages.text.download.common_crawl.extract.CommonCrawlHTMLExtractor.extract(
record: dict[str, typing.Any]
) -> dict[str, typing.Any] | None

Extract text from HTML content in the record.

Takes a record dict containing “content” field with HTML and returns a new dict with only the output columns: url, warc_id, source_id, language, text.

nemo_curator.stages.text.download.common_crawl.extract.CommonCrawlHTMLExtractor.input_columns() -> list[str]
nemo_curator.stages.text.download.common_crawl.extract.CommonCrawlHTMLExtractor.output_columns() -> list[str]