download.commoncrawl#

Module Contents#

Classes#

Functions#

Data#

API#

class download.commoncrawl.CommonCrawlWARCDownloader(
download_dir: str,
aws: bool = False,
verbose: bool = False,
)#

Bases: nemo_curator.download.doc_builder.DocumentDownloader

Initialization

download(url: str) str#
class download.commoncrawl.CommonCrawlWARCDownloaderExtractOnly(
aws: bool = False,
verbose: bool = False,
)#

Bases: nemo_curator.download.doc_builder.DocumentDownloader

Initialization

download(url: str) str#
class download.commoncrawl.CommonCrawlWARCExtractor(
algorithm: download.commoncrawl.HTMLExtractorAlgorithm | None = None,
stop_lists: dict[str, frozenset[str]] | None = None,
)#

Bases: nemo_curator.download.doc_builder.DocumentExtractor

Initialization

extract(content: str) dict[str, str] | None#
class download.commoncrawl.CommonCrawlWARCIterator(log_frequency: int = 1000)#

Bases: nemo_curator.download.doc_builder.DocumentIterator

Initialization

iterate(
file_path: str,
) collections.abc.Iterator[tuple[dict[str, str], str]]#
class download.commoncrawl.HTMLExtractorAlgorithm#

Bases: abc.ABC

abstractmethod extract_text(
html: str,
stop_words: frozenset[str],
language: str,
) list[str] | None#
class download.commoncrawl.JusTextExtractor(
length_low: int = 70,
length_high: int = 200,
stopwords_low: float = 0.3,
stopwords_high: float = 0.32,
max_link_density: float = 0.2,
max_heading_distance: int = 200,
no_headings: bool = False,
is_boilerplate: bool | None = None,
logger: logging.Logger | None = None,
)#

Bases: download.commoncrawl.HTMLExtractorAlgorithm

Initialization

extract_text(
html: str,
stop_words: frozenset[str],
language: str,
) list[str] | None#
download.commoncrawl.NON_SPACED_LANGUAGES#

[‘THAI’, ‘CHINESE’, ‘JAPANESE’, ‘KOREAN’]

class download.commoncrawl.ResiliparseExtractor(
required_stopword_density: float = 0.32,
main_content: bool = True,
alt_texts: bool = False,
)#

Bases: download.commoncrawl.HTMLExtractorAlgorithm

Initialization

extract_text(
html: str,
stop_words: frozenset[str],
language: str,
) list[str] | None#
class download.commoncrawl.TrafilaturaExtractor(
required_stopword_density: float = 0.32,
min_extracted_size: int = 250,
min_extracted_comm_size: int = 1,
min_output_size: int = 1,
min_output_comm_size: int = 1,
max_tree_size: int | None = None,
min_duplcheck_size: int = 100,
max_repetitions: int = 2,
**extract_kwargs,
)#

Bases: download.commoncrawl.HTMLExtractorAlgorithm

Initialization

extract_text(
html: str,
stop_words: frozenset[str],
language: str,
) list[str] | None#
download.commoncrawl.decode_html(html_bytes: bytes) str | None#
download.commoncrawl.download_common_crawl(
output_path: str,
start_snapshot: str,
end_snapshot: str,
output_type: Literal[jsonl, parquet] = 'jsonl',
algorithm: download.commoncrawl.HTMLExtractorAlgorithm | None = None,
stop_lists: dict[str, frozenset[str]] | None = None,
news: bool = False,
aws: bool = False,
raw_download_dir: str | None = None,
keep_raw_download: bool = False,
force_download: bool = False,
url_limit: int | None = None,
record_limit: int | None = None,
) nemo_curator.datasets.DocumentDataset#
download.commoncrawl.get_all_stop_words() frozenset[str]#
download.commoncrawl.get_stop_list_dict(
languages: list[str] | None = None,
) dict[str, frozenset[str]]#
download.commoncrawl.lang_detect(decoded_html: str) str#
download.commoncrawl.try_decode_with_detected_encoding(html_bytes: bytes) str | None#