download.commoncrawl#
Module Contents#
Classes#
Functions#
Data#
API#
- class download.commoncrawl.CommonCrawlWARCDownloader(
- download_dir: str,
- aws: bool = False,
- verbose: bool = False,
Bases:
nemo_curator.download.doc_builder.DocumentDownloaderInitialization
- download(url: str) str#
- class download.commoncrawl.CommonCrawlWARCDownloaderExtractOnly(
- aws: bool = False,
- verbose: bool = False,
Bases:
nemo_curator.download.doc_builder.DocumentDownloaderInitialization
- download(url: str) str#
- class download.commoncrawl.CommonCrawlWARCExtractor(
- algorithm: download.commoncrawl.HTMLExtractorAlgorithm | None = None,
- stop_lists: dict[str, frozenset[str]] | None = None,
Bases:
nemo_curator.download.doc_builder.DocumentExtractorInitialization
- extract(content: str) dict[str, str] | None#
- class download.commoncrawl.CommonCrawlWARCIterator(log_frequency: int = 1000)#
Bases:
nemo_curator.download.doc_builder.DocumentIteratorInitialization
- iterate(
- file_path: str,
- class download.commoncrawl.HTMLExtractorAlgorithm#
Bases:
abc.ABC- abstractmethod extract_text(
- html: str,
- stop_words: frozenset[str],
- language: str,
- class download.commoncrawl.JusTextExtractor(
- length_low: int = 70,
- length_high: int = 200,
- stopwords_low: float = 0.3,
- stopwords_high: float = 0.32,
- max_link_density: float = 0.2,
- max_heading_distance: int = 200,
- no_headings: bool = False,
- is_boilerplate: bool | None = None,
- logger: logging.Logger | None = None,
Bases:
download.commoncrawl.HTMLExtractorAlgorithmInitialization
- extract_text(
- html: str,
- stop_words: frozenset[str],
- language: str,
- download.commoncrawl.NON_SPACED_LANGUAGES#
[‘THAI’, ‘CHINESE’, ‘JAPANESE’, ‘KOREAN’]
- class download.commoncrawl.ResiliparseExtractor(
- required_stopword_density: float = 0.32,
- main_content: bool = True,
- alt_texts: bool = False,
Bases:
download.commoncrawl.HTMLExtractorAlgorithmInitialization
- extract_text(
- html: str,
- stop_words: frozenset[str],
- language: str,
- class download.commoncrawl.TrafilaturaExtractor(
- required_stopword_density: float = 0.32,
- min_extracted_size: int = 250,
- min_extracted_comm_size: int = 1,
- min_output_size: int = 1,
- min_output_comm_size: int = 1,
- max_tree_size: int | None = None,
- min_duplcheck_size: int = 100,
- max_repetitions: int = 2,
- **extract_kwargs,
Bases:
download.commoncrawl.HTMLExtractorAlgorithmInitialization
- extract_text(
- html: str,
- stop_words: frozenset[str],
- language: str,
- download.commoncrawl.decode_html(html_bytes: bytes) str | None#
- download.commoncrawl.download_common_crawl(
- output_path: str,
- start_snapshot: str,
- end_snapshot: str,
- output_type: Literal[jsonl, parquet] = 'jsonl',
- algorithm: download.commoncrawl.HTMLExtractorAlgorithm | None = None,
- stop_lists: dict[str, frozenset[str]] | None = None,
- news: bool = False,
- aws: bool = False,
- raw_download_dir: str | None = None,
- keep_raw_download: bool = False,
- force_download: bool = False,
- url_limit: int | None = None,
- record_limit: int | None = None,
- download.commoncrawl.get_all_stop_words() frozenset[str]#
- download.commoncrawl.get_stop_list_dict(
- languages: list[str] | None = None,
- download.commoncrawl.lang_detect(decoded_html: str) str#
- download.commoncrawl.try_decode_with_detected_encoding(html_bytes: bytes) str | None#