nemo_curator.stages.text.download.common_crawl.url_generation

View as Markdown

Module Contents

Classes

API

class nemo_curator.stages.text.download.common_crawl.url_generation.BaseCommonCrawlUrlGenerator(
start_snapshot_str: str,
end_snapshot_str: str,
data_prefix: str = 'https://data.commoncrawl.org',
limit: int | None = None
)
DataclassAbstract

Bases: URLGenerator

Get URLs for Common Crawl data Each concrete implementation must implement _parse_datetime_from_snapshot_string and generate_path_urls

data_prefix
str = 'https://data.commoncrawl.org'
end_snapshot_str
str
limit
int | None = None
start_snapshot_str
str
nemo_curator.stages.text.download.common_crawl.url_generation.BaseCommonCrawlUrlGenerator.__post_init__()
nemo_curator.stages.text.download.common_crawl.url_generation.BaseCommonCrawlUrlGenerator._parse_datetime_from_snapshot_string(
snapshot_str: str,
for_start: bool
) -> datetime.datetime
abstract

Parses a snapshot string (YYYY-WW or YYYY-MM) into a datetime object.

nemo_curator.stages.text.download.common_crawl.url_generation.BaseCommonCrawlUrlGenerator._start_end_dates() -> tuple[datetime.date, datetime.date]

Parses the start and end snapshot strings into date objects. For ‘news’ (YYYY-MM), the day is set to 1 for start_date, and the last day of the month for end_date to ensure the full month is covered.

nemo_curator.stages.text.download.common_crawl.url_generation.BaseCommonCrawlUrlGenerator.generate_data_urls(
path_urls: str | list[str] | None = None
) -> list[str]

Fetches all relevant warc.paths.gz files, decompresses them, and returns a list of all individual WARC file URLs.

nemo_curator.stages.text.download.common_crawl.url_generation.BaseCommonCrawlUrlGenerator.generate_path_urls() -> list[str]
abstract

Generates the list of URLs pointing to warc.paths.gz files.

nemo_curator.stages.text.download.common_crawl.url_generation.BaseCommonCrawlUrlGenerator.generate_urls() -> list[str]

Process the task and return a list of WARC URLs

class nemo_curator.stages.text.download.common_crawl.url_generation.MainCommonCrawlUrlGenerator(
start_snapshot_str: str,
end_snapshot_str: str,
data_prefix: str = 'https://data.commoncrawl.org',
limit: int | None = None,
index_prefix: str = 'https://index.commoncrawl....
)
Dataclass

Bases: BaseCommonCrawlUrlGenerator

_snapshot_index
list[dict]
index_prefix
str = 'https://index.commoncrawl.org'
nemo_curator.stages.text.download.common_crawl.url_generation.MainCommonCrawlUrlGenerator._parse_datetime_from_snapshot_string(
snapshot_str: str,
for_start: bool
) -> datetime.datetime
nemo_curator.stages.text.download.common_crawl.url_generation.MainCommonCrawlUrlGenerator.generate_path_urls() -> list[str]
class nemo_curator.stages.text.download.common_crawl.url_generation.NewsCommonCrawlUrlGenerator(
start_snapshot_str: str,
end_snapshot_str: str,
data_prefix: str = 'https://data.commoncrawl.org',
limit: int | None = None
)
Dataclass

Bases: BaseCommonCrawlUrlGenerator

nemo_curator.stages.text.download.common_crawl.url_generation.NewsCommonCrawlUrlGenerator._parse_datetime_from_snapshot_string(
snapshot_str: str,
for_start: bool
) -> datetime.datetime
nemo_curator.stages.text.download.common_crawl.url_generation.NewsCommonCrawlUrlGenerator.generate_path_urls() -> list[str]