download.wikipedia#

Module Contents#

Classes#

Functions#

Data#

API#

download.wikipedia.CAT_ALIASES#

None

download.wikipedia.MEDIA_ALIASES#

None

class download.wikipedia.WikipediaDownloader(download_dir: str, verbose: bool = False)#

Bases: nemo_curator.download.doc_builder.DocumentDownloader

Initialization

download(url: str) str#
class download.wikipedia.WikipediaExtractor(language: str = 'en', parser=mwparserfromhell)#

Bases: nemo_curator.download.doc_builder.DocumentExtractor

Initialization

extract(content) dict[str, str]#
class download.wikipedia.WikipediaIterator(language: str = 'en', log_frequency: int = 1000)#

Bases: nemo_curator.download.doc_builder.DocumentIterator

Initialization

iterate(
file_path: str,
) collections.abc.Iterator[tuple[dict[str, str], str]]#
download.wikipedia.download_wikipedia(
output_path: str,
language: str = 'en',
dump_date: str | None = None,
output_type: Literal[jsonl, parquet] = 'jsonl',
raw_download_dir: str | None = None,
keep_raw_download: bool = False,
force_download: bool = False,
url_limit: int | None = None,
record_limit: int | None = None,
) nemo_curator.datasets.DocumentDataset#