download.wikipedia#
Module Contents#
Classes#
Functions#
Data#
API#
- download.wikipedia.CAT_ALIASES#
None
- download.wikipedia.MEDIA_ALIASES#
None
- class download.wikipedia.WikipediaDownloader(download_dir: str, verbose: bool = False)#
Bases:
nemo_curator.download.doc_builder.DocumentDownloaderInitialization
- download(url: str) str#
- class download.wikipedia.WikipediaExtractor(language: str = 'en', parser=mwparserfromhell)#
Bases:
nemo_curator.download.doc_builder.DocumentExtractorInitialization
- extract(content) dict[str, str]#
- class download.wikipedia.WikipediaIterator(language: str = 'en', log_frequency: int = 1000)#
Bases:
nemo_curator.download.doc_builder.DocumentIteratorInitialization
- iterate(
- file_path: str,
- download.wikipedia.download_wikipedia(
- output_path: str,
- language: str = 'en',
- dump_date: str | None = None,
- output_type: Literal[jsonl, parquet] = 'jsonl',
- raw_download_dir: str | None = None,
- keep_raw_download: bool = False,
- force_download: bool = False,
- url_limit: int | None = None,
- record_limit: int | None = None,