download.arxiv#

Module Contents#

Classes#

Functions#

API#

class download.arxiv.ArxivDownloader(download_dir: str, verbose: bool = False)#

Bases: nemo_curator.download.doc_builder.DocumentDownloader

Initialization

download(tarfile: str) str#
class download.arxiv.ArxivExtractor#

Bases: nemo_curator.download.doc_builder.DocumentExtractor

Initialization

extract(content: list[str]) dict[str, str] | None#
class download.arxiv.ArxivIterator(log_frequency: int = 1000)#

Bases: nemo_curator.download.doc_builder.DocumentIterator

Initialization

iterate(
file_path: str,
) collections.abc.Iterator[tuple[dict[str, str], list[str]]]#
download.arxiv.download_arxiv(
output_path: str,
output_type: Literal[jsonl, parquet] = 'jsonl',
raw_download_dir: str | None = None,
keep_raw_download: bool = False,
force_download: bool = False,
url_limit: int | None = None,
record_limit: int | None = None,
) nemo_curator.datasets.DocumentDataset#