download.doc_builder#

Module Contents#

Classes#

Functions#

API#

class download.doc_builder.DocumentDownloader#

Bases: abc.ABC

Initialization

abstractmethod download(url: str) str#
class download.doc_builder.DocumentExtractor#

Bases: abc.ABC

Initialization

abstractmethod extract(content: str) dict[str, str]#
class download.doc_builder.DocumentIterator#

Bases: abc.ABC

Initialization

abstractmethod iterate(
file_path: str,
) collections.abc.Iterator[tuple[dict[str, str], str]]#
download.doc_builder.batch_download(
urls: list[str],
downloader: download.doc_builder.DocumentDownloader,
) list[str]#
download.doc_builder.download_and_extract(
urls: list[str],
output_paths: list[str],
downloader: download.doc_builder.DocumentDownloader,
iterator: download.doc_builder.DocumentIterator,
extractor: download.doc_builder.DocumentExtractor,
output_format: dict,
output_type: Literal[jsonl, parquet] = 'jsonl',
keep_raw_download: bool = False,
force_download: bool = False,
input_meta: str | dict | None = None,
filename_col: str = 'file_name',
record_limit: int | None = None,
) nemo_curator.datasets.DocumentDataset#
download.doc_builder.import_downloader(
downloader_path: str,
) download.doc_builder.DocumentDownloader#
download.doc_builder.import_extractor(
extractor_path: str,
) download.doc_builder.DocumentExtractor#
download.doc_builder.import_iterator(
iterator_path: str,
) download.doc_builder.DocumentIterator#