download.doc_builder#
Module Contents#
Classes#
Functions#
API#
- class download.doc_builder.DocumentDownloader#
Bases:
abc.ABCInitialization
- abstractmethod download(url: str) str#
- class download.doc_builder.DocumentExtractor#
Bases:
abc.ABCInitialization
- abstractmethod extract(content: str) dict[str, str]#
- class download.doc_builder.DocumentIterator#
Bases:
abc.ABCInitialization
- abstractmethod iterate(
- file_path: str,
- download.doc_builder.batch_download(
- urls: list[str],
- downloader: download.doc_builder.DocumentDownloader,
- download.doc_builder.download_and_extract(
- urls: list[str],
- output_paths: list[str],
- downloader: download.doc_builder.DocumentDownloader,
- iterator: download.doc_builder.DocumentIterator,
- extractor: download.doc_builder.DocumentExtractor,
- output_format: dict,
- output_type: Literal[jsonl, parquet] = 'jsonl',
- keep_raw_download: bool = False,
- force_download: bool = False,
- input_meta: str | dict | None = None,
- filename_col: str = 'file_name',
- record_limit: int | None = None,
- download.doc_builder.import_downloader(
- downloader_path: str,
- download.doc_builder.import_extractor(
- extractor_path: str,
- download.doc_builder.import_iterator(
- iterator_path: str,