datasets.doc_dataset#
Module Contents#
Classes#
API#
- class datasets.doc_dataset.DocumentDataset(dataset_df: dask.dataframe.DataFrame)#
Initialization
- classmethod from_pandas(
- data: pandas.DataFrame,
- npartitions: int | None = 1,
- chunksize: int | None = None,
- sort: bool | None = True,
- name: str | None = None,
- head(n: int = 5) datasets.doc_dataset.cudf | pandas.DataFrame#
- persist() datasets.doc_dataset.DocumentDataset#
- classmethod read_custom(
- input_files: str | list[str],
- file_type: str,
- read_func_single_partition: collections.abc.Callable[[list[str], str, bool, str | dict, dict], datasets.doc_dataset.cudf | pandas.DataFrame],
- files_per_partition: int | None = None,
- backend: Literal[pandas, datasets.doc_dataset.cudf] | None = None,
- add_filename: bool | str = False,
- columns: list[str] | None = None,
- input_meta: str | dict | None = None,
- **kwargs,
- classmethod read_json(
- input_files: str | list[str],
- backend: Literal[pandas, datasets.doc_dataset.cudf] = 'pandas',
- files_per_partition: int | None = None,
- blocksize: str | None = '1gb',
- add_filename: bool | str = False,
- input_meta: str | dict | None = None,
- columns: list[str] | None = None,
- **kwargs,
- classmethod read_parquet(
- input_files: str | list[str],
- backend: Literal[pandas, datasets.doc_dataset.cudf] = 'pandas',
- files_per_partition: int | None = None,
- blocksize: str | None = '1gb',
- add_filename: bool | str = False,
- columns: list[str] | None = None,
- **kwargs,
- classmethod read_pickle(
- input_files: str | list[str],
- backend: Literal[pandas, datasets.doc_dataset.cudf] = 'pandas',
- columns: list[str] | None = None,
- **kwargs,
- repartition(*args, **kwargs) datasets.doc_dataset.DocumentDataset#
- to_json(
- output_path: str,
- write_to_filename: bool | str = False,
- keep_filename_column: bool = False,
- partition_on: str | None = None,
- compression: str | None = None,
- to_pandas() pandas.DataFrame#
- to_parquet(
- output_path: str,
- write_to_filename: bool | str = False,
- keep_filename_column: bool = False,
- partition_on: str | None = None,
- to_pickle(
- output_path: str,
- write_to_filename: bool | str = False,