datasets.doc_dataset#

Module Contents#

Classes#

API#

class datasets.doc_dataset.DocumentDataset(dataset_df: dask.dataframe.DataFrame)#

Initialization

classmethod from_pandas(
data: pandas.DataFrame,
npartitions: int | None = 1,
chunksize: int | None = None,
sort: bool | None = True,
name: str | None = None,
) datasets.doc_dataset.DocumentDataset#
head(n: int = 5) datasets.doc_dataset.cudf | pandas.DataFrame#
persist() datasets.doc_dataset.DocumentDataset#
classmethod read_custom(
input_files: str | list[str],
file_type: str,
read_func_single_partition: collections.abc.Callable[[list[str], str, bool, str | dict, dict], datasets.doc_dataset.cudf | pandas.DataFrame],
files_per_partition: int | None = None,
backend: Literal[pandas, datasets.doc_dataset.cudf] | None = None,
add_filename: bool | str = False,
columns: list[str] | None = None,
input_meta: str | dict | None = None,
**kwargs,
) datasets.doc_dataset.DocumentDataset#
classmethod read_json(
input_files: str | list[str],
backend: Literal[pandas, datasets.doc_dataset.cudf] = 'pandas',
files_per_partition: int | None = None,
blocksize: str | None = '1gb',
add_filename: bool | str = False,
input_meta: str | dict | None = None,
columns: list[str] | None = None,
**kwargs,
) datasets.doc_dataset.DocumentDataset#
classmethod read_parquet(
input_files: str | list[str],
backend: Literal[pandas, datasets.doc_dataset.cudf] = 'pandas',
files_per_partition: int | None = None,
blocksize: str | None = '1gb',
add_filename: bool | str = False,
columns: list[str] | None = None,
**kwargs,
) datasets.doc_dataset.DocumentDataset#
classmethod read_pickle(
input_files: str | list[str],
backend: Literal[pandas, datasets.doc_dataset.cudf] = 'pandas',
columns: list[str] | None = None,
**kwargs,
) datasets.doc_dataset.DocumentDataset#
repartition(*args, **kwargs) datasets.doc_dataset.DocumentDataset#
to_json(
output_path: str,
write_to_filename: bool | str = False,
keep_filename_column: bool = False,
partition_on: str | None = None,
compression: str | None = None,
) None#
to_pandas() pandas.DataFrame#
to_parquet(
output_path: str,
write_to_filename: bool | str = False,
keep_filename_column: bool = False,
partition_on: str | None = None,
) None#
to_pickle(
output_path: str,
write_to_filename: bool | str = False,
) None#