nemo_curator.stages.interleaved.io.reader

View as Markdown

Module Contents

Classes

NameDescription
WebdatasetReaderComposite stage for reading WebDataset shards.

API

class nemo_curator.stages.interleaved.io.reader.WebdatasetReader(
file_paths: str | list[str],
files_per_partition: int | None = None,
blocksize: int | str | None = None,
max_batch_bytes: int | None = None,
read_kwargs: dict[str, typing.Any] = dict(),
materialize_on_read: bool = False,
file_extensions: list[str] = (lambda: list(DEFAULT_WEBDA...,
json_extensions: list[str] = (lambda: list(DEFAULT_JSON_...,
image_extensions: list[str] = (lambda: list(DEFAULT_IMAGE...,
source_id_field: str = '',
sample_id_field: str | None = None,
texts_field: str = 'texts',
images_field: str = 'images',
image_member_field: str | None = None,
fields: tuple[str, ...] | None = None,
per_image_fields: tuple[str, ...] = (),
per_text_fields: tuple[str, ...] = (),
name: str = 'webdataset_reader'
)
Dataclass

Bases: CompositeStage[_EmptyTask, InterleavedBatch]

Composite stage for reading WebDataset shards.

blocksize
int | str | None = None
fields
tuple[str, ...] | None = None
file_extensions
list[str]
file_paths
str | list[str]
files_per_partition
int | None = None
image_extensions
list[str]
image_member_field
str | None = None
images_field
str = 'images'
json_extensions
list[str]
materialize_on_read
bool = False
max_batch_bytes
int | None = None
name
str = 'webdataset_reader'
per_image_fields
tuple[str, ...] = ()
per_text_fields
tuple[str, ...] = ()
read_kwargs
dict[str, Any] = field(default_factory=dict)
sample_id_field
str | None = None
source_id_field
str = ''
texts_field
str = 'texts'
nemo_curator.stages.interleaved.io.reader.WebdatasetReader.__post_init__()
nemo_curator.stages.interleaved.io.reader.WebdatasetReader.decompose() -> list