nemo_curator.stages.text.io.reader.base
nemo_curator.stages.text.io.reader.base
Module Contents
Classes
| Name | Description |
|---|---|
BaseReader | Common base for tabular file readers. |
API
class nemo_curator.stages.text.io.reader.base.BaseReader( fields: list[str] | None = None, read_kwargs: dict[str, typing.Any] = dict(), name: str = '', _generate_ids: bool = False, _assign_ids: bool = False )
Dataclass
Bases: ProcessingStage[FileGroupTask, DocumentBatch]
Common base for tabular file readers.
Subclasses must implement the read_data method.
_assign_ids
bool = False
_generate_ids
bool = False
fields
list[str] | None = None
name
str = ''
read_kwargs
dict[str, Any] = field(default_factory=dict)
nemo_curator.stages.text.io.reader.base.BaseReader.__post_init__() -> None
nemo_curator.stages.text.io.reader.base.BaseReader._assign_ids_func( filepath: str | list[str], df: pandas.DataFrame ) -> pandas.DataFrame
nemo_curator.stages.text.io.reader.base.BaseReader._generate_ids_func( filepath: str | list[str], df: pandas.DataFrame ) -> pandas.DataFrame
nemo_curator.stages.text.io.reader.base.BaseReader.inputs() -> tuple[list[str], list[str]]
nemo_curator.stages.text.io.reader.base.BaseReader.outputs() -> tuple[list[str], list[str]]
nemo_curator.stages.text.io.reader.base.BaseReader.process( task: nemo_curator.tasks.FileGroupTask ) -> nemo_curator.tasks.DocumentBatch
nemo_curator.stages.text.io.reader.base.BaseReader.ray_stage_spec() -> dict[str, typing.Any]
nemo_curator.stages.text.io.reader.base.BaseReader.read_data( file_paths: list[str], read_kwargs: dict[str, typing.Any] | None, fields: list[str] | None ) -> pandas.DataFrame | None
nemo_curator.stages.text.io.reader.base.BaseReader.setup( _: nemo_curator.backends.base.WorkerMetadata | None = None ) -> None