nemo_curator.stages.text.io.reader.base

View as Markdown

Module Contents

Classes

NameDescription
BaseReaderCommon base for tabular file readers.

API

class nemo_curator.stages.text.io.reader.base.BaseReader(
fields: list[str] | None = None,
read_kwargs: dict[str, typing.Any] = dict(),
name: str = '',
_generate_ids: bool = False,
_assign_ids: bool = False
)
Dataclass

Bases: ProcessingStage[FileGroupTask, DocumentBatch]

Common base for tabular file readers.

Subclasses must implement the read_data method.

_assign_ids
bool = False
_generate_ids
bool = False
fields
list[str] | None = None
name
str = ''
read_kwargs
dict[str, Any] = field(default_factory=dict)
nemo_curator.stages.text.io.reader.base.BaseReader.__post_init__() -> None
nemo_curator.stages.text.io.reader.base.BaseReader._assign_ids_func(
filepath: str | list[str],
df: pandas.DataFrame
) -> pandas.DataFrame
nemo_curator.stages.text.io.reader.base.BaseReader._generate_ids_func(
filepath: str | list[str],
df: pandas.DataFrame
) -> pandas.DataFrame
nemo_curator.stages.text.io.reader.base.BaseReader.inputs() -> tuple[list[str], list[str]]
nemo_curator.stages.text.io.reader.base.BaseReader.outputs() -> tuple[list[str], list[str]]
nemo_curator.stages.text.io.reader.base.BaseReader.process(
task: nemo_curator.tasks.FileGroupTask
) -> nemo_curator.tasks.DocumentBatch
nemo_curator.stages.text.io.reader.base.BaseReader.ray_stage_spec() -> dict[str, typing.Any]
nemo_curator.stages.text.io.reader.base.BaseReader.read_data(
file_paths: list[str],
read_kwargs: dict[str, typing.Any] | None,
fields: list[str] | None
) -> pandas.DataFrame | None
nemo_curator.stages.text.io.reader.base.BaseReader.setup(
_: nemo_curator.backends.base.WorkerMetadata | None = None
) -> None