nemo_curator.stages.interleaved.io.writers.base

View as MarkdownOpen in Claude

Module Contents

Classes

NameDescription
BaseInterleavedWriterBase class for interleaved writers.

API

class nemo_curator.stages.interleaved.io.writers.base.BaseInterleavedWriter(
path: str,
file_extension: str,
write_kwargs: dict[str, typing.Any] = dict(),
materialize_on_write: bool = True,
name: str = 'base_interleaved_writer',
mode: typing.Literal['ignore', 'overwrite', 'append', 'error'] = 'ignore',
append_mode_implemented: bool = False
)
DataclassAbstract

Bases: ProcessingStage[InterleavedBatch, FileGroupTask]

Base class for interleaved writers.

Handles filesystem setup, deterministic file naming, optional binary materialization, and process() orchestration. Subclasses implement _write_dataframe for format-specific output.

append_mode_implemented
bool = False
file_extension
str
materialize_on_write
bool = True
mode
Literal['ignore', 'overwrite', 'append', 'error'] = 'ignore'
name
str = 'base_interleaved_writer'
path
str
write_kwargs
dict[str, Any] = field(default_factory=dict)
nemo_curator.stages.interleaved.io.writers.base.BaseInterleavedWriter.__post_init__() -> None
nemo_curator.stages.interleaved.io.writers.base.BaseInterleavedWriter._materialize_dataframe(
task: nemo_curator.tasks.InterleavedBatch
) -> pandas.DataFrame
nemo_curator.stages.interleaved.io.writers.base.BaseInterleavedWriter._write_dataframe(
df: pandas.DataFrame,
file_path: str,
write_kwargs: dict[str, typing.Any]
) -> None
abstract

Format-specific DataFrame writer. Subclasses implement this.

nemo_curator.stages.interleaved.io.writers.base.BaseInterleavedWriter.inputs() -> tuple[list[str], list[str]]
nemo_curator.stages.interleaved.io.writers.base.BaseInterleavedWriter.outputs() -> tuple[list[str], list[str]]
nemo_curator.stages.interleaved.io.writers.base.BaseInterleavedWriter.process(
task: nemo_curator.tasks.InterleavedBatch
) -> nemo_curator.tasks.FileGroupTask
nemo_curator.stages.interleaved.io.writers.base.BaseInterleavedWriter.write_data(
task: nemo_curator.tasks.InterleavedBatch,
file_path: str
) -> None