Source code for nemo_retriever.graph.file_loader_operator

# SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
# All rights reserved.
# SPDX-License-Identifier: Apache-2.0

"""Operators for loading file paths into DataFrames."""

from __future__ import annotations

from pathlib import Path
from typing import Any

import pandas as pd

from nemo_retriever.graph.abstract_operator import AbstractOperator
from nemo_retriever.graph.cpu_operator import CPUOperator
from nemo_retriever.graph.designer import designer_component


[docs] @designer_component( name="File List Loader", category="Document Processing", compute="cpu", description="Loads files from a list of file paths", category_color="#64b4ff", ) class FileListLoaderOperator(AbstractOperator, CPUOperator): """Load a list of files into a DataFrame with ``path`` and ``bytes`` columns."""
[docs] def preprocess(self, data: Any, **kwargs: Any) -> list[str]: if isinstance(data, (str, Path)): return [str(data)] if isinstance(data, list): return [str(item) for item in data] raise TypeError(f"data must be a file path or list of file paths, got {type(data).__name__}")
[docs] def process(self, data: list[str], **kwargs: Any) -> pd.DataFrame: rows = [] for file_path in data: path = Path(file_path) if path.is_file(): rows.append({"path": str(path.resolve()), "bytes": path.read_bytes()}) if not rows: return pd.DataFrame(columns=["path", "bytes"]) return pd.DataFrame(rows, columns=["path", "bytes"])
[docs] def postprocess(self, data: pd.DataFrame, **kwargs: Any) -> pd.DataFrame: return data