Source code for nemo_retriever.utils.input_files

from __future__ import annotations

import glob
from collections.abc import Iterable
from os import PathLike, fspath
from pathlib import Path
from typing import NoReturn

INPUT_TYPE_PATTERNS: dict[str, tuple[str, ...]] = {
    "auto": (
        "*.pdf",
        "*.docx",
        "*.pptx",
        "*.txt",
        "*.html",
        "*.jpg",
        "*.jpeg",
        "*.png",
        "*.tiff",
        "*.tif",
        "*.bmp",
        "*.svg",
        "*.mp3",
        "*.wav",
        "*.m4a",
        "*.mp4",
        "*.mov",
        "*.mkv",
    ),
    "pdf": ("*.pdf",),
    "txt": ("*.txt",),
    "html": ("*.html",),
    "doc": ("*.docx", "*.pptx"),
    "image": ("*.jpg", "*.jpeg", "*.png", "*.tiff", "*.tif", "*.bmp", "*.svg"),
    "audio": ("*.mp3", "*.wav", "*.m4a"),
    "video": ("*.mp4", "*.mov", "*.mkv"),
}
INPUT_TYPE_EXTENSIONS: dict[str, frozenset[str]] = {
    input_type: frozenset(pattern[1:].lower() for pattern in patterns if pattern.startswith("*."))
    for input_type, patterns in INPUT_TYPE_PATTERNS.items()
    if input_type != "auto"
}
AUTO_INPUT_EXTENSIONS: frozenset[str] = frozenset().union(*INPUT_TYPE_EXTENSIONS.values())
PDF_DOCUMENT_INPUT_TYPES = frozenset({"pdf", "doc"})

InputPath = str | PathLike[str]


def _is_explicit_glob_path(input_path: InputPath) -> bool:
    return glob.has_magic(fspath(input_path))


[docs] def input_type_for_path(input_path: InputPath) -> str | None: """Return the supported ingest input family for *input_path*'s extension.""" ext = Path(fspath(input_path)).suffix.lower() for input_type, extensions in INPUT_TYPE_EXTENSIONS.items(): if ext in extensions: return input_type return None
[docs] def raise_input_path_not_found(input_path: object, cause: BaseException | None = None) -> NoReturn: """Raise a consistent missing-input-path error. Parameters ---------- input_path Path, pattern, or list of paths attempted by the caller or file reader. cause Optional lower-level exception to preserve as the chained cause. Raises ------ FileNotFoundError Always raised with a product-level missing-input-path message. """ message = f"Input path does not exist: {input_path}" if cause is None: raise FileNotFoundError(message) raise FileNotFoundError(f"{message}. Reader error: {cause}") from cause
[docs] def expand_input_file_patterns(input_paths: InputPath | Iterable[InputPath]) -> list[str]: """Expand local path/glob inputs and reject missing or directory local literal paths. Empty explicit glob matches are allowed so callers can intentionally describe optional file sets. """ paths = [input_paths] if isinstance(input_paths, (str, PathLike)) else list(input_paths) expanded: list[str] = [] for input_path in paths: raw_path = fspath(input_path) pattern = str(Path(raw_path).expanduser()) matches = [match for match in glob.glob(pattern, recursive=True) if Path(match).is_file()] if matches: expanded.extend(sorted(matches)) elif _is_explicit_glob_path(pattern): expanded.append(pattern) elif not Path(pattern).exists(): raise_input_path_not_found(pattern) elif Path(pattern).is_dir(): raise IsADirectoryError( f"Input path is a directory: {pattern}. " "Pass a file path or a glob pattern such as '<dir>/**/*.pdf' or '<dir>/**/*' " "to select files inside the directory." ) else: expanded.append(pattern) return expanded
[docs] def resolve_input_patterns(input_path: Path, input_type: str) -> list[str]: path = Path(input_path) if path.is_file(): return [str(path)] if not path.is_dir(): raise FileNotFoundError(f"Path does not exist: {path}") patterns = INPUT_TYPE_PATTERNS.get(input_type, INPUT_TYPE_PATTERNS["pdf"]) return [str(path / "**" / pattern) for pattern in patterns]
[docs] def resolve_input_files(input_path: Path, input_type: str) -> list[Path]: path = Path(input_path).expanduser().resolve() if path.is_file(): return [path] if not path.exists(): return [] files: list[Path] = [] for pattern in INPUT_TYPE_PATTERNS.get(input_type, INPUT_TYPE_PATTERNS["pdf"]): files.extend(match for match in path.rglob(pattern) if match.is_file()) return sorted(set(files))