Source code for nemo_retriever.utils.convert.to_pdf

# SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
# All rights reserved.
# SPDX-License-Identifier: Apache-2.0

"""Convert DOCX/PPTX files to PDF bytes via LibreOffice headless."""

from __future__ import annotations

import os
import shutil
import subprocess
import tempfile
import traceback
from typing import Any, Dict, List, Optional

import pandas as pd

from nemo_retriever.graph.abstract_operator import AbstractOperator
from nemo_retriever.graph.cpu_operator import CPUOperator
from nemo_retriever.graph.designer import designer_component
from nemo_retriever.graph.operator_archetype import ArchetypeOperator

SUPPORTED_EXTENSIONS = frozenset({".pdf", ".docx", ".pptx"})


def _error_record(
    *,
    source_path: Optional[str],
    stage: str,
    exc: BaseException,
) -> Dict[str, Any]:
    return {
        "bytes": b"",
        "path": source_path,
        "metadata": {
            "source_path": source_path,
            "error": {
                "stage": str(stage),
                "type": exc.__class__.__name__,
                "message": str(exc),
                "traceback": "".join(traceback.format_exception(type(exc), exc, exc.__traceback__)),
            },
        },
    }



[docs]
def convert_to_pdf_bytes(file_bytes: bytes, extension: str) -> bytes:
    """Convert file bytes to PDF bytes.

    If *extension* is ``".pdf"``, return *file_bytes* unchanged.
    For ``".docx"`` / ``".pptx"``, write to a temp dir, invoke
    ``libreoffice --headless --convert-to pdf``, and return the
    resulting PDF bytes.

    Raises
    ------
    FileNotFoundError
        If the ``libreoffice`` binary is not on ``$PATH``.
    subprocess.CalledProcessError
        If LibreOffice conversion fails.
    RuntimeError
        If the expected PDF output file is missing after conversion.
    """
    ext = extension.lower()
    if ext == ".pdf":
        return file_bytes

    if ext not in SUPPORTED_EXTENSIONS:
        raise ValueError(f"Unsupported extension: {extension!r}")

    if shutil.which("libreoffice") is None:
        raise FileNotFoundError(
            "LibreOffice is required to convert DOCX/PPTX files to PDF but was not found on $PATH. "
            "Please install LibreOffice (e.g. `apt-get install libreoffice`) and try again."
        )

    with tempfile.TemporaryDirectory() as tmp_dir:
        # Strip leading dot for the filename suffix (e.g. ".docx" -> "docx").
        input_path = os.path.join(tmp_dir, f"input{ext}")
        with open(input_path, "wb") as f:
            f.write(file_bytes)

        command = [
            "libreoffice",
            "--headless",
            "--convert-to",
            "pdf",
            input_path,
            "--outdir",
            tmp_dir,
        ]

        subprocess.run(command, check=True, capture_output=True, text=True)

        pdf_path = os.path.join(tmp_dir, "input.pdf")
        if not os.path.exists(pdf_path):
            raise RuntimeError(f"LibreOffice conversion produced no output for {extension} file")

        with open(pdf_path, "rb") as f:
            return f.read()




[docs]
def convert_batch_to_pdf(batch_df: Any) -> pd.DataFrame:
    """Convert a batch of files to PDF, passing PDFs through unchanged.

    Expects a :class:`pandas.DataFrame` with at least ``bytes`` and ``path``
    columns (the same schema produced by ``ray.data.read_binary_files``).
    Rows whose path ends with a supported non-PDF extension are converted;
    rows that are already PDFs are returned as-is.  On error, an error record
    is emitted (matching the pattern in ``pdf/split.py``).
    """
    if isinstance(batch_df, list):
        # If we get a list of files instead of a DataFrame, convert it to a DataFrame.
        batch_df = pd.DataFrame({"path": batch_df})

    if not isinstance(batch_df, pd.DataFrame):
        raise NotImplementedError("convert_batch_to_pdf currently only supports pandas.DataFrame input.")

    _EXPLICIT_COLS = frozenset(("bytes", "path"))

    out_rows: List[Dict[str, Any]] = []
    for _, row in batch_df.iterrows():
        file_path = row.get("path") or ""
        file_bytes = row.get("bytes", b"")

        extra = {k: v for k, v in row.to_dict().items() if k not in _EXPLICIT_COLS}

        ext = os.path.splitext(file_path)[1].lower() if file_path else ".pdf"

        if ext not in SUPPORTED_EXTENSIONS:
            out_row: Dict[str, Any] = {"bytes": file_bytes, "path": file_path}
            out_row.update(extra)
            out_rows.append(out_row)
            continue

        if ext == ".pdf" and len(file_bytes) > 0:
            out_row = {"bytes": file_bytes, "path": file_path}
            out_row.update(extra)
            out_rows.append(out_row)
            continue

        try:
            if not isinstance(file_bytes, (bytes, bytearray, memoryview)):
                raise ValueError(f"Unsupported bytes payload type: {type(file_bytes)!r}")
            pdf_bytes = convert_to_pdf_bytes(bytes(file_bytes), ext)
            out_row = {"bytes": pdf_bytes, "path": file_path}
            out_row.update(extra)
            out_rows.append(out_row)
        except FileNotFoundError:
            raise  # LibreOffice not installed — fail fast, don't swallow.
        except BaseException as e:
            err = _error_record(
                source_path=str(file_path) if file_path else None,
                stage="convert_to_pdf",
                exc=e,
            )
            err.update(extra)
            out_rows.append(err)

    return pd.DataFrame(out_rows)




[docs]
@designer_component(
    name="Doc-to-PDF Converter",
    category="Document Processing",
    compute="cpu",
    description="Converts document formats (DOCX, PPTX, etc.) to PDF",
    category_color="#64b4ff",
)
class DocToPdfConversionCPUActor(AbstractOperator, CPUOperator):
    """Ray Data actor that converts DOCX/PPTX batches to PDF.

    Used with ``ray.data.Dataset.map_batches`` in the same style as
    ``PDFSplitActor``.
    """

    def __init__(self) -> None:
        super().__init__()


[docs]
    def preprocess(self, data: Any, **kwargs: Any) -> Any:
        return data



[docs]
    def process(self, data: Any, **kwargs: Any) -> Any:
        return convert_batch_to_pdf(data)



[docs]
    def postprocess(self, data: Any, **kwargs: Any) -> Any:
        return data


    def __call__(self, batch_df: Any) -> Any:
        return self.run(batch_df)




[docs]
class DocToPdfConversionActor(ArchetypeOperator):
    _cpu_variant_class = DocToPdfConversionCPUActor

    def __init__(self) -> None:
        super().__init__()