# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
# All rights reserved.
# SPDX-License-Identifier: Apache-2.0
import logging
from typing import List, Any
from typing import Optional
from typing import Tuple
import cv2
import numpy as np
import pypdfium2 as pdfium
import pypdfium2.raw as pdfium_c
from numpy import dtype
from numpy import ndarray
from nv_ingest_api.internal.primitives.tracing.tagging import traceable_func
from nv_ingest_api.util.image_processing.clustering import (
group_bounding_boxes,
combine_groups_into_bboxes,
remove_superset_bboxes,
)
from nv_ingest_api.util.image_processing.transforms import pad_image, numpy_to_base64, crop_image, scale_numpy_image
from nv_ingest_api.util.metadata.aggregators import Base64Image
from nv_ingest_api.internal.primitives.nim.model_interface.yolox import YOLOX_PAGE_IMAGE_FORMAT
logger = logging.getLogger(__name__)
def _compute_render_scale_to_fit(
page: pdfium.PdfPage,
target_wh: Tuple[int, int],
rotation: int = 0,
) -> float:
"""
Compute a PDFium render scale that fits the rotated page within target pixel bounds.
Uses the standard fit-to-box formula: min(target_w/page_w, target_h/page_h)
Parameters
----------
page : pdfium.PdfPage
The PDF page to compute scale for.
target_wh : Tuple[int, int]
Target (width, height) in pixels.
rotation : int, optional
Page rotation in degrees (0, 90, 180, 270). Defaults to 0.
Returns
-------
float
The scale factor to use for rendering.
"""
target_w, target_h = target_wh
if target_w <= 0 or target_h <= 0:
return 1.0
page_w, page_h = float(page.get_width()), float(page.get_height())
if page_w <= 0.0 or page_h <= 0.0:
return 1.0
# Swap dimensions if rotated 90 or 270 degrees
if (rotation % 180) != 0:
page_w, page_h = page_h, page_w
return max(min(target_w / page_w, target_h / page_h), 1e-3)
PDFIUM_PAGEOBJ_MAPPING = {
pdfium_c.FPDF_PAGEOBJ_TEXT: "TEXT",
pdfium_c.FPDF_PAGEOBJ_PATH: "PATH",
pdfium_c.FPDF_PAGEOBJ_IMAGE: "IMAGE",
pdfium_c.FPDF_PAGEOBJ_SHADING: "SHADING",
pdfium_c.FPDF_PAGEOBJ_FORM: "FORM",
}
[docs]
def convert_bitmap_to_corrected_numpy(bitmap: pdfium.PdfBitmap) -> np.ndarray:
"""
Converts a PdfBitmap to a correctly formatted NumPy array, handling any necessary
channel swapping based on the bitmap's mode.
Parameters
----------
bitmap : pdfium.PdfBitmap
The bitmap object rendered from a PDF page.
Returns
-------
np.ndarray
A NumPy array representing the correctly formatted image data.
"""
img_arr = bitmap.to_numpy().copy()
# In-place SIMD-optimized BGR→RGB swap via OpenCV. This replaces pdfium's
# rev_byteorder flag, which triggers a non-thread-safe code path in
# CFX_AggDeviceDriver::GetDIBits() that SIGTRAPs under concurrent rendering.
mode = bitmap.mode
if mode in {"BGRA", "BGRX"}:
cv2.cvtColor(img_arr, cv2.COLOR_BGRA2RGBA, dst=img_arr)
elif mode == "BGR":
cv2.cvtColor(img_arr, cv2.COLOR_BGR2RGB, dst=img_arr)
return img_arr
[docs]
def pdfium_try_get_bitmap_as_numpy(image_obj) -> np.ndarray:
"""
Attempts to retrieve the bitmap from a PdfImage object and convert it to a NumPy array,
first with rendering enabled and then without rendering if the first attempt fails.
Parameters
----------
image_obj : PdfImage
The PdfImage object from which to extract the bitmap.
Returns
-------
np.ndarray
The extracted bitmap as a NumPy array.
Raises
------
PdfiumError
If an exception occurs during bitmap retrieval and both attempts fail.
Notes
-----
This function first tries to retrieve the bitmap with rendering enabled (`render=True`).
If that fails or the bitmap returned is `None`, it attempts to retrieve the raw bitmap
without rendering (`render=False`).
Any errors encountered during these attempts are logged at the debug level.
"""
image_bitmap = None
# First attempt with rendering enabled
try:
# logger.debug("Attempting to get rendered bitmap.")
image_bitmap = image_obj.get_bitmap(render=True)
except pdfium.PdfiumError as e:
logger.debug(f"Failed to get rendered bitmap: {e}")
# If rendering failed or returned None, try without rendering
if image_bitmap is None:
try:
# logger.debug("Attempting to get raw bitmap without rendering.")
image_bitmap = image_obj.get_bitmap(render=False)
except pdfium.PdfiumError as e:
logger.debug(f"Failed to get raw bitmap: {e}")
raise # Re-raise the exception to ensure the failure is handled upstream
# Final check if bitmap is still None
if image_bitmap is None:
logger.debug("Failed to obtain bitmap from the image object after both attempts.")
raise ValueError("Failed to retrieve bitmap from the PdfImage object.")
# Convert the bitmap to a NumPy array
img_array = convert_bitmap_to_corrected_numpy(image_bitmap)
return img_array
[docs]
@traceable_func(trace_name="pdf_extraction::pdfium_pages_to_numpy")
def pdfium_pages_to_numpy(
pages: List[pdfium.PdfPage],
render_dpi: int = 300,
scale_tuple: Optional[Tuple[int, int]] = None,
padding_tuple: Optional[Tuple[int, int]] = None,
rotation: int = 0,
) -> tuple[list[ndarray | ndarray[Any, dtype[Any]]], list[tuple[int, int]]]:
"""
Converts a list of PdfPage objects to a list of NumPy arrays, where each array
represents an image of the corresponding PDF page.
The function renders each page as a bitmap, converts it to a PIL image, applies any
specified scaling using the thumbnail approach, and adds padding if requested. The
DPI for rendering can be specified, with a default value of 300 DPI.
Parameters
----------
pages : List[pdfium.PdfPage]
A list of PdfPage objects to be rendered and converted into NumPy arrays.
render_dpi : int, optional
The DPI (dots per inch) at which to render the pages. Must be between 50 and 1200.
Defaults to 300.
scale_tuple : Optional[Tuple[int, int]], optional
A tuple (width, height) to resize the rendered image to using the thumbnail approach.
Defaults to None.
padding_tuple : Optional[Tuple[int, int]], optional
A tuple (width, height) to pad the image to. Defaults to None.
rotation : int, optional
Page rotation in degrees (0, 90, 180, 270). Defaults to 0.
Returns
-------
tuple
A tuple containing:
- A list of NumPy arrays, where each array corresponds to an image of a PDF page.
Each array is an independent copy of the rendered image data.
- A list of padding offsets applied to each image, as tuples of (offset_width, offset_height).
Raises
------
ValueError
If the render_dpi is outside the allowed range (50-1200).
PdfiumError
If there is an issue rendering the page or converting it to a NumPy array.
IOError
If there is an error saving the image to disk.
"""
if not (50 <= render_dpi <= 1200):
raise ValueError("render_dpi must be between 50 and 1200.")
images = []
padding_offsets = []
base_scale = render_dpi / 72 # 72 DPI is the base DPI in PDFium
for idx, page in enumerate(pages):
# Render at target scale directly when scale_tuple specified to avoid large intermediate bitmaps
render_scale = base_scale
if scale_tuple:
render_scale = min(base_scale, _compute_render_scale_to_fit(page, scale_tuple, rotation))
page_bitmap = page.render(scale=render_scale, rotation=rotation)
img_arr = convert_bitmap_to_corrected_numpy(page_bitmap)
# Safety fallback for rounding edge cases - only scale down if needed
if scale_tuple and (img_arr.shape[1] > scale_tuple[0] or img_arr.shape[0] > scale_tuple[1]):
img_arr = scale_numpy_image(img_arr, scale_tuple)
# Apply padding if specified
if padding_tuple:
img_arr, (pad_width, pad_height) = pad_image(
img_arr, target_width=padding_tuple[0], target_height=padding_tuple[1]
)
padding_offsets.append((pad_width, pad_height))
else:
padding_offsets.append((0, 0))
images.append(img_arr)
return images, padding_offsets
[docs]
def convert_pdfium_position(pos, page_width, page_height):
"""
Convert a PDFium bounding box (which typically has an origin at the bottom-left)
to a more standard bounding-box format with y=0 at the top.
Note:
This method assumes the PDF coordinate system follows the common convention
where the origin is at the bottom-left. However, per the PDF specification,
the coordinate system can theoretically be defined between any opposite corners,
and its origin may not necessarily be (0,0). This implementation may not handle
all edge cases where the coordinate system is arbitrarily transformed.
Further processing may be necessary downstream, particularly in filtering or
deduplication stages, to account for variations in coordinate transformations
and ensure consistent bounding-box comparisons.
See https://github.com/pypdfium2-team/pypdfium2/discussions/284.
"""
left, bottom, right, top = pos
x0, x1 = left, right
y0, y1 = page_height - top, page_height - bottom
x0 = max(0, x0)
y0 = max(0, y0)
x1 = min(page_width, x1)
y1 = min(page_height, y1)
return [int(x0), int(y0), int(x1), int(y1)]
[docs]
def is_scanned_page(page) -> bool:
tp = page.get_textpage()
text = tp.get_text_bounded() or ""
num_chars = len(text.strip())
num_images = sum(1 for obj in page.get_objects() if obj.type == pdfium_c.FPDF_PAGEOBJ_IMAGE)
return num_chars == 0 and num_images > 0