# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
# All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# Copyright (c) 2024, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# pylint: disable=line-too-long
# pylint: disable=too-few-public-methods
"""
Parse document content and properties using python-docx
"""
import io
import logging
import re
import uuid
from datetime import datetime
from typing import Dict, Optional, Union
from typing import List
from typing import Tuple
from collections import defaultdict
import pandas as pd
from docx import Document
from docx.image.constants import MIME_TYPE
from docx.image.image import Image
from docx.oxml.table import CT_Tbl
from docx.oxml.text.paragraph import CT_P
from docx.table import Table
from docx.table import _Cell
from docx.text.hyperlink import Hyperlink
from docx.text.paragraph import Paragraph
from docx.text.run import Run
from pandas import DataFrame
from nv_ingest.extraction_workflows.image.image_handlers import extract_page_elements_from_images
from nv_ingest.extraction_workflows.image.image_handlers import load_and_preprocess_image
from nv_ingest.schemas.image_extractor_schema import ImageConfigSchema
from nv_ingest.schemas.metadata_schema import ContentTypeEnum
from nv_ingest.schemas.metadata_schema import ImageTypeEnum
from nv_ingest.schemas.metadata_schema import StdContentDescEnum
from nv_ingest.schemas.metadata_schema import TextTypeEnum
from nv_ingest.schemas.metadata_schema import validate_metadata
from nv_ingest.util.converters import bytetools
from nv_ingest.util.detectors.language import detect_language
from nv_ingest.util.pdf.metadata_aggregators import construct_page_element_metadata, CroppedImageWithContent
PARAGRAPH_FORMATS = ["text", "markdown"]
TABLE_FORMATS = ["markdown", "markdown_light", "csv", "tag"]
logger = logging.getLogger(__name__)
[docs]
class DocxProperties:
"""
Parse document core properties and update metadata.
This class extracts core properties from a python-docx Document object
and updates a provided metadata dictionary with standardized values.
If certain properties are missing, smart defaults are used.
"""
def __init__(self, document: Document, source_metadata: dict):
"""
Initialize a DocxProperties instance by extracting core properties from a Document.
Parameters
----------
document : Document
A python-docx Document object representing the DOCX file.
source_metadata : dict
A dictionary containing source metadata. This dictionary will be updated
with the document's core properties (e.g., creation and modification dates).
Notes
-----
The following core properties are extracted:
- title: Defaults to "Untitled Document" if not provided.
- author: Uses the document's author if available; otherwise falls back to
last_modified_by or defaults to "Unknown Author".
- created: The creation datetime; if missing, defaults to the current datetime.
- modified: The last modified datetime; if missing, defaults to the current datetime.
- keywords: The document's keywords; if missing, defaults to an empty list.
The source_metadata dictionary is updated with:
- date_created: ISO formatted string of the created date.
- last_modified: ISO formatted string of the modified date.
"""
self.document = document
self.source_metadata = source_metadata
core_properties = self.document.core_properties
# Set default title if missing
self.title = core_properties.title if core_properties.title is not None else "Untitled Document"
# Use author if available; otherwise, fall back to last_modified_by or default
self.author = (
core_properties.author
if core_properties.author is not None and core_properties.author.strip() != ""
else (
core_properties.last_modified_by if core_properties.last_modified_by is not None else "Unknown Author"
)
)
# Use current datetime as fallback for created/modified
self.created = core_properties.created if core_properties.created is not None else datetime.now()
self.modified = core_properties.modified if core_properties.modified is not None else datetime.now()
# Default keywords to an empty list if missing
self.keywords = core_properties.keywords if core_properties.keywords is not None else []
self._update_source_meta_data()
def __str__(self):
"""
Return a string representation of the document's core properties.
Returns
-------
str
A formatted string containing the title, author, created date, modified date,
and keywords of the document.
"""
info = "Document Properties:\n"
info += f"title: {self.title}\n"
info += f"author: {self.author}\n"
info += f"created: {self.created.isoformat()}\n"
info += f"modified: {self.modified.isoformat()}\n"
info += f"keywords: {self.keywords}\n"
return info
def _update_source_meta_data(self):
"""
Update the source metadata dictionary with the document's core properties.
This method sets the 'date_created' and 'last_modified' fields in the
source_metadata dictionary to the ISO formatted string representations of the
created and modified dates.
Returns
-------
None
"""
self.source_metadata.update(
{
"date_created": self.created.isoformat(),
"last_modified": self.modified.isoformat(),
}
)
[docs]
class DocxReader:
__doc__ = f"""
Read a docx file and extract its content as text, images and tables.
Parameters
----------
docx :
Bytestream
paragraph_format : str
Format of the paragraphs. Supported formats are: {PARAGRAPH_FORMATS}
table_format : str
Format of the tables. Supported formats are: {TABLE_FORMATS}
handle_text_styles : bool
Whether to apply style on a paragraph (heading, list, title, subtitle).
Not recommended if the document has been converted from pdf.
image_tag : str
Tag to replace the images in the text. Must contain one placeholder for the image index.
table_tag : str
Tag to replace the tables in the text. Must contain one placeholder for the table index.
"""
def __init__(
self,
docx,
source_metadata: Dict,
paragraph_format: str = "markdown",
table_format: str = "markdown",
handle_text_styles: bool = True,
image_tag="<image {}>",
table_tag="<table {}>",
extraction_config: Dict = None,
):
if paragraph_format not in PARAGRAPH_FORMATS:
raise ValueError(f"Unknown paragraph format {paragraph_format}. Supported formats are: {PARAGRAPH_FORMATS}")
if table_format not in TABLE_FORMATS:
raise ValueError(f"Unknown table format {table_format}. Supported formats are: {TABLE_FORMATS}")
self.paragraph_format = paragraph_format
self.table_format = table_format
self.handle_text_styles = handle_text_styles
self.image_tag = image_tag
self.table_tag = table_tag
# Read docx
self.document = Document(docx)
# Get the core properties
self.properties = DocxProperties(self.document, source_metadata)
logger.debug("%s", str(self.properties))
self.trailing_space_pattern = re.compile(r"(^\s*)(.*?)(\s*$)", re.DOTALL)
self.empty_text_pattern = re.compile(r"^\s*$")
self.images = []
self.tables = []
self.image_tag_index = 1
self.table_tag_index = 1
# placeholders for metadata extraction
self._accumulated_text = []
self._extracted_data = []
self._extraction_config = extraction_config if extraction_config else {}
self._pending_images = []
self._prev_para_image_idx = 0
self._prev_para_images = []
[docs]
def is_text_empty(self, text: str) -> bool:
"""
Check if the given text is empty or matches the empty text pattern.
Parameters
----------
text : str
The text to check.
Returns
-------
bool
True if the text is empty or matches the empty text pattern, False otherwise.
"""
return self.empty_text_pattern.match(text) is not None
[docs]
def format_text(self, text: str, bold: bool, italic: bool, underline: bool) -> str:
"""
Apply markdown styling (bold, italic, underline) to the given text.
Parameters
----------
text : str
The text to format.
bold : bool
Whether to apply bold styling.
italic : bool
Whether to apply italic styling.
underline : bool
Whether to apply underline styling.
Returns
-------
str
The formatted text with the applied styles.
"""
if self.is_text_empty(text):
return text
# Exclude leading and trailing spaces from style
match = self.trailing_space_pattern.match(text)
if match:
prefix, text, suffix = match.groups()
else:
prefix, suffix = "", ""
# Apply style
if bold:
text = f"**{text}**"
if italic:
text = f"*{text}*"
if underline:
text = f"<u>{text}</u>"
# Add back leading and trailing spaces
text = prefix + text + suffix
return text
[docs]
@staticmethod
def apply_text_style(style: str, text: str, level: int = 0) -> str:
"""
Apply a specific text style (e.g., heading, list, title, subtitle) to the given text.
Parameters
----------
style : str
The style to apply. Supported styles include headings ("Heading 1" to "Heading 9"),
list items ("List"), and document structures ("Title", "Subtitle").
text : str
The text to style.
level : int, optional
The indentation level for the styled text. Default is 0.
Returns
-------
str
The text with the specified style and indentation applied.
"""
if re.match(r"^Heading [1-9]$", style):
n = int(style.split(" ")[-1])
text = f"{'#' * n} {text}"
elif style.startswith("List"):
text = f"- {text}"
elif style == "Title":
text = f"{text}\n{'=' * len(text)}"
elif style == "Subtitle":
text = f"{text}\n{'-' * len(text)}"
text = "\t" * level + text
return text
[docs]
@staticmethod
def docx_content_type_to_image_type(content_type: "MIME_TYPE") -> str:
"""
Convert a DOCX content type string to an image type.
Parameters
----------
content_type : MIME_TYPE
The content type string from the image header, e.g., "image/jpeg".
Returns
-------
str
The image type extracted from the content type string.
"""
return content_type.split("/")[1]
def _construct_image_metadata(
self, para_idx: int, caption: str, base_unified_metadata: Dict, base64_img: str
) -> List[Union[str, dict]]:
"""
Build metadata for an image in a DOCX file.
Parameters
----------
para_idx : int
The paragraph index containing the image.
caption : str
The caption associated with the image.
base_unified_metadata : dict
The base metadata to build upon.
base64_img : str
The image content encoded as a base64 string.
Returns
-------
list
A list containing the content type, validated metadata, and a unique identifier.
"""
bbox = (0, 0, 0, 0)
caption_len = len(caption.splitlines())
page_idx = 0 # docx => single page
page_count = 1
page_nearby_blocks = {
"text": {"content": [], "bbox": []},
"images": {"content": [], "bbox": []},
"structured": {"content": [], "bbox": []},
}
if caption_len:
page_nearby_blocks["text"]["content"].append(caption)
page_nearby_blocks["text"]["bbox"] = [[-1, -1, -1, -1]] * caption_len
content_metadata = {
"type": ContentTypeEnum.IMAGE,
"description": StdContentDescEnum.DOCX_IMAGE,
"page_number": page_idx,
"hierarchy": {
"page_count": page_count,
"page": page_idx,
"block": para_idx,
"line": -1,
"span": -1,
"nearby_objects": page_nearby_blocks,
},
}
image_metadata = {
"image_type": ImageTypeEnum.image_type_1,
"structured_image_type": ImageTypeEnum.image_type_1,
"caption": caption,
"text": "",
"image_location": bbox,
}
unified_metadata = base_unified_metadata.copy()
unified_metadata.update(
{
"content": base64_img,
"source_metadata": self.properties.source_metadata,
"content_metadata": content_metadata,
"image_metadata": image_metadata,
}
)
validated_unified_metadata = validate_metadata(unified_metadata)
return [
ContentTypeEnum.IMAGE.value,
validated_unified_metadata.model_dump(),
str(uuid.uuid4()),
]
def _extract_para_images(
self, images: List["Image"], para_idx: int, caption: str, base_unified_metadata: Dict
) -> None:
"""
Collect images from a paragraph and store them for metadata construction.
Parameters
----------
images : list of Image
The images found in the paragraph.
para_idx : int
The index of the paragraph containing the images.
caption : str
The caption associated with the images.
base_unified_metadata : dict
The base metadata to associate with the images.
Returns
-------
None
"""
for image in images:
logger.debug("image content_type %s para_idx %d", image.content_type, para_idx)
logger.debug("image caption %s", caption)
# Simply append a tuple so we can build the final metadata in _finalize_images
self._pending_images.append((image, para_idx, caption, base_unified_metadata))
def _construct_text_metadata(
self, accumulated_text: List[str], para_idx: int, text_depth: "TextTypeEnum", base_unified_metadata: Dict
) -> List[Union[str, dict]]:
"""
Build metadata for text content in a DOCX file.
Parameters
----------
accumulated_text : list of str
The accumulated text to include in the metadata.
para_idx : int
The paragraph index containing the text.
text_depth : TextTypeEnum
The depth of the text content (e.g., page-level, paragraph-level).
base_unified_metadata : dict
The base metadata to build upon.
Returns
-------
list
A list containing the content type, validated metadata, and a unique identifier.
"""
if len(accumulated_text) < 1:
return []
extracted_text = " ".join(accumulated_text)
# the document is treated as a single page
page_number = 0 if text_depth == TextTypeEnum.PAGE else -1
content_metadata = {
"type": ContentTypeEnum.TEXT,
"description": StdContentDescEnum.DOCX_TEXT,
"page_number": page_number,
"hierarchy": {
"page_count": 1,
"page": page_number,
"block": para_idx,
"line": -1,
"span": -1,
},
}
language = detect_language(extracted_text)
text_metadata = {
"text_type": text_depth,
"summary": "",
"keywords": self.properties.keywords,
"language": language,
"text_location": (-1, -1, -1, -1),
}
ext_unified_metadata = base_unified_metadata.copy() if base_unified_metadata else {}
ext_unified_metadata.update(
{
"content": extracted_text,
"source_metadata": self.properties.source_metadata,
"content_metadata": content_metadata,
"text_metadata": text_metadata,
}
)
validated_unified_metadata = validate_metadata(ext_unified_metadata)
return [ContentTypeEnum.TEXT.value, validated_unified_metadata.model_dump(), str(uuid.uuid4())]
def _extract_para_text(
self,
paragraph,
paragraph_text,
base_unified_metadata: Dict,
text_depth: "TextTypeEnum",
para_idx: int,
) -> None:
"""
Process the text, images, and styles in a DOCX paragraph.
Parameters
----------
paragraph: Paragraph
The paragraph to process.
paragraph_text: str
The text content of the paragraph.
base_unified_metadata : dict
The base metadata to associate with extracted data.
text_depth : TextTypeEnum
The depth of text extraction (e.g., block-level, document-level).
para_idx : int
The index of the paragraph being processed.
Returns
-------
None
"""
# Handle text styles if desired
if self.handle_text_styles:
try:
numPr = paragraph._element.xpath("./w:pPr/w:numPr")[0]
level = int(numPr.xpath("./w:ilvl/@w:val")[0])
except Exception:
level = -1
paragraph_text = self.apply_text_style(paragraph.style.name, paragraph_text, level)
self._accumulated_text.append(paragraph_text + "\n")
# If text_depth is BLOCK, we flush after each paragraph
if text_depth == TextTypeEnum.BLOCK:
text_extraction = self._construct_text_metadata(
self._accumulated_text, para_idx, text_depth, base_unified_metadata
)
self._extracted_data.append(text_extraction)
self._accumulated_text = []
def _finalize_images(self, extract_tables: bool, extract_charts: bool, **kwargs) -> None:
"""
Build and append final metadata for each pending image in batches.
Parameters
----------
extract_tables : bool
Whether to attempt table detection in images.
extract_charts : bool
Whether to attempt chart detection in images.
**kwargs
Additional configuration for image processing.
Returns
-------
None
"""
if not self._pending_images:
return
# 1) Convert all pending images into numpy arrays (and also store base64 + context),
# so we can run detection on them in one go.
all_image_arrays = []
image_info = [] # parallel list to hold (para_idx, caption, base_unified_metadata, base64_img)
for docx_image, para_idx, caption, base_unified_metadata in self._pending_images:
# Convert docx image blob to BytesIO, then to numpy array
image_bytes = docx_image.blob
image_stream = io.BytesIO(image_bytes)
image_array = load_and_preprocess_image(image_stream)
base64_img = str(bytetools.base64frombytes(image_bytes))
all_image_arrays.append(image_array)
# Keep track of all needed metadata so we can rebuild final entries
image_info.append((para_idx, caption, base_unified_metadata, base64_img))
# 2) If the user wants to detect tables/charts, do it in one pass for all images.
detection_map = defaultdict(list) # maps image_index -> list of CroppedImageWithContent
if extract_tables or extract_charts:
try:
# Perform the batched detection on all images
detection_results = extract_page_elements_from_images(
images=all_image_arrays,
config=ImageConfigSchema(**self._extraction_config.model_dump()),
trace_info=kwargs.get("trace_info"),
)
# detection_results is typically List[Tuple[int, CroppedImageWithContent]]
# Group by image_index
for image_idx, cropped_item in detection_results:
detection_map[image_idx].append(cropped_item)
except Exception as e:
logger.error(f"Error extracting tables/charts in batch: {e}")
# If something goes wrong, we can fall back to empty detection map
# so that all images are treated normally
detection_map = {}
# 3) For each pending image, decide if we found tables/charts or not.
for i, _ in enumerate(self._pending_images):
para_idx_i, caption_i, base_unified_metadata_i, base64_img_i = image_info[i]
# If detection_map[i] is non-empty, we have found table(s)/chart(s).
if i in detection_map and detection_map[i]:
for table_chart_data in detection_map[i]:
# Build structured metadata for each table or chart
structured_entry = construct_page_element_metadata(
structured_image=table_chart_data, # A CroppedImageWithContent
page_idx=0, # docx => single page
page_count=1,
source_metadata=self.properties.source_metadata,
base_unified_metadata=base_unified_metadata_i,
)
self._extracted_data.append(structured_entry)
else:
# Either detection was not requested, or no table/chart was found
image_entry = self._construct_image_metadata(
para_idx_i,
caption_i,
base_unified_metadata_i,
base64_img_i,
)
self._extracted_data.append(image_entry)
# 4) Clear out the pending images after finalizing
self._pending_images = []
def _extract_table_data(
self,
child,
base_unified_metadata: Dict,
) -> None:
"""
Process the text and images in a DOCX table.
Parameters
----------
child : element
The table element to process.
base_unified_metadata : dict
The base metadata to associate with extracted data.
text_depth : TextTypeEnum
The depth of text extraction (e.g., block-level, document-level).
para_idx : int
The index of the table being processed.
Returns
-------
None
"""
# Table
table = Table(child, self.document)
table_text, table_images, table_dataframe = self.format_table(table)
self.images += table_images
self.tables.append(table_dataframe)
cropped_image_with_content = CroppedImageWithContent(
content=table_text,
image="", # no image content
bbox=(0, 0, 0, 0),
max_width=0,
max_height=0,
type_string="table",
)
self._extracted_data.append(
construct_page_element_metadata(
structured_image=cropped_image_with_content,
page_idx=0, # docx => single page
page_count=1,
source_metadata=self.properties.source_metadata,
base_unified_metadata=base_unified_metadata,
)
)