Source code for nv_ingest.extraction_workflows.docx.docxreader

# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
# All rights reserved.
# SPDX-License-Identifier: Apache-2.0


# Copyright (c) 2024, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# pylint: disable=line-too-long
# pylint: disable=too-few-public-methods

"""
Parse document content and properties using python-docx
"""
import io
import logging
import re
import uuid
from datetime import datetime
from typing import Dict, Optional, Union
from typing import List
from typing import Tuple

from collections import defaultdict

import pandas as pd
from docx import Document
from docx.image.constants import MIME_TYPE
from docx.image.image import Image
from docx.oxml.table import CT_Tbl
from docx.oxml.text.paragraph import CT_P
from docx.table import Table
from docx.table import _Cell
from docx.text.hyperlink import Hyperlink
from docx.text.paragraph import Paragraph
from docx.text.run import Run
from pandas import DataFrame

from nv_ingest.extraction_workflows.image.image_handlers import extract_page_elements_from_images
from nv_ingest.extraction_workflows.image.image_handlers import load_and_preprocess_image
from nv_ingest.schemas.image_extractor_schema import ImageConfigSchema
from nv_ingest.schemas.metadata_schema import ContentTypeEnum
from nv_ingest.schemas.metadata_schema import ImageTypeEnum
from nv_ingest.schemas.metadata_schema import StdContentDescEnum
from nv_ingest.schemas.metadata_schema import TextTypeEnum
from nv_ingest.schemas.metadata_schema import validate_metadata
from nv_ingest.util.converters import bytetools
from nv_ingest.util.detectors.language import detect_language
from nv_ingest.util.pdf.metadata_aggregators import construct_page_element_metadata, CroppedImageWithContent

PARAGRAPH_FORMATS = ["text", "markdown"]
TABLE_FORMATS = ["markdown", "markdown_light", "csv", "tag"]

logger = logging.getLogger(__name__)


[docs] class DocxProperties: """ Parse document core properties and update metadata. This class extracts core properties from a python-docx Document object and updates a provided metadata dictionary with standardized values. If certain properties are missing, smart defaults are used. """ def __init__(self, document: Document, source_metadata: dict): """ Initialize a DocxProperties instance by extracting core properties from a Document. Parameters ---------- document : Document A python-docx Document object representing the DOCX file. source_metadata : dict A dictionary containing source metadata. This dictionary will be updated with the document's core properties (e.g., creation and modification dates). Notes ----- The following core properties are extracted: - title: Defaults to "Untitled Document" if not provided. - author: Uses the document's author if available; otherwise falls back to last_modified_by or defaults to "Unknown Author". - created: The creation datetime; if missing, defaults to the current datetime. - modified: The last modified datetime; if missing, defaults to the current datetime. - keywords: The document's keywords; if missing, defaults to an empty list. The source_metadata dictionary is updated with: - date_created: ISO formatted string of the created date. - last_modified: ISO formatted string of the modified date. """ self.document = document self.source_metadata = source_metadata core_properties = self.document.core_properties # Set default title if missing self.title = core_properties.title if core_properties.title is not None else "Untitled Document" # Use author if available; otherwise, fall back to last_modified_by or default self.author = ( core_properties.author if core_properties.author is not None and core_properties.author.strip() != "" else ( core_properties.last_modified_by if core_properties.last_modified_by is not None else "Unknown Author" ) ) # Use current datetime as fallback for created/modified self.created = core_properties.created if core_properties.created is not None else datetime.now() self.modified = core_properties.modified if core_properties.modified is not None else datetime.now() # Default keywords to an empty list if missing self.keywords = core_properties.keywords if core_properties.keywords is not None else [] self._update_source_meta_data() def __str__(self): """ Return a string representation of the document's core properties. Returns ------- str A formatted string containing the title, author, created date, modified date, and keywords of the document. """ info = "Document Properties:\n" info += f"title: {self.title}\n" info += f"author: {self.author}\n" info += f"created: {self.created.isoformat()}\n" info += f"modified: {self.modified.isoformat()}\n" info += f"keywords: {self.keywords}\n" return info def _update_source_meta_data(self): """ Update the source metadata dictionary with the document's core properties. This method sets the 'date_created' and 'last_modified' fields in the source_metadata dictionary to the ISO formatted string representations of the created and modified dates. Returns ------- None """ self.source_metadata.update( { "date_created": self.created.isoformat(), "last_modified": self.modified.isoformat(), } )
[docs] class DocxReader: __doc__ = f""" Read a docx file and extract its content as text, images and tables. Parameters ---------- docx : Bytestream paragraph_format : str Format of the paragraphs. Supported formats are: {PARAGRAPH_FORMATS} table_format : str Format of the tables. Supported formats are: {TABLE_FORMATS} handle_text_styles : bool Whether to apply style on a paragraph (heading, list, title, subtitle). Not recommended if the document has been converted from pdf. image_tag : str Tag to replace the images in the text. Must contain one placeholder for the image index. table_tag : str Tag to replace the tables in the text. Must contain one placeholder for the table index. """ def __init__( self, docx, source_metadata: Dict, paragraph_format: str = "markdown", table_format: str = "markdown", handle_text_styles: bool = True, image_tag="<image {}>", table_tag="<table {}>", extraction_config: Dict = None, ): if paragraph_format not in PARAGRAPH_FORMATS: raise ValueError(f"Unknown paragraph format {paragraph_format}. Supported formats are: {PARAGRAPH_FORMATS}") if table_format not in TABLE_FORMATS: raise ValueError(f"Unknown table format {table_format}. Supported formats are: {TABLE_FORMATS}") self.paragraph_format = paragraph_format self.table_format = table_format self.handle_text_styles = handle_text_styles self.image_tag = image_tag self.table_tag = table_tag # Read docx self.document = Document(docx) # Get the core properties self.properties = DocxProperties(self.document, source_metadata) logger.debug("%s", str(self.properties)) self.trailing_space_pattern = re.compile(r"(^\s*)(.*?)(\s*$)", re.DOTALL) self.empty_text_pattern = re.compile(r"^\s*$") self.images = [] self.tables = [] self.image_tag_index = 1 self.table_tag_index = 1 # placeholders for metadata extraction self._accumulated_text = [] self._extracted_data = [] self._extraction_config = extraction_config if extraction_config else {} self._pending_images = [] self._prev_para_image_idx = 0 self._prev_para_images = []
[docs] def is_text_empty(self, text: str) -> bool: """ Check if the given text is empty or matches the empty text pattern. Parameters ---------- text : str The text to check. Returns ------- bool True if the text is empty or matches the empty text pattern, False otherwise. """ return self.empty_text_pattern.match(text) is not None
[docs] def format_text(self, text: str, bold: bool, italic: bool, underline: bool) -> str: """ Apply markdown styling (bold, italic, underline) to the given text. Parameters ---------- text : str The text to format. bold : bool Whether to apply bold styling. italic : bool Whether to apply italic styling. underline : bool Whether to apply underline styling. Returns ------- str The formatted text with the applied styles. """ if self.is_text_empty(text): return text # Exclude leading and trailing spaces from style match = self.trailing_space_pattern.match(text) if match: prefix, text, suffix = match.groups() else: prefix, suffix = "", "" # Apply style if bold: text = f"**{text}**" if italic: text = f"*{text}*" if underline: text = f"<u>{text}</u>" # Add back leading and trailing spaces text = prefix + text + suffix return text
[docs] def format_paragraph(self, paragraph: "Paragraph") -> Tuple[str, List["Image"]]: """ Format a paragraph into styled text and extract associated images. Parameters ---------- paragraph : Paragraph The paragraph to format. This includes text and potentially embedded images. Returns ------- tuple of (str, list of Image) - The formatted paragraph text with markdown styling applied. - A list of extracted images from the paragraph. """ paragraph_images = [] if self.paragraph_format == "text": paragraph_text = paragraph.text else: # Get the default style of the paragraph, "markdown" font = paragraph.style.font default_style = (font.bold, font.italic, font.underline) # Iterate over the runs of the paragraph and group them by style, excluding empty runs paragraph_text = "" group_text = "" previous_style = None for c in paragraph.iter_inner_content(): if isinstance(c, Hyperlink): text = f"[{c.text}]({c.address})" style = (c.runs[0].bold, c.runs[0].italic, c.runs[0].underline) elif isinstance(c, Run): text = c.text style = (c.bold, c.italic, c.underline) # 1. Locate the inline shape which is stored in the <w:drawing> element. # 2. r:embed in <a.blip> has the relationship id for extracting the file where # the image is stored as bytes. # Reference: # https://python-docx.readthedocs.io/en/latest/dev/analysis/features/shapes/picture.html#specimen-xml inline_shapes = c._element.xpath(".//w:drawing//a:blip/@r:embed") for r_id in inline_shapes: text += self.image_tag.format(self.image_tag_index) self.image_tag_index += 1 image = paragraph.part.related_parts[r_id].image paragraph_images.append(image) else: continue style = tuple([s if s is not None else d for s, d in zip(style, default_style)]) # If the style changes for a non empty text, format the previous group and start a new one if (not self.is_text_empty(text)) and (previous_style is not None): if style != previous_style: paragraph_text += self.format_text(group_text, *previous_style) group_text = "" group_text += text if not self.is_text_empty(text): previous_style = style # Format the last group if group_text: paragraph_text += self.format_text(group_text, *style) # Remove trailing spaces paragraph_text = paragraph_text.strip() return paragraph_text, paragraph_images
[docs] def format_cell(self, cell: "_Cell") -> Tuple[str, List["Image"]]: """ Format a table cell into Markdown text and extract associated images. Parameters ---------- cell : _Cell The table cell to format. Returns ------- tuple of (str, list of Image) - The formatted text of the cell with markdown styling applied. - A list of images extracted from the cell. """ if self.paragraph_format == "markdown": newline = "<br>" else: newline = "\n" paragraph_texts, paragraph_images = zip(*[self.format_paragraph(p) for p in cell.paragraphs]) return newline.join(paragraph_texts), paragraph_images
[docs] def format_table(self, table: "Table") -> Tuple[Optional[str], List["Image"], DataFrame]: """ Format a table into text, extract images, and represent it as a DataFrame. Parameters ---------- table : Table The table to format. Returns ------- tuple of (str or None, list of Image, DataFrame) - The formatted table as text, using the specified format (e.g., markdown, CSV). - A list of images extracted from the table. - A DataFrame representation of the table's content. """ rows = [[self.format_cell(cell) for cell in row.cells] for row in table.rows] texts = [[text for text, _ in row] for row in rows] table_images = [image for row in rows for _, images in row for image in images] table = pd.DataFrame(texts[1:], columns=texts[0]) if "markdown" in self.table_format: table_text = table.to_markdown(index=False) if self.table_format == "markdown_light": table_text = re.sub(r"\s{2,}", " ", table_text) table_text = re.sub(r"-{2,}", "-", table_text) elif self.table_format == "csv": table_text = table.to_csv() elif self.table_format == "tag": table_text = self.table_tag.format(self.table_tag_index) self.table_tag_index += 1 else: raise ValueError(f"Unknown table format {format}") return table_text, table_images, table
[docs] @staticmethod def apply_text_style(style: str, text: str, level: int = 0) -> str: """ Apply a specific text style (e.g., heading, list, title, subtitle) to the given text. Parameters ---------- style : str The style to apply. Supported styles include headings ("Heading 1" to "Heading 9"), list items ("List"), and document structures ("Title", "Subtitle"). text : str The text to style. level : int, optional The indentation level for the styled text. Default is 0. Returns ------- str The text with the specified style and indentation applied. """ if re.match(r"^Heading [1-9]$", style): n = int(style.split(" ")[-1]) text = f"{'#' * n} {text}" elif style.startswith("List"): text = f"- {text}" elif style == "Title": text = f"{text}\n{'=' * len(text)}" elif style == "Subtitle": text = f"{text}\n{'-' * len(text)}" text = "\t" * level + text return text
[docs] @staticmethod def docx_content_type_to_image_type(content_type: "MIME_TYPE") -> str: """ Convert a DOCX content type string to an image type. Parameters ---------- content_type : MIME_TYPE The content type string from the image header, e.g., "image/jpeg". Returns ------- str The image type extracted from the content type string. """ return content_type.split("/")[1]
def _construct_image_metadata( self, para_idx: int, caption: str, base_unified_metadata: Dict, base64_img: str ) -> List[Union[str, dict]]: """ Build metadata for an image in a DOCX file. Parameters ---------- para_idx : int The paragraph index containing the image. caption : str The caption associated with the image. base_unified_metadata : dict The base metadata to build upon. base64_img : str The image content encoded as a base64 string. Returns ------- list A list containing the content type, validated metadata, and a unique identifier. """ bbox = (0, 0, 0, 0) caption_len = len(caption.splitlines()) page_idx = 0 # docx => single page page_count = 1 page_nearby_blocks = { "text": {"content": [], "bbox": []}, "images": {"content": [], "bbox": []}, "structured": {"content": [], "bbox": []}, } if caption_len: page_nearby_blocks["text"]["content"].append(caption) page_nearby_blocks["text"]["bbox"] = [[-1, -1, -1, -1]] * caption_len content_metadata = { "type": ContentTypeEnum.IMAGE, "description": StdContentDescEnum.DOCX_IMAGE, "page_number": page_idx, "hierarchy": { "page_count": page_count, "page": page_idx, "block": para_idx, "line": -1, "span": -1, "nearby_objects": page_nearby_blocks, }, } image_metadata = { "image_type": ImageTypeEnum.image_type_1, "structured_image_type": ImageTypeEnum.image_type_1, "caption": caption, "text": "", "image_location": bbox, } unified_metadata = base_unified_metadata.copy() unified_metadata.update( { "content": base64_img, "source_metadata": self.properties.source_metadata, "content_metadata": content_metadata, "image_metadata": image_metadata, } ) validated_unified_metadata = validate_metadata(unified_metadata) return [ ContentTypeEnum.IMAGE.value, validated_unified_metadata.model_dump(), str(uuid.uuid4()), ] def _extract_para_images( self, images: List["Image"], para_idx: int, caption: str, base_unified_metadata: Dict ) -> None: """ Collect images from a paragraph and store them for metadata construction. Parameters ---------- images : list of Image The images found in the paragraph. para_idx : int The index of the paragraph containing the images. caption : str The caption associated with the images. base_unified_metadata : dict The base metadata to associate with the images. Returns ------- None """ for image in images: logger.debug("image content_type %s para_idx %d", image.content_type, para_idx) logger.debug("image caption %s", caption) # Simply append a tuple so we can build the final metadata in _finalize_images self._pending_images.append((image, para_idx, caption, base_unified_metadata)) def _construct_text_metadata( self, accumulated_text: List[str], para_idx: int, text_depth: "TextTypeEnum", base_unified_metadata: Dict ) -> List[Union[str, dict]]: """ Build metadata for text content in a DOCX file. Parameters ---------- accumulated_text : list of str The accumulated text to include in the metadata. para_idx : int The paragraph index containing the text. text_depth : TextTypeEnum The depth of the text content (e.g., page-level, paragraph-level). base_unified_metadata : dict The base metadata to build upon. Returns ------- list A list containing the content type, validated metadata, and a unique identifier. """ if len(accumulated_text) < 1: return [] extracted_text = " ".join(accumulated_text) # the document is treated as a single page page_number = 0 if text_depth == TextTypeEnum.PAGE else -1 content_metadata = { "type": ContentTypeEnum.TEXT, "description": StdContentDescEnum.DOCX_TEXT, "page_number": page_number, "hierarchy": { "page_count": 1, "page": page_number, "block": para_idx, "line": -1, "span": -1, }, } language = detect_language(extracted_text) text_metadata = { "text_type": text_depth, "summary": "", "keywords": self.properties.keywords, "language": language, "text_location": (-1, -1, -1, -1), } ext_unified_metadata = base_unified_metadata.copy() if base_unified_metadata else {} ext_unified_metadata.update( { "content": extracted_text, "source_metadata": self.properties.source_metadata, "content_metadata": content_metadata, "text_metadata": text_metadata, } ) validated_unified_metadata = validate_metadata(ext_unified_metadata) return [ContentTypeEnum.TEXT.value, validated_unified_metadata.model_dump(), str(uuid.uuid4())] def _extract_para_text( self, paragraph, paragraph_text, base_unified_metadata: Dict, text_depth: "TextTypeEnum", para_idx: int, ) -> None: """ Process the text, images, and styles in a DOCX paragraph. Parameters ---------- paragraph: Paragraph The paragraph to process. paragraph_text: str The text content of the paragraph. base_unified_metadata : dict The base metadata to associate with extracted data. text_depth : TextTypeEnum The depth of text extraction (e.g., block-level, document-level). para_idx : int The index of the paragraph being processed. Returns ------- None """ # Handle text styles if desired if self.handle_text_styles: try: numPr = paragraph._element.xpath("./w:pPr/w:numPr")[0] level = int(numPr.xpath("./w:ilvl/@w:val")[0]) except Exception: level = -1 paragraph_text = self.apply_text_style(paragraph.style.name, paragraph_text, level) self._accumulated_text.append(paragraph_text + "\n") # If text_depth is BLOCK, we flush after each paragraph if text_depth == TextTypeEnum.BLOCK: text_extraction = self._construct_text_metadata( self._accumulated_text, para_idx, text_depth, base_unified_metadata ) self._extracted_data.append(text_extraction) self._accumulated_text = [] def _finalize_images(self, extract_tables: bool, extract_charts: bool, **kwargs) -> None: """ Build and append final metadata for each pending image in batches. Parameters ---------- extract_tables : bool Whether to attempt table detection in images. extract_charts : bool Whether to attempt chart detection in images. **kwargs Additional configuration for image processing. Returns ------- None """ if not self._pending_images: return # 1) Convert all pending images into numpy arrays (and also store base64 + context), # so we can run detection on them in one go. all_image_arrays = [] image_info = [] # parallel list to hold (para_idx, caption, base_unified_metadata, base64_img) for docx_image, para_idx, caption, base_unified_metadata in self._pending_images: # Convert docx image blob to BytesIO, then to numpy array image_bytes = docx_image.blob image_stream = io.BytesIO(image_bytes) image_array = load_and_preprocess_image(image_stream) base64_img = str(bytetools.base64frombytes(image_bytes)) all_image_arrays.append(image_array) # Keep track of all needed metadata so we can rebuild final entries image_info.append((para_idx, caption, base_unified_metadata, base64_img)) # 2) If the user wants to detect tables/charts, do it in one pass for all images. detection_map = defaultdict(list) # maps image_index -> list of CroppedImageWithContent if extract_tables or extract_charts: try: # Perform the batched detection on all images detection_results = extract_page_elements_from_images( images=all_image_arrays, config=ImageConfigSchema(**self._extraction_config.model_dump()), trace_info=kwargs.get("trace_info"), ) # detection_results is typically List[Tuple[int, CroppedImageWithContent]] # Group by image_index for image_idx, cropped_item in detection_results: detection_map[image_idx].append(cropped_item) except Exception as e: logger.error(f"Error extracting tables/charts in batch: {e}") # If something goes wrong, we can fall back to empty detection map # so that all images are treated normally detection_map = {} # 3) For each pending image, decide if we found tables/charts or not. for i, _ in enumerate(self._pending_images): para_idx_i, caption_i, base_unified_metadata_i, base64_img_i = image_info[i] # If detection_map[i] is non-empty, we have found table(s)/chart(s). if i in detection_map and detection_map[i]: for table_chart_data in detection_map[i]: # Build structured metadata for each table or chart structured_entry = construct_page_element_metadata( structured_image=table_chart_data, # A CroppedImageWithContent page_idx=0, # docx => single page page_count=1, source_metadata=self.properties.source_metadata, base_unified_metadata=base_unified_metadata_i, ) self._extracted_data.append(structured_entry) else: # Either detection was not requested, or no table/chart was found image_entry = self._construct_image_metadata( para_idx_i, caption_i, base_unified_metadata_i, base64_img_i, ) self._extracted_data.append(image_entry) # 4) Clear out the pending images after finalizing self._pending_images = [] def _extract_table_data( self, child, base_unified_metadata: Dict, ) -> None: """ Process the text and images in a DOCX table. Parameters ---------- child : element The table element to process. base_unified_metadata : dict The base metadata to associate with extracted data. text_depth : TextTypeEnum The depth of text extraction (e.g., block-level, document-level). para_idx : int The index of the table being processed. Returns ------- None """ # Table table = Table(child, self.document) table_text, table_images, table_dataframe = self.format_table(table) self.images += table_images self.tables.append(table_dataframe) cropped_image_with_content = CroppedImageWithContent( content=table_text, image="", # no image content bbox=(0, 0, 0, 0), max_width=0, max_height=0, type_string="table", ) self._extracted_data.append( construct_page_element_metadata( structured_image=cropped_image_with_content, page_idx=0, # docx => single page page_count=1, source_metadata=self.properties.source_metadata, base_unified_metadata=base_unified_metadata, ) )
[docs] def extract_data( self, base_unified_metadata: Dict, text_depth: "TextTypeEnum", extract_text: bool, extract_charts: bool, extract_tables: bool, extract_images: bool, ) -> list[list[str | dict]]: """ Iterate over paragraphs and tables in a DOCX document to extract data. Parameters ---------- base_unified_metadata : dict The base metadata to associate with all extracted content. text_depth : TextTypeEnum The depth of text extraction (e.g., block-level, document-level). extract_text : bool Whether to extract text from the document. extract_charts : bool Whether to extract charts from the document. extract_tables : bool Whether to extract tables from the document. extract_images : bool Whether to extract images from the document. Returns ------- dict A dictionary containing the extracted data from the document. """ self._accumulated_text = [] self._extracted_data = [] self._pending_images = [] self._prev_para_images = [] self._prev_para_image_idx = 0 para_idx = 0 for child in self.document.element.body.iterchildren(): if isinstance(child, CT_P): paragraph = Paragraph(child, self.document) paragraph_text, paragraph_images = self.format_paragraph(paragraph) if extract_text: self._extract_para_text( paragraph, paragraph_text, base_unified_metadata, text_depth, para_idx, ) if (extract_charts or extract_images or extract_tables) and paragraph_images: self._prev_para_images = paragraph_images self._prev_para_image_idx = para_idx self._pending_images += [(image, para_idx, "", base_unified_metadata) for image in paragraph_images] self.images += paragraph_images elif isinstance(child, CT_Tbl): if extract_tables or extract_charts: self._extract_table_data(child, base_unified_metadata) para_idx += 1 # If there's leftover text at the doc’s end if ( extract_text and text_depth in (TextTypeEnum.DOCUMENT, TextTypeEnum.PAGE) and len(self._accumulated_text) > 0 ): text_extraction = self._construct_text_metadata( self._accumulated_text, -1, text_depth, base_unified_metadata, ) if text_extraction: self._extracted_data.append(text_extraction) # Final pass: Decide if images are just images or contain tables/charts if extract_images or extract_tables or extract_charts: self._finalize_images( extract_tables=extract_tables, extract_charts=extract_charts, trace_info=None, ) return self._extracted_data