Source code for nv_ingest_api.internal.extract.docx.engines.docxreader_helpers.docx_helper
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
# All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# Copyright (c) 2024, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# pylint: disable=too-many-locals
import logging
from typing import IO, Optional, List
from nv_ingest_api.internal.enums.common import AccessLevelEnum, DocumentTypeEnum
from nv_ingest_api.internal.enums.common import TextTypeEnum
from nv_ingest_api.internal.extract.docx.engines.docxreader_helpers.docxreader import DocxReader
logger = logging.getLogger(__name__)
[docs]
def python_docx(
*,
docx_stream: IO,
extract_text: bool,
extract_images: bool,
extract_infographics: bool,
extract_tables: bool,
extract_charts: bool,
extraction_config: dict,
execution_trace_log: Optional[List] = None,
):
"""
Helper function that use python-docx to extract text from a bytestream document
A document has three levels - document, paragraphs and runs. To align with the
pdf extraction paragraphs are aliased as block. python-docx leaves the page number
and line number to the renderer so we assume that the entire document is a single
page.
Run level parsing has been skipped but can be added as needed.
Parameters
----------
docx_stream:
Bytestream
extract_text : bool
Specifies whether to extract text.
extract_images : bool
Specifies whether to extract images.
extract_infographics : bool
Specifies whether to extract infographics.
extract_tables : bool
Specifies whether to extract tables.
extract_charts : bool
Specifies whether to extract charts.
extraction_config : dict
A dictionary of configuration parameters for the extraction process.
execution_trace_log : list, optional
A list for accumulating trace information during extraction. Defaults to None.
Returns
-------
str
A string of extracted text.
"""
_ = execution_trace_log
_ = extract_infographics
row_data = extraction_config.get("row_data")
# get source_id
source_id = row_data["source_id"]
# get text_depth
text_depth = extraction_config.get("text_depth", "document")
text_depth = TextTypeEnum(text_depth)
# get base metadata
metadata_col = "metadata"
docx_extractor_config = extraction_config.get("docx_extraction_config", {})
base_unified_metadata = row_data[metadata_col] if metadata_col in row_data.index else {}
# get base source_metadata
base_source_metadata = base_unified_metadata.get("source_metadata", {})
# get source_location
source_location = base_source_metadata.get("source_location", "")
# get collection_id (assuming coming in from source_metadata...)
collection_id = base_source_metadata.get("collection_id", "")
# get partition_id (assuming coming in from source_metadata...)
partition_id = base_source_metadata.get("partition_id", -1)
# get access_level (assuming coming in from source_metadata...)
access_level = base_source_metadata.get("access_level", AccessLevelEnum.UNKNOWN)
# python-docx doesn't maintain filename; re-use source_id
source_metadata = {
"source_name": source_id,
"source_id": source_id,
"source_location": source_location,
"source_type": DocumentTypeEnum.DOCX,
"collection_id": collection_id,
"partition_id": partition_id,
"access_level": access_level,
"summary": "",
}
# Extract data from the document using python-docx
doc = DocxReader(docx_stream, source_metadata, extraction_config=docx_extractor_config)
extracted_data = doc.extract_data(
base_unified_metadata,
text_depth=text_depth,
extract_text=extract_text,
extract_tables=extract_tables,
extract_charts=extract_charts,
extract_infographics=extract_infographics,
extract_images=extract_images,
)
return extracted_data