Source code for nv_ingest.framework.orchestration.morpheus.stages.extractors.pptx_extractor_stage

# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
# All rights reserved.
# SPDX-License-Identifier: Apache-2.0


import functools
import logging

from morpheus.config import Config

from nv_ingest.framework.orchestration.morpheus.stages.meta.multiprocessing_stage import MultiProcessingBaseStage
from nv_ingest_api.internal.extract.pptx.pptx_extractor import extract_primitives_from_pptx_internal
from nv_ingest_api.internal.schemas.extract.extract_pptx_schema import PPTXExtractorSchema

logger = logging.getLogger(__name__)


[docs] def generate_pptx_extractor_stage( c: Config, extraction_config: dict, task: str = "pptx-extract", task_desc: str = "pptx_content_extractor", pe_count: int = 8, ): """ Helper function to generate a multiprocessing stage to perform PPTX content extraction. Parameters ---------- c : Config Morpheus global configuration object. extraction_config : dict Configuration parameters for PPTX content extractor. task : str The task name to match for the stage worker function. task_desc : str A descriptor to be used in latency tracing. pe_count : int The number of process engines to use for PPTX content extraction. Returns ------- MultiProcessingBaseStage A Morpheus stage with the applied worker function. Raises ------ Exception If an error occurs during stage generation. """ try: validated_config = PPTXExtractorSchema(**extraction_config) _wrapped_process_fn = functools.partial( extract_primitives_from_pptx_internal, extraction_config=validated_config ) return MultiProcessingBaseStage( c=c, pe_count=pe_count, task=task, task_desc=task_desc, process_fn=_wrapped_process_fn, document_type="pptx" ) except Exception as e: err_msg = f"generate_pptx_extractor_stage: Error generating PPTX extractor stage. " f"Original error: {e}" logger.error(err_msg, exc_info=True) raise type(e)(err_msg) from e