Source code for nv_ingest_client.util.file_processing.extract
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
# All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# pylint: disable=invalid-name
# pylint: disable=missing-class-docstring
# pylint: disable=logging-fstring-interpolation
import base64
import logging
import os
from enum import Enum
from io import BytesIO
from typing import Tuple
import charset_normalizer
logger = logging.getLogger(__name__)
# Enums
[docs]
class DocumentTypeEnum(str, Enum):
bmp = "bmp"
docx = "docx"
html = "html"
jpeg = "jpeg"
md = "md"
pdf = "pdf"
png = "png"
pptx = "pptx"
svg = "svg"
tiff = "tiff"
txt = "text"
mp3 = "mp3"
wav = "wav"
# Maps MIME types to DocumentTypeEnum
MIME_TO_DOCUMENT_TYPE = {
"application/pdf": DocumentTypeEnum.pdf,
"text/plain": DocumentTypeEnum.txt,
"application/vnd.openxmlformats-officedocument.wordprocessingml.document": DocumentTypeEnum.docx,
"application/vnd.openxmlformats-officedocument.presentationml.presentation": DocumentTypeEnum.pptx,
"image/jpeg": DocumentTypeEnum.jpeg,
"image/bmp": DocumentTypeEnum.bmp,
"image/png": DocumentTypeEnum.png,
"image/svg+xml": DocumentTypeEnum.svg,
"text/html": DocumentTypeEnum.html,
# Add more as needed
}
# Maps file extensions to DocumentTypeEnum
EXTENSION_TO_DOCUMENT_TYPE = {
"bmp": DocumentTypeEnum.bmp,
"docx": DocumentTypeEnum.docx,
"html": DocumentTypeEnum.txt,
"jpeg": DocumentTypeEnum.jpeg,
"jpg": DocumentTypeEnum.jpeg,
"json": DocumentTypeEnum.txt,
"md": DocumentTypeEnum.txt,
"pdf": DocumentTypeEnum.pdf,
"png": DocumentTypeEnum.png,
"pptx": DocumentTypeEnum.pptx,
"sh": DocumentTypeEnum.txt,
"svg": DocumentTypeEnum.svg,
"tiff": DocumentTypeEnum.tiff,
"txt": DocumentTypeEnum.txt,
"mp3": DocumentTypeEnum.mp3,
"wav": DocumentTypeEnum.wav,
# Add more as needed
}
[docs]
def get_or_infer_file_type(file_path: str) -> DocumentTypeEnum:
"""
Determines the file type by inspecting its extension and optionally falls back
to MIME type detection if the extension is not recognized.
Parameters
----------
file_path : str
The path to the file.
Returns
-------
DocumentTypeEnum
An enum value representing the detected file type.
Raises
------
ValueError
If a valid extension is not found and MIME type detection cannot determine a valid type.
"""
extension = os.path.splitext(file_path)[1][1:].lower()
file_type = EXTENSION_TO_DOCUMENT_TYPE.get(extension)
# If the file extension maps to a known type, return it
if file_type:
return file_type
# TODO(Devin): libmagic is missing on the CI system, so we need to skip this check
# If extension is not recognized, attempt MIME type detection as a fallback
# mime_type = magic.from_file(file_path, mime=True)
# # Attempt to map MIME type to DocumentTypeEnum, if possible
# for mime, doc_type in MIME_TO_DOCUMENT_TYPE.items():
# if mime_type == mime:
# return doc_type
# If no valid file type is determined, raise an exception
raise ValueError(f"Failed to determine file type for: {file_path}")
[docs]
def serialize_to_base64(file_stream: BytesIO) -> str:
"""Reads a PDF file from a BytesIO object and encodes it in base64."""
try:
content = base64.b64encode(file_stream.read()).decode("utf-8")
return content
except IOError:
logger.error("Failed to read PDF file from BytesIO object")
raise
[docs]
def detect_encoding_and_read_text_file(file_stream: BytesIO) -> str:
"""Detects encoding and reads a text file from a BytesIO object accordingly."""
try:
raw_data = file_stream.read()
result = charset_normalizer.detect(raw_data)
encoding = result.get("encoding", "utf-8") # Fallback to utf-8 if undetected
content = raw_data.decode(encoding)
return content
except IOError:
logger.error("Failed to read text file from BytesIO object")
raise