Source code for nv_ingest_api.util.converters.datetools
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
# All rights reserved.
# SPDX-License-Identifier: Apache-2.0
from datetime import datetime
from datetime import timezone
from dateutil.parser import parse
from nv_ingest_api.util.exception_handlers.converters import datetools_exception_handler
@datetools_exception_handler
def datetimefrompdfmeta(pdf_formated_date: str, keep_tz: bool = False) -> str:
"""
Convert PDF metadata formatted date string to a datetime object.
Parameters
----------
pdf_formated_date : str
A date string in standard PDF metadata format.
Example: `str("D:20211222141131-07'00'")`
keep_tz : bool, optional
Keep or remove the timezone attribute of the parsed datetime object. If `False` (necessary for arrow format),
the timezone offset will be added to the datetime. Parsed datetimes will be in the same local time.
Returns
-------
str
A datetime object parsed from the input date string in ISO 8601 format.
"""
try:
# standard pdf date format
pattern = "D:%Y%m%d%H%M%S%z"
# clean up date string
cleaned_date_string = pdf_formated_date[:-1].replace("'", ":")
parsed_dt_tz = datetime.strptime(cleaned_date_string, pattern)
except ValueError:
parsed_dt_tz = parse(pdf_formated_date, fuzzy=True)
if not keep_tz:
return remove_tz(parsed_dt_tz).isoformat()
return parsed_dt_tz.isoformat()
[docs]
def remove_tz(datetime_obj: datetime) -> datetime:
"""
Remove timezone and add offset to a datetime object.
Parameters
----------
datetime_obj : datetime.datetime
A datetime object with or without the timezone attribute set.
Returns
-------
datetime.datetime
A datetime object with the timezone offset added and the timezone attribute removed.
"""
if datetime_obj.tzinfo is not None: # If timezone info is present
# Convert to UTC
datetime_obj = datetime_obj.astimezone(timezone.utc)
# Remove timezone information
datetime_obj = datetime_obj.replace(tzinfo=None)
return datetime_obj
[docs]
def validate_iso8601(date_string: str) -> None:
"""
Verify that the given date string is in ISO 8601 format.
Parameters
----------
date_string : str
A date string in human-readable format, ideally ISO 8601.
Raises
------
ValueError
If the date string is not in a valid ISO 8601 format.
"""
assert datetime.fromisoformat(date_string)