mirror of
https://github.com/kellyjonbrazil/jc.git
synced 2025-07-13 01:20:24 +02:00
add format hints for performance optimization
This commit is contained in:
47
jc/utils.py
47
jc/utils.py
@ -6,7 +6,7 @@ import shutil
|
|||||||
from datetime import datetime, timezone
|
from datetime import datetime, timezone
|
||||||
from textwrap import TextWrapper
|
from textwrap import TextWrapper
|
||||||
from functools import lru_cache
|
from functools import lru_cache
|
||||||
from typing import List, Union, Optional
|
from typing import List, Tuple, Union, Optional
|
||||||
|
|
||||||
|
|
||||||
def warning_message(message_lines: List[str]) -> None:
|
def warning_message(message_lines: List[str]) -> None:
|
||||||
@ -229,9 +229,9 @@ def input_type_check(data: str) -> None:
|
|||||||
|
|
||||||
|
|
||||||
class timestamp:
|
class timestamp:
|
||||||
def __init__(self, datetime_string: str) -> None:
|
def __init__(self, datetime_string: str, format_hint: Union[List, Tuple, None] = None) -> None:
|
||||||
"""
|
"""
|
||||||
Input a date-time text string of several formats and convert to a
|
Input a datetime text string of several formats and convert to a
|
||||||
naive or timezone-aware epoch timestamp in UTC.
|
naive or timezone-aware epoch timestamp in UTC.
|
||||||
|
|
||||||
Parameters:
|
Parameters:
|
||||||
@ -239,6 +239,13 @@ class timestamp:
|
|||||||
datetime_string (str): a string representation of a
|
datetime_string (str): a string representation of a
|
||||||
datetime in several supported formats
|
datetime in several supported formats
|
||||||
|
|
||||||
|
format_hint (list | tuple): an optional list of format ID
|
||||||
|
integers to instruct the timestamp object to try those
|
||||||
|
formats first in the order given. Other formats will be
|
||||||
|
tried after the format hint list is exhausted. This can
|
||||||
|
speed up timestamp conversion so several different formats
|
||||||
|
don't have to be tried in brute-force fashion.
|
||||||
|
|
||||||
Returns a timestamp object with the following attributes:
|
Returns a timestamp object with the following attributes:
|
||||||
|
|
||||||
string (str): the input datetime string
|
string (str): the input datetime string
|
||||||
@ -253,7 +260,13 @@ class timestamp:
|
|||||||
detected in datetime string. None if conversion fails.
|
detected in datetime string. None if conversion fails.
|
||||||
"""
|
"""
|
||||||
self.string = datetime_string
|
self.string = datetime_string
|
||||||
dt = self._parse_dt(self.string)
|
|
||||||
|
if not format_hint:
|
||||||
|
format_hint = tuple()
|
||||||
|
else:
|
||||||
|
format_hint = tuple(format_hint)
|
||||||
|
|
||||||
|
dt = self._parse_dt(self.string, format_hint=format_hint)
|
||||||
self.format = dt['format']
|
self.format = dt['format']
|
||||||
self.naive = dt['timestamp_naive']
|
self.naive = dt['timestamp_naive']
|
||||||
self.utc = dt['timestamp_utc']
|
self.utc = dt['timestamp_utc']
|
||||||
@ -263,9 +276,9 @@ class timestamp:
|
|||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
@lru_cache(maxsize=512)
|
@lru_cache(maxsize=512)
|
||||||
def _parse_dt(dt_string):
|
def _parse_dt(dt_string, format_hint=None):
|
||||||
"""
|
"""
|
||||||
Input a date-time text string of several formats and convert to
|
Input a datetime text string of several formats and convert to
|
||||||
a naive or timezone-aware epoch timestamp in UTC.
|
a naive or timezone-aware epoch timestamp in UTC.
|
||||||
|
|
||||||
Parameters:
|
Parameters:
|
||||||
@ -273,6 +286,12 @@ class timestamp:
|
|||||||
dt_string: (string) a string representation of a date-time
|
dt_string: (string) a string representation of a date-time
|
||||||
in several supported formats
|
in several supported formats
|
||||||
|
|
||||||
|
format_hint: (list | tuple) a list of format ID int's that
|
||||||
|
should be tried first. This can increase
|
||||||
|
performance since the function will not need to
|
||||||
|
try many incorrect formats before finding the
|
||||||
|
correct one.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
|
|
||||||
Dictionary of the following format:
|
Dictionary of the following format:
|
||||||
@ -318,6 +337,12 @@ class timestamp:
|
|||||||
}
|
}
|
||||||
utc_tz = False
|
utc_tz = False
|
||||||
|
|
||||||
|
# convert format_hint to a tuple so it is hashable (for lru_cache)
|
||||||
|
if not format_hint:
|
||||||
|
format_hint = tuple()
|
||||||
|
else:
|
||||||
|
format_hint = tuple(format_hint)
|
||||||
|
|
||||||
# sometimes UTC is referenced as 'Coordinated Universal Time'. Convert to 'UTC'
|
# sometimes UTC is referenced as 'Coordinated Universal Time'. Convert to 'UTC'
|
||||||
data = data.replace('Coordinated Universal Time', 'UTC')
|
data = data.replace('Coordinated Universal Time', 'UTC')
|
||||||
|
|
||||||
@ -398,7 +423,17 @@ class timestamp:
|
|||||||
p = re.compile(r'(\W\d\d:\d\d:\d\d\.\d{6})\d+\W')
|
p = re.compile(r'(\W\d\d:\d\d:\d\d\.\d{6})\d+\W')
|
||||||
normalized_datetime = p.sub(r'\g<1> ', normalized_datetime)
|
normalized_datetime = p.sub(r'\g<1> ', normalized_datetime)
|
||||||
|
|
||||||
|
# try format hints first, then fall back to brute-force method
|
||||||
|
hint_obj_list = []
|
||||||
|
for fmt_id in format_hint:
|
||||||
for fmt in formats:
|
for fmt in formats:
|
||||||
|
if fmt_id == fmt['id']:
|
||||||
|
hint_obj_list.append(fmt)
|
||||||
|
|
||||||
|
remaining_formats = [fmt for fmt in formats if not fmt['id'] in format_hint]
|
||||||
|
optimized_formats = hint_obj_list + remaining_formats
|
||||||
|
|
||||||
|
for fmt in optimized_formats:
|
||||||
try:
|
try:
|
||||||
locale.setlocale(locale.LC_TIME, fmt['locale'])
|
locale.setlocale(locale.LC_TIME, fmt['locale'])
|
||||||
dt = datetime.strptime(normalized_datetime, fmt['format'])
|
dt = datetime.strptime(normalized_datetime, fmt['format'])
|
||||||
|
Reference in New Issue
Block a user