1
0
mirror of https://github.com/kellyjonbrazil/jc.git synced 2025-06-19 00:17:51 +02:00
Files
jc/jc/utils.py

540 lines
22 KiB
Python
Raw Normal View History

2022-03-04 13:27:39 -08:00
"""jc - JSON Convert utils"""
import sys
2021-03-29 14:45:13 -07:00
import re
import locale
2021-09-23 20:53:31 -07:00
import shutil
from datetime import datetime, timezone
2021-09-23 20:53:31 -07:00
from textwrap import TextWrapper
from functools import lru_cache
from typing import List, Dict, Iterable, Union, Optional, TextIO
2022-10-18 11:01:59 -07:00
from .jc_types import TimeStampFormatType
2022-04-27 07:37:31 -07:00
def _asciify(string: str) -> str:
"""
Return a string downgraded from Unicode to ASCII with some simple
conversions.
"""
string = string.replace('©', '(c)')
# the ascii() function adds single quotes around the string
string = ascii(string)[1:-1]
string = string.replace(r'\n', '\n')
return string
2022-10-14 17:12:58 -07:00
def _safe_print(
string: str,
sep: str = ' ',
end: str = '\n',
file: TextIO = sys.stdout,
flush: bool = False
) -> None:
2022-04-27 07:37:31 -07:00
"""Output for both UTF-8 and ASCII encoding systems"""
try:
print(string, sep=sep, end=end, file=file, flush=flush)
except UnicodeEncodeError:
2022-04-27 07:37:31 -07:00
print(_asciify(string), sep=sep, end=end, file=file, flush=flush)
def _safe_pager(string: str) -> None:
"""Pager output for both UTF-8 and ASCII encoding systems"""
from pydoc import pager
try:
pager(string)
except UnicodeEncodeError:
pager(_asciify(string))
2022-01-26 15:54:36 -08:00
def warning_message(message_lines: List[str]) -> None:
2019-11-12 14:19:40 -08:00
"""
2022-05-26 16:03:30 -07:00
Prints warning message to `STDERR` for non-fatal issues. The first line
2022-05-23 13:58:24 -07:00
is prepended with 'jc: Warning - ' and subsequent lines are indented.
2022-01-19 11:08:59 -08:00
Wraps text as needed based on the terminal width.
2019-11-12 14:19:40 -08:00
Parameters:
2021-09-23 20:53:31 -07:00
message: (list) list of string lines
2019-11-12 14:19:40 -08:00
Returns:
None - just prints output to STDERR
2019-11-12 14:19:40 -08:00
"""
2021-09-23 20:53:31 -07:00
# this is for backwards compatibility with existing custom parsers
if isinstance(message_lines, str):
message_lines = [message_lines]
2019-11-11 18:30:46 -08:00
2021-09-23 20:53:31 -07:00
columns = shutil.get_terminal_size().columns
2021-09-23 20:53:31 -07:00
first_wrapper = TextWrapper(width=columns, subsequent_indent=' ' * 15)
next_wrapper = TextWrapper(width=columns, initial_indent=' ' * 15,
subsequent_indent=' ' * 19)
2021-09-23 20:53:31 -07:00
first_line = message_lines.pop(0)
first_str = f'jc: Warning - {first_line}'
first_str = first_wrapper.fill(first_str)
2022-04-27 07:37:31 -07:00
_safe_print(first_str, file=sys.stderr)
2021-09-23 20:53:31 -07:00
for line in message_lines:
if line == '':
continue
message = next_wrapper.fill(line)
2022-04-27 07:37:31 -07:00
_safe_print(message, file=sys.stderr)
2021-09-23 20:53:31 -07:00
2022-01-26 15:54:36 -08:00
def error_message(message_lines: List[str]) -> None:
2019-11-12 14:19:40 -08:00
"""
2022-05-26 16:03:30 -07:00
Prints an error message to `STDERR` for fatal issues. The first line is
2022-01-19 11:08:59 -08:00
prepended with 'jc: Error - ' and subsequent lines are indented.
Wraps text as needed based on the terminal width.
2019-11-12 14:19:40 -08:00
Parameters:
2021-09-23 20:53:31 -07:00
message: (list) list of string lines
2019-11-12 14:19:40 -08:00
Returns:
None - just prints output to STDERR
2019-11-12 14:19:40 -08:00
"""
2021-09-23 20:53:31 -07:00
columns = shutil.get_terminal_size().columns
first_wrapper = TextWrapper(width=columns, subsequent_indent=' ' * 13)
next_wrapper = TextWrapper(width=columns, initial_indent=' ' * 13,
subsequent_indent=' ' * 17)
2021-09-23 20:53:31 -07:00
first_line = message_lines.pop(0)
first_str = f'jc: Error - {first_line}'
first_str = first_wrapper.fill(first_str)
2022-04-27 07:37:31 -07:00
_safe_print(first_str, file=sys.stderr)
2019-12-14 23:56:40 -08:00
2021-09-23 20:53:31 -07:00
for line in message_lines:
if line == '':
continue
message = next_wrapper.fill(line)
2022-04-27 07:37:31 -07:00
_safe_print(message, file=sys.stderr)
2022-10-15 14:34:09 -07:00
def is_compatible(compatible: List[str]) -> bool:
"""
Returns True if the parser is compatible with the running OS platform.
"""
platform_found = False
for platform in compatible:
if sys.platform.startswith(platform):
platform_found = True
break
return platform_found
2022-10-15 14:34:09 -07:00
def compatibility(mod_name: str, compatible: List[str], quiet: bool = False) -> None:
2022-01-19 11:08:59 -08:00
"""
2022-05-23 13:58:24 -07:00
Checks for the parser's compatibility with the running OS platform and
2022-06-16 18:00:40 -07:00
prints a warning message to `STDERR` if not compatible and
`quiet=False.`
2019-12-14 23:56:40 -08:00
2019-11-12 14:19:40 -08:00
Parameters:
2019-11-11 18:30:46 -08:00
2022-01-19 11:08:59 -08:00
mod_name: (string) __name__ of the calling module
2019-12-14 23:56:40 -08:00
2022-01-19 11:08:59 -08:00
compatible: (list) sys.platform name(s) compatible with
the parser. compatible options:
linux, darwin, cygwin, win32, aix, freebsd
2019-11-12 14:20:59 -08:00
2022-01-19 11:08:59 -08:00
quiet: (bool) supress compatibility message if True
2019-11-12 14:20:59 -08:00
Returns:
2021-03-29 14:45:13 -07:00
None - just prints output to STDERR
2019-11-07 08:04:32 -08:00
"""
if not quiet and not is_compatible(compatible):
mod = mod_name.split('.')[-1]
compat_list = ', '.join(compatible)
warning_message([
f'{mod} parser is not compatible with your OS ({sys.platform}).',
f'Compatible platforms: {compat_list}'
])
2022-07-01 15:48:29 -07:00
def has_data(data: Union[str, bytes]) -> bool:
2020-06-14 17:23:10 -07:00
"""
2022-07-01 15:48:29 -07:00
Checks if the string input contains data. If there are any
non-whitespace characters then return `True`, else return `False`.
For bytes, returns True if there is any data.
2020-06-14 17:23:10 -07:00
Parameters:
2022-07-01 15:48:29 -07:00
data: (string, bytes) input to check whether it contains data
2020-06-14 17:23:10 -07:00
Returns:
2022-01-19 11:08:59 -08:00
Boolean True if input string (data) contains non-whitespace
2022-07-01 15:48:29 -07:00
characters, otherwise False. For bytes data, returns
True if there is any data, otherwise False.
2020-06-14 17:23:10 -07:00
"""
2022-07-01 15:48:29 -07:00
if isinstance(data, str):
return bool(data and not data.isspace())
return bool(data)
def convert_to_int(value: object) -> Optional[int]:
2021-04-17 17:22:59 -07:00
"""
2022-01-19 11:08:59 -08:00
Converts string and float input to int. Strips all non-numeric
characters from strings.
2021-04-17 17:22:59 -07:00
Parameters:
2022-01-26 16:55:39 -08:00
value: (string/float) Input value
2021-04-17 17:22:59 -07:00
Returns:
2021-04-18 11:46:42 -07:00
2022-01-19 11:08:59 -08:00
integer/None Integer if successful conversion, otherwise None
2021-04-17 17:22:59 -07:00
"""
2021-04-18 16:33:47 -07:00
if isinstance(value, str):
str_val = re.sub(r'[^0-9\-\.]', '', value)
2021-04-18 11:46:42 -07:00
try:
return int(str_val)
except (ValueError, TypeError):
2021-04-18 16:33:47 -07:00
try:
return int(float(str_val))
2021-04-18 16:33:47 -07:00
except (ValueError, TypeError):
return None
2021-04-18 11:46:42 -07:00
2021-04-18 16:33:47 -07:00
elif isinstance(value, (int, float)):
return int(value)
else:
return None
2021-04-18 11:46:42 -07:00
def convert_to_float(value: object) -> Optional[float]:
2021-04-18 11:46:42 -07:00
"""
2022-01-19 11:08:59 -08:00
Converts string and int input to float. Strips all non-numeric
characters from strings.
2021-04-18 11:46:42 -07:00
Parameters:
2022-01-26 16:55:39 -08:00
value: (string/integer) Input value
2021-04-18 11:46:42 -07:00
Returns:
2022-01-19 11:08:59 -08:00
float/None Float if successful conversion, otherwise None
2021-04-18 11:46:42 -07:00
"""
2021-04-18 16:33:47 -07:00
if isinstance(value, str):
try:
return float(re.sub(r'[^0-9\-\.]', '', value))
except (ValueError, TypeError):
return None
2021-04-17 17:22:59 -07:00
2021-04-18 16:33:47 -07:00
elif isinstance(value, (int, float)):
return float(value)
else:
return None
2021-04-17 17:22:59 -07:00
def convert_to_bool(value: object) -> bool:
2021-04-18 11:46:42 -07:00
"""
2022-01-19 11:08:59 -08:00
Converts string, integer, or float input to boolean by checking
for 'truthy' values.
2021-04-18 11:46:42 -07:00
Parameters:
value: (string/integer/float) Input value
Returns:
2022-01-19 11:08:59 -08:00
True/False False unless a 'truthy' number or string is found
('y', 'yes', 'true', '1', 1, -1, etc.)
2021-04-18 11:46:42 -07:00
"""
# if number, then bool it
# if string, try to convert to float
# if float converts, then bool the result
# if float does not convert then look for truthy string and bool True
# else False
2021-06-30 12:38:36 -07:00
truthy = ['y', 'yes', 'true', '*']
2021-04-18 11:46:42 -07:00
if isinstance(value, (int, float)):
return bool(value)
if isinstance(value, str):
try:
test_value = convert_to_float(value)
if test_value is not None:
return bool(test_value)
except Exception:
pass
if value:
2021-09-24 08:43:09 -07:00
return value.lower() in truthy
2021-04-18 11:46:42 -07:00
return False
def input_type_check(data: object) -> None:
2022-01-26 16:58:48 -08:00
"""Ensure input data is a string. Raises `TypeError` if not."""
if not isinstance(data, str):
raise TypeError("Input data must be a 'str' object.")
class timestamp:
2022-10-06 09:05:36 -07:00
__slots__ = ('string', 'format', 'naive', 'utc')
2022-02-07 16:58:06 -08:00
def __init__(self,
datetime_string: Optional[str],
2022-10-15 14:06:38 -07:00
format_hint: Optional[Iterable[int]] = None
2022-02-07 16:58:06 -08:00
) -> None:
2022-01-26 14:21:06 -08:00
"""
Input a datetime text string of several formats and convert to a
2022-01-26 14:21:06 -08:00
naive or timezone-aware epoch timestamp in UTC.
2021-04-03 14:48:30 -07:00
2022-01-26 14:21:06 -08:00
Parameters:
2022-02-01 17:54:22 -08:00
datetime_string (str): a string representation of a
datetime in several supported formats
2021-04-03 14:48:30 -07:00
2022-04-20 09:44:42 -04:00
format_hint (iterable): an optional iterable of format ID
integers to instruct the timestamp object to try those
formats first in the order given. Other formats will be
tried after the format hint list is exhausted. This can
speed up timestamp conversion so several different formats
don't have to be tried in brute-force fashion.
2022-02-01 17:54:22 -08:00
Returns a timestamp object with the following attributes:
2022-01-19 11:08:59 -08:00
2022-02-01 17:54:22 -08:00
string (str): the input datetime string
2022-01-19 11:08:59 -08:00
2022-02-01 17:54:22 -08:00
format (int | None): the format rule that was used to decode
the datetime string. None if conversion fails.
2022-01-19 11:08:59 -08:00
2022-02-01 17:54:22 -08:00
naive (int | None): timestamp based on locally configured
timezone. None if conversion fails.
2022-02-01 19:20:19 -08:00
utc (int | None): aware timestamp only if UTC timezone
2022-02-01 17:54:22 -08:00
detected in datetime string. None if conversion fails.
2022-01-26 14:21:06 -08:00
"""
self.string = datetime_string
if not format_hint:
format_hint = tuple()
else:
format_hint = tuple(format_hint)
dt = self._parse_dt(self.string, format_hint=format_hint)
self.format = dt['format']
self.naive = dt['timestamp_naive']
self.utc = dt['timestamp_utc']
2022-10-14 17:12:58 -07:00
def __repr__(self) -> str:
2022-02-07 08:06:48 -08:00
return f'timestamp(string={self.string!r}, format={self.format}, naive={self.naive}, utc={self.utc})'
@staticmethod
@lru_cache(maxsize=512)
2022-10-14 17:12:58 -07:00
def _parse_dt(
dt_string: Optional[str],
2022-10-14 17:12:58 -07:00
format_hint: Optional[Iterable[int]] = None
) -> Dict[str, Optional[int]]:
"""
Input a datetime text string of several formats and convert to
2022-01-19 11:08:59 -08:00
a naive or timezone-aware epoch timestamp in UTC.
Parameters:
dt_string: (string) a string representation of a date-time
in several supported formats
format_hint: (list | tuple) a list of format ID int's that
should be tried first. This can increase
performance since the function will not need to
try many incorrect formats before finding the
correct one.
Returns:
2022-02-01 17:54:22 -08:00
Dictionary of the following format:
2021-04-06 18:53:50 -07:00
{
2022-01-19 11:08:59 -08:00
# for debugging purposes. None if conversion fails
2022-02-01 17:54:22 -08:00
"format": int,
2022-01-19 11:08:59 -08:00
# timestamp based on locally configured timezone.
# None if conversion fails.
2022-02-01 17:54:22 -08:00
"timestamp_naive": int,
2022-01-19 11:08:59 -08:00
# aware timestamp only if UTC timezone detected.
# None if conversion fails.
2022-02-01 17:54:22 -08:00
"timestamp_utc": int
2021-04-06 18:53:50 -07:00
}
2022-01-19 11:08:59 -08:00
The `format` integer denotes which date_time format
conversion succeeded.
The `timestamp_naive` integer is the converted date-time
string to a naive epoch timestamp.
The `timestamp_utc` integer is the converted date-time
string to an aware epoch timestamp in the UTC timezone. If
an aware conversion cannot be performed (e.g. the UTC
timezone is not found in the date-time string), then this
field will be None.
2021-04-06 18:53:50 -07:00
If the conversion completely fails, all fields will be None.
"""
2022-10-18 11:01:59 -07:00
formats: tuple[TimeStampFormatType, ...] = (
{'id': 1000, 'format': '%a %b %d %H:%M:%S %Y', 'locale': None}, # manual C locale format conversion: Tue Mar 23 16:12:11 2021 or Tue Mar 23 16:12:11 IST 2021
2022-04-20 09:44:42 -04:00
{'id': 1100, 'format': '%a %b %d %H:%M:%S %Y %z', 'locale': None}, # git date output: Thu Mar 5 09:17:40 2020 -0800
2022-08-14 10:52:58 -07:00
{'id': 1300, 'format': '%Y-%m-%dT%H:%M:%S.%f%Z', 'locale': None}, # ISO Format with UTC (found in syslog 5424): 2003-10-11T22:14:15.003Z
{'id': 1310, 'format': '%Y-%m-%dT%H:%M:%S.%f', 'locale': None}, # ISO Format without TZ (found in syslog 5424): 2003-10-11T22:14:15.003
2022-08-17 17:45:47 -07:00
{'id': 1400, 'format': '%b %d %Y %H:%M:%S.%f UTC', 'locale': None}, # CEF Format with UTC: Nov 08 2022 12:30:00.111 UTC
{'id': 1410, 'format': '%b %d %Y %H:%M:%S.%f', 'locale': None}, # CEF Format without TZ: Nov 08 2022 12:30:00.111
{'id': 1420, 'format': '%b %d %Y %H:%M:%S UTC', 'locale': None}, # CEF Format with UTC without microseconds: Nov 08 2022 12:30:00 UTC
{'id': 1430, 'format': '%b %d %Y %H:%M:%S', 'locale': None}, # CEF Format without TZ or microseconds: Nov 08 2022 12:30:00
{'id': 1500, 'format': '%Y-%m-%d %H:%M', 'locale': None}, # en_US.UTF-8 local format (found in who cli output): 2021-03-23 00:14
2021-04-02 12:01:05 -07:00
{'id': 1600, 'format': '%m/%d/%Y %I:%M %p', 'locale': None}, # Windows english format (found in dir cli output): 12/07/2019 02:09 AM
{'id': 1700, 'format': '%m/%d/%Y, %I:%M:%S %p', 'locale': None}, # Windows english format wint non-UTC tz (found in systeminfo cli output): 3/22/2021, 1:15:51 PM (UTC-0600)
2021-05-16 19:09:53 -07:00
{'id': 1705, 'format': '%m/%d/%Y, %I:%M:%S %p %Z', 'locale': None}, # Windows english format with UTC tz (found in systeminfo cli output): 3/22/2021, 1:15:51 PM (UTC)
{'id': 1710, 'format': '%m/%d/%Y, %I:%M:%S %p UTC%z', 'locale': None}, # Windows english format with UTC tz (found in systeminfo cli output): 3/22/2021, 1:15:51 PM (UTC+0000)
{'id': 2000, 'format': '%a %d %b %Y %I:%M:%S %p %Z', 'locale': None}, # en_US.UTF-8 local format (found in upower cli output): Tue 23 Mar 2021 04:12:11 PM UTC
{'id': 3000, 'format': '%a %d %b %Y %I:%M:%S %p', 'locale': None}, # en_US.UTF-8 local format with non-UTC tz (found in upower cli output): Tue 23 Mar 2021 04:12:11 PM IST
{'id': 4000, 'format': '%A %d %B %Y %I:%M:%S %p %Z', 'locale': None}, # European-style local format (found in upower cli output): Tuesday 01 October 2019 12:50:41 PM UTC
{'id': 5000, 'format': '%A %d %B %Y %I:%M:%S %p', 'locale': None}, # European-style local format with non-UTC tz (found in upower cli output): Tuesday 01 October 2019 12:50:41 PM IST
{'id': 6000, 'format': '%a %b %d %I:%M:%S %p %Z %Y', 'locale': None}, # en_US.UTF-8 format (found in date cli): Wed Mar 24 06:16:19 PM UTC 2021
{'id': 7000, 'format': '%a %b %d %H:%M:%S %Z %Y', 'locale': None}, # C locale format (found in date cli): Wed Mar 24 11:11:30 UTC 2021
{'id': 7100, 'format': '%b %d %H:%M:%S %Y', 'locale': None}, # C locale format (found in stat cli output - osx): # Mar 29 11:49:05 2021
{'id': 7200, 'format': '%Y-%m-%d %H:%M:%S.%f %z', 'locale': None}, # C locale format (found in stat cli output - linux): 2019-08-13 18:13:43.555604315 -0400
2021-09-22 14:06:28 -07:00
{'id': 7250, 'format': '%Y-%m-%d %H:%M:%S', 'locale': None}, # C locale format with non-UTC tz (found in modified vmstat cli output): # 2021-09-16 20:32:28 PDT
{'id': 7255, 'format': '%Y-%m-%d %H:%M:%S %Z', 'locale': None}, # C locale format (found in modified vmstat cli output): # 2021-09-16 20:32:28 UTC
{'id': 7300, 'format': '%a %Y-%m-%d %H:%M:%S %Z', 'locale': None}, # C locale format (found in timedatectl cli output): # Wed 2020-03-11 00:53:21 UTC
# attempt locale changes last
{'id': 8000, 'format': '%a %d %b %Y %H:%M:%S %Z', 'locale': ''}, # current locale format (found in upower cli output): # mar. 23 mars 2021 23:12:11 UTC
{'id': 8100, 'format': '%a %d %b %Y %H:%M:%S', 'locale': ''}, # current locale format with non-UTC tz (found in upower cli output): # mar. 23 mars 2021 19:12:11 EDT
{'id': 8200, 'format': '%A %d %B %Y, %H:%M:%S UTC%z', 'locale': ''}, # fr_FR.utf8 locale format (found in date cli output): vendredi 26 mars 2021, 13:26:46 (UTC+0000)
{'id': 8300, 'format': '%A %d %B %Y, %H:%M:%S', 'locale': ''}, # fr_FR.utf8 locale format with non-UTC tz (found in date cli output): vendredi 26 mars 2021, 13:26:46 (UTC-0400)
{'id': 9000, 'format': '%c', 'locale': ''} # locally configured locale format conversion: Could be anything :) this is a last-gasp attempt
)
# from https://www.timeanddate.com/time/zones/
# only removed UTC timezone and added known non-UTC offsets
2022-10-14 17:12:58 -07:00
tz_abbr: set[str] = {
2022-01-26 14:38:57 -08:00
'A', 'ACDT', 'ACST', 'ACT', 'ACWST', 'ADT', 'AEDT', 'AEST', 'AET', 'AFT', 'AKDT',
'AKST', 'ALMT', 'AMST', 'AMT', 'ANAST', 'ANAT', 'AQTT', 'ART', 'AST', 'AT', 'AWDT',
'AWST', 'AZOST', 'AZOT', 'AZST', 'AZT', 'AoE', 'B', 'BNT', 'BOT', 'BRST', 'BRT', 'BST',
'BTT', 'C', 'CAST', 'CAT', 'CCT', 'CDT', 'CEST', 'CET', 'CHADT', 'CHAST', 'CHOST',
'CHOT', 'CHUT', 'CIDST', 'CIST', 'CKT', 'CLST', 'CLT', 'COT', 'CST', 'CT', 'CVT', 'CXT',
'ChST', 'D', 'DAVT', 'DDUT', 'E', 'EASST', 'EAST', 'EAT', 'ECT', 'EDT', 'EEST', 'EET',
'EGST', 'EGT', 'EST', 'ET', 'F', 'FET', 'FJST', 'FJT', 'FKST', 'FKT', 'FNT', 'G',
'GALT', 'GAMT', 'GET', 'GFT', 'GILT', 'GMT', 'GST', 'GYT', 'H', 'HDT', 'HKT', 'HOVST',
'HOVT', 'HST', 'I', 'ICT', 'IDT', 'IOT', 'IRDT', 'IRKST', 'IRKT', 'IRST', 'IST', 'JST',
'K', 'KGT', 'KOST', 'KRAST', 'KRAT', 'KST', 'KUYT', 'L', 'LHDT', 'LHST', 'LINT', 'M',
'MAGST', 'MAGT', 'MART', 'MAWT', 'MDT', 'MHT', 'MMT', 'MSD', 'MSK', 'MST', 'MT', 'MUT',
'MVT', 'MYT', 'N', 'NCT', 'NDT', 'NFDT', 'NFT', 'NOVST', 'NOVT', 'NPT', 'NRT', 'NST',
'NUT', 'NZDT', 'NZST', 'O', 'OMSST', 'OMST', 'ORAT', 'P', 'PDT', 'PET', 'PETST', 'PETT',
'PGT', 'PHOT', 'PHT', 'PKT', 'PMDT', 'PMST', 'PONT', 'PST', 'PT', 'PWT', 'PYST', 'PYT',
'Q', 'QYZT', 'R', 'RET', 'ROTT', 'S', 'SAKT', 'SAMT', 'SAST', 'SBT', 'SCT', 'SGT',
'SRET', 'SRT', 'SST', 'SYOT', 'T', 'TAHT', 'TFT', 'TJT', 'TKT', 'TLT', 'TMT', 'TOST',
'TOT', 'TRT', 'TVT', 'U', 'ULAST', 'ULAT', 'UYST', 'UYT', 'UZT', 'V', 'VET', 'VLAST',
'VLAT', 'VOST', 'VUT', 'W', 'WAKT', 'WARST', 'WAST', 'WAT', 'WEST', 'WET', 'WFT',
'WGST', 'WGT', 'WIB', 'WIT', 'WITA', 'WST', 'WT', 'X', 'Y', 'YAKST', 'YAKT', 'YAPT',
2022-08-14 10:52:58 -07:00
'YEKST', 'YEKT', 'UTC-1200', 'UTC-1100', 'UTC-1000', 'UTC-0930', 'UTC-0900',
2022-01-26 14:38:57 -08:00
'UTC-0800', 'UTC-0700', 'UTC-0600', 'UTC-0500', 'UTC-0400', 'UTC-0300', 'UTC-0230',
'UTC-0200', 'UTC-0100', 'UTC+0100', 'UTC+0200', 'UTC+0300', 'UTC+0400', 'UTC+0430',
'UTC+0500', 'UTC+0530', 'UTC+0545', 'UTC+0600', 'UTC+0630', 'UTC+0700', 'UTC+0800',
'UTC+0845', 'UTC+0900', 'UTC+1000', 'UTC+1030', 'UTC+1100', 'UTC+1200', 'UTC+1300',
'UTC+1345', 'UTC+1400'
}
2022-10-14 17:12:58 -07:00
offset_suffixes: tuple[str, ...] = (
'-12:00', '-11:00', '-10:00', '-09:30', '-09:00',
'-08:00', '-07:00', '-06:00', '-05:00', '-04:00', '-03:00', '-02:30',
'-02:00', '-01:00', '+01:00', '+02:00', '+03:00', '+04:00', '+04:30',
'+05:00', '+05:30', '+05:45', '+06:00', '+06:30', '+07:00', '+08:00',
'+08:45', '+09:00', '+10:00', '+10:30', '+11:00', '+12:00', '+13:00',
'+13:45', '+14:00'
)
2022-10-14 17:12:58 -07:00
data: str = dt_string or ''
normalized_datetime: str = ''
utc_tz: bool = False
dt: Optional[datetime] = None
dt_utc: Optional[datetime] = None
timestamp_naive: Optional[int] = None
timestamp_utc: Optional[int] = None
timestamp_obj: Dict[str, Optional[int]] = {
'format': None,
'timestamp_naive': None,
'timestamp_utc': None
}
# convert format_hint to a tuple so it is hashable (for lru_cache)
if not format_hint:
format_hint = tuple()
else:
format_hint = tuple(format_hint)
# sometimes UTC is referenced as 'Coordinated Universal Time'. Convert to 'UTC'
data = data.replace('Coordinated Universal Time', 'UTC')
# UTC can also be indicated with 'Z' for Zulu time (ISO-8601). Convert to 'UTC'
data = data.replace('Z', 'UTC')
if 'UTC' in data:
utc_tz = True
if 'UTC+' in data or 'UTC-' in data:
utc_tz = bool('UTC+0000' in data or 'UTC-0000' in data)
elif '+0000' in data or '-0000' in data:
utc_tz = True
# normalize the timezone by taking out any timezone reference, except UTC
cleandata = data.replace('(', '').replace(')', '')
2022-10-15 14:06:38 -07:00
normalized_datetime_list: List[str] = []
for term in cleandata.split():
if term not in tz_abbr:
normalized_datetime_list.append(term)
normalized_datetime = ' '.join(normalized_datetime_list)
# remove non UTC offset suffixes at the end of the string
for suffix in offset_suffixes:
if normalized_datetime.endswith(suffix):
normalized_datetime = normalized_datetime[0:-len(suffix)]
break
# normalize further by converting any greater-than 6-digit subsecond to 6-digits
p = re.compile(r'(\W\d\d:\d\d:\d\d\.\d{6})\d+\W')
normalized_datetime = p.sub(r'\g<1> ', normalized_datetime)
# try format hints first, then fall back to brute-force method
2022-10-18 11:01:59 -07:00
hint_obj_list: List[TimeStampFormatType] = []
for fmt_id in format_hint:
for fmt in formats:
if fmt_id == fmt['id']:
hint_obj_list.append(fmt)
remaining_formats = [fmt for fmt in formats if not fmt['id'] in format_hint]
optimized_formats = hint_obj_list + remaining_formats
for fmt in optimized_formats:
try:
2022-10-15 13:44:44 -07:00
locale.setlocale(locale.LC_TIME, fmt['locale'])
dt = datetime.strptime(normalized_datetime, fmt['format'])
timestamp_naive = int(dt.replace(tzinfo=None).timestamp())
2022-10-15 13:44:44 -07:00
timestamp_obj['format'] = fmt['id']
locale.setlocale(locale.LC_TIME, None)
break
except Exception:
locale.setlocale(locale.LC_TIME, None)
continue
if dt and utc_tz:
dt_utc = dt.replace(tzinfo=timezone.utc)
timestamp_utc = int(dt_utc.timestamp())
if timestamp_naive:
timestamp_obj['timestamp_naive'] = timestamp_naive
timestamp_obj['timestamp_utc'] = timestamp_utc
return timestamp_obj