mirror of
https://github.com/kellyjonbrazil/jc.git
synced 2025-06-17 00:07:37 +02:00
add toml parser
This commit is contained in:
@ -1,6 +1,7 @@
|
||||
jc changelog
|
||||
|
||||
20230103 v1.22.5
|
||||
- Add TOML parser
|
||||
- Update copyright dates
|
||||
- Fix INI parser to include top-level values with no section header
|
||||
- Fix INI and Key/Value parsers to only remove one quotation mark from the
|
||||
|
@ -174,6 +174,7 @@ parsers: List[str] = [
|
||||
'time',
|
||||
'timedatectl',
|
||||
'timestamp',
|
||||
'toml',
|
||||
'top',
|
||||
'top-s',
|
||||
'tracepath',
|
||||
|
126
jc/parsers/toml.py
Normal file
126
jc/parsers/toml.py
Normal file
@ -0,0 +1,126 @@
|
||||
"""jc - JSON Convert TOML file parser
|
||||
|
||||
Usage (cli):
|
||||
|
||||
$ cat file.toml | jc --toml
|
||||
|
||||
Usage (module):
|
||||
|
||||
import jc
|
||||
result = jc.parse('toml', toml_file)
|
||||
|
||||
Schema:
|
||||
|
||||
[
|
||||
{
|
||||
"toml": string,
|
||||
"bar": boolean,
|
||||
"baz": integer
|
||||
}
|
||||
]
|
||||
|
||||
Examples:
|
||||
|
||||
$ toml | jc --toml -p
|
||||
[]
|
||||
|
||||
$ toml | jc --toml -p -r
|
||||
[]
|
||||
"""
|
||||
from typing import Any
|
||||
from jc.jc_types import JSONDictType
|
||||
import jc.utils
|
||||
from jc.parsers import tomli
|
||||
from datetime import datetime
|
||||
|
||||
|
||||
class info():
|
||||
"""Provides parser metadata (version, author, etc.)"""
|
||||
version = '1.0'
|
||||
description = 'TOML file parser'
|
||||
author = 'Kelly Brazil'
|
||||
author_email = 'kellyjonbrazil@gmail.com'
|
||||
details = 'Using the tomli library at https://github.com/hukkin/tomli.'
|
||||
compatible = ['linux', 'darwin', 'cygwin', 'win32', 'aix', 'freebsd']
|
||||
tags = ['standard', 'file', 'string']
|
||||
|
||||
|
||||
__version__ = info.version
|
||||
|
||||
|
||||
def _process(proc_data: JSONDictType) -> JSONDictType:
|
||||
"""
|
||||
Final processing to conform to the schema.
|
||||
|
||||
Parameters:
|
||||
|
||||
proc_data: (List of Dictionaries) raw structured data to process
|
||||
|
||||
Returns:
|
||||
|
||||
Dictionary. Structured to conform to the schema.
|
||||
"""
|
||||
return proc_data
|
||||
|
||||
|
||||
def _fix_objects(obj: Any) -> JSONDictType:
|
||||
"""
|
||||
Recursively traverse the nested dictionary or list and convert objects
|
||||
into JSON serializable types.
|
||||
"""
|
||||
if isinstance(obj, dict):
|
||||
for k, v in obj.copy().items():
|
||||
|
||||
if isinstance(v, datetime):
|
||||
iso = v.isoformat()
|
||||
v = int(round(v.timestamp()))
|
||||
obj.update({k: v, f'{k}_iso': iso})
|
||||
continue
|
||||
|
||||
if isinstance(v, dict):
|
||||
obj.update({k: _fix_objects(v)})
|
||||
continue
|
||||
|
||||
if isinstance(v, list):
|
||||
newlist = []
|
||||
for i in v:
|
||||
newlist.append(_fix_objects(i))
|
||||
obj.update({k: newlist})
|
||||
continue
|
||||
|
||||
if isinstance(obj, list):
|
||||
new_list = []
|
||||
for i in obj:
|
||||
new_list.append(_fix_objects(i))
|
||||
obj = new_list
|
||||
|
||||
return obj
|
||||
|
||||
|
||||
def parse(
|
||||
data: str,
|
||||
raw: bool = False,
|
||||
quiet: bool = False
|
||||
) -> JSONDictType:
|
||||
"""
|
||||
Main text parsing function
|
||||
|
||||
Parameters:
|
||||
|
||||
data: (string) text data to parse
|
||||
raw: (boolean) unprocessed output if True
|
||||
quiet: (boolean) suppress warning messages if True
|
||||
|
||||
Returns:
|
||||
|
||||
Dictionary. Raw or processed structured data.
|
||||
"""
|
||||
jc.utils.compatibility(__name__, info.compatible, quiet)
|
||||
jc.utils.input_type_check(data)
|
||||
|
||||
raw_output: JSONDictType = {}
|
||||
|
||||
if jc.utils.has_data(data):
|
||||
raw_output = _fix_objects(tomli.loads(data))
|
||||
|
||||
return raw_output if raw else _process(raw_output)
|
21
jc/parsers/tomli/LICENSE
Normal file
21
jc/parsers/tomli/LICENSE
Normal file
@ -0,0 +1,21 @@
|
||||
MIT License
|
||||
|
||||
Copyright (c) 2021 Taneli Hukkinen
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
11
jc/parsers/tomli/__init__.py
Normal file
11
jc/parsers/tomli/__init__.py
Normal file
@ -0,0 +1,11 @@
|
||||
# SPDX-License-Identifier: MIT
|
||||
# SPDX-FileCopyrightText: 2021 Taneli Hukkinen
|
||||
# Licensed to PSF under a Contributor Agreement.
|
||||
|
||||
__all__ = ("loads", "load", "TOMLDecodeError")
|
||||
__version__ = "2.0.1" # DO NOT EDIT THIS LINE MANUALLY. LET bump2version UTILITY DO IT
|
||||
|
||||
from ._parser import TOMLDecodeError, load, loads
|
||||
|
||||
# Pretend this exception was created here.
|
||||
TOMLDecodeError.__module__ = __name__
|
691
jc/parsers/tomli/_parser.py
Normal file
691
jc/parsers/tomli/_parser.py
Normal file
@ -0,0 +1,691 @@
|
||||
# SPDX-License-Identifier: MIT
|
||||
# SPDX-FileCopyrightText: 2021 Taneli Hukkinen
|
||||
# Licensed to PSF under a Contributor Agreement.
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from collections.abc import Iterable
|
||||
import string
|
||||
from types import MappingProxyType
|
||||
from typing import Any, BinaryIO, NamedTuple
|
||||
|
||||
from ._re import (
|
||||
RE_DATETIME,
|
||||
RE_LOCALTIME,
|
||||
RE_NUMBER,
|
||||
match_to_datetime,
|
||||
match_to_localtime,
|
||||
match_to_number,
|
||||
)
|
||||
from ._types import Key, ParseFloat, Pos
|
||||
|
||||
ASCII_CTRL = frozenset(chr(i) for i in range(32)) | frozenset(chr(127))
|
||||
|
||||
# Neither of these sets include quotation mark or backslash. They are
|
||||
# currently handled as separate cases in the parser functions.
|
||||
ILLEGAL_BASIC_STR_CHARS = ASCII_CTRL - frozenset("\t")
|
||||
ILLEGAL_MULTILINE_BASIC_STR_CHARS = ASCII_CTRL - frozenset("\t\n")
|
||||
|
||||
ILLEGAL_LITERAL_STR_CHARS = ILLEGAL_BASIC_STR_CHARS
|
||||
ILLEGAL_MULTILINE_LITERAL_STR_CHARS = ILLEGAL_MULTILINE_BASIC_STR_CHARS
|
||||
|
||||
ILLEGAL_COMMENT_CHARS = ILLEGAL_BASIC_STR_CHARS
|
||||
|
||||
TOML_WS = frozenset(" \t")
|
||||
TOML_WS_AND_NEWLINE = TOML_WS | frozenset("\n")
|
||||
BARE_KEY_CHARS = frozenset(string.ascii_letters + string.digits + "-_")
|
||||
KEY_INITIAL_CHARS = BARE_KEY_CHARS | frozenset("\"'")
|
||||
HEXDIGIT_CHARS = frozenset(string.hexdigits)
|
||||
|
||||
BASIC_STR_ESCAPE_REPLACEMENTS = MappingProxyType(
|
||||
{
|
||||
"\\b": "\u0008", # backspace
|
||||
"\\t": "\u0009", # tab
|
||||
"\\n": "\u000A", # linefeed
|
||||
"\\f": "\u000C", # form feed
|
||||
"\\r": "\u000D", # carriage return
|
||||
'\\"': "\u0022", # quote
|
||||
"\\\\": "\u005C", # backslash
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
class TOMLDecodeError(ValueError):
|
||||
"""An error raised if a document is not valid TOML."""
|
||||
|
||||
|
||||
def load(__fp: BinaryIO, *, parse_float: ParseFloat = float) -> dict[str, Any]:
|
||||
"""Parse TOML from a binary file object."""
|
||||
b = __fp.read()
|
||||
try:
|
||||
s = b.decode()
|
||||
except AttributeError:
|
||||
raise TypeError(
|
||||
"File must be opened in binary mode, e.g. use `open('foo.toml', 'rb')`"
|
||||
) from None
|
||||
return loads(s, parse_float=parse_float)
|
||||
|
||||
|
||||
def loads(__s: str, *, parse_float: ParseFloat = float) -> dict[str, Any]: # noqa: C901
|
||||
"""Parse TOML from a string."""
|
||||
|
||||
# The spec allows converting "\r\n" to "\n", even in string
|
||||
# literals. Let's do so to simplify parsing.
|
||||
src = __s.replace("\r\n", "\n")
|
||||
pos = 0
|
||||
out = Output(NestedDict(), Flags())
|
||||
header: Key = ()
|
||||
parse_float = make_safe_parse_float(parse_float)
|
||||
|
||||
# Parse one statement at a time
|
||||
# (typically means one line in TOML source)
|
||||
while True:
|
||||
# 1. Skip line leading whitespace
|
||||
pos = skip_chars(src, pos, TOML_WS)
|
||||
|
||||
# 2. Parse rules. Expect one of the following:
|
||||
# - end of file
|
||||
# - end of line
|
||||
# - comment
|
||||
# - key/value pair
|
||||
# - append dict to list (and move to its namespace)
|
||||
# - create dict (and move to its namespace)
|
||||
# Skip trailing whitespace when applicable.
|
||||
try:
|
||||
char = src[pos]
|
||||
except IndexError:
|
||||
break
|
||||
if char == "\n":
|
||||
pos += 1
|
||||
continue
|
||||
if char in KEY_INITIAL_CHARS:
|
||||
pos = key_value_rule(src, pos, out, header, parse_float)
|
||||
pos = skip_chars(src, pos, TOML_WS)
|
||||
elif char == "[":
|
||||
try:
|
||||
second_char: str | None = src[pos + 1]
|
||||
except IndexError:
|
||||
second_char = None
|
||||
out.flags.finalize_pending()
|
||||
if second_char == "[":
|
||||
pos, header = create_list_rule(src, pos, out)
|
||||
else:
|
||||
pos, header = create_dict_rule(src, pos, out)
|
||||
pos = skip_chars(src, pos, TOML_WS)
|
||||
elif char != "#":
|
||||
raise suffixed_err(src, pos, "Invalid statement")
|
||||
|
||||
# 3. Skip comment
|
||||
pos = skip_comment(src, pos)
|
||||
|
||||
# 4. Expect end of line or end of file
|
||||
try:
|
||||
char = src[pos]
|
||||
except IndexError:
|
||||
break
|
||||
if char != "\n":
|
||||
raise suffixed_err(
|
||||
src, pos, "Expected newline or end of document after a statement"
|
||||
)
|
||||
pos += 1
|
||||
|
||||
return out.data.dict
|
||||
|
||||
|
||||
class Flags:
|
||||
"""Flags that map to parsed keys/namespaces."""
|
||||
|
||||
# Marks an immutable namespace (inline array or inline table).
|
||||
FROZEN = 0
|
||||
# Marks a nest that has been explicitly created and can no longer
|
||||
# be opened using the "[table]" syntax.
|
||||
EXPLICIT_NEST = 1
|
||||
|
||||
def __init__(self) -> None:
|
||||
self._flags: dict[str, dict] = {}
|
||||
self._pending_flags: set[tuple[Key, int]] = set()
|
||||
|
||||
def add_pending(self, key: Key, flag: int) -> None:
|
||||
self._pending_flags.add((key, flag))
|
||||
|
||||
def finalize_pending(self) -> None:
|
||||
for key, flag in self._pending_flags:
|
||||
self.set(key, flag, recursive=False)
|
||||
self._pending_flags.clear()
|
||||
|
||||
def unset_all(self, key: Key) -> None:
|
||||
cont = self._flags
|
||||
for k in key[:-1]:
|
||||
if k not in cont:
|
||||
return
|
||||
cont = cont[k]["nested"]
|
||||
cont.pop(key[-1], None)
|
||||
|
||||
def set(self, key: Key, flag: int, *, recursive: bool) -> None: # noqa: A003
|
||||
cont = self._flags
|
||||
key_parent, key_stem = key[:-1], key[-1]
|
||||
for k in key_parent:
|
||||
if k not in cont:
|
||||
cont[k] = {"flags": set(), "recursive_flags": set(), "nested": {}}
|
||||
cont = cont[k]["nested"]
|
||||
if key_stem not in cont:
|
||||
cont[key_stem] = {"flags": set(), "recursive_flags": set(), "nested": {}}
|
||||
cont[key_stem]["recursive_flags" if recursive else "flags"].add(flag)
|
||||
|
||||
def is_(self, key: Key, flag: int) -> bool:
|
||||
if not key:
|
||||
return False # document root has no flags
|
||||
cont = self._flags
|
||||
for k in key[:-1]:
|
||||
if k not in cont:
|
||||
return False
|
||||
inner_cont = cont[k]
|
||||
if flag in inner_cont["recursive_flags"]:
|
||||
return True
|
||||
cont = inner_cont["nested"]
|
||||
key_stem = key[-1]
|
||||
if key_stem in cont:
|
||||
cont = cont[key_stem]
|
||||
return flag in cont["flags"] or flag in cont["recursive_flags"]
|
||||
return False
|
||||
|
||||
|
||||
class NestedDict:
|
||||
def __init__(self) -> None:
|
||||
# The parsed content of the TOML document
|
||||
self.dict: dict[str, Any] = {}
|
||||
|
||||
def get_or_create_nest(
|
||||
self,
|
||||
key: Key,
|
||||
*,
|
||||
access_lists: bool = True,
|
||||
) -> dict:
|
||||
cont: Any = self.dict
|
||||
for k in key:
|
||||
if k not in cont:
|
||||
cont[k] = {}
|
||||
cont = cont[k]
|
||||
if access_lists and isinstance(cont, list):
|
||||
cont = cont[-1]
|
||||
if not isinstance(cont, dict):
|
||||
raise KeyError("There is no nest behind this key")
|
||||
return cont
|
||||
|
||||
def append_nest_to_list(self, key: Key) -> None:
|
||||
cont = self.get_or_create_nest(key[:-1])
|
||||
last_key = key[-1]
|
||||
if last_key in cont:
|
||||
list_ = cont[last_key]
|
||||
if not isinstance(list_, list):
|
||||
raise KeyError("An object other than list found behind this key")
|
||||
list_.append({})
|
||||
else:
|
||||
cont[last_key] = [{}]
|
||||
|
||||
|
||||
class Output(NamedTuple):
|
||||
data: NestedDict
|
||||
flags: Flags
|
||||
|
||||
|
||||
def skip_chars(src: str, pos: Pos, chars: Iterable[str]) -> Pos:
|
||||
try:
|
||||
while src[pos] in chars:
|
||||
pos += 1
|
||||
except IndexError:
|
||||
pass
|
||||
return pos
|
||||
|
||||
|
||||
def skip_until(
|
||||
src: str,
|
||||
pos: Pos,
|
||||
expect: str,
|
||||
*,
|
||||
error_on: frozenset[str],
|
||||
error_on_eof: bool,
|
||||
) -> Pos:
|
||||
try:
|
||||
new_pos = src.index(expect, pos)
|
||||
except ValueError:
|
||||
new_pos = len(src)
|
||||
if error_on_eof:
|
||||
raise suffixed_err(src, new_pos, f"Expected {expect!r}") from None
|
||||
|
||||
if not error_on.isdisjoint(src[pos:new_pos]):
|
||||
while src[pos] not in error_on:
|
||||
pos += 1
|
||||
raise suffixed_err(src, pos, f"Found invalid character {src[pos]!r}")
|
||||
return new_pos
|
||||
|
||||
|
||||
def skip_comment(src: str, pos: Pos) -> Pos:
|
||||
try:
|
||||
char: str | None = src[pos]
|
||||
except IndexError:
|
||||
char = None
|
||||
if char == "#":
|
||||
return skip_until(
|
||||
src, pos + 1, "\n", error_on=ILLEGAL_COMMENT_CHARS, error_on_eof=False
|
||||
)
|
||||
return pos
|
||||
|
||||
|
||||
def skip_comments_and_array_ws(src: str, pos: Pos) -> Pos:
|
||||
while True:
|
||||
pos_before_skip = pos
|
||||
pos = skip_chars(src, pos, TOML_WS_AND_NEWLINE)
|
||||
pos = skip_comment(src, pos)
|
||||
if pos == pos_before_skip:
|
||||
return pos
|
||||
|
||||
|
||||
def create_dict_rule(src: str, pos: Pos, out: Output) -> tuple[Pos, Key]:
|
||||
pos += 1 # Skip "["
|
||||
pos = skip_chars(src, pos, TOML_WS)
|
||||
pos, key = parse_key(src, pos)
|
||||
|
||||
if out.flags.is_(key, Flags.EXPLICIT_NEST) or out.flags.is_(key, Flags.FROZEN):
|
||||
raise suffixed_err(src, pos, f"Cannot declare {key} twice")
|
||||
out.flags.set(key, Flags.EXPLICIT_NEST, recursive=False)
|
||||
try:
|
||||
out.data.get_or_create_nest(key)
|
||||
except KeyError:
|
||||
raise suffixed_err(src, pos, "Cannot overwrite a value") from None
|
||||
|
||||
if not src.startswith("]", pos):
|
||||
raise suffixed_err(src, pos, "Expected ']' at the end of a table declaration")
|
||||
return pos + 1, key
|
||||
|
||||
|
||||
def create_list_rule(src: str, pos: Pos, out: Output) -> tuple[Pos, Key]:
|
||||
pos += 2 # Skip "[["
|
||||
pos = skip_chars(src, pos, TOML_WS)
|
||||
pos, key = parse_key(src, pos)
|
||||
|
||||
if out.flags.is_(key, Flags.FROZEN):
|
||||
raise suffixed_err(src, pos, f"Cannot mutate immutable namespace {key}")
|
||||
# Free the namespace now that it points to another empty list item...
|
||||
out.flags.unset_all(key)
|
||||
# ...but this key precisely is still prohibited from table declaration
|
||||
out.flags.set(key, Flags.EXPLICIT_NEST, recursive=False)
|
||||
try:
|
||||
out.data.append_nest_to_list(key)
|
||||
except KeyError:
|
||||
raise suffixed_err(src, pos, "Cannot overwrite a value") from None
|
||||
|
||||
if not src.startswith("]]", pos):
|
||||
raise suffixed_err(src, pos, "Expected ']]' at the end of an array declaration")
|
||||
return pos + 2, key
|
||||
|
||||
|
||||
def key_value_rule(
|
||||
src: str, pos: Pos, out: Output, header: Key, parse_float: ParseFloat
|
||||
) -> Pos:
|
||||
pos, key, value = parse_key_value_pair(src, pos, parse_float)
|
||||
key_parent, key_stem = key[:-1], key[-1]
|
||||
abs_key_parent = header + key_parent
|
||||
|
||||
relative_path_cont_keys = (header + key[:i] for i in range(1, len(key)))
|
||||
for cont_key in relative_path_cont_keys:
|
||||
# Check that dotted key syntax does not redefine an existing table
|
||||
if out.flags.is_(cont_key, Flags.EXPLICIT_NEST):
|
||||
raise suffixed_err(src, pos, f"Cannot redefine namespace {cont_key}")
|
||||
# Containers in the relative path can't be opened with the table syntax or
|
||||
# dotted key/value syntax in following table sections.
|
||||
out.flags.add_pending(cont_key, Flags.EXPLICIT_NEST)
|
||||
|
||||
if out.flags.is_(abs_key_parent, Flags.FROZEN):
|
||||
raise suffixed_err(
|
||||
src, pos, f"Cannot mutate immutable namespace {abs_key_parent}"
|
||||
)
|
||||
|
||||
try:
|
||||
nest = out.data.get_or_create_nest(abs_key_parent)
|
||||
except KeyError:
|
||||
raise suffixed_err(src, pos, "Cannot overwrite a value") from None
|
||||
if key_stem in nest:
|
||||
raise suffixed_err(src, pos, "Cannot overwrite a value")
|
||||
# Mark inline table and array namespaces recursively immutable
|
||||
if isinstance(value, (dict, list)):
|
||||
out.flags.set(header + key, Flags.FROZEN, recursive=True)
|
||||
nest[key_stem] = value
|
||||
return pos
|
||||
|
||||
|
||||
def parse_key_value_pair(
|
||||
src: str, pos: Pos, parse_float: ParseFloat
|
||||
) -> tuple[Pos, Key, Any]:
|
||||
pos, key = parse_key(src, pos)
|
||||
try:
|
||||
char: str | None = src[pos]
|
||||
except IndexError:
|
||||
char = None
|
||||
if char != "=":
|
||||
raise suffixed_err(src, pos, "Expected '=' after a key in a key/value pair")
|
||||
pos += 1
|
||||
pos = skip_chars(src, pos, TOML_WS)
|
||||
pos, value = parse_value(src, pos, parse_float)
|
||||
return pos, key, value
|
||||
|
||||
|
||||
def parse_key(src: str, pos: Pos) -> tuple[Pos, Key]:
|
||||
pos, key_part = parse_key_part(src, pos)
|
||||
key: Key = (key_part,)
|
||||
pos = skip_chars(src, pos, TOML_WS)
|
||||
while True:
|
||||
try:
|
||||
char: str | None = src[pos]
|
||||
except IndexError:
|
||||
char = None
|
||||
if char != ".":
|
||||
return pos, key
|
||||
pos += 1
|
||||
pos = skip_chars(src, pos, TOML_WS)
|
||||
pos, key_part = parse_key_part(src, pos)
|
||||
key += (key_part,)
|
||||
pos = skip_chars(src, pos, TOML_WS)
|
||||
|
||||
|
||||
def parse_key_part(src: str, pos: Pos) -> tuple[Pos, str]:
|
||||
try:
|
||||
char: str | None = src[pos]
|
||||
except IndexError:
|
||||
char = None
|
||||
if char in BARE_KEY_CHARS:
|
||||
start_pos = pos
|
||||
pos = skip_chars(src, pos, BARE_KEY_CHARS)
|
||||
return pos, src[start_pos:pos]
|
||||
if char == "'":
|
||||
return parse_literal_str(src, pos)
|
||||
if char == '"':
|
||||
return parse_one_line_basic_str(src, pos)
|
||||
raise suffixed_err(src, pos, "Invalid initial character for a key part")
|
||||
|
||||
|
||||
def parse_one_line_basic_str(src: str, pos: Pos) -> tuple[Pos, str]:
|
||||
pos += 1
|
||||
return parse_basic_str(src, pos, multiline=False)
|
||||
|
||||
|
||||
def parse_array(src: str, pos: Pos, parse_float: ParseFloat) -> tuple[Pos, list]:
|
||||
pos += 1
|
||||
array: list = []
|
||||
|
||||
pos = skip_comments_and_array_ws(src, pos)
|
||||
if src.startswith("]", pos):
|
||||
return pos + 1, array
|
||||
while True:
|
||||
pos, val = parse_value(src, pos, parse_float)
|
||||
array.append(val)
|
||||
pos = skip_comments_and_array_ws(src, pos)
|
||||
|
||||
c = src[pos : pos + 1]
|
||||
if c == "]":
|
||||
return pos + 1, array
|
||||
if c != ",":
|
||||
raise suffixed_err(src, pos, "Unclosed array")
|
||||
pos += 1
|
||||
|
||||
pos = skip_comments_and_array_ws(src, pos)
|
||||
if src.startswith("]", pos):
|
||||
return pos + 1, array
|
||||
|
||||
|
||||
def parse_inline_table(src: str, pos: Pos, parse_float: ParseFloat) -> tuple[Pos, dict]:
|
||||
pos += 1
|
||||
nested_dict = NestedDict()
|
||||
flags = Flags()
|
||||
|
||||
pos = skip_chars(src, pos, TOML_WS)
|
||||
if src.startswith("}", pos):
|
||||
return pos + 1, nested_dict.dict
|
||||
while True:
|
||||
pos, key, value = parse_key_value_pair(src, pos, parse_float)
|
||||
key_parent, key_stem = key[:-1], key[-1]
|
||||
if flags.is_(key, Flags.FROZEN):
|
||||
raise suffixed_err(src, pos, f"Cannot mutate immutable namespace {key}")
|
||||
try:
|
||||
nest = nested_dict.get_or_create_nest(key_parent, access_lists=False)
|
||||
except KeyError:
|
||||
raise suffixed_err(src, pos, "Cannot overwrite a value") from None
|
||||
if key_stem in nest:
|
||||
raise suffixed_err(src, pos, f"Duplicate inline table key {key_stem!r}")
|
||||
nest[key_stem] = value
|
||||
pos = skip_chars(src, pos, TOML_WS)
|
||||
c = src[pos : pos + 1]
|
||||
if c == "}":
|
||||
return pos + 1, nested_dict.dict
|
||||
if c != ",":
|
||||
raise suffixed_err(src, pos, "Unclosed inline table")
|
||||
if isinstance(value, (dict, list)):
|
||||
flags.set(key, Flags.FROZEN, recursive=True)
|
||||
pos += 1
|
||||
pos = skip_chars(src, pos, TOML_WS)
|
||||
|
||||
|
||||
def parse_basic_str_escape(
|
||||
src: str, pos: Pos, *, multiline: bool = False
|
||||
) -> tuple[Pos, str]:
|
||||
escape_id = src[pos : pos + 2]
|
||||
pos += 2
|
||||
if multiline and escape_id in {"\\ ", "\\\t", "\\\n"}:
|
||||
# Skip whitespace until next non-whitespace character or end of
|
||||
# the doc. Error if non-whitespace is found before newline.
|
||||
if escape_id != "\\\n":
|
||||
pos = skip_chars(src, pos, TOML_WS)
|
||||
try:
|
||||
char = src[pos]
|
||||
except IndexError:
|
||||
return pos, ""
|
||||
if char != "\n":
|
||||
raise suffixed_err(src, pos, "Unescaped '\\' in a string")
|
||||
pos += 1
|
||||
pos = skip_chars(src, pos, TOML_WS_AND_NEWLINE)
|
||||
return pos, ""
|
||||
if escape_id == "\\u":
|
||||
return parse_hex_char(src, pos, 4)
|
||||
if escape_id == "\\U":
|
||||
return parse_hex_char(src, pos, 8)
|
||||
try:
|
||||
return pos, BASIC_STR_ESCAPE_REPLACEMENTS[escape_id]
|
||||
except KeyError:
|
||||
raise suffixed_err(src, pos, "Unescaped '\\' in a string") from None
|
||||
|
||||
|
||||
def parse_basic_str_escape_multiline(src: str, pos: Pos) -> tuple[Pos, str]:
|
||||
return parse_basic_str_escape(src, pos, multiline=True)
|
||||
|
||||
|
||||
def parse_hex_char(src: str, pos: Pos, hex_len: int) -> tuple[Pos, str]:
|
||||
hex_str = src[pos : pos + hex_len]
|
||||
if len(hex_str) != hex_len or not HEXDIGIT_CHARS.issuperset(hex_str):
|
||||
raise suffixed_err(src, pos, "Invalid hex value")
|
||||
pos += hex_len
|
||||
hex_int = int(hex_str, 16)
|
||||
if not is_unicode_scalar_value(hex_int):
|
||||
raise suffixed_err(src, pos, "Escaped character is not a Unicode scalar value")
|
||||
return pos, chr(hex_int)
|
||||
|
||||
|
||||
def parse_literal_str(src: str, pos: Pos) -> tuple[Pos, str]:
|
||||
pos += 1 # Skip starting apostrophe
|
||||
start_pos = pos
|
||||
pos = skip_until(
|
||||
src, pos, "'", error_on=ILLEGAL_LITERAL_STR_CHARS, error_on_eof=True
|
||||
)
|
||||
return pos + 1, src[start_pos:pos] # Skip ending apostrophe
|
||||
|
||||
|
||||
def parse_multiline_str(src: str, pos: Pos, *, literal: bool) -> tuple[Pos, str]:
|
||||
pos += 3
|
||||
if src.startswith("\n", pos):
|
||||
pos += 1
|
||||
|
||||
if literal:
|
||||
delim = "'"
|
||||
end_pos = skip_until(
|
||||
src,
|
||||
pos,
|
||||
"'''",
|
||||
error_on=ILLEGAL_MULTILINE_LITERAL_STR_CHARS,
|
||||
error_on_eof=True,
|
||||
)
|
||||
result = src[pos:end_pos]
|
||||
pos = end_pos + 3
|
||||
else:
|
||||
delim = '"'
|
||||
pos, result = parse_basic_str(src, pos, multiline=True)
|
||||
|
||||
# Add at maximum two extra apostrophes/quotes if the end sequence
|
||||
# is 4 or 5 chars long instead of just 3.
|
||||
if not src.startswith(delim, pos):
|
||||
return pos, result
|
||||
pos += 1
|
||||
if not src.startswith(delim, pos):
|
||||
return pos, result + delim
|
||||
pos += 1
|
||||
return pos, result + (delim * 2)
|
||||
|
||||
|
||||
def parse_basic_str(src: str, pos: Pos, *, multiline: bool) -> tuple[Pos, str]:
|
||||
if multiline:
|
||||
error_on = ILLEGAL_MULTILINE_BASIC_STR_CHARS
|
||||
parse_escapes = parse_basic_str_escape_multiline
|
||||
else:
|
||||
error_on = ILLEGAL_BASIC_STR_CHARS
|
||||
parse_escapes = parse_basic_str_escape
|
||||
result = ""
|
||||
start_pos = pos
|
||||
while True:
|
||||
try:
|
||||
char = src[pos]
|
||||
except IndexError:
|
||||
raise suffixed_err(src, pos, "Unterminated string") from None
|
||||
if char == '"':
|
||||
if not multiline:
|
||||
return pos + 1, result + src[start_pos:pos]
|
||||
if src.startswith('"""', pos):
|
||||
return pos + 3, result + src[start_pos:pos]
|
||||
pos += 1
|
||||
continue
|
||||
if char == "\\":
|
||||
result += src[start_pos:pos]
|
||||
pos, parsed_escape = parse_escapes(src, pos)
|
||||
result += parsed_escape
|
||||
start_pos = pos
|
||||
continue
|
||||
if char in error_on:
|
||||
raise suffixed_err(src, pos, f"Illegal character {char!r}")
|
||||
pos += 1
|
||||
|
||||
|
||||
def parse_value( # noqa: C901
|
||||
src: str, pos: Pos, parse_float: ParseFloat
|
||||
) -> tuple[Pos, Any]:
|
||||
try:
|
||||
char: str | None = src[pos]
|
||||
except IndexError:
|
||||
char = None
|
||||
|
||||
# IMPORTANT: order conditions based on speed of checking and likelihood
|
||||
|
||||
# Basic strings
|
||||
if char == '"':
|
||||
if src.startswith('"""', pos):
|
||||
return parse_multiline_str(src, pos, literal=False)
|
||||
return parse_one_line_basic_str(src, pos)
|
||||
|
||||
# Literal strings
|
||||
if char == "'":
|
||||
if src.startswith("'''", pos):
|
||||
return parse_multiline_str(src, pos, literal=True)
|
||||
return parse_literal_str(src, pos)
|
||||
|
||||
# Booleans
|
||||
if char == "t":
|
||||
if src.startswith("true", pos):
|
||||
return pos + 4, True
|
||||
if char == "f":
|
||||
if src.startswith("false", pos):
|
||||
return pos + 5, False
|
||||
|
||||
# Arrays
|
||||
if char == "[":
|
||||
return parse_array(src, pos, parse_float)
|
||||
|
||||
# Inline tables
|
||||
if char == "{":
|
||||
return parse_inline_table(src, pos, parse_float)
|
||||
|
||||
# Dates and times
|
||||
datetime_match = RE_DATETIME.match(src, pos)
|
||||
if datetime_match:
|
||||
try:
|
||||
datetime_obj = match_to_datetime(datetime_match)
|
||||
except ValueError as e:
|
||||
raise suffixed_err(src, pos, "Invalid date or datetime") from e
|
||||
return datetime_match.end(), datetime_obj
|
||||
localtime_match = RE_LOCALTIME.match(src, pos)
|
||||
if localtime_match:
|
||||
return localtime_match.end(), match_to_localtime(localtime_match)
|
||||
|
||||
# Integers and "normal" floats.
|
||||
# The regex will greedily match any type starting with a decimal
|
||||
# char, so needs to be located after handling of dates and times.
|
||||
number_match = RE_NUMBER.match(src, pos)
|
||||
if number_match:
|
||||
return number_match.end(), match_to_number(number_match, parse_float)
|
||||
|
||||
# Special floats
|
||||
first_three = src[pos : pos + 3]
|
||||
if first_three in {"inf", "nan"}:
|
||||
return pos + 3, parse_float(first_three)
|
||||
first_four = src[pos : pos + 4]
|
||||
if first_four in {"-inf", "+inf", "-nan", "+nan"}:
|
||||
return pos + 4, parse_float(first_four)
|
||||
|
||||
raise suffixed_err(src, pos, "Invalid value")
|
||||
|
||||
|
||||
def suffixed_err(src: str, pos: Pos, msg: str) -> TOMLDecodeError:
|
||||
"""Return a `TOMLDecodeError` where error message is suffixed with
|
||||
coordinates in source."""
|
||||
|
||||
def coord_repr(src: str, pos: Pos) -> str:
|
||||
if pos >= len(src):
|
||||
return "end of document"
|
||||
line = src.count("\n", 0, pos) + 1
|
||||
if line == 1:
|
||||
column = pos + 1
|
||||
else:
|
||||
column = pos - src.rindex("\n", 0, pos)
|
||||
return f"line {line}, column {column}"
|
||||
|
||||
return TOMLDecodeError(f"{msg} (at {coord_repr(src, pos)})")
|
||||
|
||||
|
||||
def is_unicode_scalar_value(codepoint: int) -> bool:
|
||||
return (0 <= codepoint <= 55295) or (57344 <= codepoint <= 1114111)
|
||||
|
||||
|
||||
def make_safe_parse_float(parse_float: ParseFloat) -> ParseFloat:
|
||||
"""A decorator to make `parse_float` safe.
|
||||
|
||||
`parse_float` must not return dicts or lists, because these types
|
||||
would be mixed with parsed TOML tables and arrays, thus confusing
|
||||
the parser. The returned decorated callable raises `ValueError`
|
||||
instead of returning illegal types.
|
||||
"""
|
||||
# The default `float` callable never returns illegal types. Optimize it.
|
||||
if parse_float is float: # type: ignore[comparison-overlap]
|
||||
return float
|
||||
|
||||
def safe_parse_float(float_str: str) -> Any:
|
||||
float_value = parse_float(float_str)
|
||||
if isinstance(float_value, (dict, list)):
|
||||
raise ValueError("parse_float must not return dicts or lists")
|
||||
return float_value
|
||||
|
||||
return safe_parse_float
|
107
jc/parsers/tomli/_re.py
Normal file
107
jc/parsers/tomli/_re.py
Normal file
@ -0,0 +1,107 @@
|
||||
# SPDX-License-Identifier: MIT
|
||||
# SPDX-FileCopyrightText: 2021 Taneli Hukkinen
|
||||
# Licensed to PSF under a Contributor Agreement.
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from datetime import date, datetime, time, timedelta, timezone, tzinfo
|
||||
from functools import lru_cache
|
||||
import re
|
||||
from typing import Any
|
||||
|
||||
from ._types import ParseFloat
|
||||
|
||||
# E.g.
|
||||
# - 00:32:00.999999
|
||||
# - 00:32:00
|
||||
_TIME_RE_STR = r"([01][0-9]|2[0-3]):([0-5][0-9]):([0-5][0-9])(?:\.([0-9]{1,6})[0-9]*)?"
|
||||
|
||||
RE_NUMBER = re.compile(
|
||||
r"""
|
||||
0
|
||||
(?:
|
||||
x[0-9A-Fa-f](?:_?[0-9A-Fa-f])* # hex
|
||||
|
|
||||
b[01](?:_?[01])* # bin
|
||||
|
|
||||
o[0-7](?:_?[0-7])* # oct
|
||||
)
|
||||
|
|
||||
[+-]?(?:0|[1-9](?:_?[0-9])*) # dec, integer part
|
||||
(?P<floatpart>
|
||||
(?:\.[0-9](?:_?[0-9])*)? # optional fractional part
|
||||
(?:[eE][+-]?[0-9](?:_?[0-9])*)? # optional exponent part
|
||||
)
|
||||
""",
|
||||
flags=re.VERBOSE,
|
||||
)
|
||||
RE_LOCALTIME = re.compile(_TIME_RE_STR)
|
||||
RE_DATETIME = re.compile(
|
||||
rf"""
|
||||
([0-9]{{4}})-(0[1-9]|1[0-2])-(0[1-9]|[12][0-9]|3[01]) # date, e.g. 1988-10-27
|
||||
(?:
|
||||
[Tt ]
|
||||
{_TIME_RE_STR}
|
||||
(?:([Zz])|([+-])([01][0-9]|2[0-3]):([0-5][0-9]))? # optional time offset
|
||||
)?
|
||||
""",
|
||||
flags=re.VERBOSE,
|
||||
)
|
||||
|
||||
|
||||
def match_to_datetime(match: re.Match) -> datetime | date:
|
||||
"""Convert a `RE_DATETIME` match to `datetime.datetime` or `datetime.date`.
|
||||
|
||||
Raises ValueError if the match does not correspond to a valid date
|
||||
or datetime.
|
||||
"""
|
||||
(
|
||||
year_str,
|
||||
month_str,
|
||||
day_str,
|
||||
hour_str,
|
||||
minute_str,
|
||||
sec_str,
|
||||
micros_str,
|
||||
zulu_time,
|
||||
offset_sign_str,
|
||||
offset_hour_str,
|
||||
offset_minute_str,
|
||||
) = match.groups()
|
||||
year, month, day = int(year_str), int(month_str), int(day_str)
|
||||
if hour_str is None:
|
||||
return date(year, month, day)
|
||||
hour, minute, sec = int(hour_str), int(minute_str), int(sec_str)
|
||||
micros = int(micros_str.ljust(6, "0")) if micros_str else 0
|
||||
if offset_sign_str:
|
||||
tz: tzinfo | None = cached_tz(
|
||||
offset_hour_str, offset_minute_str, offset_sign_str
|
||||
)
|
||||
elif zulu_time:
|
||||
tz = timezone.utc
|
||||
else: # local date-time
|
||||
tz = None
|
||||
return datetime(year, month, day, hour, minute, sec, micros, tzinfo=tz)
|
||||
|
||||
|
||||
@lru_cache(maxsize=None)
|
||||
def cached_tz(hour_str: str, minute_str: str, sign_str: str) -> timezone:
|
||||
sign = 1 if sign_str == "+" else -1
|
||||
return timezone(
|
||||
timedelta(
|
||||
hours=sign * int(hour_str),
|
||||
minutes=sign * int(minute_str),
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def match_to_localtime(match: re.Match) -> time:
|
||||
hour_str, minute_str, sec_str, micros_str = match.groups()
|
||||
micros = int(micros_str.ljust(6, "0")) if micros_str else 0
|
||||
return time(int(hour_str), int(minute_str), int(sec_str), micros)
|
||||
|
||||
|
||||
def match_to_number(match: re.Match, parse_float: ParseFloat) -> Any:
|
||||
if match.group("floatpart"):
|
||||
return parse_float(match.group())
|
||||
return int(match.group(), 0)
|
10
jc/parsers/tomli/_types.py
Normal file
10
jc/parsers/tomli/_types.py
Normal file
@ -0,0 +1,10 @@
|
||||
# SPDX-License-Identifier: MIT
|
||||
# SPDX-FileCopyrightText: 2021 Taneli Hukkinen
|
||||
# Licensed to PSF under a Contributor Agreement.
|
||||
|
||||
from typing import Any, Callable, Tuple
|
||||
|
||||
# Type annotations
|
||||
ParseFloat = Callable[[str], Any]
|
||||
Key = Tuple[str, ...]
|
||||
Pos = int
|
1
jc/parsers/tomli/py.typed
Normal file
1
jc/parsers/tomli/py.typed
Normal file
@ -0,0 +1 @@
|
||||
# Marker file for PEP 561
|
33
tests/fixtures/generic/toml-example.toml
vendored
Normal file
33
tests/fixtures/generic/toml-example.toml
vendored
Normal file
@ -0,0 +1,33 @@
|
||||
# This is a TOML document. Boom.
|
||||
|
||||
title = "TOML Example"
|
||||
|
||||
[owner]
|
||||
name = "Lance Uppercut"
|
||||
dob = 1979-05-27T07:32:00-08:00 # First class dates? Why not?
|
||||
|
||||
[database]
|
||||
server = "192.168.1.1"
|
||||
ports = [ 8001, 8001, 8002 ]
|
||||
connection_max = 5000
|
||||
enabled = true
|
||||
|
||||
[servers]
|
||||
|
||||
# You can indent as you please. Tabs or spaces. TOML don't care.
|
||||
[servers.alpha]
|
||||
ip = "10.0.0.1"
|
||||
dc = "eqdc10"
|
||||
|
||||
[servers.beta]
|
||||
ip = "10.0.0.2"
|
||||
dc = "eqdc10"
|
||||
|
||||
[clients]
|
||||
data = [ ["gamma", "delta"], [1, 2] ]
|
||||
|
||||
# Line breaks are OK when inside arrays
|
||||
hosts = [
|
||||
"alpha",
|
||||
"omega"
|
||||
]
|
Reference in New Issue
Block a user