1
0
mirror of https://github.com/kellyjonbrazil/jc.git synced 2025-06-17 00:07:37 +02:00

extract structured data. need to fix corner cases

This commit is contained in:
Kelly Brazil
2022-08-12 17:35:28 -07:00
parent 5f280c3e24
commit c24c5f7985

View File

@ -34,7 +34,7 @@ Examples:
[]
"""
import re
from typing import List, Dict
from typing import List, Dict, Optional
import jc.utils
@ -49,6 +49,39 @@ class info():
__version__ = info.version
def _extract_structs(structs_string: str) -> List[str]:
struct_match = re.compile(r'(?P<eachstruct>\[.+?(?<!\\)\])')
each_struct = struct_match.findall(structs_string)
my_structs = []
if each_struct:
for structured in each_struct:
my_structs.append(structured)
return my_structs
def _extract_ident(struct_string) -> Optional[str]:
ident = re.compile(r'''\[(?P<ident>[^\[\=\x22\]\x20]{1,32})\s''')
ident_match = ident.search(struct_string)
if ident_match:
return ident_match.group('ident')
return None
def _extract_kv(struct_string) -> List[Dict]:
key_vals = re.compile(r'''(?P<key>\w+)=(?P<val>\"[^\"]*\")''')
key_vals_match = key_vals.findall(struct_string)
kv_list = []
if key_vals_match:
for kv in key_vals_match:
key, val = kv
kv_list.append({key: val[1:-1]})
return kv_list
def _process(proc_data: List[Dict]) -> List[Dict]:
"""
Final processing to conform to the schema.
@ -69,37 +102,37 @@ def _process(proc_data: List[Dict]) -> List[Dict]:
r'\]': r']'
}
structured = re.compile(r'''
(?P<STRUCTUREDDATA>\[
(?P<ident>[^\[\=\x22\]\x20]{1,32})\s
(?P<keyval>[^\[\=\x22\x20]{1,32}=\x22.+\x22\s?)+\]
)
''', re.VERBOSE
)
each_struct = r'''(?P<eachstruct>\[.+?(?<!\\)\])'''
ident = r'''\[(?P<ident>[^\[\=\x22\]\x20]{1,32})\s'''
key_vals = r'''(?P<key>\w+)=(?P<val>\"[^\"]*\")'''
for item in proc_data:
for key, value in item.copy().items():
for key, value in item.items():
# remove any spaces around values
if item[key]:
item[key] = value.strip()
# fixup escaped characters
for esc, esc_sub in escape_map.items():
if item[key]:
if item[key] and isinstance(item[key], str):
item[key] = item[key].replace(esc, esc_sub)
# parse identity and key value pairs in the structured data section
# if proc_data['structured_data']:
# struct_match = structured.match(proc_data['structured_data'])
# if struct_match:
# struct_dict = struct_match.groupdict()
# parse identity and key value pairs in the structured data section
structs = None
if item['structured_data']:
structs_list = []
structs = _extract_structs(item['structured_data'])
for a_struct in structs:
struct_obj = {
'identity': _extract_ident(a_struct)
}
my_values = {}
for val_obj in _extract_kv(a_struct):
my_values.update(val_obj)
struct_obj.update({'values': my_values}) # type: ignore
structs_list.append(struct_obj)
item['structured_data'] = structs_list
return proc_data