1
0
mirror of https://github.com/kellyjonbrazil/jc.git synced 2025-06-17 00:07:37 +02:00
Files
jc/jc/parsers/ls_s.py
Kelly Brazil 5c749fe26f simplify docs
2022-04-28 13:30:11 -07:00

192 lines
5.9 KiB
Python

"""jc - JSON Convert `ls` and `vdir` command output streaming parser
> This streaming parser outputs JSON Lines (cli) or returns a Generator
iterator of Dictionaries (module)
Requires the `-l` option to be used on `ls`. If there are newline characters
in the filename, then make sure to use the `-b` option on `ls`.
The `jc` `-qq` option can be used to ignore parsing errors. (e.g. filenames
with newline characters, but `-b` was not used)
The `epoch` calculated timestamp field is naive (i.e. based on the local
time of the system the parser is run on)
The `epoch_utc` calculated timestamp field is timezone-aware and is only
available if the timezone field is UTC.
Usage (cli):
$ ls | jc --ls-s
Usage (module):
import jc
result = jc.parse('ls_s', ls_command_output.splitlines())
for item in result:
# do something
Schema:
{
"filename": string,
"flags": string,
"links": integer,
"parent": string,
"owner": string,
"group": string,
"size": integer,
"date": string,
"epoch": integer, # [0]
"epoch_utc": integer, # [1]
# below object only exists if using -qq or ignore_exceptions=True
"_jc_meta": {
"success": boolean, # false if error parsing
"error": string, # exists if "success" is false
"line": string # exists if "success" is false
}
}
[0] naive timestamp if date field exists and can be converted.
[1] timezone aware timestamp if date field is in UTC and can
be converted
Examples:
$ ls -l /usr/bin | jc --ls-s
{"filename":"2to3-","flags":"-rwxr-xr-x","links":4,"owner":"root","...}
{"filename":"2to3-2.7","link_to":"../../System/Library/Frameworks/P...}
{"filename":"AssetCacheLocatorUtil","flags":"-rwxr-xr-x","links":1,...}
...
$ ls -l /usr/bin | jc --ls-s -r
{"filename":"2to3-","flags":"-rwxr-xr-x","links":"4","owner":"roo"..."}
{"filename":"2to3-2.7","link_to":"../../System/Library/Frameworks/P...}
{"filename":"AssetCacheLocatorUtil","flags":"-rwxr-xr-x","links":"1...}
...
"""
import re
import jc.utils
from jc.streaming import (
add_jc_meta, streaming_input_type_check, streaming_line_input_type_check, raise_or_yield
)
from jc.exceptions import ParseError
class info():
"""Provides parser metadata (version, author, etc.)"""
version = '1.0'
description = '`ls` command streaming parser'
author = 'Kelly Brazil'
author_email = 'kellyjonbrazil@gmail.com'
compatible = ['linux', 'darwin', 'cygwin', 'aix', 'freebsd']
streaming = True
__version__ = info.version
def _process(proc_data):
"""
Final processing to conform to the schema.
Parameters:
proc_data: (Dictionary) raw structured data to process
Returns:
Dictionary. Structured data to conform to the schema.
"""
int_list = ['links', 'size']
for key in proc_data:
if key in int_list:
proc_data[key] = jc.utils.convert_to_int(proc_data[key])
if 'date' in proc_data:
# to speed up processing only try to convert the date if it's not the default format
if not re.match(r'[a-zA-Z]{3}\s{1,2}\d{1,2}\s{1,2}[0-9:]{4,5}', proc_data['date']):
ts = jc.utils.timestamp(proc_data['date'], format_hint=(7200,))
proc_data['epoch'] = ts.naive
proc_data['epoch_utc'] = ts.utc
return proc_data
@add_jc_meta
def parse(data, raw=False, quiet=False, ignore_exceptions=False):
"""
Main text parsing generator function. Returns an iterable object.
Parameters:
data: (iterable) line-based text data to parse
(e.g. sys.stdin or str.splitlines())
raw: (boolean) unprocessed output if True
quiet: (boolean) suppress warning messages if True
ignore_exceptions: (boolean) ignore parsing exceptions if True
Returns:
Iterable of Dictionaries
"""
jc.utils.compatibility(__name__, info.compatible, quiet)
streaming_input_type_check(data)
parent = ''
for line in data:
try:
streaming_line_input_type_check(line)
# skip line if it starts with 'total 1234'
if re.match(r'total [0-9]+', line):
continue
# skip blank lines
if line.strip() == '':
continue
# Look for parent line if glob or -R is used
if not re.match(r'[-dclpsbDCMnP?]([-r][-w][-xsS]){2}([-r][-w][-xtT])[+]?', line) \
and line.strip().endswith(':'):
parent = line.strip()[:-1]
continue
if not re.match(r'[-dclpsbDCMnP?]([-r][-w][-xsS]){2}([-r][-w][-xtT])[+]?', line):
raise ParseError('Not ls -l data')
parsed_line = line.strip().split(maxsplit=8)
output_line = {}
# split filenames and links
if len(parsed_line) == 9:
filename_field = parsed_line[8].split(' -> ')
else:
# in case of filenames starting with a newline character
filename_field = ['']
# create output object
output_line['filename'] = filename_field[0]
if len(filename_field) > 1:
output_line['link_to'] = filename_field[1]
if parent:
output_line['parent'] = parent
output_line['flags'] = parsed_line[0]
output_line['links'] = parsed_line[1]
output_line['owner'] = parsed_line[2]
output_line['group'] = parsed_line[3]
output_line['size'] = parsed_line[4]
output_line['date'] = ' '.join(parsed_line[5:8])
yield output_line if raw else _process(output_line)
except Exception as e:
yield raise_or_yield(ignore_exceptions, e, line)