1
0
mirror of https://github.com/kellyjonbrazil/jc.git synced 2025-06-19 00:17:51 +02:00
Files
jc/jc/parsers/ls_s.py
2021-09-12 16:35:46 -07:00

217 lines
8.0 KiB
Python

"""jc - JSON CLI output utility `ls` and `vdir` command output streaming parser
Options supported:
- `lbaR1`
- `--time-style=full-iso`
Note: The `-1`, `-l`, or `-b` option of `ls` should be used to correctly parse filenames that include newline characters. Since `ls` does not encode newlines in filenames when outputting to a pipe it will cause `jc` to see multiple files instead of a single file if `-1`, `-l`, or `-b` is not used. Alternatively, `vdir` can be used, which is the same as running `ls -lb`.
The `epoch` calculated timestamp field is naive (i.e. based on the local time of the system the parser is run on)
The `epoch_utc` calculated timestamp field is timezone-aware and is only available if the timezone field is UTC.
Usage (cli):
$ ls | jc --ls-s
Usage (module):
import jc.parsers.ls_s
result = jc.parsers.ls_s.parse(ls_command_output) # result is an iterable object
for item in result:
# do something
Schema:
{
"filename": string,
"flags": string,
"links": integer,
"parent": string, # not yet implemented
"owner": string,
"group": string,
"size": integer,
"date": string,
"epoch": integer, # naive timestamp if date field exists and can be converted
"epoch_utc": integer, # timezone aware timestamp if date field is in UTC and can be converted
"_meta":
{
"success": booean, # true if successfully parsed, false if error
"error_msg": string, # exists if "success" is false
"line": string # exists if "success" is false
}
}
Examples:
$ ls -l /usr/bin | jc --ls-s
{"filename":"2to3-","flags":"-rwxr-xr-x","links":4,"owner":"root","group":"wheel","size":925,"date":"Feb 22 2019","_meta":{"success":true}}
{"filename":"2to3-2.7","link_to":"../../System/Library/Frameworks/Python.framework/Versions/2.7/bin/2to3-2.7","flags":"lrwxr-xr-x","links":1,"owner":"root","group":"wheel","size":74,"date":"May 4 2019","_meta":{"success":true}}
{"filename":"AssetCacheLocatorUtil","flags":"-rwxr-xr-x","links":1,"owner":"root","group":"wheel","size":55152,"date":"May 3 2019","_meta":{"success":true}}
...
$ ls -l /usr/bin | jc --ls-s -r
{"filename":"2to3-","flags":"-rwxr-xr-x","links":"4","owner":"root","group":"wheel","size":"925","date":"Feb 22 2019","_meta":{"success":true}}
{"filename":"2to3-2.7","link_to":"../../System/Library/Frameworks/Python.framework/Versions/2.7/bin/2to3-2.7","flags":"lrwxr-xr-x","links":"1","owner":"root","group":"wheel","size":"74","date":"May 4 2019","_meta":{"success":true}}
{"filename":"AssetCacheLocatorUtil","flags":"-rwxr-xr-x","links":"1","owner":"root","group":"wheel","size":"55152","date":"May 3 2019","_meta":{"success":true}}
...
"""
import re
import jc.utils
from jc.exceptions import ParseError
class info():
"""Provides parser metadata (version, author, etc.)"""
version = '1.0'
description = '`ls` command streaming parser'
author = 'Kelly Brazil'
author_email = 'kellyjonbrazil@gmail.com'
# compatible options: linux, darwin, cygwin, win32, aix, freebsd
compatible = ['linux', 'darwin', 'cygwin', 'aix', 'freebsd']
streaming = True
__version__ = info.version
def _process(proc_data):
"""
Final processing to conform to the schema.
Parameters:
proc_data: (List of Dictionaries) raw structured data to process
Returns:
List of Dictionaries. Structured data to conform to the schema.
"""
int_list = ['links', 'size']
for key in proc_data:
if key in int_list:
proc_data[key] = jc.utils.convert_to_int(proc_data[key])
if 'date' in proc_data:
# to speed up processing only try to convert the date if it's not the default format
if not re.match(r'[a-zA-Z]{3}\s{1,2}\d{1,2}\s{1,2}[0-9:]{4,5}', proc_data['date']):
ts = jc.utils.timestamp(proc_data['date'])
proc_data['epoch'] = ts.naive
proc_data['epoch_utc'] = ts.utc
return proc_data
def parse(data, raw=False, quiet=False):
"""
Main text parsing function
Parameters:
data: (string) line-based text data to parse
raw: (boolean) output preprocessed JSON if True
quiet: (boolean) suppress warning messages if True
Returns:
List of Dictionaries. Raw or processed structured data.
"""
if not quiet:
jc.utils.compatibility(__name__, info.compatible)
warned = False
parent = ''
next_is_parent = False
new_section = False
for line in data:
try:
# Delete first line if it starts with 'total 1234'
# if re.match(r'total [0-9]+', linedata[0]):
# linedata.pop(0)
# Look for parent line if glob or -R is used
# if not re.match(r'[-dclpsbDCMnP?]([-r][-w][-xsS]){2}([-r][-w][-xtT])[+]?', linedata[0]) \
# and linedata[0].endswith(':'):
# parent = linedata.pop(0)[:-1]
# # Pop following total line if it exists
# if re.match(r'total [0-9]+', linedata[0]):
# linedata.pop(0)
parsed_line = line.strip().split(maxsplit=8)
output_line = {}
if not re.match(r'[-dclpsbDCMnP?]([-r][-w][-xsS]){2}([-r][-w][-xtT])[+]?', line) \
and line.endswith(':'):
parent = line[:-1]
new_section = True
# no support for filenames with newline chars in streaming parser
# fixup to remove trailing \n in previous entry
# raw_output[-1]['filename'] = raw_output[-1]['filename'][:-1]
# continue
if re.match(r'total [0-9]+', line):
new_section = False
continue
# fix for OSX - doesn't print 'total xx' line if empty directory
if new_section and line == '':
new_section = False
continue
# no support for filenames with newline chars in streaming parser
# fixup for filenames with newlines
# if not new_section \
# and not re.match(r'[-dclpsbDCMnP?]([-r][-w][-xsS]){2}([-r][-w][-xtT])[+]?', entry):
# raw_output[-1]['filename'] = raw_output[-1]['filename'] + '\n' + entry
# continue
# Only support -l option
if not re.match(r'[-dclpsbDCMnP?]([-r][-w][-xsS]){2}([-r][-w][-xtT])[+]?', line):
raise ParseError(f'Unparsable line: {line[0:60]}')
# split filenames and links
if len(parsed_line) == 9:
filename_field = parsed_line[8].split(' -> ')
else:
# in case of filenames starting with a newline character
filename_field = ['']
# create list of dictionaries
output_line['filename'] = filename_field[0]
if len(filename_field) > 1:
output_line['link_to'] = filename_field[1]
if parent:
output_line['parent'] = parent
output_line['flags'] = parsed_line[0]
output_line['links'] = parsed_line[1]
output_line['owner'] = parsed_line[2]
output_line['group'] = parsed_line[3]
output_line['size'] = parsed_line[4]
output_line['date'] = ' '.join(parsed_line[5:8])
output_line['_meta'] = {'success': True}
if raw:
yield output_line
else:
yield _process(output_line)
except Exception:
if not quiet:
raise
else:
yield {
'_meta':
{
'success': False,
'error': 'error parsing line',
'line': line.strip()
}
}