From 8c3cff59c60a15e434ee8d37c72da4c4d7b1ffa3 Mon Sep 17 00:00:00 2001 From: Kelly Brazil Date: Sat, 16 Jul 2022 11:27:58 -0700 Subject: [PATCH] make parser more robust for non-standard extension fields --- jc/parsers/m3u.py | 60 ++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 52 insertions(+), 8 deletions(-) diff --git a/jc/parsers/m3u.py b/jc/parsers/m3u.py index 22c6558c..c5542e0c 100644 --- a/jc/parsers/m3u.py +++ b/jc/parsers/m3u.py @@ -1,6 +1,12 @@ """jc - JSON Convert M3U and M3U8 file parser -Only standard extended info fields are supported. +This parser will make a best-effort to parse extended field information. If +the extended fields cannot be successfully parsed, then an `unparsed_info` +field will be added to the object. If not using `--quiet`, then a warning +message also will be printed to `STDERR`. + +Parsing issues with extended field information will usually occur with lines +that include punctuation like single quotes. Usage (cli): @@ -17,10 +23,15 @@ Schema: { "runtime": integer, "display": string, - "path": string + "path": string, + : string, # [0] + "unparsed_info": string, # [1] } ] + [0] Field names are pulled directly from the #EXTINF: line + [1] Only added if the extended information cannot be parsed + Examples: $ cat playlist.m3u | jc --m3u -p @@ -51,7 +62,9 @@ Examples: } ] """ +import shlex from typing import List, Dict +from typing_extensions import runtime import jc.utils @@ -114,18 +127,48 @@ def parse( output_line = {} if jc.utils.has_data(data): - for line in filter(None, data.splitlines()): # ignore any lines with only whitespace if not jc.utils.has_data(line): continue - # standard extended info fields + # extended info fields if line.lstrip().startswith('#EXTINF:'): - output_line = { - 'runtime': line.split(':')[1].split(',')[0].strip(), - 'display': line.split(':')[1].split(',')[1].strip() - } + splitline = line.strip().split(':', maxsplit=1) + + # best-effort to parse additional extended fields + # if a parsing error occurs, a warning message will be + # printed to STDERR and `unparsed_info` added + try: + extline = shlex.shlex(splitline[1], posix=True) + extline.whitespace_split = True + extline.whitespace = ', \n' + extline_list = list(extline) + runtime = extline_list.pop(0) + display_list = [] + + for item in extline_list: + if '=' in item: + k, v = item.split('=', maxsplit=1) + output_line.update({k: v}) + + else: + display_list.append(item) + + display = ' '.join(display_list) + output_line.update({ + 'runtime': runtime, + 'display': display + }) + + except Exception: + if not quiet: + jc.utils.warning_message([ + 'Not able to parse non-standard extensions in the following line:', + line + ]) + output_line = {'unparsed_info': line} + continue # ignore all other extension info (obsolete) @@ -136,6 +179,7 @@ def parse( output_line.update( {'path': line.strip()} ) + raw_output.append(output_line) output_line = {}