jc/jc/parsers/git_log.py

"""jc - JSON Convert `git log` command output parser

Can be used with the following format options:
- `oneline`
- `short`
- `medium`
- `full`
- `fuller`

Additional options supported:
- --stat
- --shortstat

Usage (cli):

    $ git log | jc --git-log

    or

    $ jc git log

Usage (module):

    import jc
    result = jc.parse('git_log', git_log_command_output)

Schema:

    [
      {
        "commit":               string,
        "author":               string,
        "author_email":         string,
        "date":                 string,
        "commit_by":            string,
        "commit_by_email":      string,
        "commit_by_date":       string,
        "message":              string,
        "stats" : {
          "files_changed":      integer,
          "insertions":         integer,
          "deletions":          integer,
          "files": [
                                string
          ]
        }
      }
    ]

Examples:

    $ git-log | jc --git-log -p
    []

    $ git-log | jc --git-log -p -r
    []
"""
import re
from typing import List, Dict
import jc.utils


class info():
    """Provides parser metadata (version, author, etc.)"""
    version = '1.0'
    description = '`git log` command parser'
    author = 'Kelly Brazil'
    author_email = 'kellyjonbrazil@gmail.com'
    compatible = ['linux', 'darwin', 'cygwin', 'win32', 'aix', 'freebsd']
    magic_commands = ['git log']


__version__ = info.version


def _process(proc_data: List[Dict]) -> List[Dict]:
    """
    Final processing to conform to the schema.

    Parameters:

        proc_data:   (List of Dictionaries) raw structured data to process

    Returns:

        List of Dictionaries. Structured to conform to the schema.
    """
    int_list = ['files_changed', 'insertions', 'deletions']
    for entry in proc_data:
        if 'stats' in entry:
            for key in entry['stats']:
                if key in int_list:
                    entry['stats'][key] = jc.utils.convert_to_int(entry['stats'][key])
    return proc_data


def _is_commit_hash(hash_string: str) -> bool:
    # 0c55240e9da30ac4293dc324f1094de2abd3da91
    if len(hash_string) != 40:
        return False

    hash_pattern = r'([0-9]|[a-f])+'
    if re.match(hash_pattern, hash_string):
        return True

    return False


def parse(
    data: str,
    raw: bool = False,
    quiet: bool = False
) -> List[Dict]:
    """
    Main text parsing function

    Parameters:

        data:        (string)  text data to parse
        raw:         (boolean) unprocessed output if True
        quiet:       (boolean) suppress warning messages if True

    Returns:

        List of Dictionaries. Raw or processed structured data.
    """
    jc.utils.compatibility(__name__, info.compatible, quiet)
    jc.utils.input_type_check(data)

    raw_output: List = []
    output_line: Dict = {}
    message_lines: List[str] = []
    file_list: List[str] = []

    if jc.utils.has_data(data):

        for line in data.splitlines():
            line_list = line.split(maxsplit=1)

            # oneline style
            if line_list and _is_commit_hash(line_list[0]):
                output_line = {
                    'commit': line_list[0],
                    'message': line_list[1]
                }
                raw_output.append(output_line)
                continue

            # all other styles
            if line.startswith('commit '):
                if output_line:
                    if message_lines:
                        output_line['message'] = '\n'.join(message_lines)

                    if file_list:
                        output_line['stats']['files'] = file_list

                    raw_output.append(output_line)
                    output_line = {}
                    message_lines = []
                    file_list = []
                output_line['commit'] = line_list[1]
                continue

            if line.startswith('Merge: '):
                output_line['merge'] = line_list[1]
                continue

            if line.startswith('Author: '):
                values = line_list[1].rsplit(maxsplit=1)
                output_line['author'] = values[0]
                output_line['author_email'] = values[1].strip('<').strip('>')
                continue

            if line.startswith('Date: '):
                output_line['date'] = line_list[1]
                continue

            if line.startswith('AuthorDate: '):
                output_line['date'] = line_list[1]
                continue

            if line.startswith('CommitDate: '):
                output_line['commit_by_date'] = line_list[1]
                continue

            if line.startswith('Commit: '):
                values = line_list[1].rsplit(maxsplit=1)
                output_line['commit_by'] = values[0]
                output_line['commit_by_email'] = values[1].strip('<').strip('>')
                continue

            if line.startswith('    '):
                message_lines.append(line.strip())
                continue

            if line.startswith(' ') and 'changed, ' not in line:
                # this is a file name
                file_name = line.split('|')[0].strip()
                file_list.append(file_name)
                continue

            if line.startswith(' ') and 'changed, ' in line:
                # this is the stat summary
                changes_pattern = r'\s(?P<files>\d+)\s+(files? changed),\s+(?P<insertions>\d+)\s(insertions?\(\+\))?(,\s+)?(?P<deletions>\d+)?(\s+deletions?\(\-\))?'
                changes = re.match(changes_pattern, line)
                if changes:
                    files = changes['files']
                    insertions = changes['insertions']
                    deletions = changes['deletions']

                output_line['stats'] = {
                    'files_changed': files or '0',
                    'insertions': insertions or '0',
                    'deletions':  deletions or '0'
                }

    if output_line:
        if message_lines:
            output_line['message'] = '\n'.join(message_lines)

        if file_list:
            output_line['stats']['files'] = file_list

        raw_output.append(output_line)

    return raw_output if raw else _process(raw_output)