nerd-fonts/bin/scripts/name_parser/FontnameTools.py

#!/usr/bin/env python
# coding=utf8

import re
import sys

class FontnameTools:
    """Deconstruct a fontname to get standardized name parts"""

    @staticmethod
    def front_upper(word):
        """Capitalize a string (but keep case of subsequent chars)"""
        return word[:1].upper() + word[1:]

    @staticmethod
    def camel_casify(word):
        """Remove blanks and use CamelCase for the new word"""
        return ''.join(map(FontnameTools.front_upper, word.split(' ')))

    @staticmethod
    def camel_explode(word):
        """Explode CamelCase -> Camel Case"""
        # But do not explode "JetBrains" etc at string start...
        excludes = [
                'JetBrains',
                'DejaVu',
                'OpenDyslexicAlta',
                'OpenDyslexicMono',
                'OpenDyslexic',
                'DaddyTimeMono',
                'InconsolataGo',
                'ProFontWindows',
                'ProFont',
                'ProggyClean',
                ]
        m = re.match('(' + '|'.join(excludes) + ')(.*)', word)
        (prefix, word) = m.group(1,2) if m != None else ('', word)
        if len(word) == 0:
            return prefix
        parts = re.split('(?<=[a-z0-9])(?=[A-Z])', word)
        if len(prefix):
            parts.insert(0, prefix)
        return ' '.join(parts)

    @staticmethod
    def drop_empty(l):
        """Remove empty strings from list of strings"""
        return [x for x in l if len(x) > 0]

    @staticmethod
    def concat(*all_things):
        """Flatten list of (strings or lists of strings) to a blank-separated string"""
        all = []
        for thing in all_things:
            if type(thing) is not list:
                all.append(thing)
            else:
                all += thing
        return ' '.join(FontnameTools.drop_empty(all))

    @staticmethod
    def unify_style_names(style_name):
        """Substitude some known token with standard wording"""
        known_names = {
            # Source of the table is the current sourcefonts
            # Left side needs to be lower case
            'book':         '',
            'ce':           'CE',
            'normal':       'Regular',
        }
        return known_names.get(style_name.lower(), style_name)

    @staticmethod
    def find_in_dicts(key, dicts):
        """Find an entry in a list of dicts, return entry and in which list it was"""
        for i, d in enumerate(dicts):
            if key in d:
                return ( d[key], i )
        return (None, 0)

    @staticmethod
    def get_shorten_form_idx(aggressive, prefix, form_if_prefixed):
        """Get the tuple index of known_* data tables"""
        if aggressive:
            return 0
        if len(prefix):
            return form_if_prefixed
        return 1

    @staticmethod
    def shorten_style_name(name, aggressive):
        """Substitude some known styles to short form"""
        # If aggressive is False create the mild short form
        # aggressive == True: Always use first form of everything
        # aggressive == False:
        #               - has no modifier: use the second form
        #               - has modifier: use second form of mod plus first form of weights2
        #               - has modifier: use second form of mod plus second form of widths
        name_rest = name
        name_pre = ''
        form = FontnameTools.get_shorten_form_idx(aggressive, '', 0)
        for mod in FontnameTools.known_modifiers:
            if name.startswith(mod) and len(name) > len(mod): # Second condition specifically for 'Demi'
                name_pre = FontnameTools.known_modifiers[mod][form]
                name_rest = name[len(mod):]
                break
        subst, i = FontnameTools.find_in_dicts(name_rest, [ FontnameTools.known_weights2, FontnameTools.known_widths ])
        form = FontnameTools.get_shorten_form_idx(aggressive, name_pre, i)
        if isinstance(subst, tuple):
            return name_pre + subst[form]
        if not len(name_pre):
            # The following sets do not allow modifiers
            subst, _ = FontnameTools.find_in_dicts(name_rest, [ FontnameTools.known_weights1, FontnameTools.known_slopes ])
            if isinstance(subst, tuple):
                return subst[form]
        return name

    @staticmethod
    def short_styles(lists, aggressive):
        """Shorten all style names in a list or a list of lists"""
        if not len(lists) or not isinstance(lists[0], list):
            return list(map(lambda x: FontnameTools.shorten_style_name(x, aggressive), lists))
        return [ list(map(lambda x: FontnameTools.shorten_style_name(x, aggressive), styles)) for styles in lists ]

    @staticmethod
    def make_oblique_style(weights, styles):
        """Move "Oblique" from weights to styles for font naming purposes"""
        if 'Oblique' in weights:
            weights = list(weights)
            weights.remove('Oblique')
            styles = list(styles)
            styles.append('Oblique')
        return (weights, styles)

    @staticmethod
    def get_name_token(name, tokens):
        """Try to find any case insensitive token from tokens in the name, return tuple with found token-list and rest"""
        # The default mode (allow_regex_token = False) will try to find any verbatim string in the
        # tokens list (case insensitive matching) and give that tokens list item back with
        # unchanged case (i.e. [ 'Bold' ] will match "bold" and return it as [ 'Bold', ]
        # In the regex mode (allow_regex_token = True) it will use the tokens elements as
        # regexes and return the original (i.e. from name) case.
        #
        # Token are always used in a regex and may not capture, use non capturing
        # grouping if needed (?: ... )
        lower_tokens = [ t.lower() for t in tokens ]
        not_matched = ""
        all_tokens = []
        j = 1
        token_regex = '|'.join(tokens)
        # Allow a dash between CamelCase token word parts, i.e. Camel-Case
        # This allows for styles like Extra-Bold
        token_regex = re.sub(r'(?<=[a-z])(?=[A-Z])', '-?', token_regex)
        regex = re.compile('(.*?)(' + token_regex + ')(.*)', re.IGNORECASE)
        while j:
            j = regex.match(name)
            if not j:
                break
            if len(j.groups()) != 3:
                sys.exit('Malformed regex in FontnameTools.get_name_token()')
            not_matched += ' ' + j.groups()[0] # Blanc prevents unwanted concatenation of unmatched substrings
            tok = j.groups()[1].lower()
            tok = tok.replace('-', '') # Remove dashes between CamelCase token words
            if tok in lower_tokens:
                tok = tokens[lower_tokens.index(tok)]
            tok = FontnameTools.unify_style_names(tok)
            if len(tok):
                all_tokens.append(tok)
            name = j.groups()[2] # Recurse rest
        not_matched += ' ' + name
        return ( not_matched.strip(), all_tokens )

    @staticmethod
    def postscript_char_filter(name):
        """Filter out characters that are not allowed in Postscript names"""
        # The name string must be restricted to the printable ASCII subset, codes 33 to 126,
        # except for the 10 characters '[', ']', '(', ')', '{', '}', '<', '>', '/', '%'
        out = ''
        for c in name:
            if c in '[](){}<>/%' or ord(c) < 33 or ord(c) > 126:
                continue
            out += c
        return out

    SIL_TABLE = [
        ( '(a)nka/(c)oder',             r'\1na\2onder' ),
        ( '(a)nonymous',                r'\1nonymice' ),
        ( '(b)itstream( ?)(v)era( ?sans ?mono)?', r'\1itstrom\2Wera' ),
        ( '(c)ascadia( ?)(c)ode',       r'\1askaydia\2\3ove' ),
        ( '(c)ascadia( ?)(m)ono',       r'\1askaydia\2\3ono' ),
        ( 'Gohufont',                   r'GohuFont'), # Correct to CamelCase
        ( '(h)ermit',                   r'\1urmit' ),
        ( '(h)asklig',                  r'\1asklug' ),
        ( 'iA([- ]?)writer',            r'iM\1Writing' ),
        ( 'IBM[- ]?plex',               r'Blex' ), # We do not keep the case here
        ( '(i)ntel( ?)(o)ne',           r'\1ntone' ),
        ( '(l)iberation',               r'\1iteration' ),
        ( '(m)( ?)plus',                r'\1+'), # Added this, because they use a plus symbol :->
        ( '(s)hare',                    r'\1hure' ),
        ( '(s)ource',                   r'\1auce' ),
        ( '(t)erminus',                 r'\1erminess' ),
        # Noone cares that font names starting with a digit are forbidden:
        ( 'IBM 3270',                   r'3270'), # for historical reasons and 'IBM' is a TM or something
        # Some name parts that are too long for us
        ( '(.*sans ?m)ono',             r'\1'), # Various SomenameSansMono fonts
        ( '(.*code ?lat)in Expanded',   r'\1X'), # for 'M PLUS Code Latin Expanded'
        ( '(.*code ?lat)in',            r'\1'), # for 'M PLUS Code Latin'
        ( '(b)ig( ?)(b)lue( ?)(t)erminal', r'\1ig\3lue\5erm'), # Shorten BigBlueTerminal
        ( '(.*)437TT',                  r'\g<1>437'), # Shorten BigBlueTerminal 437 TT even further
        ( '(.*dyslexic ?alt)a',         r'\1'), # Open Dyslexic Alta -> Open Dyslexic Alt
        ( '(.*dyslexic ?m)ono',         r'\1'), # Open Dyslexic Mono -> Open Dyslexic M
        ( '(overpass ?m)ono',           r'\1'), # Overpass Mono -> Overpass M
        ( '(proggyclean) ?tt',          r'\1'), # Remove TT from ProggyClean
        ( '(terminess) ?\(ttf\)',       r'\1'), # Remove TTF from Terminus (after renamed to Terminess)
        ( '(im ?writing ?q)uattro',     r'\1uat'), # Rename iM Writing Quattro to Quat
        ( '(im ?writing ?(mono|duo|quat)) ?s', r'\1'), # Remove S from all iM Writing styles
    ]

    # From https://adobe-type-tools.github.io/font-tech-notes/pdfs/5088.FontNames.pdf
    # The first short variant is from the linked table.
    # The second (longer) short variant is from diverse fonts like Noto.
    # We can
    # - use the long form
    # - use the very short form (first)
    # - use mild short form:
    #   - has no modifier: use the second form
    #   - has modifier: use second form of mod plus first form of weights2
    #   - has modifier: use second form of mod plus second form of widths
    # This is encoded in get_shorten_form_idx()
    known_weights1 = { # can not take modifiers
        'Medium': ('Md', 'Med'),
        'Nord': ('Nd', 'Nord'),
        'Book': ('Bk', 'Book'),
        'Text': ('Txt', 'Text'),
        'Poster': ('Po', 'Poster'),
        'Demi': ('Dm', 'Demi'), # Demi is sometimes used as a weight, sometimes as a modifier
        'Regular': ('Rg', 'Reg'),
        'Display': ('DS', 'Disp'),
        'Super': ('Su', 'Sup'),
        'Retina': ('Rt', 'Ret'),
    }
    known_weights2 = { # can take modifiers
        'Black': ('Blk', 'Black'),
        'Bold': ('Bd', 'Bold'),
        'Heavy': ('Hv', 'Heavy'),
        'Thin': ('Th', 'Thin'),
        'Light': ('Lt', 'Light'),
        ' ': (), # Just for CodeClimate :-/
    }
    known_styles = [ # Keywords that end up as style (i.e. a RIBBI set)
        'Bold', 'Italic', 'Regular', 'Normal'
    ]
    known_widths = { # can take modifiers
        'Compressed': ('Cm', 'Comp'),
        'Extended': ('Ex', 'Extd'),
        'Condensed': ('Cn', 'Cond'),
        'Narrow': ('Nr', 'Narrow'),
        'Compact': ('Ct', 'Compact'),
    }
    known_slopes = { # can not take modifiers
        'Inclined': ('Ic', 'Incl'),
        'Oblique': ('Obl', 'Obl'),
        'Italic': ('It', 'Italic'),
        'Upright': ('Up', 'Uprght'),
        'Kursiv': ('Ks', 'Kurs'),
        'Sloped': ('Sl', 'Slop'),
    }
    known_modifiers = {
        'Demi': ('Dm', 'Dem'),
        'Ultra': ('Ult', 'Ult'),
        'Semi': ('Sm', 'Sem'),
        'Extra': ('X', 'Ext'),
    }
    equivalent_weights = {
        100: ('thin', 'hairline'),
        200: ('extralight', 'ultralight'),
        300: ('light', ),
        350: ('semilight', ),
        400: ('regular', 'normal', 'book', 'text', 'nord', 'retina'),
        500: ('medium', ),
        600: ('semibold', 'demibold', 'demi'),
        700: ('bold', ),
        800: ('extrabold', 'ultrabold'),
        900: ('black', 'heavy', 'poster', 'extrablack', 'ultrablack'),
    }

    @staticmethod
    def weight_string_to_number(w):
        """ Convert a common string approximation to a PS/2 weight value """
        if not isinstance(w, str) or len(w) < 1:
            return 400
        w = w.lower().replace('-', '').replace(' ', '')
        for num, strs in FontnameTools.equivalent_weights.items():
            if w in strs:
                return num
        return None

    @staticmethod
    def weight_to_string(w):
        """ Convert a PS/2 weight value to the common string approximation """
        if w < 150:
            str = 'Thin'
        elif w < 250:
            str = 'Extra-Light'
        elif w < 350:
            str = 'Light'
        elif w < 450:
            str = 'Regular'
        elif w < 550:
            str = 'Medium'
        elif w < 650:
            str = 'Semi-Bold'
        elif w < 750:
            str = 'Bold'
        elif w < 850:
            str = 'Extra-Bold'
        else:
            str = 'Black'
        return str

    @staticmethod
    def is_keep_regular(basename):
        """This has been decided by the font designers, we need to mimic that (for comparison purposes)"""
        KEEP_REGULAR = [
            'Agave',
            'Arimo',
            'Aurulent',
            'Cascadia',
            'Cousine',
            'Fantasque',
            'Fira',

            'Overpass',
            'Lilex',
            'Inconsolata$', # not InconsolataGo
            'IAWriter',
            'Meslo',
            'Monoid',
            'Mononoki',
            'Hack',
            'JetBrains Mono',
            'Noto Sans',
            'Noto Serif',
            'Victor',
        ]
        for kr in KEEP_REGULAR:
            if (basename.rstrip() + '$').startswith(kr): return True
        return False

    @staticmethod
    def _parse_simple_font_name(name):
        """Parse a fontname that does not follow the 'FontFamilyName-FontStyle' pattern"""
        # This is the usual case, because the font-patcher usually uses the fullname and
        # not the PS name
        if ' ' in name:
            return FontnameTools.parse_font_name(name.replace(' ', '-'))
        # Do we have a number-name boundary?
        p = re.split('(?<=[0-9])(?=[a-zA-Z])', name)
        if len(p) > 1:
            return FontnameTools.parse_font_name('-'.join(p))
        # Or do we have CamelCase?
        n = FontnameTools.camel_explode(name)
        if n != name:
            return FontnameTools.parse_font_name(n.replace(' ', '-'))
        return (False, FontnameTools.camel_casify(name), [], [], [], '')

    @staticmethod
    def parse_font_name(name):
        """Expects a fontname following the 'FontFamilyName-FontStyle' pattern and returns ... parts"""
        # This could parse filenames in the beginning but that was never used in production; code removed with this commit
        for special in [
                ('ExtLt', 'ExtraLight'), # IBM-Plex
                ('Medm', 'Medium'), # IBM-Plex
                ('Semi-Condensed', 'SemiCondensed'), # 3270
                ('SmBld', 'SemiBold'), # IBM-Plex
            ]:
            name = re.sub(r'\b' + special[0] + r'\b', special[1], name, 1, re.IGNORECASE)
        name = re.sub('[_\s]+', ' ', name)
        matches = re.match(r'([^-]+)(?:-(.*))?', name)
        familyname = FontnameTools.camel_casify(matches.group(1))
        style = matches.group(2)

        if not style:
            return FontnameTools._parse_simple_font_name(name)

        # These are the FontStyle keywords we know, in three categories
        # Weights end up as Typographic Family parts ('after the dash')
        # Styles end up as Family parts (for classic grouping of four)
        # Others also end up in Typographic Family ('before the dash')
        weights = [ m + s
                for s in list(FontnameTools.known_weights2) + list(FontnameTools.known_widths)
                for m in list(FontnameTools.known_modifiers) + [''] if m != s
            ] + list(FontnameTools.known_weights1) + list(FontnameTools.known_slopes)
        weights = [ w for w in weights if w not in FontnameTools.known_styles ]
        # Some font specialities:
        other = [
            '-', 'Book', 'For', 'Powerline',
            'IIx',              # Profont IIx
            'LGC',              # Inconsolata LGC
            r'\bCE\b',          # ProggycleanTT CE
            r'[12][cmp]n?',     # MPlus
            r'(?:uni-)?1[14]',  # GohuFont uni
        ]

        ( style, weight_token ) = FontnameTools.get_name_token(style, weights)
        ( style, style_token ) = FontnameTools.get_name_token(style, FontnameTools.known_styles)
        ( style, other_token ) = FontnameTools.get_name_token(style, other)
        while 'Regular' in style_token and len(style_token) > 1:
            # Correct situation where "Regular" and something else is given
            style_token.remove('Regular')

        # Recurse to see if unmatched stuff between dashes can belong to familyname
        matches2 = re.match(r'(\w+)-(.*)', style)
        if matches2:
            return FontnameTools.parse_font_name(familyname + matches2.group(1) + '-' + matches2.group(2))

        style = re.sub(r'(^|\s)\d+(\.\d+)+(\s|$)', r'\1\3', style) # Remove (free standing) version numbers
        style_parts = FontnameTools.drop_empty(style.split(' '))
        style = ' '.join(map(FontnameTools.front_upper, style_parts))
        familyname = FontnameTools.camel_explode(familyname)
        return (True, familyname, weight_token, style_token, other_token, style)