nerd-fonts/bin/scripts/name_parser/FontnameTools.py

#!/usr/bin/env python
# coding=utf8

import re
import sys

class FontnameTools:
    """Deconstruct a font filename to get standardized name parts"""

    @staticmethod
    def front_upper(word):
        """Capitalize a string (but keep case of subsequent chars)"""
        return word[:1].upper() + word[1:]

    @staticmethod
    def camel_casify(word):
        """Remove blanks and use CamelCase for the new word"""
        return ''.join(map(FontnameTools.front_upper, word.split(' ')))

    @staticmethod
    def camel_explode(word):
        """Explode CamelCase -> Camel Case"""
        # But do not explode "JetBrains" etc at string start...
        excludes = [
                'JetBrains',
                'DejaVu',
                'OpenDyslexicAlta',
                'OpenDyslexicMono',
                'OpenDyslexic',
                'DaddyTimeMono',
                'InconsolataGo',
                'ProFontWindows',
                'ProFont',
                'ProggyClean',
                ]
        m = re.match('(' + '|'.join(excludes) + ')(.*)', word)
        (prefix, word) = m.group(1,2) if m != None else ('', word)
        if len(word) == 0:
            return prefix
        parts = re.split('(?<=[a-z0-9])(?=[A-Z])', word)
        if len(prefix):
            parts.insert(0, prefix)
        return ' '.join(parts)

    @staticmethod
    def drop_empty(l):
        """Remove empty strings from list of strings"""
        return [x for x in l if len(x) > 0]

    @staticmethod
    def concat(*all_things):
        """Flatten list of (strings or lists of strings) to a blank-separated string"""
        all = []
        for thing in all_things:
            if type(thing) is not list:
                all.append(thing)
            else:
                all += thing
        return ' '.join(FontnameTools.drop_empty(all))

    @staticmethod
    def unify_style_names(style_name):
        """Substitude some known token with standard wording"""
        known_names = {
            # Source of the table is the current sourcefonts
            # Left side needs to be lower case
            '-':            '',
            'book':         '',
            'text':         '',
            'ce':           'CE',
            '(ttf)':        '(TTF)',
            #'semibold':     'Demi',
            'ob':           'Oblique',
            'it':           'Italic',
            'i':            'Italic',
            'b':            'Bold',
            'normal':       'Regular',
            'c':            'Condensed',
            'r':            'Regular',
            'm':            'Medium',
            'l':            'Light',
        }
        if style_name in known_names:
            return known_names[style_name.lower()]
        return style_name

    @staticmethod
    def shorten_style_name(name):
        """Substitude some known styles to short form"""
        known_names = {
            # Chiefly from Noto
            'SemiCondensed':    'SemCond',
            'Condensed':        'Cond',
            'ExtraCondensed':   'ExtCond',
            'SemiBold':         'SemBd',
            'ExtraBold':        'ExtBd',
            'Medium':           'Med',
            'ExtraLight':       'ExtLt',
            'Black':            'Blk',
        }
        if name in known_names:
            return known_names[name]
        return name

    @staticmethod
    def short_styles(styles):
        """Shorten all style names in a list"""
        return list(map(FontnameTools.shorten_style_name, styles))
    @staticmethod
    def make_oblique_style(weights, styles):
        """Move "Oblique" from weights to styles for font naming purposes"""
        if 'Oblique' in weights:
            weights = list(weights)
            weights.remove('Oblique')
            styles = list(styles)
            styles.append('Oblique')
        return (weights, styles)

    @staticmethod
    def get_name_token(name, tokens, allow_regex_token = False):
        """Try to find any case insensitive token from tokens in the name, return tuple with found token-list and rest"""
        # The default mode (allow_regex_token = False) will try to find any verbatim string in the
        # tokens list (case insensitive matching) and give that tokens list item back with
        # unchanged case (i.e. [ 'Bold' ] will match "bold" and return it as [ 'Bold', ]
        # In the regex mode (allow_regex_token = True) it will use the tokens elements as
        # regexes and return the original (i.e. from name) case.
        #
        # Token are always used in a regex and may not capture, use non capturing
        # grouping if needed (?: ... )
        lower_tokens = [ t.lower() for t in tokens ]
        not_matched = ""
        all_tokens = []
        j = 1
        regex = re.compile('(.*?)(' + '|'.join(tokens) + ')(.*)', re.IGNORECASE)
        while j:
            j = regex.match(name)
            if not j:
                break
            if len(j.groups()) != 3:
                sys.exit('Malformed regex in FontnameTools.get_name_token()')
            not_matched += ' ' + j.groups()[0] # Blanc prevents unwanted concatenation of unmatched substrings
            tok = j.groups()[1].lower()
            if tok in lower_tokens:
                tok = tokens[lower_tokens.index(tok)]
            tok = FontnameTools.unify_style_names(tok)
            if len(tok):
                all_tokens.append(tok)
            name = j.groups()[2] # Recurse rest
        not_matched += ' ' + name
        return ( not_matched.strip(), all_tokens )

    @staticmethod
    def postscript_char_filter(name):
        """Filter out characters that are not allowed in Postscript names"""
        # The name string must be restricted to the printable ASCII subset, codes 33 to 126,
        # except for the 10 characters '[', ']', '(', ')', '{', '}', '<', '>', '/', '%'
        out = ""
        for c in name:
            if c in '[](){}<>/%' or ord(c) < 33 or ord(c) > 126:
                continue
            out += c
        return out

    SIL_TABLE = [
        ( '(s)ource',                   r'\1auce' ),
        ( '(h)ermit',                   r'\1urmit' ),
        ( '(h)asklig',                  r'\1asklug' ),
        ( '(s)hare',                    r'\1hure' ),
        ( 'IBM[- ]?plex',               r'Blex' ), # We do not keep the case here
        ( '(t)erminus',                 r'\1erminess' ),
        ( '(l)iberation',               r'\1iteration' ),
        ( 'iA([- ]?)writer',            r'iM\1Writing' ),
        ( '(a)nka/(c)oder',             r'\1na\2onder' ),
        ( '(c)ascadia( ?)(c)ode',       r'\1askaydia\2\3ove' ),
        ( '(c)ascadia( ?)(m)ono',       r'\1askaydia\2\3ono' ),
        ( '(m)plus',                    r'\1+'), # Added this, because they use a plus symbol :->
        ( 'Gohufont',                   r'GohuFont'), # Correct to CamelCase
        # Noone cares that font names starting with a digit are forbidden:
        # ( '(3270)',                     r'Ibeam\1'),
    ]

    @staticmethod
    def is_keep_regular(basename):
        """This has been decided by the font designers, we need to mimic that (for comparison purposes)"""
        KEEP_REGULAR = [
            'Agave',
            'Arimo',
            'Aurulent',
            'Cascadia',
            'Cousine',
            'Fantasque',
            'Fira',

            'Overpass',
            'Lilex',
            'Inconsolata$', # not InconsolataGo
            'IAWriter',
            'Meslo',
            'Monoid',
            'Mononoki',
            'Hack',
            'JetBrains Mono',
            'Noto Sans',
            'Noto Serif',
            'Victor',
        ]
        for kr in KEEP_REGULAR:
            if (basename.rstrip() + '$').startswith(kr): return True
        return False

    @staticmethod
    def _parse_simple_font_name(name):
        """Parse a filename that does not follow the 'FontFamilyName-FontStyle' pattern"""
        # No dash in name, maybe we have blanc separated filename?
        if ' ' in name:
            return FontnameTools.parse_font_name(name.replace(' ', '-'))
        # Do we have a number-name boundary?
        p = re.split('(?<=[0-9])(?=[a-zA-Z])', name)
        if len(p) > 1:
            return FontnameTools.parse_font_name('-'.join(p))
        # Or do we have CamelCase?
        n = FontnameTools.camel_explode(name)
        if n != name:
            return FontnameTools.parse_font_name(n.replace(' ', '-'))
        return (False, FontnameTools.camel_casify(name), [], [], [], '')

    @staticmethod
    def parse_font_name(name):
        """Expects a filename following the 'FontFamilyName-FontStyle' pattern and returns ... parts"""
        name = re.sub(r'\bsemi-narrow\b', 'SemiNarrow', name, 1, re.IGNORECASE) # Just for "3270 Semi-Narrow" :-/
        name = re.sub('[_\s]+', ' ', name)
        matches = re.match(r'([^-]+)(?:-(.*))?', name)
        familyname = FontnameTools.camel_casify(matches.group(1))
        style = matches.group(2)

        if not style:
            return FontnameTools._parse_simple_font_name(name)

        # These are the FontStyle keywords we know, in three categories
        # Weights end up as Typographic Family parts ('after the dash')
        # Styles end up as Family parts (for classic grouping of four)
        # Others also end up in Typographic Family ('before the dash')
        weights = [ 'Thin', 'Light', 'ExtraLight', 'SemiBold', 'Demi',
                    'SemiLight', 'Medium', 'Black', 'ExtraBold', 'Heavy',
                    'Oblique', 'Condensed', 'SemiCondensed', 'ExtraCondensed',
                    'Narrow', 'SemiNarrow', 'Retina', ]
        styles = [ 'Bold', 'Italic', 'Regular', 'Normal', ]
        # Some font specialities:
        other = [
            '-', 'Book', 'For', 'Powerline',
            'Text',             # Plex
            'IIx',              # Profont IIx
            'LGC',              # Inconsolata LGC
            r'\(TTF\)',         # Terminus (TTF)
            r'\bCE\b',          # ProggycleanTT CE
            r'[12][cmp]n?',     # MPlus
            r'(?:uni-)?1[14]',  # GohuFont uni
        ]

        # Sometimes used abbreviations
        weight_abbrevs = [ 'ob', 'c', 'm', 'l', ]
        style_abbrevs = [ 'it', 'r', 'b', 'i', ]

        ( style, weight_token ) = FontnameTools.get_name_token(style, weights)
        ( style, style_token ) = FontnameTools.get_name_token(style, styles)
        ( style, other_token ) = FontnameTools.get_name_token(style, other, True)
        if len(style) < 4:
            ( style, weight_token_abbrevs ) = FontnameTools.get_name_token(style, weight_abbrevs)
            ( style, style_token_abbrevs ) = FontnameTools.get_name_token(style, style_abbrevs)
            weight_token += weight_token_abbrevs
            style_token += style_token_abbrevs
        while 'Regular' in style_token and len(style_token) > 1:
            # Correct situation where "Regular" and something else is given
            style_token.remove('Regular')

        # Recurse to see if unmatched stuff between dashes can belong to familyname
        matches2 = re.match(r'(\w+)-(.*)', style)
        if matches2:
            return FontnameTools.parse_font_name(familyname + matches2.group(1) + '-' + matches2.group(2))

        style = re.sub(r'(^|\s)\d+(\.\d+)+(\s|$)', r'\1\3', style) # Remove (free standing) version numbers
        style_parts = FontnameTools.drop_empty(style.split(' '))
        style = ' '.join(map(FontnameTools.front_upper, style_parts))
        familyname = FontnameTools.camel_explode(familyname)
        return (True, familyname, weight_token, style_token, other_token, style)
Draft: Introduce a file name parser DO NOT MERGE [why] A lot of the fonts have incorrect naming after patching. A completely different approach can help to come up with a consistent naming scheme. [how] See bin/scripts/name-parser/README.md Signed-off-by: Fini Jastrow <ulf.fini.jastrow@desy.de> 2021-12-02 23:29:54 +02:00			`#!/usr/bin/env python`
			`# coding=utf8`

			`import re`
			`import sys`

			`class FontnameTools:`
			`"""Deconstruct a font filename to get standardized name parts"""`

			`@staticmethod`
			`def front_upper(word):`
			`"""Capitalize a string (but keep case of subsequent chars)"""`
			`return word[:1].upper() + word[1:]`

			`@staticmethod`
			`def camel_casify(word):`
			`"""Remove blanks and use CamelCase for the new word"""`
			`return ''.join(map(FontnameTools.front_upper, word.split(' ')))`

			`@staticmethod`
			`def camel_explode(word):`
			`"""Explode CamelCase -> Camel Case"""`
			`# But do not explode "JetBrains" etc at string start...`
			`excludes = [`
			`'JetBrains',`
			`'DejaVu',`
			`'OpenDyslexicAlta',`
			`'OpenDyslexicMono',`
			`'OpenDyslexic',`
			`'DaddyTimeMono',`
			`'InconsolataGo',`
			`'ProFontWindows',`
			`'ProFont',`
			`'ProggyClean',`
			`]`
			`m = re.match('(' + '\|'.join(excludes) + ')(.*)', word)`
			`(prefix, word) = m.group(1,2) if m != None else ('', word)`
			`if len(word) == 0:`
			`return prefix`
			`parts = re.split('(?<=[a-z0-9])(?=[A-Z])', word)`
			`if len(prefix):`
			`parts.insert(0, prefix)`
			`return ' '.join(parts)`

			`@staticmethod`
			`def drop_empty(l):`
			`"""Remove empty strings from list of strings"""`
			`return [x for x in l if len(x) > 0]`

			`@staticmethod`
			`def concat(*all_things):`
			`"""Flatten list of (strings or lists of strings) to a blank-separated string"""`
			`all = []`
			`for thing in all_things:`
name-parser: Fix Python2 compatibility [why] The naming has bizarre blanks strewn in sometimes, or is all caps. For example `C a s k a y d i a C o v e` or `CASKAYDIACOVE-Regular` [how] When run under Python2 all strings are unicode strings because `unicode_literals` is imported by `font-patcher`. Unfortunately the code checks for type str; but that will all become type unicode with the import. One check is suboptimal anyhow and can be dropped, while the other is turned around. Signed-off-by: Fini Jastrow <ulf.fini.jastrow@desy.de> 2022-02-06 21:58:01 +02:00			`if type(thing) is not list:`
Draft: Introduce a file name parser DO NOT MERGE [why] A lot of the fonts have incorrect naming after patching. A completely different approach can help to come up with a consistent naming scheme. [how] See bin/scripts/name-parser/README.md Signed-off-by: Fini Jastrow <ulf.fini.jastrow@desy.de> 2021-12-02 23:29:54 +02:00			`all.append(thing)`
			`else:`
			`all += thing`
			`return ' '.join(FontnameTools.drop_empty(all))`

			`@staticmethod`
			`def unify_style_names(style_name):`
			`"""Substitude some known token with standard wording"""`
			`known_names = {`
			`# Source of the table is the current sourcefonts`
			`# Left side needs to be lower case`
			`'-': '',`
			`'book': '',`
			`'text': '',`
			`'ce': 'CE',`
			`'(ttf)': '(TTF)',`
			`#'semibold': 'Demi',`
			`'ob': 'Oblique',`
			`'it': 'Italic',`
			`'i': 'Italic',`
			`'b': 'Bold',`
			`'normal': 'Regular',`
			`'c': 'Condensed',`
			`'r': 'Regular',`
			`'m': 'Medium',`
			`'l': 'Light',`
			`}`
			`if style_name in known_names:`
			`return known_names[style_name.lower()]`
			`return style_name`

			`@staticmethod`
			`def shorten_style_name(name):`
			`"""Substitude some known styles to short form"""`
			`known_names = {`
			`# Chiefly from Noto`
			`'SemiCondensed': 'SemCond',`
			`'Condensed': 'Cond',`
			`'ExtraCondensed': 'ExtCond',`
			`'SemiBold': 'SemBd',`
			`'ExtraBold': 'ExtBd',`
			`'Medium': 'Med',`
			`'ExtraLight': 'ExtLt',`
			`'Black': 'Blk',`
			`}`
			`if name in known_names:`
			`return known_names[name]`
			`return name`

			`@staticmethod`
			`def short_styles(styles):`
			`"""Shorten all style names in a list"""`
			`return list(map(FontnameTools.shorten_style_name, styles))`
			`@staticmethod`
			`def make_oblique_style(weights, styles):`
			`"""Move "Oblique" from weights to styles for font naming purposes"""`
			`if 'Oblique' in weights:`
			`weights = list(weights)`
			`weights.remove('Oblique')`
			`styles = list(styles)`
			`styles.append('Oblique')`
			`return (weights, styles)`

			`@staticmethod`
			`def get_name_token(name, tokens, allow_regex_token = False):`
			`"""Try to find any case insensitive token from tokens in the name, return tuple with found token-list and rest"""`
			`# The default mode (allow_regex_token = False) will try to find any verbatim string in the`
			`# tokens list (case insensitive matching) and give that tokens list item back with`
			`# unchanged case (i.e. [ 'Bold' ] will match "bold" and return it as [ 'Bold', ]`
			`# In the regex mode (allow_regex_token = True) it will use the tokens elements as`
			`# regexes and return the original (i.e. from name) case.`
			`#`
			`# Token are always used in a regex and may not capture, use non capturing`
			`# grouping if needed (?: ... )`
			`lower_tokens = [ t.lower() for t in tokens ]`
			`not_matched = ""`
			`all_tokens = []`
			`j = 1`
			`regex = re.compile('(.?)(' + '\|'.join(tokens) + ')(.)', re.IGNORECASE)`
			`while j:`
			`j = regex.match(name)`
			`if not j:`
			`break`
			`if len(j.groups()) != 3:`
			`sys.exit('Malformed regex in FontnameTools.get_name_token()')`
			`not_matched += ' ' + j.groups()[0] # Blanc prevents unwanted concatenation of unmatched substrings`
			`tok = j.groups()[1].lower()`
			`if tok in lower_tokens:`
			`tok = tokens[lower_tokens.index(tok)]`
			`tok = FontnameTools.unify_style_names(tok)`
			`if len(tok):`
			`all_tokens.append(tok)`
			`name = j.groups()[2] # Recurse rest`
			`not_matched += ' ' + name`
			`return ( not_matched.strip(), all_tokens )`

			`@staticmethod`
			`def postscript_char_filter(name):`
			`"""Filter out characters that are not allowed in Postscript names"""`
			`# The name string must be restricted to the printable ASCII subset, codes 33 to 126,`
			`# except for the 10 characters '[', ']', '(', ')', '{', '}', '<', '>', '/', '%'`
			`out = ""`
			`for c in name:`
			`if c in '[](){}<>/%' or ord(c) < 33 or ord(c) > 126:`
			`continue`
			`out += c`
			`return out`

			`SIL_TABLE = [`
			`( '(s)ource', r'\1auce' ),`
			`( '(h)ermit', r'\1urmit' ),`
			`( '(h)asklig', r'\1asklug' ),`
			`( '(s)hare', r'\1hure' ),`
			`( 'IBM[- ]?plex', r'Blex' ), # We do not keep the case here`
			`( '(t)erminus', r'\1erminess' ),`
			`( '(l)iberation', r'\1iteration' ),`
			`( 'iA([- ]?)writer', r'iM\1Writing' ),`
			`( '(a)nka/(c)oder', r'\1na\2onder' ),`
			`( '(c)ascadia( ?)(c)ode', r'\1askaydia\2\3ove' ),`
			`( '(c)ascadia( ?)(m)ono', r'\1askaydia\2\3ono' ),`
			`( '(m)plus', r'\1+'), # Added this, because they use a plus symbol :->`
			`( 'Gohufont', r'GohuFont'), # Correct to CamelCase`
			`# Noone cares that font names starting with a digit are forbidden:`
			`# ( '(3270)', r'Ibeam\1'),`
			`]`

			`@staticmethod`
			`def is_keep_regular(basename):`
			`"""This has been decided by the font designers, we need to mimic that (for comparison purposes)"""`
			`KEEP_REGULAR = [`
			`'Agave',`
			`'Arimo',`
			`'Aurulent',`
			`'Cascadia',`
			`'Cousine',`
			`'Fantasque',`
			`'Fira',`

			`'Overpass',`
			`'Lilex',`
			`'Inconsolata$', # not InconsolataGo`
			`'IAWriter',`
			`'Meslo',`
			`'Monoid',`
			`'Mononoki',`
			`'Hack',`
			`'JetBrains Mono',`
			`'Noto Sans',`
			`'Noto Serif',`
			`'Victor',`
			`]`
			`for kr in KEEP_REGULAR:`
			`if (basename.rstrip() + '$').startswith(kr): return True`
			`return False`

			`@staticmethod`
			`def _parse_simple_font_name(name):`
			`"""Parse a filename that does not follow the 'FontFamilyName-FontStyle' pattern"""`
			`# No dash in name, maybe we have blanc separated filename?`
			`if ' ' in name:`
			`return FontnameTools.parse_font_name(name.replace(' ', '-'))`
			`# Do we have a number-name boundary?`
			`p = re.split('(?<=[0-9])(?=[a-zA-Z])', name)`
			`if len(p) > 1:`
			`return FontnameTools.parse_font_name('-'.join(p))`
			`# Or do we have CamelCase?`
			`n = FontnameTools.camel_explode(name)`
			`if n != name:`
			`return FontnameTools.parse_font_name(n.replace(' ', '-'))`
			`return (False, FontnameTools.camel_casify(name), [], [], [], '')`

			`@staticmethod`
			`def parse_font_name(name):`
			`"""Expects a filename following the 'FontFamilyName-FontStyle' pattern and returns ... parts"""`
			`name = re.sub(r'\bsemi-narrow\b', 'SemiNarrow', name, 1, re.IGNORECASE) # Just for "3270 Semi-Narrow" :-/`
			`name = re.sub('[_\s]+', ' ', name)`
			`matches = re.match(r'([^-]+)(?:-(.*))?', name)`
			`familyname = FontnameTools.camel_casify(matches.group(1))`
			`style = matches.group(2)`

			`if not style:`
			`return FontnameTools._parse_simple_font_name(name)`

			`# These are the FontStyle keywords we know, in three categories`
			`# Weights end up as Typographic Family parts ('after the dash')`
			`# Styles end up as Family parts (for classic grouping of four)`
			`# Others also end up in Typographic Family ('before the dash')`
			`weights = [ 'Thin', 'Light', 'ExtraLight', 'SemiBold', 'Demi',`
			`'SemiLight', 'Medium', 'Black', 'ExtraBold', 'Heavy',`
			`'Oblique', 'Condensed', 'SemiCondensed', 'ExtraCondensed',`
			`'Narrow', 'SemiNarrow', 'Retina', ]`
			`styles = [ 'Bold', 'Italic', 'Regular', 'Normal', ]`
			`# Some font specialities:`
			`other = [`
			`'-', 'Book', 'For', 'Powerline',`
			`'Text', # Plex`
			`'IIx', # Profont IIx`
			`'LGC', # Inconsolata LGC`
			`r'\(TTF\)', # Terminus (TTF)`
			`r'\bCE\b', # ProggycleanTT CE`
			`r'[12][cmp]n?', # MPlus`
			`r'(?:uni-)?1[14]', # GohuFont uni`
			`]`

			`# Sometimes used abbreviations`
			`weight_abbrevs = [ 'ob', 'c', 'm', 'l', ]`
			`style_abbrevs = [ 'it', 'r', 'b', 'i', ]`

			`( style, weight_token ) = FontnameTools.get_name_token(style, weights)`
			`( style, style_token ) = FontnameTools.get_name_token(style, styles)`
			`( style, other_token ) = FontnameTools.get_name_token(style, other, True)`
			`if len(style) < 4:`
			`( style, weight_token_abbrevs ) = FontnameTools.get_name_token(style, weight_abbrevs)`
			`( style, style_token_abbrevs ) = FontnameTools.get_name_token(style, style_abbrevs)`
			`weight_token += weight_token_abbrevs`
			`style_token += style_token_abbrevs`
			`while 'Regular' in style_token and len(style_token) > 1:`
			`# Correct situation where "Regular" and something else is given`
			`style_token.remove('Regular')`

			`# Recurse to see if unmatched stuff between dashes can belong to familyname`
			`matches2 = re.match(r'(\w+)-(.*)', style)`
			`if matches2:`
			`return FontnameTools.parse_font_name(familyname + matches2.group(1) + '-' + matches2.group(2))`

			`style = re.sub(r'(^\|\s)\d+(\.\d+)+(\s\|$)', r'\1\3', style) # Remove (free standing) version numbers`
			`style_parts = FontnameTools.drop_empty(style.split(' '))`
			`style = ' '.join(map(FontnameTools.front_upper, style_parts))`
			`familyname = FontnameTools.camel_explode(familyname)`
			`return (True, familyname, weight_token, style_token, other_token, style)`