chroma/_tools/pygments2chroma.py

import functools
import importlib
import json
import os
import re
import sys
import types

import pystache
from pygments import lexer as pygments_lexer
from pygments.token import _TokenType


TEMPLATE = r'''
package {{package}}

import (
	. "github.com/alecthomas/chroma/v2" // nolint
	"github.com/alecthomas/chroma/v2/lexers/internal"
)

// {{upper_name}} lexer.
var {{upper_name}} = internal.Register(MustNewLazyLexer(
	&Config{
		Name:      "{{name}}",
		{{=<% %>=}}
		Aliases:   []string{<%#aliases%>"<%.%>", <%/aliases%>},
		Filenames: []string{<%#filenames%>"<%.%>", <%/filenames%>},
		MimeTypes: []string{<%#mimetypes%>"<%.%>", <%/mimetypes%>},
		<%={{ }}=%>
{{#re_not_multiline}}
		NotMultiline: true,
{{/re_not_multiline}}
{{#re_dotall}}
		DotAll: true,
{{/re_dotall}}
{{#re_ignorecase}}
		CaseInsensitive: true,
{{/re_ignorecase}}
	},
	func() Rules {
		return Rules{
{{#tokens}}
			"{{state}}": {
				{{#rules}}
				{{{.}}},
				{{/rules}}
			},
{{/tokens}}
		}
	},
))
'''


def go_regex(s):
    return go_string(s)


def go_string(s):
    if '`' not in s:
        return '`' + s + '`'
    return json.dumps(s)


def to_camel_case(snake_str):
    components = snake_str.split('_')
    return ''.join(x.title() for x in components)


def warning(message):
    print('warning: ' + message, file=sys.stderr)


def resolve_emitter(emitter):
    if isinstance(emitter, types.FunctionType):
        if repr(emitter).startswith('<function bygroups.'):
            args = emitter.__closure__[0].cell_contents
            emitter = 'ByGroups(%s)' % ', '.join(resolve_emitter(e) for e in args)
        elif repr(emitter).startswith('<function using.'):
            args = emitter.__closure__[0].cell_contents
            if isinstance(args, dict):
                state = 'root'
                if 'stack' in args:
                    state = args['stack'][1]
                    args.pop('stack')
                assert args == {}, args
                emitter = 'UsingSelf("%s")' % state
            elif issubclass(args, pygments_lexer.Lexer):
                name = args.__name__
                if name.endswith('Lexer'):
                    name = name[:-5]
                emitter = 'Using(%s)' % name
            else:
                raise ValueError('only support "using" with lexer classes, not %r' % args)
        else:
            warning('unsupported emitter function %r' % emitter)
            emitter = '?? %r ??' % emitter
    elif isinstance(emitter, _TokenType):
        emitter = str(emitter).replace('.', '')[5:]
    elif emitter is None:
        # This generally only occurs when a lookahead/behind assertion is used, so we just allow it
        # through.
        return 'None'
    else:
        raise ValueError('unsupported emitter type %r' % emitter)
    assert isinstance(emitter, str)
    return emitter


def process_state_action(action):
    if isinstance(action, tuple):
        return functools.reduce(lambda a, b: a + b, (process_state_action(a) for a in action))
    if action.startswith('#'):
        action = action[1:]
        if action== 'pop':
            action = 'Pop(1)'
        elif action.startswith('pop:'):
            action = 'Pop(%s)' % action[4:]
        elif action == 'push':
            action = 'Push()'
        elif action.startswith('push:'):
            action = 'Push("%s")' % action[5:]
        else:
            raise ValueError('unsupported action %r' % (action,))
    else:
        action = 'Push("%s")' % action
    return (action,)


def translate_rules(rules):
    out = []
    for rule in rules:
        if isinstance(rule, tuple):
            regex = rule[0]
            if isinstance(regex, str):
                regex = go_regex(regex)
            elif isinstance(regex, pygments_lexer.words):
                regex = 'Words(%s, %s, %s)' % (go_string(regex.prefix),
                                               go_string(regex.suffix),
                                               ', '.join(go_string(w) for w in regex.words))
            else:
                raise ValueError('expected regex string but got %r' % regex)
            emitter = resolve_emitter(rule[1])
            if len(rule) == 2:
                modifier = 'nil'
            elif type(rule[2]) is str:
                modifier = process_state_action(rule[2])[0]
            elif isinstance(rule[2], pygments_lexer.combined):
                modifier = 'Combined("%s")' % '", "'.join(rule[2])
            elif type(rule[2]) is tuple:
                modifier = 'Push("%s")' % '", "'.join(rule[2])
            else:
                raise ValueError('unsupported modifier %r' % (rule[2],))
            out.append('{{{}, {}, {}}}'.format(regex, emitter, modifier))
        elif isinstance(rule, pygments_lexer.include):
            out.append('Include("{}")'.format(rule))
        elif isinstance(rule, pygments_lexer.default):
            out.append('Default({})'.format(', '.join(process_state_action(rule.state))))
        else:
            raise ValueError('unsupported rule %r' % (rule,))
    return out


class TemplateView(object):
    def __init__(self, **kwargs):
        for key, value in kwargs.items():
            setattr(self, key, value)

    def re_not_multiline(self):
        return not (self.regex_flags & re.MULTILINE)

    def re_dotall(self):
        return self.regex_flags & re.DOTALL

    def re_ignorecase(self):
        return self.regex_flags & re.IGNORECASE


def main():
    package_name, symbol_name = sys.argv[1].rsplit(sep=".", maxsplit=1)

    package = importlib.import_module(package_name)

    lexer_cls = getattr(package, symbol_name)

    assert issubclass(lexer_cls, pygments_lexer.RegexLexer), 'can only translate from RegexLexer'

    print(pystache.render(TEMPLATE, TemplateView(
        package=lexer_cls.name.lower()[0],
        name=lexer_cls.name,
        regex_flags=lexer_cls.flags,
        upper_name=to_camel_case(re.sub(r'\W', '_', lexer_cls.name)),
        aliases=lexer_cls.aliases,
        filenames=lexer_cls.filenames,
        mimetypes=lexer_cls.mimetypes,
        tokens=[{'state': state, 'rules': translate_rules(rules)} for (state, rules) in lexer_cls.get_tokendefs().items()],
    )))


if __name__ == '__main__':
    main()
Switch to github.com/dlclark/regexp2. This makes translating Pygments lexers much much simpler (and possible). 2017-09-18 11:16:44 +10:00			`import functools`
Add pygments2chroma.py script. 2017-06-04 22:38:52 +10:00			`import importlib`
Switch to github.com/dlclark/regexp2. This makes translating Pygments lexers much much simpler (and possible). 2017-09-18 11:16:44 +10:00			`import json`
			`import os`
			`import re`
Add pygments2chroma.py script. 2017-06-04 22:38:52 +10:00			`import sys`
			`import types`

			`import pystache`
			`from pygments import lexer as pygments_lexer`
			`from pygments.token import _TokenType`


			`TEMPLATE = r'''`
pygments2chroma updates (#509) 2021-05-19 01:15:25 +03:00			`package {{package}}`
Add pygments2chroma.py script. 2017-06-04 22:38:52 +10:00
			`import (`
Version 2 of Chroma This cleans up the API in general, removing a bunch of deprecated stuff, cleaning up circular imports, etc. But the biggest change is switching to an optional XML format for the regex lexer. Having lexers defined only in Go is not ideal for a couple of reasons. Firstly, it impedes a significant portion of contributors who use Chroma in Hugo, but don't know Go. Secondly, it bloats the binary size of any project that imports Chroma. Why XML? YAML is an abomination and JSON is not human editable. XML also compresses very well (eg. Go template lexer XML compresses from 3239 bytes to 718). Why a new syntax format? All major existing formats rely on the Oniguruma regex engine, which is extremely complex and for which there is no Go port. Why not earlier? Prior to the existence of fs.FS this was not a viable option. Benchmarks: $ hyperfine --warmup 3 \ './chroma.master --version' \ './chroma.xml-pre-opt --version' \ './chroma.xml --version' Benchmark 1: ./chroma.master --version Time (mean ± σ): 5.3 ms ± 0.5 ms [User: 3.6 ms, System: 1.4 ms] Range (min … max): 4.2 ms … 6.6 ms 233 runs Benchmark 2: ./chroma.xml-pre-opt --version Time (mean ± σ): 50.6 ms ± 0.5 ms [User: 52.4 ms, System: 3.6 ms] Range (min … max): 49.2 ms … 51.5 ms 51 runs Benchmark 3: ./chroma.xml --version Time (mean ± σ): 6.9 ms ± 1.1 ms [User: 5.1 ms, System: 1.5 ms] Range (min … max): 5.7 ms … 19.9 ms 196 runs Summary './chroma.master --version' ran 1.30 ± 0.23 times faster than './chroma.xml --version' 9.56 ± 0.83 times faster than './chroma.xml-pre-opt --version' A slight increase in init time, but I think this is okay given the increase in flexibility. And binary size difference: $ du -h lexers.test* $ du -sh chroma* 951371ms 8.8M chroma.master 7.8M chroma.xml 7.8M chroma.xml-pre-opt Benchmarks: $ hyperfine --warmup 3 \ './chroma.master --version' \ './chroma.xml-pre-opt --version' \ './chroma.xml --version' Benchmark 1: ./chroma.master --version Time (mean ± σ): 5.3 ms ± 0.5 ms [User: 3.6 ms, System: 1.4 ms] Range (min … max): 4.2 ms … 6.6 ms 233 runs Benchmark 2: ./chroma.xml-pre-opt --version Time (mean ± σ): 50.6 ms ± 0.5 ms [User: 52.4 ms, System: 3.6 ms] Range (min … max): 49.2 ms … 51.5 ms 51 runs Benchmark 3: ./chroma.xml --version Time (mean ± σ): 6.9 ms ± 1.1 ms [User: 5.1 ms, System: 1.5 ms] Range (min … max): 5.7 ms … 19.9 ms 196 runs Summary './chroma.master --version' ran 1.30 ± 0.23 times faster than './chroma.xml --version' 9.56 ± 0.83 times faster than './chroma.xml-pre-opt --version' Incompatible changes: - (RegexLexer).SetAnalyser: changed from func(func(text string) float32) RegexLexer to func(func(text string) float32) Lexer - (TokenType).UnmarshalJSON: removed - Lexer.AnalyseText: added - Lexer.SetAnalyser: added - Lexer.SetRegistry: added - MustNewLazyLexer: removed - MustNewLexer: changed from func(Config, Rules) RegexLexer to func(Config, func() Rules) RegexLexer - Mutators: changed from func(...Mutator) MutatorFunc to func(...Mutator) Mutator - NewLazyLexer: removed - NewLexer: changed from func(Config, Rules) (RegexLexer, error) to func(Config, func() Rules) (*RegexLexer, error) - Pop: changed from func(int) MutatorFunc to func(int) Mutator - Push: changed from func(...string) MutatorFunc to func(...string) Mutator - TokenType.MarshalJSON: removed - Using: changed from func(Lexer) Emitter to func(string) Emitter - UsingByGroup: changed from func(func(string) Lexer, int, int, ...Emitter) Emitter to func(int, int, ...Emitter) Emitter 2022-01-03 23:51:17 +11:00			`. "github.com/alecthomas/chroma/v2" // nolint`
			`"github.com/alecthomas/chroma/v2/lexers/internal"`
Add pygments2chroma.py script. 2017-06-04 22:38:52 +10:00			`)`

			`// {{upper_name}} lexer.`
Update golangci-lint so we can force use of LazyLexer. 2021-04-29 12:07:50 +10:00			`var {{upper_name}} = internal.Register(MustNewLazyLexer(`
pygments2chroma updates (#509) 2021-05-19 01:15:25 +03:00			`&Config{`
			`Name: "{{name}}",`
			`{{=<% %>=}}`
			`Aliases: []string{<%#aliases%>"<%.%>", <%/aliases%>},`
			`Filenames: []string{<%#filenames%>"<%.%>", <%/filenames%>},`
			`MimeTypes: []string{<%#mimetypes%>"<%.%>", <%/mimetypes%>},`
			`<%={{ }}=%>`
Use pointers to tokens + support regex flags in importer. 2017-06-05 10:29:50 +10:00			`{{#re_not_multiline}}`
pygments2chroma updates (#509) 2021-05-19 01:15:25 +03:00			`NotMultiline: true,`
Use pointers to tokens + support regex flags in importer. 2017-06-05 10:29:50 +10:00			`{{/re_not_multiline}}`
			`{{#re_dotall}}`
pygments2chroma updates (#509) 2021-05-19 01:15:25 +03:00			`DotAll: true,`
Use pointers to tokens + support regex flags in importer. 2017-06-05 10:29:50 +10:00			`{{/re_dotall}}`
			`{{#re_ignorecase}}`
pygments2chroma updates (#509) 2021-05-19 01:15:25 +03:00			`CaseInsensitive: true,`
Use pointers to tokens + support regex flags in importer. 2017-06-05 10:29:50 +10:00			`{{/re_ignorecase}}`
pygments2chroma updates (#509) 2021-05-19 01:15:25 +03:00			`},`
			`func() Rules {`
			`return Rules{`
Add pygments2chroma.py script. 2017-06-04 22:38:52 +10:00			`{{#tokens}}`
pygments2chroma updates (#509) 2021-05-19 01:15:25 +03:00			`"{{state}}": {`
			`{{#rules}}`
			`{{{.}}},`
			`{{/rules}}`
			`},`
Add pygments2chroma.py script. 2017-06-04 22:38:52 +10:00			`{{/tokens}}`
pygments2chroma updates (#509) 2021-05-19 01:15:25 +03:00			`}`
			`},`
Add pygments2chroma.py script. 2017-06-04 22:38:52 +10:00			`))`
			`'''`


Switch to github.com/dlclark/regexp2. This makes translating Pygments lexers much much simpler (and possible). 2017-09-18 11:16:44 +10:00			`def go_regex(s):`
			`return go_string(s)`


Add pygments2chroma.py script. 2017-06-04 22:38:52 +10:00			`def go_string(s):`
			if '`' not in s:
			return '`' + s + '`'
			`return json.dumps(s)`


			`def to_camel_case(snake_str):`
			`components = snake_str.split('_')`
			`return ''.join(x.title() for x in components)`


			`def warning(message):`
			`print('warning: ' + message, file=sys.stderr)`


			`def resolve_emitter(emitter):`
			`if isinstance(emitter, types.FunctionType):`
			`if repr(emitter).startswith('<function bygroups.'):`
			`args = emitter.__closure__[0].cell_contents`
			`emitter = 'ByGroups(%s)' % ', '.join(resolve_emitter(e) for e in args)`
			`elif repr(emitter).startswith('<function using.'):`
			`args = emitter.__closure__[0].cell_contents`
			`if isinstance(args, dict):`
			`state = 'root'`
			`if 'stack' in args:`
			`state = args['stack'][1]`
			`args.pop('stack')`
			`assert args == {}, args`
			`emitter = 'UsingSelf("%s")' % state`
			`elif issubclass(args, pygments_lexer.Lexer):`
Fixes: css, html, php. 2017-06-05 11:17:38 +10:00			`name = args.__name__`
			`if name.endswith('Lexer'):`
			`name = name[:-5]`
Fix bug with nested newlines. Fixes #124. Also reinstitute lexer tests that disappeared during package split. 2018-03-03 10:16:21 +11:00			`emitter = 'Using(%s)' % name`
Add pygments2chroma.py script. 2017-06-04 22:38:52 +10:00			`else:`
			`raise ValueError('only support "using" with lexer classes, not %r' % args)`
			`else:`
			`warning('unsupported emitter function %r' % emitter)`
HTML formatter + import all Pygments styles. 2017-07-19 23:51:16 -07:00			`emitter = '?? %r ??' % emitter`
Add pygments2chroma.py script. 2017-06-04 22:38:52 +10:00			`elif isinstance(emitter, _TokenType):`
			`emitter = str(emitter).replace('.', '')[5:]`
			`elif emitter is None:`
			`# This generally only occurs when a lookahead/behind assertion is used, so we just allow it`
			`# through.`
			`return 'None'`
			`else:`
			`raise ValueError('unsupported emitter type %r' % emitter)`
			`assert isinstance(emitter, str)`
			`return emitter`


			`def process_state_action(action):`
Switch to github.com/dlclark/regexp2. This makes translating Pygments lexers much much simpler (and possible). 2017-09-18 11:16:44 +10:00			`if isinstance(action, tuple):`
			`return functools.reduce(lambda a, b: a + b, (process_state_action(a) for a in action))`
Add pygments2chroma.py script. 2017-06-04 22:38:52 +10:00			`if action.startswith('#'):`
			`action = action[1:]`
Support #pop:<n>. 2017-06-05 11:14:24 +10:00			`if action== 'pop':`
Add pygments2chroma.py script. 2017-06-04 22:38:52 +10:00			`action = 'Pop(1)'`
Support #pop:<n>. 2017-06-05 11:14:24 +10:00			`elif action.startswith('pop:'):`
			`action = 'Pop(%s)' % action[4:]`
Add pygments2chroma.py script. 2017-06-04 22:38:52 +10:00			`elif action == 'push':`
			`action = 'Push()'`
			`elif action.startswith('push:'):`
			`action = 'Push("%s")' % action[5:]`
			`else:`
			`raise ValueError('unsupported action %r' % (action,))`
			`else:`
			`action = 'Push("%s")' % action`
Switch to github.com/dlclark/regexp2. This makes translating Pygments lexers much much simpler (and possible). 2017-09-18 11:16:44 +10:00			`return (action,)`
Add pygments2chroma.py script. 2017-06-04 22:38:52 +10:00

			`def translate_rules(rules):`
			`out = []`
			`for rule in rules:`
			`if isinstance(rule, tuple):`
			`regex = rule[0]`
			`if isinstance(regex, str):`
Switch to github.com/dlclark/regexp2. This makes translating Pygments lexers much much simpler (and possible). 2017-09-18 11:16:44 +10:00			`regex = go_regex(regex)`
Add pygments2chroma.py script. 2017-06-04 22:38:52 +10:00			`elif isinstance(regex, pygments_lexer.words):`
Switch to github.com/dlclark/regexp2. This makes translating Pygments lexers much much simpler (and possible). 2017-09-18 11:16:44 +10:00			`regex = 'Words(%s, %s, %s)' % (go_string(regex.prefix),`
			`go_string(regex.suffix),`
			`', '.join(go_string(w) for w in regex.words))`
Add pygments2chroma.py script. 2017-06-04 22:38:52 +10:00			`else:`
			`raise ValueError('expected regex string but got %r' % regex)`
			`emitter = resolve_emitter(rule[1])`
			`if len(rule) == 2:`
			`modifier = 'nil'`
			`elif type(rule[2]) is str:`
Switch to github.com/dlclark/regexp2. This makes translating Pygments lexers much much simpler (and possible). 2017-09-18 11:16:44 +10:00			`modifier = process_state_action(rule[2])[0]`
Add pygments2chroma.py script. 2017-06-04 22:38:52 +10:00			`elif isinstance(rule[2], pygments_lexer.combined):`
			`modifier = 'Combined("%s")' % '", "'.join(rule[2])`
			`elif type(rule[2]) is tuple:`
			`modifier = 'Push("%s")' % '", "'.join(rule[2])`
			`else:`
			`raise ValueError('unsupported modifier %r' % (rule[2],))`
pygments2chroma updates (#509) 2021-05-19 01:15:25 +03:00			`out.append('{{{}, {}, {}}}'.format(regex, emitter, modifier))`
Add pygments2chroma.py script. 2017-06-04 22:38:52 +10:00			`elif isinstance(rule, pygments_lexer.include):`
			`out.append('Include("{}")'.format(rule))`
			`elif isinstance(rule, pygments_lexer.default):`
Switch to github.com/dlclark/regexp2. This makes translating Pygments lexers much much simpler (and possible). 2017-09-18 11:16:44 +10:00			`out.append('Default({})'.format(', '.join(process_state_action(rule.state))))`
Add pygments2chroma.py script. 2017-06-04 22:38:52 +10:00			`else:`
			`raise ValueError('unsupported rule %r' % (rule,))`
			`return out`


Use pointers to tokens + support regex flags in importer. 2017-06-05 10:29:50 +10:00			`class TemplateView(object):`
			`def __init__(self, **kwargs):`
			`for key, value in kwargs.items():`
			`setattr(self, key, value)`

			`def re_not_multiline(self):`
			`return not (self.regex_flags & re.MULTILINE)`

			`def re_dotall(self):`
			`return self.regex_flags & re.DOTALL`

			`def re_ignorecase(self):`
			`return self.regex_flags & re.IGNORECASE`


Add pygments2chroma.py script. 2017-06-04 22:38:52 +10:00			`def main():`
			`package_name, symbol_name = sys.argv[1].rsplit(sep=".", maxsplit=1)`

			`package = importlib.import_module(package_name)`

			`lexer_cls = getattr(package, symbol_name)`

			`assert issubclass(lexer_cls, pygments_lexer.RegexLexer), 'can only translate from RegexLexer'`

Use pointers to tokens + support regex flags in importer. 2017-06-05 10:29:50 +10:00			`print(pystache.render(TEMPLATE, TemplateView(`
pygments2chroma updates (#509) 2021-05-19 01:15:25 +03:00			`package=lexer_cls.name.lower()[0],`
Use pointers to tokens + support regex flags in importer. 2017-06-05 10:29:50 +10:00			`name=lexer_cls.name,`
			`regex_flags=lexer_cls.flags,`
pygments2chroma updates (#509) 2021-05-19 01:15:25 +03:00			`upper_name=to_camel_case(re.sub(r'\W', '_', lexer_cls.name)),`
Use pointers to tokens + support regex flags in importer. 2017-06-05 10:29:50 +10:00			`aliases=lexer_cls.aliases,`
			`filenames=lexer_cls.filenames,`
			`mimetypes=lexer_cls.mimetypes,`
			`tokens=[{'state': state, 'rules': translate_rules(rules)} for (state, rules) in lexer_cls.get_tokendefs().items()],`
			`)))`
Add pygments2chroma.py script. 2017-06-04 22:38:52 +10:00

			`if __name__ == '__main__':`
Support go modules + VB.Net lexer. Fixes #201. 2018-12-03 20:38:33 -10:00			`main()`