2017-09-18 11:16:44 +10:00
|
|
|
import functools
|
2017-06-04 22:38:52 +10:00
|
|
|
import importlib
|
2017-09-18 11:16:44 +10:00
|
|
|
import json
|
|
|
|
import os
|
|
|
|
import re
|
2017-06-04 22:38:52 +10:00
|
|
|
import sys
|
|
|
|
import types
|
|
|
|
|
|
|
|
import pystache
|
|
|
|
from pygments import lexer as pygments_lexer
|
|
|
|
from pygments.token import _TokenType
|
|
|
|
|
|
|
|
|
|
|
|
TEMPLATE = r'''
|
2021-05-19 01:15:25 +03:00
|
|
|
package {{package}}
|
2017-06-04 22:38:52 +10:00
|
|
|
|
|
|
|
import (
|
Version 2 of Chroma
This cleans up the API in general, removing a bunch of deprecated stuff,
cleaning up circular imports, etc.
But the biggest change is switching to an optional XML format for the
regex lexer.
Having lexers defined only in Go is not ideal for a couple of reasons.
Firstly, it impedes a significant portion of contributors who use Chroma
in Hugo, but don't know Go. Secondly, it bloats the binary size of any
project that imports Chroma.
Why XML? YAML is an abomination and JSON is not human editable. XML
also compresses very well (eg. Go template lexer XML compresses from
3239 bytes to 718).
Why a new syntax format? All major existing formats rely on the
Oniguruma regex engine, which is extremely complex and for which there
is no Go port.
Why not earlier? Prior to the existence of fs.FS this was not a viable
option.
Benchmarks:
$ hyperfine --warmup 3 \
'./chroma.master --version' \
'./chroma.xml-pre-opt --version' \
'./chroma.xml --version'
Benchmark 1: ./chroma.master --version
Time (mean ± σ): 5.3 ms ± 0.5 ms [User: 3.6 ms, System: 1.4 ms]
Range (min … max): 4.2 ms … 6.6 ms 233 runs
Benchmark 2: ./chroma.xml-pre-opt --version
Time (mean ± σ): 50.6 ms ± 0.5 ms [User: 52.4 ms, System: 3.6 ms]
Range (min … max): 49.2 ms … 51.5 ms 51 runs
Benchmark 3: ./chroma.xml --version
Time (mean ± σ): 6.9 ms ± 1.1 ms [User: 5.1 ms, System: 1.5 ms]
Range (min … max): 5.7 ms … 19.9 ms 196 runs
Summary
'./chroma.master --version' ran
1.30 ± 0.23 times faster than './chroma.xml --version'
9.56 ± 0.83 times faster than './chroma.xml-pre-opt --version'
A slight increase in init time, but I think this is okay given the
increase in flexibility.
And binary size difference:
$ du -h lexers.test*
$ du -sh chroma* 951371ms
8.8M chroma.master
7.8M chroma.xml
7.8M chroma.xml-pre-opt
Benchmarks:
$ hyperfine --warmup 3 \
'./chroma.master --version' \
'./chroma.xml-pre-opt --version' \
'./chroma.xml --version'
Benchmark 1: ./chroma.master --version
Time (mean ± σ): 5.3 ms ± 0.5 ms [User: 3.6 ms, System: 1.4 ms]
Range (min … max): 4.2 ms … 6.6 ms 233 runs
Benchmark 2: ./chroma.xml-pre-opt --version
Time (mean ± σ): 50.6 ms ± 0.5 ms [User: 52.4 ms, System: 3.6 ms]
Range (min … max): 49.2 ms … 51.5 ms 51 runs
Benchmark 3: ./chroma.xml --version
Time (mean ± σ): 6.9 ms ± 1.1 ms [User: 5.1 ms, System: 1.5 ms]
Range (min … max): 5.7 ms … 19.9 ms 196 runs
Summary
'./chroma.master --version' ran
1.30 ± 0.23 times faster than './chroma.xml --version'
9.56 ± 0.83 times faster than './chroma.xml-pre-opt --version'
Incompatible changes:
- (*RegexLexer).SetAnalyser: changed from func(func(text string) float32) *RegexLexer to func(func(text string) float32) Lexer
- (*TokenType).UnmarshalJSON: removed
- Lexer.AnalyseText: added
- Lexer.SetAnalyser: added
- Lexer.SetRegistry: added
- MustNewLazyLexer: removed
- MustNewLexer: changed from func(*Config, Rules) *RegexLexer to func(*Config, func() Rules) *RegexLexer
- Mutators: changed from func(...Mutator) MutatorFunc to func(...Mutator) Mutator
- NewLazyLexer: removed
- NewLexer: changed from func(*Config, Rules) (*RegexLexer, error) to func(*Config, func() Rules) (*RegexLexer, error)
- Pop: changed from func(int) MutatorFunc to func(int) Mutator
- Push: changed from func(...string) MutatorFunc to func(...string) Mutator
- TokenType.MarshalJSON: removed
- Using: changed from func(Lexer) Emitter to func(string) Emitter
- UsingByGroup: changed from func(func(string) Lexer, int, int, ...Emitter) Emitter to func(int, int, ...Emitter) Emitter
2022-01-03 23:51:17 +11:00
|
|
|
. "github.com/alecthomas/chroma/v2" // nolint
|
|
|
|
"github.com/alecthomas/chroma/v2/lexers/internal"
|
2017-06-04 22:38:52 +10:00
|
|
|
)
|
|
|
|
|
|
|
|
// {{upper_name}} lexer.
|
2021-04-29 12:07:50 +10:00
|
|
|
var {{upper_name}} = internal.Register(MustNewLazyLexer(
|
2021-05-19 01:15:25 +03:00
|
|
|
&Config{
|
|
|
|
Name: "{{name}}",
|
|
|
|
{{=<% %>=}}
|
|
|
|
Aliases: []string{<%#aliases%>"<%.%>", <%/aliases%>},
|
|
|
|
Filenames: []string{<%#filenames%>"<%.%>", <%/filenames%>},
|
|
|
|
MimeTypes: []string{<%#mimetypes%>"<%.%>", <%/mimetypes%>},
|
|
|
|
<%={{ }}=%>
|
2017-06-05 10:29:50 +10:00
|
|
|
{{#re_not_multiline}}
|
2021-05-19 01:15:25 +03:00
|
|
|
NotMultiline: true,
|
2017-06-05 10:29:50 +10:00
|
|
|
{{/re_not_multiline}}
|
|
|
|
{{#re_dotall}}
|
2021-05-19 01:15:25 +03:00
|
|
|
DotAll: true,
|
2017-06-05 10:29:50 +10:00
|
|
|
{{/re_dotall}}
|
|
|
|
{{#re_ignorecase}}
|
2021-05-19 01:15:25 +03:00
|
|
|
CaseInsensitive: true,
|
2017-06-05 10:29:50 +10:00
|
|
|
{{/re_ignorecase}}
|
2021-05-19 01:15:25 +03:00
|
|
|
},
|
|
|
|
func() Rules {
|
|
|
|
return Rules{
|
2017-06-04 22:38:52 +10:00
|
|
|
{{#tokens}}
|
2021-05-19 01:15:25 +03:00
|
|
|
"{{state}}": {
|
|
|
|
{{#rules}}
|
|
|
|
{{{.}}},
|
|
|
|
{{/rules}}
|
|
|
|
},
|
2017-06-04 22:38:52 +10:00
|
|
|
{{/tokens}}
|
2021-05-19 01:15:25 +03:00
|
|
|
}
|
|
|
|
},
|
2017-06-04 22:38:52 +10:00
|
|
|
))
|
|
|
|
'''
|
|
|
|
|
|
|
|
|
2017-09-18 11:16:44 +10:00
|
|
|
def go_regex(s):
|
|
|
|
return go_string(s)
|
|
|
|
|
|
|
|
|
2017-06-04 22:38:52 +10:00
|
|
|
def go_string(s):
|
|
|
|
if '`' not in s:
|
|
|
|
return '`' + s + '`'
|
|
|
|
return json.dumps(s)
|
|
|
|
|
|
|
|
|
|
|
|
def to_camel_case(snake_str):
|
|
|
|
components = snake_str.split('_')
|
|
|
|
return ''.join(x.title() for x in components)
|
|
|
|
|
|
|
|
|
|
|
|
def warning(message):
|
|
|
|
print('warning: ' + message, file=sys.stderr)
|
|
|
|
|
|
|
|
|
|
|
|
def resolve_emitter(emitter):
|
|
|
|
if isinstance(emitter, types.FunctionType):
|
|
|
|
if repr(emitter).startswith('<function bygroups.'):
|
|
|
|
args = emitter.__closure__[0].cell_contents
|
|
|
|
emitter = 'ByGroups(%s)' % ', '.join(resolve_emitter(e) for e in args)
|
|
|
|
elif repr(emitter).startswith('<function using.'):
|
|
|
|
args = emitter.__closure__[0].cell_contents
|
|
|
|
if isinstance(args, dict):
|
|
|
|
state = 'root'
|
|
|
|
if 'stack' in args:
|
|
|
|
state = args['stack'][1]
|
|
|
|
args.pop('stack')
|
|
|
|
assert args == {}, args
|
|
|
|
emitter = 'UsingSelf("%s")' % state
|
|
|
|
elif issubclass(args, pygments_lexer.Lexer):
|
2017-06-05 11:17:38 +10:00
|
|
|
name = args.__name__
|
|
|
|
if name.endswith('Lexer'):
|
|
|
|
name = name[:-5]
|
2018-03-03 10:16:21 +11:00
|
|
|
emitter = 'Using(%s)' % name
|
2017-06-04 22:38:52 +10:00
|
|
|
else:
|
|
|
|
raise ValueError('only support "using" with lexer classes, not %r' % args)
|
|
|
|
else:
|
|
|
|
warning('unsupported emitter function %r' % emitter)
|
2017-07-19 23:51:16 -07:00
|
|
|
emitter = '?? %r ??' % emitter
|
2017-06-04 22:38:52 +10:00
|
|
|
elif isinstance(emitter, _TokenType):
|
|
|
|
emitter = str(emitter).replace('.', '')[5:]
|
|
|
|
elif emitter is None:
|
|
|
|
# This generally only occurs when a lookahead/behind assertion is used, so we just allow it
|
|
|
|
# through.
|
|
|
|
return 'None'
|
|
|
|
else:
|
|
|
|
raise ValueError('unsupported emitter type %r' % emitter)
|
|
|
|
assert isinstance(emitter, str)
|
|
|
|
return emitter
|
|
|
|
|
|
|
|
|
|
|
|
def process_state_action(action):
|
2017-09-18 11:16:44 +10:00
|
|
|
if isinstance(action, tuple):
|
|
|
|
return functools.reduce(lambda a, b: a + b, (process_state_action(a) for a in action))
|
2017-06-04 22:38:52 +10:00
|
|
|
if action.startswith('#'):
|
|
|
|
action = action[1:]
|
2017-06-05 11:14:24 +10:00
|
|
|
if action== 'pop':
|
2017-06-04 22:38:52 +10:00
|
|
|
action = 'Pop(1)'
|
2017-06-05 11:14:24 +10:00
|
|
|
elif action.startswith('pop:'):
|
|
|
|
action = 'Pop(%s)' % action[4:]
|
2017-06-04 22:38:52 +10:00
|
|
|
elif action == 'push':
|
|
|
|
action = 'Push()'
|
|
|
|
elif action.startswith('push:'):
|
|
|
|
action = 'Push("%s")' % action[5:]
|
|
|
|
else:
|
|
|
|
raise ValueError('unsupported action %r' % (action,))
|
|
|
|
else:
|
|
|
|
action = 'Push("%s")' % action
|
2017-09-18 11:16:44 +10:00
|
|
|
return (action,)
|
2017-06-04 22:38:52 +10:00
|
|
|
|
|
|
|
|
|
|
|
def translate_rules(rules):
|
|
|
|
out = []
|
|
|
|
for rule in rules:
|
|
|
|
if isinstance(rule, tuple):
|
|
|
|
regex = rule[0]
|
|
|
|
if isinstance(regex, str):
|
2017-09-18 11:16:44 +10:00
|
|
|
regex = go_regex(regex)
|
2017-06-04 22:38:52 +10:00
|
|
|
elif isinstance(regex, pygments_lexer.words):
|
2017-09-18 11:16:44 +10:00
|
|
|
regex = 'Words(%s, %s, %s)' % (go_string(regex.prefix),
|
|
|
|
go_string(regex.suffix),
|
|
|
|
', '.join(go_string(w) for w in regex.words))
|
2017-06-04 22:38:52 +10:00
|
|
|
else:
|
|
|
|
raise ValueError('expected regex string but got %r' % regex)
|
|
|
|
emitter = resolve_emitter(rule[1])
|
|
|
|
if len(rule) == 2:
|
|
|
|
modifier = 'nil'
|
|
|
|
elif type(rule[2]) is str:
|
2017-09-18 11:16:44 +10:00
|
|
|
modifier = process_state_action(rule[2])[0]
|
2017-06-04 22:38:52 +10:00
|
|
|
elif isinstance(rule[2], pygments_lexer.combined):
|
|
|
|
modifier = 'Combined("%s")' % '", "'.join(rule[2])
|
|
|
|
elif type(rule[2]) is tuple:
|
|
|
|
modifier = 'Push("%s")' % '", "'.join(rule[2])
|
|
|
|
else:
|
|
|
|
raise ValueError('unsupported modifier %r' % (rule[2],))
|
2021-05-19 01:15:25 +03:00
|
|
|
out.append('{{{}, {}, {}}}'.format(regex, emitter, modifier))
|
2017-06-04 22:38:52 +10:00
|
|
|
elif isinstance(rule, pygments_lexer.include):
|
|
|
|
out.append('Include("{}")'.format(rule))
|
|
|
|
elif isinstance(rule, pygments_lexer.default):
|
2017-09-18 11:16:44 +10:00
|
|
|
out.append('Default({})'.format(', '.join(process_state_action(rule.state))))
|
2017-06-04 22:38:52 +10:00
|
|
|
else:
|
|
|
|
raise ValueError('unsupported rule %r' % (rule,))
|
|
|
|
return out
|
|
|
|
|
|
|
|
|
2017-06-05 10:29:50 +10:00
|
|
|
class TemplateView(object):
|
|
|
|
def __init__(self, **kwargs):
|
|
|
|
for key, value in kwargs.items():
|
|
|
|
setattr(self, key, value)
|
|
|
|
|
|
|
|
def re_not_multiline(self):
|
|
|
|
return not (self.regex_flags & re.MULTILINE)
|
|
|
|
|
|
|
|
def re_dotall(self):
|
|
|
|
return self.regex_flags & re.DOTALL
|
|
|
|
|
|
|
|
def re_ignorecase(self):
|
|
|
|
return self.regex_flags & re.IGNORECASE
|
|
|
|
|
|
|
|
|
2017-06-04 22:38:52 +10:00
|
|
|
def main():
|
|
|
|
package_name, symbol_name = sys.argv[1].rsplit(sep=".", maxsplit=1)
|
|
|
|
|
|
|
|
package = importlib.import_module(package_name)
|
|
|
|
|
|
|
|
lexer_cls = getattr(package, symbol_name)
|
|
|
|
|
|
|
|
assert issubclass(lexer_cls, pygments_lexer.RegexLexer), 'can only translate from RegexLexer'
|
|
|
|
|
2017-06-05 10:29:50 +10:00
|
|
|
print(pystache.render(TEMPLATE, TemplateView(
|
2021-05-19 01:15:25 +03:00
|
|
|
package=lexer_cls.name.lower()[0],
|
2017-06-05 10:29:50 +10:00
|
|
|
name=lexer_cls.name,
|
|
|
|
regex_flags=lexer_cls.flags,
|
2021-05-19 01:15:25 +03:00
|
|
|
upper_name=to_camel_case(re.sub(r'\W', '_', lexer_cls.name)),
|
2017-06-05 10:29:50 +10:00
|
|
|
aliases=lexer_cls.aliases,
|
|
|
|
filenames=lexer_cls.filenames,
|
|
|
|
mimetypes=lexer_cls.mimetypes,
|
|
|
|
tokens=[{'state': state, 'rules': translate_rules(rules)} for (state, rules) in lexer_cls.get_tokendefs().items()],
|
|
|
|
)))
|
2017-06-04 22:38:52 +10:00
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
2018-12-03 20:38:33 -10:00
|
|
|
main()
|