mirror of
https://github.com/alecthomas/chroma.git
synced 2025-02-09 13:23:51 +02:00
This cleans up the API in general, removing a bunch of deprecated stuff, cleaning up circular imports, etc. But the biggest change is switching to an optional XML format for the regex lexer. Having lexers defined only in Go is not ideal for a couple of reasons. Firstly, it impedes a significant portion of contributors who use Chroma in Hugo, but don't know Go. Secondly, it bloats the binary size of any project that imports Chroma. Why XML? YAML is an abomination and JSON is not human editable. XML also compresses very well (eg. Go template lexer XML compresses from 3239 bytes to 718). Why a new syntax format? All major existing formats rely on the Oniguruma regex engine, which is extremely complex and for which there is no Go port. Why not earlier? Prior to the existence of fs.FS this was not a viable option. Benchmarks: $ hyperfine --warmup 3 \ './chroma.master --version' \ './chroma.xml-pre-opt --version' \ './chroma.xml --version' Benchmark 1: ./chroma.master --version Time (mean ± σ): 5.3 ms ± 0.5 ms [User: 3.6 ms, System: 1.4 ms] Range (min … max): 4.2 ms … 6.6 ms 233 runs Benchmark 2: ./chroma.xml-pre-opt --version Time (mean ± σ): 50.6 ms ± 0.5 ms [User: 52.4 ms, System: 3.6 ms] Range (min … max): 49.2 ms … 51.5 ms 51 runs Benchmark 3: ./chroma.xml --version Time (mean ± σ): 6.9 ms ± 1.1 ms [User: 5.1 ms, System: 1.5 ms] Range (min … max): 5.7 ms … 19.9 ms 196 runs Summary './chroma.master --version' ran 1.30 ± 0.23 times faster than './chroma.xml --version' 9.56 ± 0.83 times faster than './chroma.xml-pre-opt --version' A slight increase in init time, but I think this is okay given the increase in flexibility. And binary size difference: $ du -h lexers.test* $ du -sh chroma* 951371ms 8.8M chroma.master 7.8M chroma.xml 7.8M chroma.xml-pre-opt Benchmarks: $ hyperfine --warmup 3 \ './chroma.master --version' \ './chroma.xml-pre-opt --version' \ './chroma.xml --version' Benchmark 1: ./chroma.master --version Time (mean ± σ): 5.3 ms ± 0.5 ms [User: 3.6 ms, System: 1.4 ms] Range (min … max): 4.2 ms … 6.6 ms 233 runs Benchmark 2: ./chroma.xml-pre-opt --version Time (mean ± σ): 50.6 ms ± 0.5 ms [User: 52.4 ms, System: 3.6 ms] Range (min … max): 49.2 ms … 51.5 ms 51 runs Benchmark 3: ./chroma.xml --version Time (mean ± σ): 6.9 ms ± 1.1 ms [User: 5.1 ms, System: 1.5 ms] Range (min … max): 5.7 ms … 19.9 ms 196 runs Summary './chroma.master --version' ran 1.30 ± 0.23 times faster than './chroma.xml --version' 9.56 ± 0.83 times faster than './chroma.xml-pre-opt --version' Incompatible changes: - (*RegexLexer).SetAnalyser: changed from func(func(text string) float32) *RegexLexer to func(func(text string) float32) Lexer - (*TokenType).UnmarshalJSON: removed - Lexer.AnalyseText: added - Lexer.SetAnalyser: added - Lexer.SetRegistry: added - MustNewLazyLexer: removed - MustNewLexer: changed from func(*Config, Rules) *RegexLexer to func(*Config, func() Rules) *RegexLexer - Mutators: changed from func(...Mutator) MutatorFunc to func(...Mutator) Mutator - NewLazyLexer: removed - NewLexer: changed from func(*Config, Rules) (*RegexLexer, error) to func(*Config, func() Rules) (*RegexLexer, error) - Pop: changed from func(int) MutatorFunc to func(int) Mutator - Push: changed from func(...string) MutatorFunc to func(...string) Mutator - TokenType.MarshalJSON: removed - Using: changed from func(Lexer) Emitter to func(string) Emitter - UsingByGroup: changed from func(func(string) Lexer, int, int, ...Emitter) Emitter to func(int, int, ...Emitter) Emitter
481 lines
12 KiB
Go
481 lines
12 KiB
Go
package chroma
|
|
|
|
import (
|
|
"fmt"
|
|
"os"
|
|
"path/filepath"
|
|
"regexp"
|
|
"sort"
|
|
"strings"
|
|
"sync"
|
|
"time"
|
|
"unicode/utf8"
|
|
|
|
"github.com/dlclark/regexp2"
|
|
)
|
|
|
|
// A Rule is the fundamental matching unit of the Regex lexer state machine.
|
|
type Rule struct {
|
|
Pattern string
|
|
Type Emitter
|
|
Mutator Mutator
|
|
}
|
|
|
|
// Words creates a regex that matches any of the given literal words.
|
|
func Words(prefix, suffix string, words ...string) string {
|
|
sort.Slice(words, func(i, j int) bool {
|
|
return len(words[j]) < len(words[i])
|
|
})
|
|
for i, word := range words {
|
|
words[i] = regexp.QuoteMeta(word)
|
|
}
|
|
return prefix + `(` + strings.Join(words, `|`) + `)` + suffix
|
|
}
|
|
|
|
// Tokenise text using lexer, returning tokens as a slice.
|
|
func Tokenise(lexer Lexer, options *TokeniseOptions, text string) ([]Token, error) {
|
|
var out []Token
|
|
it, err := lexer.Tokenise(options, text)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
for t := it(); t != EOF; t = it() {
|
|
out = append(out, t)
|
|
}
|
|
return out, nil
|
|
}
|
|
|
|
// Rules maps from state to a sequence of Rules.
|
|
type Rules map[string][]Rule
|
|
|
|
// Rename clones rules then a rule.
|
|
func (r Rules) Rename(oldRule, newRule string) Rules {
|
|
r = r.Clone()
|
|
r[newRule] = r[oldRule]
|
|
delete(r, oldRule)
|
|
return r
|
|
}
|
|
|
|
// Clone returns a clone of the Rules.
|
|
func (r Rules) Clone() Rules {
|
|
out := map[string][]Rule{}
|
|
for key, rules := range r {
|
|
out[key] = make([]Rule, len(rules))
|
|
copy(out[key], rules)
|
|
}
|
|
return out
|
|
}
|
|
|
|
// Merge creates a clone of "r" then merges "rules" into the clone.
|
|
func (r Rules) Merge(rules Rules) Rules {
|
|
out := r.Clone()
|
|
for k, v := range rules.Clone() {
|
|
out[k] = v
|
|
}
|
|
return out
|
|
}
|
|
|
|
// MustNewLexer creates a new Lexer with deferred rules generation or panics.
|
|
func MustNewLexer(config *Config, rulesFunc func() Rules) *RegexLexer {
|
|
lexer, err := NewLexer(config, rulesFunc)
|
|
if err != nil {
|
|
panic(err)
|
|
}
|
|
return lexer
|
|
}
|
|
|
|
// NewLexer creates a new regex-based Lexer.
|
|
//
|
|
// "rules" is a state machine transition map. Each key is a state. Values are sets of rules
|
|
// that match input, optionally modify lexer state, and output tokens.
|
|
func NewLexer(config *Config, rulesFunc func() Rules) (*RegexLexer, error) {
|
|
if config == nil {
|
|
config = &Config{}
|
|
}
|
|
for _, glob := range append(config.Filenames, config.AliasFilenames...) {
|
|
_, err := filepath.Match(glob, "")
|
|
if err != nil {
|
|
return nil, fmt.Errorf("%s: %q is not a valid glob: %w", config.Name, glob, err)
|
|
}
|
|
}
|
|
r := &RegexLexer{
|
|
config: config,
|
|
fetchRulesFunc: func() (Rules, error) { return rulesFunc(), nil },
|
|
}
|
|
// One-off code to generate XML lexers in the Chroma source tree.
|
|
// var nameCleanRe = regexp.MustCompile(`[^-+A-Za-z0-9_]`)
|
|
// name := strings.ToLower(nameCleanRe.ReplaceAllString(config.Name, "_"))
|
|
// data, err := Marshal(r)
|
|
// if err != nil {
|
|
// if errors.Is(err, ErrNotSerialisable) {
|
|
// fmt.Fprintf(os.Stderr, "warning: %q: %s\n", name, err)
|
|
// return r, nil
|
|
// }
|
|
// return nil, err
|
|
// }
|
|
// _, file, _, ok := runtime.Caller(2)
|
|
// if !ok {
|
|
// panic("??")
|
|
// }
|
|
// fmt.Println(file)
|
|
// if strings.Contains(file, "/lexers/") {
|
|
// dir := filepath.Join(filepath.Dir(file), "embedded")
|
|
// err = os.MkdirAll(dir, 0700)
|
|
// if err != nil {
|
|
// return nil, err
|
|
// }
|
|
// filename := filepath.Join(dir, name) + ".xml"
|
|
// fmt.Println(filename)
|
|
// err = ioutil.WriteFile(filename, data, 0600)
|
|
// if err != nil {
|
|
// return nil, err
|
|
// }
|
|
// }
|
|
return r, nil
|
|
}
|
|
|
|
// Trace enables debug tracing.
|
|
func (r *RegexLexer) Trace(trace bool) *RegexLexer {
|
|
r.trace = trace
|
|
return r
|
|
}
|
|
|
|
// A CompiledRule is a Rule with a pre-compiled regex.
|
|
//
|
|
// Note that regular expressions are lazily compiled on first use of the lexer.
|
|
type CompiledRule struct {
|
|
Rule
|
|
Regexp *regexp2.Regexp
|
|
flags string
|
|
}
|
|
|
|
// CompiledRules is a map of rule name to sequence of compiled rules in that rule.
|
|
type CompiledRules map[string][]*CompiledRule
|
|
|
|
// LexerState contains the state for a single lex.
|
|
type LexerState struct {
|
|
Lexer *RegexLexer
|
|
Registry *LexerRegistry
|
|
Text []rune
|
|
Pos int
|
|
Rules CompiledRules
|
|
Stack []string
|
|
State string
|
|
Rule int
|
|
// Group matches.
|
|
Groups []string
|
|
// Named Group matches.
|
|
NamedGroups map[string]string
|
|
// Custum context for mutators.
|
|
MutatorContext map[interface{}]interface{}
|
|
iteratorStack []Iterator
|
|
options *TokeniseOptions
|
|
newlineAdded bool
|
|
}
|
|
|
|
// Set mutator context.
|
|
func (l *LexerState) Set(key interface{}, value interface{}) {
|
|
l.MutatorContext[key] = value
|
|
}
|
|
|
|
// Get mutator context.
|
|
func (l *LexerState) Get(key interface{}) interface{} {
|
|
return l.MutatorContext[key]
|
|
}
|
|
|
|
// Iterator returns the next Token from the lexer.
|
|
func (l *LexerState) Iterator() Token { // nolint: gocognit
|
|
end := len(l.Text)
|
|
if l.newlineAdded {
|
|
end--
|
|
}
|
|
for l.Pos < end && len(l.Stack) > 0 {
|
|
// Exhaust the iterator stack, if any.
|
|
for len(l.iteratorStack) > 0 {
|
|
n := len(l.iteratorStack) - 1
|
|
t := l.iteratorStack[n]()
|
|
if t == EOF {
|
|
l.iteratorStack = l.iteratorStack[:n]
|
|
continue
|
|
}
|
|
return t
|
|
}
|
|
|
|
l.State = l.Stack[len(l.Stack)-1]
|
|
if l.Lexer.trace {
|
|
fmt.Fprintf(os.Stderr, "%s: pos=%d, text=%q\n", l.State, l.Pos, string(l.Text[l.Pos:]))
|
|
}
|
|
selectedRule, ok := l.Rules[l.State]
|
|
if !ok {
|
|
panic("unknown state " + l.State)
|
|
}
|
|
ruleIndex, rule, groups, namedGroups := matchRules(l.Text, l.Pos, selectedRule)
|
|
// No match.
|
|
if groups == nil {
|
|
// From Pygments :\
|
|
//
|
|
// If the RegexLexer encounters a newline that is flagged as an error token, the stack is
|
|
// emptied and the lexer continues scanning in the 'root' state. This can help producing
|
|
// error-tolerant highlighting for erroneous input, e.g. when a single-line string is not
|
|
// closed.
|
|
if l.Text[l.Pos] == '\n' && l.State != l.options.State {
|
|
l.Stack = []string{l.options.State}
|
|
continue
|
|
}
|
|
l.Pos++
|
|
return Token{Error, string(l.Text[l.Pos-1 : l.Pos])}
|
|
}
|
|
l.Rule = ruleIndex
|
|
l.Groups = groups
|
|
l.NamedGroups = namedGroups
|
|
l.Pos += utf8.RuneCountInString(groups[0])
|
|
if rule.Mutator != nil {
|
|
if err := rule.Mutator.Mutate(l); err != nil {
|
|
panic(err)
|
|
}
|
|
}
|
|
if rule.Type != nil {
|
|
l.iteratorStack = append(l.iteratorStack, rule.Type.Emit(l.Groups, l))
|
|
}
|
|
}
|
|
// Exhaust the IteratorStack, if any.
|
|
// Duplicate code, but eh.
|
|
for len(l.iteratorStack) > 0 {
|
|
n := len(l.iteratorStack) - 1
|
|
t := l.iteratorStack[n]()
|
|
if t == EOF {
|
|
l.iteratorStack = l.iteratorStack[:n]
|
|
continue
|
|
}
|
|
return t
|
|
}
|
|
|
|
// If we get to here and we still have text, return it as an error.
|
|
if l.Pos != len(l.Text) && len(l.Stack) == 0 {
|
|
value := string(l.Text[l.Pos:])
|
|
l.Pos = len(l.Text)
|
|
return Token{Type: Error, Value: value}
|
|
}
|
|
return EOF
|
|
}
|
|
|
|
// RegexLexer is the default lexer implementation used in Chroma.
|
|
type RegexLexer struct {
|
|
registry *LexerRegistry // The LexerRegistry this Lexer is associated with, if any.
|
|
config *Config
|
|
analyser func(text string) float32
|
|
trace bool
|
|
|
|
mu sync.Mutex
|
|
compiled bool
|
|
rawRules Rules
|
|
rules map[string][]*CompiledRule
|
|
fetchRulesFunc func() (Rules, error)
|
|
compileOnce sync.Once
|
|
}
|
|
|
|
func (r *RegexLexer) String() string {
|
|
return r.config.Name
|
|
}
|
|
|
|
// Rules in the Lexer.
|
|
func (r *RegexLexer) Rules() (Rules, error) {
|
|
if err := r.needRules(); err != nil {
|
|
return nil, err
|
|
}
|
|
return r.rawRules, nil
|
|
}
|
|
|
|
// SetRegistry the lexer will use to lookup other lexers if necessary.
|
|
func (r *RegexLexer) SetRegistry(registry *LexerRegistry) Lexer {
|
|
r.registry = registry
|
|
return r
|
|
}
|
|
|
|
// SetAnalyser sets the analyser function used to perform content inspection.
|
|
func (r *RegexLexer) SetAnalyser(analyser func(text string) float32) Lexer {
|
|
r.analyser = analyser
|
|
return r
|
|
}
|
|
|
|
func (r *RegexLexer) AnalyseText(text string) float32 { // nolint
|
|
if r.analyser != nil {
|
|
return r.analyser(text)
|
|
}
|
|
return 0.0
|
|
}
|
|
|
|
// SetConfig replaces the Config for this Lexer.
|
|
func (r *RegexLexer) SetConfig(config *Config) *RegexLexer {
|
|
r.config = config
|
|
return r
|
|
}
|
|
|
|
func (r *RegexLexer) Config() *Config { // nolint
|
|
return r.config
|
|
}
|
|
|
|
// Regex compilation is deferred until the lexer is used. This is to avoid significant init() time costs.
|
|
func (r *RegexLexer) maybeCompile() (err error) {
|
|
r.mu.Lock()
|
|
defer r.mu.Unlock()
|
|
if r.compiled {
|
|
return nil
|
|
}
|
|
for state, rules := range r.rules {
|
|
for i, rule := range rules {
|
|
if rule.Regexp == nil {
|
|
pattern := "(?:" + rule.Pattern + ")"
|
|
if rule.flags != "" {
|
|
pattern = "(?" + rule.flags + ")" + pattern
|
|
}
|
|
pattern = `\G` + pattern
|
|
rule.Regexp, err = regexp2.Compile(pattern, regexp2.RE2)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to compile rule %s.%d: %s", state, i, err)
|
|
}
|
|
rule.Regexp.MatchTimeout = time.Millisecond * 250
|
|
}
|
|
}
|
|
}
|
|
restart:
|
|
seen := map[LexerMutator]bool{}
|
|
for state := range r.rules {
|
|
for i := 0; i < len(r.rules[state]); i++ {
|
|
rule := r.rules[state][i]
|
|
if compile, ok := rule.Mutator.(LexerMutator); ok {
|
|
if seen[compile] {
|
|
return fmt.Errorf("saw mutator %T twice; this should not happen", compile)
|
|
}
|
|
seen[compile] = true
|
|
if err := compile.MutateLexer(r.rules, state, i); err != nil {
|
|
return err
|
|
}
|
|
// Process the rules again in case the mutator added/removed rules.
|
|
//
|
|
// This sounds bad, but shouldn't be significant in practice.
|
|
goto restart
|
|
}
|
|
}
|
|
}
|
|
r.compiled = true
|
|
return nil
|
|
}
|
|
|
|
func (r *RegexLexer) fetchRules() error {
|
|
rules, err := r.fetchRulesFunc()
|
|
if err != nil {
|
|
return fmt.Errorf("%s: failed to compile rules: %w", r.config.Name, err)
|
|
}
|
|
if _, ok := rules["root"]; !ok {
|
|
return fmt.Errorf("no \"root\" state")
|
|
}
|
|
compiledRules := map[string][]*CompiledRule{}
|
|
for state, rules := range rules {
|
|
compiledRules[state] = nil
|
|
for _, rule := range rules {
|
|
flags := ""
|
|
if !r.config.NotMultiline {
|
|
flags += "m"
|
|
}
|
|
if r.config.CaseInsensitive {
|
|
flags += "i"
|
|
}
|
|
if r.config.DotAll {
|
|
flags += "s"
|
|
}
|
|
compiledRules[state] = append(compiledRules[state], &CompiledRule{Rule: rule, flags: flags})
|
|
}
|
|
}
|
|
|
|
r.rawRules = rules
|
|
r.rules = compiledRules
|
|
return nil
|
|
}
|
|
|
|
func (r *RegexLexer) needRules() error {
|
|
var err error
|
|
if r.fetchRulesFunc != nil {
|
|
r.compileOnce.Do(func() {
|
|
err = r.fetchRules()
|
|
})
|
|
}
|
|
if err := r.maybeCompile(); err != nil {
|
|
return err
|
|
}
|
|
return err
|
|
}
|
|
|
|
func (r *RegexLexer) Tokenise(options *TokeniseOptions, text string) (Iterator, error) { // nolint
|
|
err := r.needRules()
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
if options == nil {
|
|
options = defaultOptions
|
|
}
|
|
if options.EnsureLF {
|
|
text = ensureLF(text)
|
|
}
|
|
newlineAdded := false
|
|
if !options.Nested && r.config.EnsureNL && !strings.HasSuffix(text, "\n") {
|
|
text += "\n"
|
|
newlineAdded = true
|
|
}
|
|
state := &LexerState{
|
|
Registry: r.registry,
|
|
newlineAdded: newlineAdded,
|
|
options: options,
|
|
Lexer: r,
|
|
Text: []rune(text),
|
|
Stack: []string{options.State},
|
|
Rules: r.rules,
|
|
MutatorContext: map[interface{}]interface{}{},
|
|
}
|
|
return state.Iterator, nil
|
|
}
|
|
|
|
// MustRules is like Rules() but will panic on error.
|
|
func (r *RegexLexer) MustRules() Rules {
|
|
rules, err := r.Rules()
|
|
if err != nil {
|
|
panic(err)
|
|
}
|
|
return rules
|
|
}
|
|
|
|
func matchRules(text []rune, pos int, rules []*CompiledRule) (int, *CompiledRule, []string, map[string]string) {
|
|
for i, rule := range rules {
|
|
match, err := rule.Regexp.FindRunesMatchStartingAt(text, pos)
|
|
if match != nil && err == nil && match.Index == pos {
|
|
groups := []string{}
|
|
namedGroups := make(map[string]string)
|
|
for _, g := range match.Groups() {
|
|
namedGroups[g.Name] = g.String()
|
|
groups = append(groups, g.String())
|
|
}
|
|
return i, rule, groups, namedGroups
|
|
}
|
|
}
|
|
return 0, &CompiledRule{}, nil, nil
|
|
}
|
|
|
|
// replace \r and \r\n with \n
|
|
// same as strings.ReplaceAll but more efficient
|
|
func ensureLF(text string) string {
|
|
buf := make([]byte, len(text))
|
|
var j int
|
|
for i := 0; i < len(text); i++ {
|
|
c := text[i]
|
|
if c == '\r' {
|
|
if i < len(text)-1 && text[i+1] == '\n' {
|
|
continue
|
|
}
|
|
c = '\n'
|
|
}
|
|
buf[j] = c
|
|
j++
|
|
}
|
|
return string(buf[:j])
|
|
}
|