1
0
mirror of https://github.com/alecthomas/chroma.git synced 2025-02-09 13:23:51 +02:00
chroma/regexp.go
Alec Thomas cc2dd5b8ad Version 2 of Chroma
This cleans up the API in general, removing a bunch of deprecated stuff,
cleaning up circular imports, etc.

But the biggest change is switching to an optional XML format for the
regex lexer.

Having lexers defined only in Go is not ideal for a couple of reasons.
Firstly, it impedes a significant portion of contributors who use Chroma
in Hugo, but don't know Go. Secondly, it bloats the binary size of any
project that imports Chroma.

Why XML? YAML is an abomination and JSON is not human editable. XML
also compresses very well (eg. Go template lexer XML compresses from
3239 bytes to 718).

Why a new syntax format? All major existing formats rely on the
Oniguruma regex engine, which is extremely complex and for which there
is no Go port.

Why not earlier? Prior to the existence of fs.FS this was not a viable
option.

Benchmarks:

    $ hyperfine --warmup 3 \
        './chroma.master --version' \
        './chroma.xml-pre-opt --version' \
        './chroma.xml --version'
    Benchmark 1: ./chroma.master --version
      Time (mean ± σ):       5.3 ms ±   0.5 ms    [User: 3.6 ms, System: 1.4 ms]
      Range (min … max):     4.2 ms …   6.6 ms    233 runs

    Benchmark 2: ./chroma.xml-pre-opt --version
      Time (mean ± σ):      50.6 ms ±   0.5 ms    [User: 52.4 ms, System: 3.6 ms]
      Range (min … max):    49.2 ms …  51.5 ms    51 runs

    Benchmark 3: ./chroma.xml --version
      Time (mean ± σ):       6.9 ms ±   1.1 ms    [User: 5.1 ms, System: 1.5 ms]
      Range (min … max):     5.7 ms …  19.9 ms    196 runs

    Summary
      './chroma.master --version' ran
        1.30 ± 0.23 times faster than './chroma.xml --version'
        9.56 ± 0.83 times faster than './chroma.xml-pre-opt --version'

A slight increase in init time, but I think this is okay given the
increase in flexibility.

And binary size difference:

    $ du -h lexers.test*
    $ du -sh chroma*                                                                                                                                                                                                                                                                                                                                                                                                                                                             951371ms
    8.8M	chroma.master
    7.8M	chroma.xml
    7.8M	chroma.xml-pre-opt

Benchmarks:

    $ hyperfine --warmup 3 \
        './chroma.master --version' \
        './chroma.xml-pre-opt --version' \
        './chroma.xml --version'
    Benchmark 1: ./chroma.master --version
      Time (mean ± σ):       5.3 ms ±   0.5 ms    [User: 3.6 ms, System: 1.4 ms]
      Range (min … max):     4.2 ms …   6.6 ms    233 runs

    Benchmark 2: ./chroma.xml-pre-opt --version
      Time (mean ± σ):      50.6 ms ±   0.5 ms    [User: 52.4 ms, System: 3.6 ms]
      Range (min … max):    49.2 ms …  51.5 ms    51 runs

    Benchmark 3: ./chroma.xml --version
      Time (mean ± σ):       6.9 ms ±   1.1 ms    [User: 5.1 ms, System: 1.5 ms]
      Range (min … max):     5.7 ms …  19.9 ms    196 runs

    Summary
      './chroma.master --version' ran
        1.30 ± 0.23 times faster than './chroma.xml --version'
        9.56 ± 0.83 times faster than './chroma.xml-pre-opt --version'

Incompatible changes:

- (*RegexLexer).SetAnalyser: changed from func(func(text string) float32) *RegexLexer to func(func(text string) float32) Lexer
- (*TokenType).UnmarshalJSON: removed
- Lexer.AnalyseText: added
- Lexer.SetAnalyser: added
- Lexer.SetRegistry: added
- MustNewLazyLexer: removed
- MustNewLexer: changed from func(*Config, Rules) *RegexLexer to func(*Config, func() Rules) *RegexLexer
- Mutators: changed from func(...Mutator) MutatorFunc to func(...Mutator) Mutator
- NewLazyLexer: removed
- NewLexer: changed from func(*Config, Rules) (*RegexLexer, error) to func(*Config, func() Rules) (*RegexLexer, error)
- Pop: changed from func(int) MutatorFunc to func(int) Mutator
- Push: changed from func(...string) MutatorFunc to func(...string) Mutator
- TokenType.MarshalJSON: removed
- Using: changed from func(Lexer) Emitter to func(string) Emitter
- UsingByGroup: changed from func(func(string) Lexer, int, int, ...Emitter) Emitter to func(int, int, ...Emitter) Emitter
2022-01-27 15:22:00 +11:00

481 lines
12 KiB
Go

package chroma
import (
"fmt"
"os"
"path/filepath"
"regexp"
"sort"
"strings"
"sync"
"time"
"unicode/utf8"
"github.com/dlclark/regexp2"
)
// A Rule is the fundamental matching unit of the Regex lexer state machine.
type Rule struct {
Pattern string
Type Emitter
Mutator Mutator
}
// Words creates a regex that matches any of the given literal words.
func Words(prefix, suffix string, words ...string) string {
sort.Slice(words, func(i, j int) bool {
return len(words[j]) < len(words[i])
})
for i, word := range words {
words[i] = regexp.QuoteMeta(word)
}
return prefix + `(` + strings.Join(words, `|`) + `)` + suffix
}
// Tokenise text using lexer, returning tokens as a slice.
func Tokenise(lexer Lexer, options *TokeniseOptions, text string) ([]Token, error) {
var out []Token
it, err := lexer.Tokenise(options, text)
if err != nil {
return nil, err
}
for t := it(); t != EOF; t = it() {
out = append(out, t)
}
return out, nil
}
// Rules maps from state to a sequence of Rules.
type Rules map[string][]Rule
// Rename clones rules then a rule.
func (r Rules) Rename(oldRule, newRule string) Rules {
r = r.Clone()
r[newRule] = r[oldRule]
delete(r, oldRule)
return r
}
// Clone returns a clone of the Rules.
func (r Rules) Clone() Rules {
out := map[string][]Rule{}
for key, rules := range r {
out[key] = make([]Rule, len(rules))
copy(out[key], rules)
}
return out
}
// Merge creates a clone of "r" then merges "rules" into the clone.
func (r Rules) Merge(rules Rules) Rules {
out := r.Clone()
for k, v := range rules.Clone() {
out[k] = v
}
return out
}
// MustNewLexer creates a new Lexer with deferred rules generation or panics.
func MustNewLexer(config *Config, rulesFunc func() Rules) *RegexLexer {
lexer, err := NewLexer(config, rulesFunc)
if err != nil {
panic(err)
}
return lexer
}
// NewLexer creates a new regex-based Lexer.
//
// "rules" is a state machine transition map. Each key is a state. Values are sets of rules
// that match input, optionally modify lexer state, and output tokens.
func NewLexer(config *Config, rulesFunc func() Rules) (*RegexLexer, error) {
if config == nil {
config = &Config{}
}
for _, glob := range append(config.Filenames, config.AliasFilenames...) {
_, err := filepath.Match(glob, "")
if err != nil {
return nil, fmt.Errorf("%s: %q is not a valid glob: %w", config.Name, glob, err)
}
}
r := &RegexLexer{
config: config,
fetchRulesFunc: func() (Rules, error) { return rulesFunc(), nil },
}
// One-off code to generate XML lexers in the Chroma source tree.
// var nameCleanRe = regexp.MustCompile(`[^-+A-Za-z0-9_]`)
// name := strings.ToLower(nameCleanRe.ReplaceAllString(config.Name, "_"))
// data, err := Marshal(r)
// if err != nil {
// if errors.Is(err, ErrNotSerialisable) {
// fmt.Fprintf(os.Stderr, "warning: %q: %s\n", name, err)
// return r, nil
// }
// return nil, err
// }
// _, file, _, ok := runtime.Caller(2)
// if !ok {
// panic("??")
// }
// fmt.Println(file)
// if strings.Contains(file, "/lexers/") {
// dir := filepath.Join(filepath.Dir(file), "embedded")
// err = os.MkdirAll(dir, 0700)
// if err != nil {
// return nil, err
// }
// filename := filepath.Join(dir, name) + ".xml"
// fmt.Println(filename)
// err = ioutil.WriteFile(filename, data, 0600)
// if err != nil {
// return nil, err
// }
// }
return r, nil
}
// Trace enables debug tracing.
func (r *RegexLexer) Trace(trace bool) *RegexLexer {
r.trace = trace
return r
}
// A CompiledRule is a Rule with a pre-compiled regex.
//
// Note that regular expressions are lazily compiled on first use of the lexer.
type CompiledRule struct {
Rule
Regexp *regexp2.Regexp
flags string
}
// CompiledRules is a map of rule name to sequence of compiled rules in that rule.
type CompiledRules map[string][]*CompiledRule
// LexerState contains the state for a single lex.
type LexerState struct {
Lexer *RegexLexer
Registry *LexerRegistry
Text []rune
Pos int
Rules CompiledRules
Stack []string
State string
Rule int
// Group matches.
Groups []string
// Named Group matches.
NamedGroups map[string]string
// Custum context for mutators.
MutatorContext map[interface{}]interface{}
iteratorStack []Iterator
options *TokeniseOptions
newlineAdded bool
}
// Set mutator context.
func (l *LexerState) Set(key interface{}, value interface{}) {
l.MutatorContext[key] = value
}
// Get mutator context.
func (l *LexerState) Get(key interface{}) interface{} {
return l.MutatorContext[key]
}
// Iterator returns the next Token from the lexer.
func (l *LexerState) Iterator() Token { // nolint: gocognit
end := len(l.Text)
if l.newlineAdded {
end--
}
for l.Pos < end && len(l.Stack) > 0 {
// Exhaust the iterator stack, if any.
for len(l.iteratorStack) > 0 {
n := len(l.iteratorStack) - 1
t := l.iteratorStack[n]()
if t == EOF {
l.iteratorStack = l.iteratorStack[:n]
continue
}
return t
}
l.State = l.Stack[len(l.Stack)-1]
if l.Lexer.trace {
fmt.Fprintf(os.Stderr, "%s: pos=%d, text=%q\n", l.State, l.Pos, string(l.Text[l.Pos:]))
}
selectedRule, ok := l.Rules[l.State]
if !ok {
panic("unknown state " + l.State)
}
ruleIndex, rule, groups, namedGroups := matchRules(l.Text, l.Pos, selectedRule)
// No match.
if groups == nil {
// From Pygments :\
//
// If the RegexLexer encounters a newline that is flagged as an error token, the stack is
// emptied and the lexer continues scanning in the 'root' state. This can help producing
// error-tolerant highlighting for erroneous input, e.g. when a single-line string is not
// closed.
if l.Text[l.Pos] == '\n' && l.State != l.options.State {
l.Stack = []string{l.options.State}
continue
}
l.Pos++
return Token{Error, string(l.Text[l.Pos-1 : l.Pos])}
}
l.Rule = ruleIndex
l.Groups = groups
l.NamedGroups = namedGroups
l.Pos += utf8.RuneCountInString(groups[0])
if rule.Mutator != nil {
if err := rule.Mutator.Mutate(l); err != nil {
panic(err)
}
}
if rule.Type != nil {
l.iteratorStack = append(l.iteratorStack, rule.Type.Emit(l.Groups, l))
}
}
// Exhaust the IteratorStack, if any.
// Duplicate code, but eh.
for len(l.iteratorStack) > 0 {
n := len(l.iteratorStack) - 1
t := l.iteratorStack[n]()
if t == EOF {
l.iteratorStack = l.iteratorStack[:n]
continue
}
return t
}
// If we get to here and we still have text, return it as an error.
if l.Pos != len(l.Text) && len(l.Stack) == 0 {
value := string(l.Text[l.Pos:])
l.Pos = len(l.Text)
return Token{Type: Error, Value: value}
}
return EOF
}
// RegexLexer is the default lexer implementation used in Chroma.
type RegexLexer struct {
registry *LexerRegistry // The LexerRegistry this Lexer is associated with, if any.
config *Config
analyser func(text string) float32
trace bool
mu sync.Mutex
compiled bool
rawRules Rules
rules map[string][]*CompiledRule
fetchRulesFunc func() (Rules, error)
compileOnce sync.Once
}
func (r *RegexLexer) String() string {
return r.config.Name
}
// Rules in the Lexer.
func (r *RegexLexer) Rules() (Rules, error) {
if err := r.needRules(); err != nil {
return nil, err
}
return r.rawRules, nil
}
// SetRegistry the lexer will use to lookup other lexers if necessary.
func (r *RegexLexer) SetRegistry(registry *LexerRegistry) Lexer {
r.registry = registry
return r
}
// SetAnalyser sets the analyser function used to perform content inspection.
func (r *RegexLexer) SetAnalyser(analyser func(text string) float32) Lexer {
r.analyser = analyser
return r
}
func (r *RegexLexer) AnalyseText(text string) float32 { // nolint
if r.analyser != nil {
return r.analyser(text)
}
return 0.0
}
// SetConfig replaces the Config for this Lexer.
func (r *RegexLexer) SetConfig(config *Config) *RegexLexer {
r.config = config
return r
}
func (r *RegexLexer) Config() *Config { // nolint
return r.config
}
// Regex compilation is deferred until the lexer is used. This is to avoid significant init() time costs.
func (r *RegexLexer) maybeCompile() (err error) {
r.mu.Lock()
defer r.mu.Unlock()
if r.compiled {
return nil
}
for state, rules := range r.rules {
for i, rule := range rules {
if rule.Regexp == nil {
pattern := "(?:" + rule.Pattern + ")"
if rule.flags != "" {
pattern = "(?" + rule.flags + ")" + pattern
}
pattern = `\G` + pattern
rule.Regexp, err = regexp2.Compile(pattern, regexp2.RE2)
if err != nil {
return fmt.Errorf("failed to compile rule %s.%d: %s", state, i, err)
}
rule.Regexp.MatchTimeout = time.Millisecond * 250
}
}
}
restart:
seen := map[LexerMutator]bool{}
for state := range r.rules {
for i := 0; i < len(r.rules[state]); i++ {
rule := r.rules[state][i]
if compile, ok := rule.Mutator.(LexerMutator); ok {
if seen[compile] {
return fmt.Errorf("saw mutator %T twice; this should not happen", compile)
}
seen[compile] = true
if err := compile.MutateLexer(r.rules, state, i); err != nil {
return err
}
// Process the rules again in case the mutator added/removed rules.
//
// This sounds bad, but shouldn't be significant in practice.
goto restart
}
}
}
r.compiled = true
return nil
}
func (r *RegexLexer) fetchRules() error {
rules, err := r.fetchRulesFunc()
if err != nil {
return fmt.Errorf("%s: failed to compile rules: %w", r.config.Name, err)
}
if _, ok := rules["root"]; !ok {
return fmt.Errorf("no \"root\" state")
}
compiledRules := map[string][]*CompiledRule{}
for state, rules := range rules {
compiledRules[state] = nil
for _, rule := range rules {
flags := ""
if !r.config.NotMultiline {
flags += "m"
}
if r.config.CaseInsensitive {
flags += "i"
}
if r.config.DotAll {
flags += "s"
}
compiledRules[state] = append(compiledRules[state], &CompiledRule{Rule: rule, flags: flags})
}
}
r.rawRules = rules
r.rules = compiledRules
return nil
}
func (r *RegexLexer) needRules() error {
var err error
if r.fetchRulesFunc != nil {
r.compileOnce.Do(func() {
err = r.fetchRules()
})
}
if err := r.maybeCompile(); err != nil {
return err
}
return err
}
func (r *RegexLexer) Tokenise(options *TokeniseOptions, text string) (Iterator, error) { // nolint
err := r.needRules()
if err != nil {
return nil, err
}
if options == nil {
options = defaultOptions
}
if options.EnsureLF {
text = ensureLF(text)
}
newlineAdded := false
if !options.Nested && r.config.EnsureNL && !strings.HasSuffix(text, "\n") {
text += "\n"
newlineAdded = true
}
state := &LexerState{
Registry: r.registry,
newlineAdded: newlineAdded,
options: options,
Lexer: r,
Text: []rune(text),
Stack: []string{options.State},
Rules: r.rules,
MutatorContext: map[interface{}]interface{}{},
}
return state.Iterator, nil
}
// MustRules is like Rules() but will panic on error.
func (r *RegexLexer) MustRules() Rules {
rules, err := r.Rules()
if err != nil {
panic(err)
}
return rules
}
func matchRules(text []rune, pos int, rules []*CompiledRule) (int, *CompiledRule, []string, map[string]string) {
for i, rule := range rules {
match, err := rule.Regexp.FindRunesMatchStartingAt(text, pos)
if match != nil && err == nil && match.Index == pos {
groups := []string{}
namedGroups := make(map[string]string)
for _, g := range match.Groups() {
namedGroups[g.Name] = g.String()
groups = append(groups, g.String())
}
return i, rule, groups, namedGroups
}
}
return 0, &CompiledRule{}, nil, nil
}
// replace \r and \r\n with \n
// same as strings.ReplaceAll but more efficient
func ensureLF(text string) string {
buf := make([]byte, len(text))
var j int
for i := 0; i < len(text); i++ {
c := text[i]
if c == '\r' {
if i < len(text)-1 && text[i+1] == '\n' {
continue
}
c = '\n'
}
buf[j] = c
j++
}
return string(buf[:j])
}