1
0
mirror of https://github.com/alecthomas/chroma.git synced 2025-07-13 01:10:14 +02:00

Version 2 of Chroma

This cleans up the API in general, removing a bunch of deprecated stuff,
cleaning up circular imports, etc.

But the biggest change is switching to an optional XML format for the
regex lexer.

Having lexers defined only in Go is not ideal for a couple of reasons.
Firstly, it impedes a significant portion of contributors who use Chroma
in Hugo, but don't know Go. Secondly, it bloats the binary size of any
project that imports Chroma.

Why XML? YAML is an abomination and JSON is not human editable. XML
also compresses very well (eg. Go template lexer XML compresses from
3239 bytes to 718).

Why a new syntax format? All major existing formats rely on the
Oniguruma regex engine, which is extremely complex and for which there
is no Go port.

Why not earlier? Prior to the existence of fs.FS this was not a viable
option.

Benchmarks:

    $ hyperfine --warmup 3 \
        './chroma.master --version' \
        './chroma.xml-pre-opt --version' \
        './chroma.xml --version'
    Benchmark 1: ./chroma.master --version
      Time (mean ± σ):       5.3 ms ±   0.5 ms    [User: 3.6 ms, System: 1.4 ms]
      Range (min … max):     4.2 ms …   6.6 ms    233 runs

    Benchmark 2: ./chroma.xml-pre-opt --version
      Time (mean ± σ):      50.6 ms ±   0.5 ms    [User: 52.4 ms, System: 3.6 ms]
      Range (min … max):    49.2 ms …  51.5 ms    51 runs

    Benchmark 3: ./chroma.xml --version
      Time (mean ± σ):       6.9 ms ±   1.1 ms    [User: 5.1 ms, System: 1.5 ms]
      Range (min … max):     5.7 ms …  19.9 ms    196 runs

    Summary
      './chroma.master --version' ran
        1.30 ± 0.23 times faster than './chroma.xml --version'
        9.56 ± 0.83 times faster than './chroma.xml-pre-opt --version'

A slight increase in init time, but I think this is okay given the
increase in flexibility.

And binary size difference:

    $ du -h lexers.test*
    $ du -sh chroma*                                                                                                                                                                                                                                                                                                                                                                                                                                                             951371ms
    8.8M	chroma.master
    7.8M	chroma.xml
    7.8M	chroma.xml-pre-opt

Benchmarks:

    $ hyperfine --warmup 3 \
        './chroma.master --version' \
        './chroma.xml-pre-opt --version' \
        './chroma.xml --version'
    Benchmark 1: ./chroma.master --version
      Time (mean ± σ):       5.3 ms ±   0.5 ms    [User: 3.6 ms, System: 1.4 ms]
      Range (min … max):     4.2 ms …   6.6 ms    233 runs

    Benchmark 2: ./chroma.xml-pre-opt --version
      Time (mean ± σ):      50.6 ms ±   0.5 ms    [User: 52.4 ms, System: 3.6 ms]
      Range (min … max):    49.2 ms …  51.5 ms    51 runs

    Benchmark 3: ./chroma.xml --version
      Time (mean ± σ):       6.9 ms ±   1.1 ms    [User: 5.1 ms, System: 1.5 ms]
      Range (min … max):     5.7 ms …  19.9 ms    196 runs

    Summary
      './chroma.master --version' ran
        1.30 ± 0.23 times faster than './chroma.xml --version'
        9.56 ± 0.83 times faster than './chroma.xml-pre-opt --version'

Incompatible changes:

- (*RegexLexer).SetAnalyser: changed from func(func(text string) float32) *RegexLexer to func(func(text string) float32) Lexer
- (*TokenType).UnmarshalJSON: removed
- Lexer.AnalyseText: added
- Lexer.SetAnalyser: added
- Lexer.SetRegistry: added
- MustNewLazyLexer: removed
- MustNewLexer: changed from func(*Config, Rules) *RegexLexer to func(*Config, func() Rules) *RegexLexer
- Mutators: changed from func(...Mutator) MutatorFunc to func(...Mutator) Mutator
- NewLazyLexer: removed
- NewLexer: changed from func(*Config, Rules) (*RegexLexer, error) to func(*Config, func() Rules) (*RegexLexer, error)
- Pop: changed from func(int) MutatorFunc to func(int) Mutator
- Push: changed from func(...string) MutatorFunc to func(...string) Mutator
- TokenType.MarshalJSON: removed
- Using: changed from func(Lexer) Emitter to func(string) Emitter
- UsingByGroup: changed from func(func(string) Lexer, int, int, ...Emitter) Emitter to func(int, int, ...Emitter) Emitter
This commit is contained in:
Alec Thomas
2022-01-03 23:51:17 +11:00
parent d491f1b5c1
commit cc2dd5b8ad
490 changed files with 32816 additions and 14470 deletions

308
regexp.go
View File

@ -21,156 +21,6 @@ type Rule struct {
Mutator Mutator
}
// An Emitter takes group matches and returns tokens.
type Emitter interface {
// Emit tokens for the given regex groups.
Emit(groups []string, state *LexerState) Iterator
}
// EmitterFunc is a function that is an Emitter.
type EmitterFunc func(groups []string, state *LexerState) Iterator
// Emit tokens for groups.
func (e EmitterFunc) Emit(groups []string, state *LexerState) Iterator {
return e(groups, state)
}
// ByGroups emits a token for each matching group in the rule's regex.
func ByGroups(emitters ...Emitter) Emitter {
return EmitterFunc(func(groups []string, state *LexerState) Iterator {
iterators := make([]Iterator, 0, len(groups)-1)
if len(emitters) != len(groups)-1 {
iterators = append(iterators, Error.Emit(groups, state))
// panic(errors.Errorf("number of groups %q does not match number of emitters %v", groups, emitters))
} else {
for i, group := range groups[1:] {
if emitters[i] != nil {
iterators = append(iterators, emitters[i].Emit([]string{group}, state))
}
}
}
return Concaterator(iterators...)
})
}
// ByGroupNames emits a token for each named matching group in the rule's regex.
func ByGroupNames(emitters map[string]Emitter) Emitter {
return EmitterFunc(func(groups []string, state *LexerState) Iterator {
iterators := make([]Iterator, 0, len(state.NamedGroups)-1)
if len(state.NamedGroups)-1 == 0 {
if emitter, ok := emitters[`0`]; ok {
iterators = append(iterators, emitter.Emit(groups, state))
} else {
iterators = append(iterators, Error.Emit(groups, state))
}
} else {
ruleRegex := state.Rules[state.State][state.Rule].Regexp
for i := 1; i < len(state.NamedGroups); i++ {
groupName := ruleRegex.GroupNameFromNumber(i)
group := state.NamedGroups[groupName]
if emitter, ok := emitters[groupName]; ok {
if emitter != nil {
iterators = append(iterators, emitter.Emit([]string{group}, state))
}
} else {
iterators = append(iterators, Error.Emit([]string{group}, state))
}
}
}
return Concaterator(iterators...)
})
}
// UsingByGroup emits tokens for the matched groups in the regex using a
// "sublexer". Used when lexing code blocks where the name of a sublexer is
// contained within the block, for example on a Markdown text block or SQL
// language block.
//
// The sublexer will be retrieved using sublexerGetFunc (typically
// internal.Get), using the captured value from the matched sublexerNameGroup.
//
// If sublexerGetFunc returns a non-nil lexer for the captured sublexerNameGroup,
// then tokens for the matched codeGroup will be emitted using the retrieved
// lexer. Otherwise, if the sublexer is nil, then tokens will be emitted from
// the passed emitter.
//
// Example:
//
// var Markdown = internal.Register(MustNewLexer(
// &Config{
// Name: "markdown",
// Aliases: []string{"md", "mkd"},
// Filenames: []string{"*.md", "*.mkd", "*.markdown"},
// MimeTypes: []string{"text/x-markdown"},
// },
// Rules{
// "root": {
// {"^(```)(\\w+)(\\n)([\\w\\W]*?)(^```$)",
// UsingByGroup(
// internal.Get,
// 2, 4,
// String, String, String, Text, String,
// ),
// nil,
// },
// },
// },
// ))
//
// See the lexers/m/markdown.go for the complete example.
//
// Note: panic's if the number emitters does not equal the number of matched
// groups in the regex.
func UsingByGroup(sublexerGetFunc func(string) Lexer, sublexerNameGroup, codeGroup int, emitters ...Emitter) Emitter {
return EmitterFunc(func(groups []string, state *LexerState) Iterator {
// bounds check
if len(emitters) != len(groups)-1 {
panic("UsingByGroup expects number of emitters to be the same as len(groups)-1")
}
// grab sublexer
sublexer := sublexerGetFunc(groups[sublexerNameGroup])
// build iterators
iterators := make([]Iterator, len(groups)-1)
for i, group := range groups[1:] {
if i == codeGroup-1 && sublexer != nil {
var err error
iterators[i], err = sublexer.Tokenise(nil, groups[codeGroup])
if err != nil {
panic(err)
}
} else if emitters[i] != nil {
iterators[i] = emitters[i].Emit([]string{group}, state)
}
}
return Concaterator(iterators...)
})
}
// Using returns an Emitter that uses a given Lexer for parsing and emitting.
func Using(lexer Lexer) Emitter {
return EmitterFunc(func(groups []string, _ *LexerState) Iterator {
it, err := lexer.Tokenise(&TokeniseOptions{State: "root", Nested: true}, groups[0])
if err != nil {
panic(err)
}
return it
})
}
// UsingSelf is like Using, but uses the current Lexer.
func UsingSelf(stateName string) Emitter {
return EmitterFunc(func(groups []string, state *LexerState) Iterator {
it, err := state.Lexer.Tokenise(&TokeniseOptions{State: stateName, Nested: true}, groups[0])
if err != nil {
panic(err)
}
return it
})
}
// Words creates a regex that matches any of the given literal words.
func Words(prefix, suffix string, words ...string) string {
sort.Slice(words, func(i, j int) bool {
@ -225,17 +75,20 @@ func (r Rules) Merge(rules Rules) Rules {
return out
}
// MustNewLazyLexer creates a new Lexer with deferred rules generation or panics.
func MustNewLazyLexer(config *Config, rulesFunc func() Rules) *RegexLexer {
lexer, err := NewLazyLexer(config, rulesFunc)
// MustNewLexer creates a new Lexer with deferred rules generation or panics.
func MustNewLexer(config *Config, rulesFunc func() Rules) *RegexLexer {
lexer, err := NewLexer(config, rulesFunc)
if err != nil {
panic(err)
}
return lexer
}
// NewLazyLexer creates a new regex-based Lexer with deferred rules generation.
func NewLazyLexer(config *Config, rulesFunc func() Rules) (*RegexLexer, error) {
// NewLexer creates a new regex-based Lexer.
//
// "rules" is a state machine transition map. Each key is a state. Values are sets of rules
// that match input, optionally modify lexer state, and output tokens.
func NewLexer(config *Config, rulesFunc func() Rules) (*RegexLexer, error) {
if config == nil {
config = &Config{}
}
@ -245,31 +98,40 @@ func NewLazyLexer(config *Config, rulesFunc func() Rules) (*RegexLexer, error) {
return nil, fmt.Errorf("%s: %q is not a valid glob: %w", config.Name, glob, err)
}
}
return &RegexLexer{
config: config,
compilerFunc: rulesFunc,
}, nil
}
// MustNewLexer creates a new Lexer or panics.
//
// Deprecated: Use MustNewLazyLexer instead.
func MustNewLexer(config *Config, rules Rules) *RegexLexer { // nolint: forbidigo
lexer, err := NewLexer(config, rules) // nolint: forbidigo
if err != nil {
panic(err)
r := &RegexLexer{
config: config,
fetchRulesFunc: func() (Rules, error) { return rulesFunc(), nil },
}
return lexer
}
// NewLexer creates a new regex-based Lexer.
//
// "rules" is a state machine transitition map. Each key is a state. Values are sets of rules
// that match input, optionally modify lexer state, and output tokens.
//
// Deprecated: Use NewLazyLexer instead.
func NewLexer(config *Config, rules Rules) (*RegexLexer, error) { // nolint: forbidigo
return NewLazyLexer(config, func() Rules { return rules })
// One-off code to generate XML lexers in the Chroma source tree.
// var nameCleanRe = regexp.MustCompile(`[^-+A-Za-z0-9_]`)
// name := strings.ToLower(nameCleanRe.ReplaceAllString(config.Name, "_"))
// data, err := Marshal(r)
// if err != nil {
// if errors.Is(err, ErrNotSerialisable) {
// fmt.Fprintf(os.Stderr, "warning: %q: %s\n", name, err)
// return r, nil
// }
// return nil, err
// }
// _, file, _, ok := runtime.Caller(2)
// if !ok {
// panic("??")
// }
// fmt.Println(file)
// if strings.Contains(file, "/lexers/") {
// dir := filepath.Join(filepath.Dir(file), "embedded")
// err = os.MkdirAll(dir, 0700)
// if err != nil {
// return nil, err
// }
// filename := filepath.Join(dir, name) + ".xml"
// fmt.Println(filename)
// err = ioutil.WriteFile(filename, data, 0600)
// if err != nil {
// return nil, err
// }
// }
return r, nil
}
// Trace enables debug tracing.
@ -292,13 +154,14 @@ type CompiledRules map[string][]*CompiledRule
// LexerState contains the state for a single lex.
type LexerState struct {
Lexer *RegexLexer
Text []rune
Pos int
Rules CompiledRules
Stack []string
State string
Rule int
Lexer *RegexLexer
Registry *LexerRegistry
Text []rune
Pos int
Rules CompiledRules
Stack []string
State string
Rule int
// Group matches.
Groups []string
// Named Group matches.
@ -398,19 +261,39 @@ func (l *LexerState) Iterator() Token { // nolint: gocognit
// RegexLexer is the default lexer implementation used in Chroma.
type RegexLexer struct {
registry *LexerRegistry // The LexerRegistry this Lexer is associated with, if any.
config *Config
analyser func(text string) float32
trace bool
mu sync.Mutex
compiled bool
rules map[string][]*CompiledRule
compilerFunc func() Rules
compileOnce sync.Once
mu sync.Mutex
compiled bool
rawRules Rules
rules map[string][]*CompiledRule
fetchRulesFunc func() (Rules, error)
compileOnce sync.Once
}
func (r *RegexLexer) String() string {
return r.config.Name
}
// Rules in the Lexer.
func (r *RegexLexer) Rules() (Rules, error) {
if err := r.needRules(); err != nil {
return nil, err
}
return r.rawRules, nil
}
// SetRegistry the lexer will use to lookup other lexers if necessary.
func (r *RegexLexer) SetRegistry(registry *LexerRegistry) Lexer {
r.registry = registry
return r
}
// SetAnalyser sets the analyser function used to perform content inspection.
func (r *RegexLexer) SetAnalyser(analyser func(text string) float32) *RegexLexer {
func (r *RegexLexer) SetAnalyser(analyser func(text string) float32) Lexer {
r.analyser = analyser
return r
}
@ -422,6 +305,12 @@ func (r *RegexLexer) AnalyseText(text string) float32 { // nolint
return 0.0
}
// SetConfig replaces the Config for this Lexer.
func (r *RegexLexer) SetConfig(config *Config) *RegexLexer {
r.config = config
return r
}
func (r *RegexLexer) Config() *Config { // nolint
return r.config
}
@ -473,8 +362,11 @@ restart:
return nil
}
func (r *RegexLexer) compileRules() error {
rules := r.compilerFunc()
func (r *RegexLexer) fetchRules() error {
rules, err := r.fetchRulesFunc()
if err != nil {
return fmt.Errorf("%s: failed to compile rules: %w", r.config.Name, err)
}
if _, ok := rules["root"]; !ok {
return fmt.Errorf("no \"root\" state")
}
@ -496,21 +388,27 @@ func (r *RegexLexer) compileRules() error {
}
}
r.rawRules = rules
r.rules = compiledRules
return nil
}
func (r *RegexLexer) Tokenise(options *TokeniseOptions, text string) (Iterator, error) { // nolint
func (r *RegexLexer) needRules() error {
var err error
if r.compilerFunc != nil {
if r.fetchRulesFunc != nil {
r.compileOnce.Do(func() {
err = r.compileRules()
err = r.fetchRules()
})
}
if err != nil {
return nil, err
}
if err := r.maybeCompile(); err != nil {
return err
}
return err
}
func (r *RegexLexer) Tokenise(options *TokeniseOptions, text string) (Iterator, error) { // nolint
err := r.needRules()
if err != nil {
return nil, err
}
if options == nil {
@ -525,6 +423,7 @@ func (r *RegexLexer) Tokenise(options *TokeniseOptions, text string) (Iterator,
newlineAdded = true
}
state := &LexerState{
Registry: r.registry,
newlineAdded: newlineAdded,
options: options,
Lexer: r,
@ -536,6 +435,15 @@ func (r *RegexLexer) Tokenise(options *TokeniseOptions, text string) (Iterator,
return state.Iterator, nil
}
// MustRules is like Rules() but will panic on error.
func (r *RegexLexer) MustRules() Rules {
rules, err := r.Rules()
if err != nil {
panic(err)
}
return rules
}
func matchRules(text []rune, pos int, rules []*CompiledRule) (int, *CompiledRule, []string, map[string]string) {
for i, rule := range rules {
match, err := rule.Regexp.FindRunesMatchStartingAt(text, pos)