2017-09-20 20:15:35 +10:00
|
|
|
package chroma
|
|
|
|
|
|
|
|
import (
|
|
|
|
"fmt"
|
2017-09-21 17:52:28 +10:00
|
|
|
"os"
|
2021-09-27 14:27:46 +10:00
|
|
|
"path/filepath"
|
2017-09-20 20:15:35 +10:00
|
|
|
"regexp"
|
2021-05-08 01:10:18 +02:00
|
|
|
"sort"
|
2017-09-20 20:15:35 +10:00
|
|
|
"strings"
|
|
|
|
"sync"
|
2020-07-08 20:22:02 +10:00
|
|
|
"time"
|
2017-09-20 20:36:25 +10:00
|
|
|
"unicode/utf8"
|
2017-09-20 20:15:35 +10:00
|
|
|
|
|
|
|
"github.com/dlclark/regexp2"
|
|
|
|
)
|
|
|
|
|
2018-12-31 22:44:27 +11:00
|
|
|
// A Rule is the fundamental matching unit of the Regex lexer state machine.
|
2017-09-20 20:15:35 +10:00
|
|
|
type Rule struct {
|
|
|
|
Pattern string
|
|
|
|
Type Emitter
|
|
|
|
Mutator Mutator
|
|
|
|
}
|
|
|
|
|
|
|
|
// Words creates a regex that matches any of the given literal words.
|
|
|
|
func Words(prefix, suffix string, words ...string) string {
|
2021-05-08 01:10:18 +02:00
|
|
|
sort.Slice(words, func(i, j int) bool {
|
|
|
|
return len(words[j]) < len(words[i])
|
|
|
|
})
|
2017-09-20 20:15:35 +10:00
|
|
|
for i, word := range words {
|
|
|
|
words[i] = regexp.QuoteMeta(word)
|
|
|
|
}
|
|
|
|
return prefix + `(` + strings.Join(words, `|`) + `)` + suffix
|
|
|
|
}
|
|
|
|
|
|
|
|
// Tokenise text using lexer, returning tokens as a slice.
|
2018-11-03 16:22:51 -07:00
|
|
|
func Tokenise(lexer Lexer, options *TokeniseOptions, text string) ([]Token, error) {
|
|
|
|
var out []Token
|
2017-09-20 22:19:36 +10:00
|
|
|
it, err := lexer.Tokenise(options, text)
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
2018-11-03 16:22:51 -07:00
|
|
|
for t := it(); t != EOF; t = it() {
|
2017-09-20 22:19:36 +10:00
|
|
|
out = append(out, t)
|
|
|
|
}
|
|
|
|
return out, nil
|
2017-09-20 20:15:35 +10:00
|
|
|
}
|
|
|
|
|
|
|
|
// Rules maps from state to a sequence of Rules.
|
|
|
|
type Rules map[string][]Rule
|
|
|
|
|
2020-06-30 20:56:49 +10:00
|
|
|
// Rename clones rules then a rule.
|
2021-04-29 12:07:50 +10:00
|
|
|
func (r Rules) Rename(oldRule, newRule string) Rules {
|
2020-06-30 20:56:49 +10:00
|
|
|
r = r.Clone()
|
2021-04-29 12:07:50 +10:00
|
|
|
r[newRule] = r[oldRule]
|
|
|
|
delete(r, oldRule)
|
2020-06-30 20:56:49 +10:00
|
|
|
return r
|
|
|
|
}
|
|
|
|
|
2018-12-31 22:44:27 +11:00
|
|
|
// Clone returns a clone of the Rules.
|
2018-02-07 22:11:40 +11:00
|
|
|
func (r Rules) Clone() Rules {
|
|
|
|
out := map[string][]Rule{}
|
|
|
|
for key, rules := range r {
|
|
|
|
out[key] = make([]Rule, len(rules))
|
|
|
|
copy(out[key], rules)
|
|
|
|
}
|
|
|
|
return out
|
|
|
|
}
|
|
|
|
|
2020-05-16 16:04:08 +10:00
|
|
|
// Merge creates a clone of "r" then merges "rules" into the clone.
|
|
|
|
func (r Rules) Merge(rules Rules) Rules {
|
|
|
|
out := r.Clone()
|
|
|
|
for k, v := range rules.Clone() {
|
|
|
|
out[k] = v
|
|
|
|
}
|
|
|
|
return out
|
|
|
|
}
|
|
|
|
|
Version 2 of Chroma
This cleans up the API in general, removing a bunch of deprecated stuff,
cleaning up circular imports, etc.
But the biggest change is switching to an optional XML format for the
regex lexer.
Having lexers defined only in Go is not ideal for a couple of reasons.
Firstly, it impedes a significant portion of contributors who use Chroma
in Hugo, but don't know Go. Secondly, it bloats the binary size of any
project that imports Chroma.
Why XML? YAML is an abomination and JSON is not human editable. XML
also compresses very well (eg. Go template lexer XML compresses from
3239 bytes to 718).
Why a new syntax format? All major existing formats rely on the
Oniguruma regex engine, which is extremely complex and for which there
is no Go port.
Why not earlier? Prior to the existence of fs.FS this was not a viable
option.
Benchmarks:
$ hyperfine --warmup 3 \
'./chroma.master --version' \
'./chroma.xml-pre-opt --version' \
'./chroma.xml --version'
Benchmark 1: ./chroma.master --version
Time (mean ± σ): 5.3 ms ± 0.5 ms [User: 3.6 ms, System: 1.4 ms]
Range (min … max): 4.2 ms … 6.6 ms 233 runs
Benchmark 2: ./chroma.xml-pre-opt --version
Time (mean ± σ): 50.6 ms ± 0.5 ms [User: 52.4 ms, System: 3.6 ms]
Range (min … max): 49.2 ms … 51.5 ms 51 runs
Benchmark 3: ./chroma.xml --version
Time (mean ± σ): 6.9 ms ± 1.1 ms [User: 5.1 ms, System: 1.5 ms]
Range (min … max): 5.7 ms … 19.9 ms 196 runs
Summary
'./chroma.master --version' ran
1.30 ± 0.23 times faster than './chroma.xml --version'
9.56 ± 0.83 times faster than './chroma.xml-pre-opt --version'
A slight increase in init time, but I think this is okay given the
increase in flexibility.
And binary size difference:
$ du -h lexers.test*
$ du -sh chroma* 951371ms
8.8M chroma.master
7.8M chroma.xml
7.8M chroma.xml-pre-opt
Benchmarks:
$ hyperfine --warmup 3 \
'./chroma.master --version' \
'./chroma.xml-pre-opt --version' \
'./chroma.xml --version'
Benchmark 1: ./chroma.master --version
Time (mean ± σ): 5.3 ms ± 0.5 ms [User: 3.6 ms, System: 1.4 ms]
Range (min … max): 4.2 ms … 6.6 ms 233 runs
Benchmark 2: ./chroma.xml-pre-opt --version
Time (mean ± σ): 50.6 ms ± 0.5 ms [User: 52.4 ms, System: 3.6 ms]
Range (min … max): 49.2 ms … 51.5 ms 51 runs
Benchmark 3: ./chroma.xml --version
Time (mean ± σ): 6.9 ms ± 1.1 ms [User: 5.1 ms, System: 1.5 ms]
Range (min … max): 5.7 ms … 19.9 ms 196 runs
Summary
'./chroma.master --version' ran
1.30 ± 0.23 times faster than './chroma.xml --version'
9.56 ± 0.83 times faster than './chroma.xml-pre-opt --version'
Incompatible changes:
- (*RegexLexer).SetAnalyser: changed from func(func(text string) float32) *RegexLexer to func(func(text string) float32) Lexer
- (*TokenType).UnmarshalJSON: removed
- Lexer.AnalyseText: added
- Lexer.SetAnalyser: added
- Lexer.SetRegistry: added
- MustNewLazyLexer: removed
- MustNewLexer: changed from func(*Config, Rules) *RegexLexer to func(*Config, func() Rules) *RegexLexer
- Mutators: changed from func(...Mutator) MutatorFunc to func(...Mutator) Mutator
- NewLazyLexer: removed
- NewLexer: changed from func(*Config, Rules) (*RegexLexer, error) to func(*Config, func() Rules) (*RegexLexer, error)
- Pop: changed from func(int) MutatorFunc to func(int) Mutator
- Push: changed from func(...string) MutatorFunc to func(...string) Mutator
- TokenType.MarshalJSON: removed
- Using: changed from func(Lexer) Emitter to func(string) Emitter
- UsingByGroup: changed from func(func(string) Lexer, int, int, ...Emitter) Emitter to func(int, int, ...Emitter) Emitter
2022-01-03 23:51:17 +11:00
|
|
|
// MustNewLexer creates a new Lexer with deferred rules generation or panics.
|
2023-08-22 08:39:01 +10:00
|
|
|
func MustNewLexer(config *Config, rules func() Rules) *RegexLexer {
|
|
|
|
lexer, err := NewLexer(config, rules)
|
2021-02-07 19:16:49 -06:00
|
|
|
if err != nil {
|
|
|
|
panic(err)
|
|
|
|
}
|
|
|
|
return lexer
|
|
|
|
}
|
|
|
|
|
Version 2 of Chroma
This cleans up the API in general, removing a bunch of deprecated stuff,
cleaning up circular imports, etc.
But the biggest change is switching to an optional XML format for the
regex lexer.
Having lexers defined only in Go is not ideal for a couple of reasons.
Firstly, it impedes a significant portion of contributors who use Chroma
in Hugo, but don't know Go. Secondly, it bloats the binary size of any
project that imports Chroma.
Why XML? YAML is an abomination and JSON is not human editable. XML
also compresses very well (eg. Go template lexer XML compresses from
3239 bytes to 718).
Why a new syntax format? All major existing formats rely on the
Oniguruma regex engine, which is extremely complex and for which there
is no Go port.
Why not earlier? Prior to the existence of fs.FS this was not a viable
option.
Benchmarks:
$ hyperfine --warmup 3 \
'./chroma.master --version' \
'./chroma.xml-pre-opt --version' \
'./chroma.xml --version'
Benchmark 1: ./chroma.master --version
Time (mean ± σ): 5.3 ms ± 0.5 ms [User: 3.6 ms, System: 1.4 ms]
Range (min … max): 4.2 ms … 6.6 ms 233 runs
Benchmark 2: ./chroma.xml-pre-opt --version
Time (mean ± σ): 50.6 ms ± 0.5 ms [User: 52.4 ms, System: 3.6 ms]
Range (min … max): 49.2 ms … 51.5 ms 51 runs
Benchmark 3: ./chroma.xml --version
Time (mean ± σ): 6.9 ms ± 1.1 ms [User: 5.1 ms, System: 1.5 ms]
Range (min … max): 5.7 ms … 19.9 ms 196 runs
Summary
'./chroma.master --version' ran
1.30 ± 0.23 times faster than './chroma.xml --version'
9.56 ± 0.83 times faster than './chroma.xml-pre-opt --version'
A slight increase in init time, but I think this is okay given the
increase in flexibility.
And binary size difference:
$ du -h lexers.test*
$ du -sh chroma* 951371ms
8.8M chroma.master
7.8M chroma.xml
7.8M chroma.xml-pre-opt
Benchmarks:
$ hyperfine --warmup 3 \
'./chroma.master --version' \
'./chroma.xml-pre-opt --version' \
'./chroma.xml --version'
Benchmark 1: ./chroma.master --version
Time (mean ± σ): 5.3 ms ± 0.5 ms [User: 3.6 ms, System: 1.4 ms]
Range (min … max): 4.2 ms … 6.6 ms 233 runs
Benchmark 2: ./chroma.xml-pre-opt --version
Time (mean ± σ): 50.6 ms ± 0.5 ms [User: 52.4 ms, System: 3.6 ms]
Range (min … max): 49.2 ms … 51.5 ms 51 runs
Benchmark 3: ./chroma.xml --version
Time (mean ± σ): 6.9 ms ± 1.1 ms [User: 5.1 ms, System: 1.5 ms]
Range (min … max): 5.7 ms … 19.9 ms 196 runs
Summary
'./chroma.master --version' ran
1.30 ± 0.23 times faster than './chroma.xml --version'
9.56 ± 0.83 times faster than './chroma.xml-pre-opt --version'
Incompatible changes:
- (*RegexLexer).SetAnalyser: changed from func(func(text string) float32) *RegexLexer to func(func(text string) float32) Lexer
- (*TokenType).UnmarshalJSON: removed
- Lexer.AnalyseText: added
- Lexer.SetAnalyser: added
- Lexer.SetRegistry: added
- MustNewLazyLexer: removed
- MustNewLexer: changed from func(*Config, Rules) *RegexLexer to func(*Config, func() Rules) *RegexLexer
- Mutators: changed from func(...Mutator) MutatorFunc to func(...Mutator) Mutator
- NewLazyLexer: removed
- NewLexer: changed from func(*Config, Rules) (*RegexLexer, error) to func(*Config, func() Rules) (*RegexLexer, error)
- Pop: changed from func(int) MutatorFunc to func(int) Mutator
- Push: changed from func(...string) MutatorFunc to func(...string) Mutator
- TokenType.MarshalJSON: removed
- Using: changed from func(Lexer) Emitter to func(string) Emitter
- UsingByGroup: changed from func(func(string) Lexer, int, int, ...Emitter) Emitter to func(int, int, ...Emitter) Emitter
2022-01-03 23:51:17 +11:00
|
|
|
// NewLexer creates a new regex-based Lexer.
|
|
|
|
//
|
|
|
|
// "rules" is a state machine transition map. Each key is a state. Values are sets of rules
|
|
|
|
// that match input, optionally modify lexer state, and output tokens.
|
|
|
|
func NewLexer(config *Config, rulesFunc func() Rules) (*RegexLexer, error) {
|
2021-02-07 19:16:49 -06:00
|
|
|
if config == nil {
|
|
|
|
config = &Config{}
|
|
|
|
}
|
2021-09-27 14:27:46 +10:00
|
|
|
for _, glob := range append(config.Filenames, config.AliasFilenames...) {
|
|
|
|
_, err := filepath.Match(glob, "")
|
|
|
|
if err != nil {
|
|
|
|
return nil, fmt.Errorf("%s: %q is not a valid glob: %w", config.Name, glob, err)
|
|
|
|
}
|
|
|
|
}
|
Version 2 of Chroma
This cleans up the API in general, removing a bunch of deprecated stuff,
cleaning up circular imports, etc.
But the biggest change is switching to an optional XML format for the
regex lexer.
Having lexers defined only in Go is not ideal for a couple of reasons.
Firstly, it impedes a significant portion of contributors who use Chroma
in Hugo, but don't know Go. Secondly, it bloats the binary size of any
project that imports Chroma.
Why XML? YAML is an abomination and JSON is not human editable. XML
also compresses very well (eg. Go template lexer XML compresses from
3239 bytes to 718).
Why a new syntax format? All major existing formats rely on the
Oniguruma regex engine, which is extremely complex and for which there
is no Go port.
Why not earlier? Prior to the existence of fs.FS this was not a viable
option.
Benchmarks:
$ hyperfine --warmup 3 \
'./chroma.master --version' \
'./chroma.xml-pre-opt --version' \
'./chroma.xml --version'
Benchmark 1: ./chroma.master --version
Time (mean ± σ): 5.3 ms ± 0.5 ms [User: 3.6 ms, System: 1.4 ms]
Range (min … max): 4.2 ms … 6.6 ms 233 runs
Benchmark 2: ./chroma.xml-pre-opt --version
Time (mean ± σ): 50.6 ms ± 0.5 ms [User: 52.4 ms, System: 3.6 ms]
Range (min … max): 49.2 ms … 51.5 ms 51 runs
Benchmark 3: ./chroma.xml --version
Time (mean ± σ): 6.9 ms ± 1.1 ms [User: 5.1 ms, System: 1.5 ms]
Range (min … max): 5.7 ms … 19.9 ms 196 runs
Summary
'./chroma.master --version' ran
1.30 ± 0.23 times faster than './chroma.xml --version'
9.56 ± 0.83 times faster than './chroma.xml-pre-opt --version'
A slight increase in init time, but I think this is okay given the
increase in flexibility.
And binary size difference:
$ du -h lexers.test*
$ du -sh chroma* 951371ms
8.8M chroma.master
7.8M chroma.xml
7.8M chroma.xml-pre-opt
Benchmarks:
$ hyperfine --warmup 3 \
'./chroma.master --version' \
'./chroma.xml-pre-opt --version' \
'./chroma.xml --version'
Benchmark 1: ./chroma.master --version
Time (mean ± σ): 5.3 ms ± 0.5 ms [User: 3.6 ms, System: 1.4 ms]
Range (min … max): 4.2 ms … 6.6 ms 233 runs
Benchmark 2: ./chroma.xml-pre-opt --version
Time (mean ± σ): 50.6 ms ± 0.5 ms [User: 52.4 ms, System: 3.6 ms]
Range (min … max): 49.2 ms … 51.5 ms 51 runs
Benchmark 3: ./chroma.xml --version
Time (mean ± σ): 6.9 ms ± 1.1 ms [User: 5.1 ms, System: 1.5 ms]
Range (min … max): 5.7 ms … 19.9 ms 196 runs
Summary
'./chroma.master --version' ran
1.30 ± 0.23 times faster than './chroma.xml --version'
9.56 ± 0.83 times faster than './chroma.xml-pre-opt --version'
Incompatible changes:
- (*RegexLexer).SetAnalyser: changed from func(func(text string) float32) *RegexLexer to func(func(text string) float32) Lexer
- (*TokenType).UnmarshalJSON: removed
- Lexer.AnalyseText: added
- Lexer.SetAnalyser: added
- Lexer.SetRegistry: added
- MustNewLazyLexer: removed
- MustNewLexer: changed from func(*Config, Rules) *RegexLexer to func(*Config, func() Rules) *RegexLexer
- Mutators: changed from func(...Mutator) MutatorFunc to func(...Mutator) Mutator
- NewLazyLexer: removed
- NewLexer: changed from func(*Config, Rules) (*RegexLexer, error) to func(*Config, func() Rules) (*RegexLexer, error)
- Pop: changed from func(int) MutatorFunc to func(int) Mutator
- Push: changed from func(...string) MutatorFunc to func(...string) Mutator
- TokenType.MarshalJSON: removed
- Using: changed from func(Lexer) Emitter to func(string) Emitter
- UsingByGroup: changed from func(func(string) Lexer, int, int, ...Emitter) Emitter to func(int, int, ...Emitter) Emitter
2022-01-03 23:51:17 +11:00
|
|
|
r := &RegexLexer{
|
|
|
|
config: config,
|
|
|
|
fetchRulesFunc: func() (Rules, error) { return rulesFunc(), nil },
|
2017-09-20 20:15:35 +10:00
|
|
|
}
|
Version 2 of Chroma
This cleans up the API in general, removing a bunch of deprecated stuff,
cleaning up circular imports, etc.
But the biggest change is switching to an optional XML format for the
regex lexer.
Having lexers defined only in Go is not ideal for a couple of reasons.
Firstly, it impedes a significant portion of contributors who use Chroma
in Hugo, but don't know Go. Secondly, it bloats the binary size of any
project that imports Chroma.
Why XML? YAML is an abomination and JSON is not human editable. XML
also compresses very well (eg. Go template lexer XML compresses from
3239 bytes to 718).
Why a new syntax format? All major existing formats rely on the
Oniguruma regex engine, which is extremely complex and for which there
is no Go port.
Why not earlier? Prior to the existence of fs.FS this was not a viable
option.
Benchmarks:
$ hyperfine --warmup 3 \
'./chroma.master --version' \
'./chroma.xml-pre-opt --version' \
'./chroma.xml --version'
Benchmark 1: ./chroma.master --version
Time (mean ± σ): 5.3 ms ± 0.5 ms [User: 3.6 ms, System: 1.4 ms]
Range (min … max): 4.2 ms … 6.6 ms 233 runs
Benchmark 2: ./chroma.xml-pre-opt --version
Time (mean ± σ): 50.6 ms ± 0.5 ms [User: 52.4 ms, System: 3.6 ms]
Range (min … max): 49.2 ms … 51.5 ms 51 runs
Benchmark 3: ./chroma.xml --version
Time (mean ± σ): 6.9 ms ± 1.1 ms [User: 5.1 ms, System: 1.5 ms]
Range (min … max): 5.7 ms … 19.9 ms 196 runs
Summary
'./chroma.master --version' ran
1.30 ± 0.23 times faster than './chroma.xml --version'
9.56 ± 0.83 times faster than './chroma.xml-pre-opt --version'
A slight increase in init time, but I think this is okay given the
increase in flexibility.
And binary size difference:
$ du -h lexers.test*
$ du -sh chroma* 951371ms
8.8M chroma.master
7.8M chroma.xml
7.8M chroma.xml-pre-opt
Benchmarks:
$ hyperfine --warmup 3 \
'./chroma.master --version' \
'./chroma.xml-pre-opt --version' \
'./chroma.xml --version'
Benchmark 1: ./chroma.master --version
Time (mean ± σ): 5.3 ms ± 0.5 ms [User: 3.6 ms, System: 1.4 ms]
Range (min … max): 4.2 ms … 6.6 ms 233 runs
Benchmark 2: ./chroma.xml-pre-opt --version
Time (mean ± σ): 50.6 ms ± 0.5 ms [User: 52.4 ms, System: 3.6 ms]
Range (min … max): 49.2 ms … 51.5 ms 51 runs
Benchmark 3: ./chroma.xml --version
Time (mean ± σ): 6.9 ms ± 1.1 ms [User: 5.1 ms, System: 1.5 ms]
Range (min … max): 5.7 ms … 19.9 ms 196 runs
Summary
'./chroma.master --version' ran
1.30 ± 0.23 times faster than './chroma.xml --version'
9.56 ± 0.83 times faster than './chroma.xml-pre-opt --version'
Incompatible changes:
- (*RegexLexer).SetAnalyser: changed from func(func(text string) float32) *RegexLexer to func(func(text string) float32) Lexer
- (*TokenType).UnmarshalJSON: removed
- Lexer.AnalyseText: added
- Lexer.SetAnalyser: added
- Lexer.SetRegistry: added
- MustNewLazyLexer: removed
- MustNewLexer: changed from func(*Config, Rules) *RegexLexer to func(*Config, func() Rules) *RegexLexer
- Mutators: changed from func(...Mutator) MutatorFunc to func(...Mutator) Mutator
- NewLazyLexer: removed
- NewLexer: changed from func(*Config, Rules) (*RegexLexer, error) to func(*Config, func() Rules) (*RegexLexer, error)
- Pop: changed from func(int) MutatorFunc to func(int) Mutator
- Push: changed from func(...string) MutatorFunc to func(...string) Mutator
- TokenType.MarshalJSON: removed
- Using: changed from func(Lexer) Emitter to func(string) Emitter
- UsingByGroup: changed from func(func(string) Lexer, int, int, ...Emitter) Emitter to func(int, int, ...Emitter) Emitter
2022-01-03 23:51:17 +11:00
|
|
|
// One-off code to generate XML lexers in the Chroma source tree.
|
|
|
|
// var nameCleanRe = regexp.MustCompile(`[^-+A-Za-z0-9_]`)
|
|
|
|
// name := strings.ToLower(nameCleanRe.ReplaceAllString(config.Name, "_"))
|
|
|
|
// data, err := Marshal(r)
|
|
|
|
// if err != nil {
|
|
|
|
// if errors.Is(err, ErrNotSerialisable) {
|
|
|
|
// fmt.Fprintf(os.Stderr, "warning: %q: %s\n", name, err)
|
|
|
|
// return r, nil
|
|
|
|
// }
|
|
|
|
// return nil, err
|
|
|
|
// }
|
|
|
|
// _, file, _, ok := runtime.Caller(2)
|
|
|
|
// if !ok {
|
|
|
|
// panic("??")
|
|
|
|
// }
|
|
|
|
// fmt.Println(file)
|
|
|
|
// if strings.Contains(file, "/lexers/") {
|
|
|
|
// dir := filepath.Join(filepath.Dir(file), "embedded")
|
|
|
|
// err = os.MkdirAll(dir, 0700)
|
|
|
|
// if err != nil {
|
|
|
|
// return nil, err
|
|
|
|
// }
|
|
|
|
// filename := filepath.Join(dir, name) + ".xml"
|
|
|
|
// fmt.Println(filename)
|
|
|
|
// err = ioutil.WriteFile(filename, data, 0600)
|
|
|
|
// if err != nil {
|
|
|
|
// return nil, err
|
|
|
|
// }
|
|
|
|
// }
|
|
|
|
return r, nil
|
2017-09-20 20:15:35 +10:00
|
|
|
}
|
|
|
|
|
2018-12-31 22:44:27 +11:00
|
|
|
// Trace enables debug tracing.
|
2017-09-21 17:52:28 +10:00
|
|
|
func (r *RegexLexer) Trace(trace bool) *RegexLexer {
|
|
|
|
r.trace = trace
|
|
|
|
return r
|
|
|
|
}
|
|
|
|
|
2017-09-20 20:15:35 +10:00
|
|
|
// A CompiledRule is a Rule with a pre-compiled regex.
|
|
|
|
//
|
|
|
|
// Note that regular expressions are lazily compiled on first use of the lexer.
|
|
|
|
type CompiledRule struct {
|
|
|
|
Rule
|
|
|
|
Regexp *regexp2.Regexp
|
|
|
|
flags string
|
|
|
|
}
|
|
|
|
|
2018-12-31 22:44:27 +11:00
|
|
|
// CompiledRules is a map of rule name to sequence of compiled rules in that rule.
|
2017-09-21 19:59:10 +10:00
|
|
|
type CompiledRules map[string][]*CompiledRule
|
2017-09-20 20:15:35 +10:00
|
|
|
|
2018-12-31 22:44:27 +11:00
|
|
|
// LexerState contains the state for a single lex.
|
2017-09-20 20:15:35 +10:00
|
|
|
type LexerState struct {
|
Version 2 of Chroma
This cleans up the API in general, removing a bunch of deprecated stuff,
cleaning up circular imports, etc.
But the biggest change is switching to an optional XML format for the
regex lexer.
Having lexers defined only in Go is not ideal for a couple of reasons.
Firstly, it impedes a significant portion of contributors who use Chroma
in Hugo, but don't know Go. Secondly, it bloats the binary size of any
project that imports Chroma.
Why XML? YAML is an abomination and JSON is not human editable. XML
also compresses very well (eg. Go template lexer XML compresses from
3239 bytes to 718).
Why a new syntax format? All major existing formats rely on the
Oniguruma regex engine, which is extremely complex and for which there
is no Go port.
Why not earlier? Prior to the existence of fs.FS this was not a viable
option.
Benchmarks:
$ hyperfine --warmup 3 \
'./chroma.master --version' \
'./chroma.xml-pre-opt --version' \
'./chroma.xml --version'
Benchmark 1: ./chroma.master --version
Time (mean ± σ): 5.3 ms ± 0.5 ms [User: 3.6 ms, System: 1.4 ms]
Range (min … max): 4.2 ms … 6.6 ms 233 runs
Benchmark 2: ./chroma.xml-pre-opt --version
Time (mean ± σ): 50.6 ms ± 0.5 ms [User: 52.4 ms, System: 3.6 ms]
Range (min … max): 49.2 ms … 51.5 ms 51 runs
Benchmark 3: ./chroma.xml --version
Time (mean ± σ): 6.9 ms ± 1.1 ms [User: 5.1 ms, System: 1.5 ms]
Range (min … max): 5.7 ms … 19.9 ms 196 runs
Summary
'./chroma.master --version' ran
1.30 ± 0.23 times faster than './chroma.xml --version'
9.56 ± 0.83 times faster than './chroma.xml-pre-opt --version'
A slight increase in init time, but I think this is okay given the
increase in flexibility.
And binary size difference:
$ du -h lexers.test*
$ du -sh chroma* 951371ms
8.8M chroma.master
7.8M chroma.xml
7.8M chroma.xml-pre-opt
Benchmarks:
$ hyperfine --warmup 3 \
'./chroma.master --version' \
'./chroma.xml-pre-opt --version' \
'./chroma.xml --version'
Benchmark 1: ./chroma.master --version
Time (mean ± σ): 5.3 ms ± 0.5 ms [User: 3.6 ms, System: 1.4 ms]
Range (min … max): 4.2 ms … 6.6 ms 233 runs
Benchmark 2: ./chroma.xml-pre-opt --version
Time (mean ± σ): 50.6 ms ± 0.5 ms [User: 52.4 ms, System: 3.6 ms]
Range (min … max): 49.2 ms … 51.5 ms 51 runs
Benchmark 3: ./chroma.xml --version
Time (mean ± σ): 6.9 ms ± 1.1 ms [User: 5.1 ms, System: 1.5 ms]
Range (min … max): 5.7 ms … 19.9 ms 196 runs
Summary
'./chroma.master --version' ran
1.30 ± 0.23 times faster than './chroma.xml --version'
9.56 ± 0.83 times faster than './chroma.xml-pre-opt --version'
Incompatible changes:
- (*RegexLexer).SetAnalyser: changed from func(func(text string) float32) *RegexLexer to func(func(text string) float32) Lexer
- (*TokenType).UnmarshalJSON: removed
- Lexer.AnalyseText: added
- Lexer.SetAnalyser: added
- Lexer.SetRegistry: added
- MustNewLazyLexer: removed
- MustNewLexer: changed from func(*Config, Rules) *RegexLexer to func(*Config, func() Rules) *RegexLexer
- Mutators: changed from func(...Mutator) MutatorFunc to func(...Mutator) Mutator
- NewLazyLexer: removed
- NewLexer: changed from func(*Config, Rules) (*RegexLexer, error) to func(*Config, func() Rules) (*RegexLexer, error)
- Pop: changed from func(int) MutatorFunc to func(int) Mutator
- Push: changed from func(...string) MutatorFunc to func(...string) Mutator
- TokenType.MarshalJSON: removed
- Using: changed from func(Lexer) Emitter to func(string) Emitter
- UsingByGroup: changed from func(func(string) Lexer, int, int, ...Emitter) Emitter to func(int, int, ...Emitter) Emitter
2022-01-03 23:51:17 +11:00
|
|
|
Lexer *RegexLexer
|
|
|
|
Registry *LexerRegistry
|
|
|
|
Text []rune
|
|
|
|
Pos int
|
|
|
|
Rules CompiledRules
|
|
|
|
Stack []string
|
|
|
|
State string
|
|
|
|
Rule int
|
2017-09-20 20:15:35 +10:00
|
|
|
// Group matches.
|
|
|
|
Groups []string
|
2021-05-06 11:43:54 +04:30
|
|
|
// Named Group matches.
|
|
|
|
NamedGroups map[string]string
|
2017-09-20 20:15:35 +10:00
|
|
|
// Custum context for mutators.
|
|
|
|
MutatorContext map[interface{}]interface{}
|
2017-09-23 21:55:56 +10:00
|
|
|
iteratorStack []Iterator
|
2019-04-22 18:22:58 +10:00
|
|
|
options *TokeniseOptions
|
2021-02-06 20:13:50 +11:00
|
|
|
newlineAdded bool
|
2017-09-20 20:15:35 +10:00
|
|
|
}
|
|
|
|
|
2018-12-31 22:44:27 +11:00
|
|
|
// Set mutator context.
|
2017-09-20 20:15:35 +10:00
|
|
|
func (l *LexerState) Set(key interface{}, value interface{}) {
|
|
|
|
l.MutatorContext[key] = value
|
|
|
|
}
|
|
|
|
|
2018-12-31 22:44:27 +11:00
|
|
|
// Get mutator context.
|
2017-09-20 20:15:35 +10:00
|
|
|
func (l *LexerState) Get(key interface{}) interface{} {
|
|
|
|
return l.MutatorContext[key]
|
|
|
|
}
|
|
|
|
|
2018-12-31 22:44:27 +11:00
|
|
|
// Iterator returns the next Token from the lexer.
|
2019-10-15 21:01:41 +11:00
|
|
|
func (l *LexerState) Iterator() Token { // nolint: gocognit
|
2021-02-06 20:13:50 +11:00
|
|
|
end := len(l.Text)
|
|
|
|
if l.newlineAdded {
|
|
|
|
end--
|
|
|
|
}
|
|
|
|
for l.Pos < end && len(l.Stack) > 0 {
|
2017-09-23 21:55:56 +10:00
|
|
|
// Exhaust the iterator stack, if any.
|
|
|
|
for len(l.iteratorStack) > 0 {
|
|
|
|
n := len(l.iteratorStack) - 1
|
|
|
|
t := l.iteratorStack[n]()
|
2018-11-03 16:22:51 -07:00
|
|
|
if t == EOF {
|
2017-09-23 21:55:56 +10:00
|
|
|
l.iteratorStack = l.iteratorStack[:n]
|
2017-09-20 22:19:36 +10:00
|
|
|
continue
|
|
|
|
}
|
|
|
|
return t
|
|
|
|
}
|
2017-09-21 17:52:28 +10:00
|
|
|
|
2017-09-23 21:55:56 +10:00
|
|
|
l.State = l.Stack[len(l.Stack)-1]
|
|
|
|
if l.Lexer.trace {
|
|
|
|
fmt.Fprintf(os.Stderr, "%s: pos=%d, text=%q\n", l.State, l.Pos, string(l.Text[l.Pos:]))
|
|
|
|
}
|
2018-03-18 21:57:34 +11:00
|
|
|
selectedRule, ok := l.Rules[l.State]
|
|
|
|
if !ok {
|
|
|
|
panic("unknown state " + l.State)
|
|
|
|
}
|
2021-05-06 11:43:54 +04:30
|
|
|
ruleIndex, rule, groups, namedGroups := matchRules(l.Text, l.Pos, selectedRule)
|
2017-09-23 21:55:56 +10:00
|
|
|
// No match.
|
|
|
|
if groups == nil {
|
2019-04-22 18:22:58 +10:00
|
|
|
// From Pygments :\
|
|
|
|
//
|
|
|
|
// If the RegexLexer encounters a newline that is flagged as an error token, the stack is
|
|
|
|
// emptied and the lexer continues scanning in the 'root' state. This can help producing
|
|
|
|
// error-tolerant highlighting for erroneous input, e.g. when a single-line string is not
|
|
|
|
// closed.
|
|
|
|
if l.Text[l.Pos] == '\n' && l.State != l.options.State {
|
|
|
|
l.Stack = []string{l.options.State}
|
|
|
|
continue
|
|
|
|
}
|
2017-09-23 21:55:56 +10:00
|
|
|
l.Pos++
|
2018-11-03 16:22:51 -07:00
|
|
|
return Token{Error, string(l.Text[l.Pos-1 : l.Pos])}
|
2017-09-23 21:55:56 +10:00
|
|
|
}
|
|
|
|
l.Rule = ruleIndex
|
|
|
|
l.Groups = groups
|
2021-05-06 11:43:54 +04:30
|
|
|
l.NamedGroups = namedGroups
|
2017-09-23 21:55:56 +10:00
|
|
|
l.Pos += utf8.RuneCountInString(groups[0])
|
|
|
|
if rule.Mutator != nil {
|
|
|
|
if err := rule.Mutator.Mutate(l); err != nil {
|
|
|
|
panic(err)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if rule.Type != nil {
|
2021-05-06 14:37:30 +04:30
|
|
|
l.iteratorStack = append(l.iteratorStack, rule.Type.Emit(l.Groups, l))
|
2017-09-21 17:52:28 +10:00
|
|
|
}
|
2017-09-20 22:19:36 +10:00
|
|
|
}
|
2017-09-23 21:55:56 +10:00
|
|
|
// Exhaust the IteratorStack, if any.
|
|
|
|
// Duplicate code, but eh.
|
|
|
|
for len(l.iteratorStack) > 0 {
|
|
|
|
n := len(l.iteratorStack) - 1
|
|
|
|
t := l.iteratorStack[n]()
|
2018-11-03 16:22:51 -07:00
|
|
|
if t == EOF {
|
2017-09-23 21:55:56 +10:00
|
|
|
l.iteratorStack = l.iteratorStack[:n]
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
return t
|
|
|
|
}
|
|
|
|
|
|
|
|
// If we get to here and we still have text, return it as an error.
|
|
|
|
if l.Pos != len(l.Text) && len(l.Stack) == 0 {
|
|
|
|
value := string(l.Text[l.Pos:])
|
|
|
|
l.Pos = len(l.Text)
|
2018-11-03 16:22:51 -07:00
|
|
|
return Token{Type: Error, Value: value}
|
2017-09-23 21:55:56 +10:00
|
|
|
}
|
2018-11-03 16:22:51 -07:00
|
|
|
return EOF
|
2017-09-20 22:19:36 +10:00
|
|
|
}
|
|
|
|
|
2018-12-31 22:44:27 +11:00
|
|
|
// RegexLexer is the default lexer implementation used in Chroma.
|
2017-09-20 20:15:35 +10:00
|
|
|
type RegexLexer struct {
|
Version 2 of Chroma
This cleans up the API in general, removing a bunch of deprecated stuff,
cleaning up circular imports, etc.
But the biggest change is switching to an optional XML format for the
regex lexer.
Having lexers defined only in Go is not ideal for a couple of reasons.
Firstly, it impedes a significant portion of contributors who use Chroma
in Hugo, but don't know Go. Secondly, it bloats the binary size of any
project that imports Chroma.
Why XML? YAML is an abomination and JSON is not human editable. XML
also compresses very well (eg. Go template lexer XML compresses from
3239 bytes to 718).
Why a new syntax format? All major existing formats rely on the
Oniguruma regex engine, which is extremely complex and for which there
is no Go port.
Why not earlier? Prior to the existence of fs.FS this was not a viable
option.
Benchmarks:
$ hyperfine --warmup 3 \
'./chroma.master --version' \
'./chroma.xml-pre-opt --version' \
'./chroma.xml --version'
Benchmark 1: ./chroma.master --version
Time (mean ± σ): 5.3 ms ± 0.5 ms [User: 3.6 ms, System: 1.4 ms]
Range (min … max): 4.2 ms … 6.6 ms 233 runs
Benchmark 2: ./chroma.xml-pre-opt --version
Time (mean ± σ): 50.6 ms ± 0.5 ms [User: 52.4 ms, System: 3.6 ms]
Range (min … max): 49.2 ms … 51.5 ms 51 runs
Benchmark 3: ./chroma.xml --version
Time (mean ± σ): 6.9 ms ± 1.1 ms [User: 5.1 ms, System: 1.5 ms]
Range (min … max): 5.7 ms … 19.9 ms 196 runs
Summary
'./chroma.master --version' ran
1.30 ± 0.23 times faster than './chroma.xml --version'
9.56 ± 0.83 times faster than './chroma.xml-pre-opt --version'
A slight increase in init time, but I think this is okay given the
increase in flexibility.
And binary size difference:
$ du -h lexers.test*
$ du -sh chroma* 951371ms
8.8M chroma.master
7.8M chroma.xml
7.8M chroma.xml-pre-opt
Benchmarks:
$ hyperfine --warmup 3 \
'./chroma.master --version' \
'./chroma.xml-pre-opt --version' \
'./chroma.xml --version'
Benchmark 1: ./chroma.master --version
Time (mean ± σ): 5.3 ms ± 0.5 ms [User: 3.6 ms, System: 1.4 ms]
Range (min … max): 4.2 ms … 6.6 ms 233 runs
Benchmark 2: ./chroma.xml-pre-opt --version
Time (mean ± σ): 50.6 ms ± 0.5 ms [User: 52.4 ms, System: 3.6 ms]
Range (min … max): 49.2 ms … 51.5 ms 51 runs
Benchmark 3: ./chroma.xml --version
Time (mean ± σ): 6.9 ms ± 1.1 ms [User: 5.1 ms, System: 1.5 ms]
Range (min … max): 5.7 ms … 19.9 ms 196 runs
Summary
'./chroma.master --version' ran
1.30 ± 0.23 times faster than './chroma.xml --version'
9.56 ± 0.83 times faster than './chroma.xml-pre-opt --version'
Incompatible changes:
- (*RegexLexer).SetAnalyser: changed from func(func(text string) float32) *RegexLexer to func(func(text string) float32) Lexer
- (*TokenType).UnmarshalJSON: removed
- Lexer.AnalyseText: added
- Lexer.SetAnalyser: added
- Lexer.SetRegistry: added
- MustNewLazyLexer: removed
- MustNewLexer: changed from func(*Config, Rules) *RegexLexer to func(*Config, func() Rules) *RegexLexer
- Mutators: changed from func(...Mutator) MutatorFunc to func(...Mutator) Mutator
- NewLazyLexer: removed
- NewLexer: changed from func(*Config, Rules) (*RegexLexer, error) to func(*Config, func() Rules) (*RegexLexer, error)
- Pop: changed from func(int) MutatorFunc to func(int) Mutator
- Push: changed from func(...string) MutatorFunc to func(...string) Mutator
- TokenType.MarshalJSON: removed
- Using: changed from func(Lexer) Emitter to func(string) Emitter
- UsingByGroup: changed from func(func(string) Lexer, int, int, ...Emitter) Emitter to func(int, int, ...Emitter) Emitter
2022-01-03 23:51:17 +11:00
|
|
|
registry *LexerRegistry // The LexerRegistry this Lexer is associated with, if any.
|
2017-09-20 20:15:35 +10:00
|
|
|
config *Config
|
|
|
|
analyser func(text string) float32
|
2017-09-21 17:52:28 +10:00
|
|
|
trace bool
|
2017-09-20 20:15:35 +10:00
|
|
|
|
Version 2 of Chroma
This cleans up the API in general, removing a bunch of deprecated stuff,
cleaning up circular imports, etc.
But the biggest change is switching to an optional XML format for the
regex lexer.
Having lexers defined only in Go is not ideal for a couple of reasons.
Firstly, it impedes a significant portion of contributors who use Chroma
in Hugo, but don't know Go. Secondly, it bloats the binary size of any
project that imports Chroma.
Why XML? YAML is an abomination and JSON is not human editable. XML
also compresses very well (eg. Go template lexer XML compresses from
3239 bytes to 718).
Why a new syntax format? All major existing formats rely on the
Oniguruma regex engine, which is extremely complex and for which there
is no Go port.
Why not earlier? Prior to the existence of fs.FS this was not a viable
option.
Benchmarks:
$ hyperfine --warmup 3 \
'./chroma.master --version' \
'./chroma.xml-pre-opt --version' \
'./chroma.xml --version'
Benchmark 1: ./chroma.master --version
Time (mean ± σ): 5.3 ms ± 0.5 ms [User: 3.6 ms, System: 1.4 ms]
Range (min … max): 4.2 ms … 6.6 ms 233 runs
Benchmark 2: ./chroma.xml-pre-opt --version
Time (mean ± σ): 50.6 ms ± 0.5 ms [User: 52.4 ms, System: 3.6 ms]
Range (min … max): 49.2 ms … 51.5 ms 51 runs
Benchmark 3: ./chroma.xml --version
Time (mean ± σ): 6.9 ms ± 1.1 ms [User: 5.1 ms, System: 1.5 ms]
Range (min … max): 5.7 ms … 19.9 ms 196 runs
Summary
'./chroma.master --version' ran
1.30 ± 0.23 times faster than './chroma.xml --version'
9.56 ± 0.83 times faster than './chroma.xml-pre-opt --version'
A slight increase in init time, but I think this is okay given the
increase in flexibility.
And binary size difference:
$ du -h lexers.test*
$ du -sh chroma* 951371ms
8.8M chroma.master
7.8M chroma.xml
7.8M chroma.xml-pre-opt
Benchmarks:
$ hyperfine --warmup 3 \
'./chroma.master --version' \
'./chroma.xml-pre-opt --version' \
'./chroma.xml --version'
Benchmark 1: ./chroma.master --version
Time (mean ± σ): 5.3 ms ± 0.5 ms [User: 3.6 ms, System: 1.4 ms]
Range (min … max): 4.2 ms … 6.6 ms 233 runs
Benchmark 2: ./chroma.xml-pre-opt --version
Time (mean ± σ): 50.6 ms ± 0.5 ms [User: 52.4 ms, System: 3.6 ms]
Range (min … max): 49.2 ms … 51.5 ms 51 runs
Benchmark 3: ./chroma.xml --version
Time (mean ± σ): 6.9 ms ± 1.1 ms [User: 5.1 ms, System: 1.5 ms]
Range (min … max): 5.7 ms … 19.9 ms 196 runs
Summary
'./chroma.master --version' ran
1.30 ± 0.23 times faster than './chroma.xml --version'
9.56 ± 0.83 times faster than './chroma.xml-pre-opt --version'
Incompatible changes:
- (*RegexLexer).SetAnalyser: changed from func(func(text string) float32) *RegexLexer to func(func(text string) float32) Lexer
- (*TokenType).UnmarshalJSON: removed
- Lexer.AnalyseText: added
- Lexer.SetAnalyser: added
- Lexer.SetRegistry: added
- MustNewLazyLexer: removed
- MustNewLexer: changed from func(*Config, Rules) *RegexLexer to func(*Config, func() Rules) *RegexLexer
- Mutators: changed from func(...Mutator) MutatorFunc to func(...Mutator) Mutator
- NewLazyLexer: removed
- NewLexer: changed from func(*Config, Rules) (*RegexLexer, error) to func(*Config, func() Rules) (*RegexLexer, error)
- Pop: changed from func(int) MutatorFunc to func(int) Mutator
- Push: changed from func(...string) MutatorFunc to func(...string) Mutator
- TokenType.MarshalJSON: removed
- Using: changed from func(Lexer) Emitter to func(string) Emitter
- UsingByGroup: changed from func(func(string) Lexer, int, int, ...Emitter) Emitter to func(int, int, ...Emitter) Emitter
2022-01-03 23:51:17 +11:00
|
|
|
mu sync.Mutex
|
|
|
|
compiled bool
|
|
|
|
rawRules Rules
|
|
|
|
rules map[string][]*CompiledRule
|
|
|
|
fetchRulesFunc func() (Rules, error)
|
|
|
|
compileOnce sync.Once
|
|
|
|
}
|
|
|
|
|
|
|
|
func (r *RegexLexer) String() string {
|
|
|
|
return r.config.Name
|
|
|
|
}
|
|
|
|
|
|
|
|
// Rules in the Lexer.
|
|
|
|
func (r *RegexLexer) Rules() (Rules, error) {
|
|
|
|
if err := r.needRules(); err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
return r.rawRules, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// SetRegistry the lexer will use to lookup other lexers if necessary.
|
|
|
|
func (r *RegexLexer) SetRegistry(registry *LexerRegistry) Lexer {
|
|
|
|
r.registry = registry
|
|
|
|
return r
|
2017-09-20 20:15:35 +10:00
|
|
|
}
|
|
|
|
|
|
|
|
// SetAnalyser sets the analyser function used to perform content inspection.
|
Version 2 of Chroma
This cleans up the API in general, removing a bunch of deprecated stuff,
cleaning up circular imports, etc.
But the biggest change is switching to an optional XML format for the
regex lexer.
Having lexers defined only in Go is not ideal for a couple of reasons.
Firstly, it impedes a significant portion of contributors who use Chroma
in Hugo, but don't know Go. Secondly, it bloats the binary size of any
project that imports Chroma.
Why XML? YAML is an abomination and JSON is not human editable. XML
also compresses very well (eg. Go template lexer XML compresses from
3239 bytes to 718).
Why a new syntax format? All major existing formats rely on the
Oniguruma regex engine, which is extremely complex and for which there
is no Go port.
Why not earlier? Prior to the existence of fs.FS this was not a viable
option.
Benchmarks:
$ hyperfine --warmup 3 \
'./chroma.master --version' \
'./chroma.xml-pre-opt --version' \
'./chroma.xml --version'
Benchmark 1: ./chroma.master --version
Time (mean ± σ): 5.3 ms ± 0.5 ms [User: 3.6 ms, System: 1.4 ms]
Range (min … max): 4.2 ms … 6.6 ms 233 runs
Benchmark 2: ./chroma.xml-pre-opt --version
Time (mean ± σ): 50.6 ms ± 0.5 ms [User: 52.4 ms, System: 3.6 ms]
Range (min … max): 49.2 ms … 51.5 ms 51 runs
Benchmark 3: ./chroma.xml --version
Time (mean ± σ): 6.9 ms ± 1.1 ms [User: 5.1 ms, System: 1.5 ms]
Range (min … max): 5.7 ms … 19.9 ms 196 runs
Summary
'./chroma.master --version' ran
1.30 ± 0.23 times faster than './chroma.xml --version'
9.56 ± 0.83 times faster than './chroma.xml-pre-opt --version'
A slight increase in init time, but I think this is okay given the
increase in flexibility.
And binary size difference:
$ du -h lexers.test*
$ du -sh chroma* 951371ms
8.8M chroma.master
7.8M chroma.xml
7.8M chroma.xml-pre-opt
Benchmarks:
$ hyperfine --warmup 3 \
'./chroma.master --version' \
'./chroma.xml-pre-opt --version' \
'./chroma.xml --version'
Benchmark 1: ./chroma.master --version
Time (mean ± σ): 5.3 ms ± 0.5 ms [User: 3.6 ms, System: 1.4 ms]
Range (min … max): 4.2 ms … 6.6 ms 233 runs
Benchmark 2: ./chroma.xml-pre-opt --version
Time (mean ± σ): 50.6 ms ± 0.5 ms [User: 52.4 ms, System: 3.6 ms]
Range (min … max): 49.2 ms … 51.5 ms 51 runs
Benchmark 3: ./chroma.xml --version
Time (mean ± σ): 6.9 ms ± 1.1 ms [User: 5.1 ms, System: 1.5 ms]
Range (min … max): 5.7 ms … 19.9 ms 196 runs
Summary
'./chroma.master --version' ran
1.30 ± 0.23 times faster than './chroma.xml --version'
9.56 ± 0.83 times faster than './chroma.xml-pre-opt --version'
Incompatible changes:
- (*RegexLexer).SetAnalyser: changed from func(func(text string) float32) *RegexLexer to func(func(text string) float32) Lexer
- (*TokenType).UnmarshalJSON: removed
- Lexer.AnalyseText: added
- Lexer.SetAnalyser: added
- Lexer.SetRegistry: added
- MustNewLazyLexer: removed
- MustNewLexer: changed from func(*Config, Rules) *RegexLexer to func(*Config, func() Rules) *RegexLexer
- Mutators: changed from func(...Mutator) MutatorFunc to func(...Mutator) Mutator
- NewLazyLexer: removed
- NewLexer: changed from func(*Config, Rules) (*RegexLexer, error) to func(*Config, func() Rules) (*RegexLexer, error)
- Pop: changed from func(int) MutatorFunc to func(int) Mutator
- Push: changed from func(...string) MutatorFunc to func(...string) Mutator
- TokenType.MarshalJSON: removed
- Using: changed from func(Lexer) Emitter to func(string) Emitter
- UsingByGroup: changed from func(func(string) Lexer, int, int, ...Emitter) Emitter to func(int, int, ...Emitter) Emitter
2022-01-03 23:51:17 +11:00
|
|
|
func (r *RegexLexer) SetAnalyser(analyser func(text string) float32) Lexer {
|
2017-09-20 20:15:35 +10:00
|
|
|
r.analyser = analyser
|
|
|
|
return r
|
|
|
|
}
|
|
|
|
|
2023-08-22 22:51:13 -03:00
|
|
|
// AnalyseText scores how likely a fragment of text is to match this lexer, between 0.0 and 1.0.
|
|
|
|
func (r *RegexLexer) AnalyseText(text string) float32 {
|
2017-09-20 20:15:35 +10:00
|
|
|
if r.analyser != nil {
|
|
|
|
return r.analyser(text)
|
|
|
|
}
|
2023-08-22 22:51:13 -03:00
|
|
|
return 0
|
2017-09-20 20:15:35 +10:00
|
|
|
}
|
|
|
|
|
Version 2 of Chroma
This cleans up the API in general, removing a bunch of deprecated stuff,
cleaning up circular imports, etc.
But the biggest change is switching to an optional XML format for the
regex lexer.
Having lexers defined only in Go is not ideal for a couple of reasons.
Firstly, it impedes a significant portion of contributors who use Chroma
in Hugo, but don't know Go. Secondly, it bloats the binary size of any
project that imports Chroma.
Why XML? YAML is an abomination and JSON is not human editable. XML
also compresses very well (eg. Go template lexer XML compresses from
3239 bytes to 718).
Why a new syntax format? All major existing formats rely on the
Oniguruma regex engine, which is extremely complex and for which there
is no Go port.
Why not earlier? Prior to the existence of fs.FS this was not a viable
option.
Benchmarks:
$ hyperfine --warmup 3 \
'./chroma.master --version' \
'./chroma.xml-pre-opt --version' \
'./chroma.xml --version'
Benchmark 1: ./chroma.master --version
Time (mean ± σ): 5.3 ms ± 0.5 ms [User: 3.6 ms, System: 1.4 ms]
Range (min … max): 4.2 ms … 6.6 ms 233 runs
Benchmark 2: ./chroma.xml-pre-opt --version
Time (mean ± σ): 50.6 ms ± 0.5 ms [User: 52.4 ms, System: 3.6 ms]
Range (min … max): 49.2 ms … 51.5 ms 51 runs
Benchmark 3: ./chroma.xml --version
Time (mean ± σ): 6.9 ms ± 1.1 ms [User: 5.1 ms, System: 1.5 ms]
Range (min … max): 5.7 ms … 19.9 ms 196 runs
Summary
'./chroma.master --version' ran
1.30 ± 0.23 times faster than './chroma.xml --version'
9.56 ± 0.83 times faster than './chroma.xml-pre-opt --version'
A slight increase in init time, but I think this is okay given the
increase in flexibility.
And binary size difference:
$ du -h lexers.test*
$ du -sh chroma* 951371ms
8.8M chroma.master
7.8M chroma.xml
7.8M chroma.xml-pre-opt
Benchmarks:
$ hyperfine --warmup 3 \
'./chroma.master --version' \
'./chroma.xml-pre-opt --version' \
'./chroma.xml --version'
Benchmark 1: ./chroma.master --version
Time (mean ± σ): 5.3 ms ± 0.5 ms [User: 3.6 ms, System: 1.4 ms]
Range (min … max): 4.2 ms … 6.6 ms 233 runs
Benchmark 2: ./chroma.xml-pre-opt --version
Time (mean ± σ): 50.6 ms ± 0.5 ms [User: 52.4 ms, System: 3.6 ms]
Range (min … max): 49.2 ms … 51.5 ms 51 runs
Benchmark 3: ./chroma.xml --version
Time (mean ± σ): 6.9 ms ± 1.1 ms [User: 5.1 ms, System: 1.5 ms]
Range (min … max): 5.7 ms … 19.9 ms 196 runs
Summary
'./chroma.master --version' ran
1.30 ± 0.23 times faster than './chroma.xml --version'
9.56 ± 0.83 times faster than './chroma.xml-pre-opt --version'
Incompatible changes:
- (*RegexLexer).SetAnalyser: changed from func(func(text string) float32) *RegexLexer to func(func(text string) float32) Lexer
- (*TokenType).UnmarshalJSON: removed
- Lexer.AnalyseText: added
- Lexer.SetAnalyser: added
- Lexer.SetRegistry: added
- MustNewLazyLexer: removed
- MustNewLexer: changed from func(*Config, Rules) *RegexLexer to func(*Config, func() Rules) *RegexLexer
- Mutators: changed from func(...Mutator) MutatorFunc to func(...Mutator) Mutator
- NewLazyLexer: removed
- NewLexer: changed from func(*Config, Rules) (*RegexLexer, error) to func(*Config, func() Rules) (*RegexLexer, error)
- Pop: changed from func(int) MutatorFunc to func(int) Mutator
- Push: changed from func(...string) MutatorFunc to func(...string) Mutator
- TokenType.MarshalJSON: removed
- Using: changed from func(Lexer) Emitter to func(string) Emitter
- UsingByGroup: changed from func(func(string) Lexer, int, int, ...Emitter) Emitter to func(int, int, ...Emitter) Emitter
2022-01-03 23:51:17 +11:00
|
|
|
// SetConfig replaces the Config for this Lexer.
|
|
|
|
func (r *RegexLexer) SetConfig(config *Config) *RegexLexer {
|
|
|
|
r.config = config
|
|
|
|
return r
|
|
|
|
}
|
|
|
|
|
2023-08-22 22:51:13 -03:00
|
|
|
// Config returns the Config for this Lexer.
|
|
|
|
func (r *RegexLexer) Config() *Config {
|
2017-09-20 20:15:35 +10:00
|
|
|
return r.config
|
|
|
|
}
|
|
|
|
|
|
|
|
// Regex compilation is deferred until the lexer is used. This is to avoid significant init() time costs.
|
|
|
|
func (r *RegexLexer) maybeCompile() (err error) {
|
|
|
|
r.mu.Lock()
|
|
|
|
defer r.mu.Unlock()
|
|
|
|
if r.compiled {
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
for state, rules := range r.rules {
|
|
|
|
for i, rule := range rules {
|
|
|
|
if rule.Regexp == nil {
|
2019-06-09 21:43:16 +10:00
|
|
|
pattern := "(?:" + rule.Pattern + ")"
|
|
|
|
if rule.flags != "" {
|
|
|
|
pattern = "(?" + rule.flags + ")" + pattern
|
|
|
|
}
|
|
|
|
pattern = `\G` + pattern
|
2023-09-09 11:46:02 +10:00
|
|
|
rule.Regexp, err = regexp2.Compile(pattern, 0)
|
2017-09-20 20:15:35 +10:00
|
|
|
if err != nil {
|
|
|
|
return fmt.Errorf("failed to compile rule %s.%d: %s", state, i, err)
|
|
|
|
}
|
2020-07-08 20:22:02 +10:00
|
|
|
rule.Regexp.MatchTimeout = time.Millisecond * 250
|
2017-09-20 20:15:35 +10:00
|
|
|
}
|
2017-09-22 22:28:51 +10:00
|
|
|
}
|
|
|
|
}
|
2017-09-22 23:14:32 +10:00
|
|
|
restart:
|
|
|
|
seen := map[LexerMutator]bool{}
|
2017-09-22 22:28:51 +10:00
|
|
|
for state := range r.rules {
|
|
|
|
for i := 0; i < len(r.rules[state]); i++ {
|
|
|
|
rule := r.rules[state][i]
|
2017-09-21 19:59:10 +10:00
|
|
|
if compile, ok := rule.Mutator.(LexerMutator); ok {
|
2017-09-22 23:14:32 +10:00
|
|
|
if seen[compile] {
|
|
|
|
return fmt.Errorf("saw mutator %T twice; this should not happen", compile)
|
|
|
|
}
|
|
|
|
seen[compile] = true
|
2017-09-22 22:28:51 +10:00
|
|
|
if err := compile.MutateLexer(r.rules, state, i); err != nil {
|
2017-09-21 19:59:10 +10:00
|
|
|
return err
|
|
|
|
}
|
2017-09-22 22:28:51 +10:00
|
|
|
// Process the rules again in case the mutator added/removed rules.
|
2017-09-22 23:14:32 +10:00
|
|
|
//
|
|
|
|
// This sounds bad, but shouldn't be significant in practice.
|
|
|
|
goto restart
|
2017-09-21 19:59:10 +10:00
|
|
|
}
|
2017-09-20 20:15:35 +10:00
|
|
|
}
|
|
|
|
}
|
|
|
|
r.compiled = true
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
Version 2 of Chroma
This cleans up the API in general, removing a bunch of deprecated stuff,
cleaning up circular imports, etc.
But the biggest change is switching to an optional XML format for the
regex lexer.
Having lexers defined only in Go is not ideal for a couple of reasons.
Firstly, it impedes a significant portion of contributors who use Chroma
in Hugo, but don't know Go. Secondly, it bloats the binary size of any
project that imports Chroma.
Why XML? YAML is an abomination and JSON is not human editable. XML
also compresses very well (eg. Go template lexer XML compresses from
3239 bytes to 718).
Why a new syntax format? All major existing formats rely on the
Oniguruma regex engine, which is extremely complex and for which there
is no Go port.
Why not earlier? Prior to the existence of fs.FS this was not a viable
option.
Benchmarks:
$ hyperfine --warmup 3 \
'./chroma.master --version' \
'./chroma.xml-pre-opt --version' \
'./chroma.xml --version'
Benchmark 1: ./chroma.master --version
Time (mean ± σ): 5.3 ms ± 0.5 ms [User: 3.6 ms, System: 1.4 ms]
Range (min … max): 4.2 ms … 6.6 ms 233 runs
Benchmark 2: ./chroma.xml-pre-opt --version
Time (mean ± σ): 50.6 ms ± 0.5 ms [User: 52.4 ms, System: 3.6 ms]
Range (min … max): 49.2 ms … 51.5 ms 51 runs
Benchmark 3: ./chroma.xml --version
Time (mean ± σ): 6.9 ms ± 1.1 ms [User: 5.1 ms, System: 1.5 ms]
Range (min … max): 5.7 ms … 19.9 ms 196 runs
Summary
'./chroma.master --version' ran
1.30 ± 0.23 times faster than './chroma.xml --version'
9.56 ± 0.83 times faster than './chroma.xml-pre-opt --version'
A slight increase in init time, but I think this is okay given the
increase in flexibility.
And binary size difference:
$ du -h lexers.test*
$ du -sh chroma* 951371ms
8.8M chroma.master
7.8M chroma.xml
7.8M chroma.xml-pre-opt
Benchmarks:
$ hyperfine --warmup 3 \
'./chroma.master --version' \
'./chroma.xml-pre-opt --version' \
'./chroma.xml --version'
Benchmark 1: ./chroma.master --version
Time (mean ± σ): 5.3 ms ± 0.5 ms [User: 3.6 ms, System: 1.4 ms]
Range (min … max): 4.2 ms … 6.6 ms 233 runs
Benchmark 2: ./chroma.xml-pre-opt --version
Time (mean ± σ): 50.6 ms ± 0.5 ms [User: 52.4 ms, System: 3.6 ms]
Range (min … max): 49.2 ms … 51.5 ms 51 runs
Benchmark 3: ./chroma.xml --version
Time (mean ± σ): 6.9 ms ± 1.1 ms [User: 5.1 ms, System: 1.5 ms]
Range (min … max): 5.7 ms … 19.9 ms 196 runs
Summary
'./chroma.master --version' ran
1.30 ± 0.23 times faster than './chroma.xml --version'
9.56 ± 0.83 times faster than './chroma.xml-pre-opt --version'
Incompatible changes:
- (*RegexLexer).SetAnalyser: changed from func(func(text string) float32) *RegexLexer to func(func(text string) float32) Lexer
- (*TokenType).UnmarshalJSON: removed
- Lexer.AnalyseText: added
- Lexer.SetAnalyser: added
- Lexer.SetRegistry: added
- MustNewLazyLexer: removed
- MustNewLexer: changed from func(*Config, Rules) *RegexLexer to func(*Config, func() Rules) *RegexLexer
- Mutators: changed from func(...Mutator) MutatorFunc to func(...Mutator) Mutator
- NewLazyLexer: removed
- NewLexer: changed from func(*Config, Rules) (*RegexLexer, error) to func(*Config, func() Rules) (*RegexLexer, error)
- Pop: changed from func(int) MutatorFunc to func(int) Mutator
- Push: changed from func(...string) MutatorFunc to func(...string) Mutator
- TokenType.MarshalJSON: removed
- Using: changed from func(Lexer) Emitter to func(string) Emitter
- UsingByGroup: changed from func(func(string) Lexer, int, int, ...Emitter) Emitter to func(int, int, ...Emitter) Emitter
2022-01-03 23:51:17 +11:00
|
|
|
func (r *RegexLexer) fetchRules() error {
|
|
|
|
rules, err := r.fetchRulesFunc()
|
|
|
|
if err != nil {
|
|
|
|
return fmt.Errorf("%s: failed to compile rules: %w", r.config.Name, err)
|
|
|
|
}
|
2021-02-07 19:16:49 -06:00
|
|
|
if _, ok := rules["root"]; !ok {
|
|
|
|
return fmt.Errorf("no \"root\" state")
|
|
|
|
}
|
|
|
|
compiledRules := map[string][]*CompiledRule{}
|
|
|
|
for state, rules := range rules {
|
|
|
|
compiledRules[state] = nil
|
|
|
|
for _, rule := range rules {
|
|
|
|
flags := ""
|
|
|
|
if !r.config.NotMultiline {
|
|
|
|
flags += "m"
|
|
|
|
}
|
|
|
|
if r.config.CaseInsensitive {
|
|
|
|
flags += "i"
|
|
|
|
}
|
|
|
|
if r.config.DotAll {
|
|
|
|
flags += "s"
|
|
|
|
}
|
|
|
|
compiledRules[state] = append(compiledRules[state], &CompiledRule{Rule: rule, flags: flags})
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
Version 2 of Chroma
This cleans up the API in general, removing a bunch of deprecated stuff,
cleaning up circular imports, etc.
But the biggest change is switching to an optional XML format for the
regex lexer.
Having lexers defined only in Go is not ideal for a couple of reasons.
Firstly, it impedes a significant portion of contributors who use Chroma
in Hugo, but don't know Go. Secondly, it bloats the binary size of any
project that imports Chroma.
Why XML? YAML is an abomination and JSON is not human editable. XML
also compresses very well (eg. Go template lexer XML compresses from
3239 bytes to 718).
Why a new syntax format? All major existing formats rely on the
Oniguruma regex engine, which is extremely complex and for which there
is no Go port.
Why not earlier? Prior to the existence of fs.FS this was not a viable
option.
Benchmarks:
$ hyperfine --warmup 3 \
'./chroma.master --version' \
'./chroma.xml-pre-opt --version' \
'./chroma.xml --version'
Benchmark 1: ./chroma.master --version
Time (mean ± σ): 5.3 ms ± 0.5 ms [User: 3.6 ms, System: 1.4 ms]
Range (min … max): 4.2 ms … 6.6 ms 233 runs
Benchmark 2: ./chroma.xml-pre-opt --version
Time (mean ± σ): 50.6 ms ± 0.5 ms [User: 52.4 ms, System: 3.6 ms]
Range (min … max): 49.2 ms … 51.5 ms 51 runs
Benchmark 3: ./chroma.xml --version
Time (mean ± σ): 6.9 ms ± 1.1 ms [User: 5.1 ms, System: 1.5 ms]
Range (min … max): 5.7 ms … 19.9 ms 196 runs
Summary
'./chroma.master --version' ran
1.30 ± 0.23 times faster than './chroma.xml --version'
9.56 ± 0.83 times faster than './chroma.xml-pre-opt --version'
A slight increase in init time, but I think this is okay given the
increase in flexibility.
And binary size difference:
$ du -h lexers.test*
$ du -sh chroma* 951371ms
8.8M chroma.master
7.8M chroma.xml
7.8M chroma.xml-pre-opt
Benchmarks:
$ hyperfine --warmup 3 \
'./chroma.master --version' \
'./chroma.xml-pre-opt --version' \
'./chroma.xml --version'
Benchmark 1: ./chroma.master --version
Time (mean ± σ): 5.3 ms ± 0.5 ms [User: 3.6 ms, System: 1.4 ms]
Range (min … max): 4.2 ms … 6.6 ms 233 runs
Benchmark 2: ./chroma.xml-pre-opt --version
Time (mean ± σ): 50.6 ms ± 0.5 ms [User: 52.4 ms, System: 3.6 ms]
Range (min … max): 49.2 ms … 51.5 ms 51 runs
Benchmark 3: ./chroma.xml --version
Time (mean ± σ): 6.9 ms ± 1.1 ms [User: 5.1 ms, System: 1.5 ms]
Range (min … max): 5.7 ms … 19.9 ms 196 runs
Summary
'./chroma.master --version' ran
1.30 ± 0.23 times faster than './chroma.xml --version'
9.56 ± 0.83 times faster than './chroma.xml-pre-opt --version'
Incompatible changes:
- (*RegexLexer).SetAnalyser: changed from func(func(text string) float32) *RegexLexer to func(func(text string) float32) Lexer
- (*TokenType).UnmarshalJSON: removed
- Lexer.AnalyseText: added
- Lexer.SetAnalyser: added
- Lexer.SetRegistry: added
- MustNewLazyLexer: removed
- MustNewLexer: changed from func(*Config, Rules) *RegexLexer to func(*Config, func() Rules) *RegexLexer
- Mutators: changed from func(...Mutator) MutatorFunc to func(...Mutator) Mutator
- NewLazyLexer: removed
- NewLexer: changed from func(*Config, Rules) (*RegexLexer, error) to func(*Config, func() Rules) (*RegexLexer, error)
- Pop: changed from func(int) MutatorFunc to func(int) Mutator
- Push: changed from func(...string) MutatorFunc to func(...string) Mutator
- TokenType.MarshalJSON: removed
- Using: changed from func(Lexer) Emitter to func(string) Emitter
- UsingByGroup: changed from func(func(string) Lexer, int, int, ...Emitter) Emitter to func(int, int, ...Emitter) Emitter
2022-01-03 23:51:17 +11:00
|
|
|
r.rawRules = rules
|
2021-02-07 19:16:49 -06:00
|
|
|
r.rules = compiledRules
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
Version 2 of Chroma
This cleans up the API in general, removing a bunch of deprecated stuff,
cleaning up circular imports, etc.
But the biggest change is switching to an optional XML format for the
regex lexer.
Having lexers defined only in Go is not ideal for a couple of reasons.
Firstly, it impedes a significant portion of contributors who use Chroma
in Hugo, but don't know Go. Secondly, it bloats the binary size of any
project that imports Chroma.
Why XML? YAML is an abomination and JSON is not human editable. XML
also compresses very well (eg. Go template lexer XML compresses from
3239 bytes to 718).
Why a new syntax format? All major existing formats rely on the
Oniguruma regex engine, which is extremely complex and for which there
is no Go port.
Why not earlier? Prior to the existence of fs.FS this was not a viable
option.
Benchmarks:
$ hyperfine --warmup 3 \
'./chroma.master --version' \
'./chroma.xml-pre-opt --version' \
'./chroma.xml --version'
Benchmark 1: ./chroma.master --version
Time (mean ± σ): 5.3 ms ± 0.5 ms [User: 3.6 ms, System: 1.4 ms]
Range (min … max): 4.2 ms … 6.6 ms 233 runs
Benchmark 2: ./chroma.xml-pre-opt --version
Time (mean ± σ): 50.6 ms ± 0.5 ms [User: 52.4 ms, System: 3.6 ms]
Range (min … max): 49.2 ms … 51.5 ms 51 runs
Benchmark 3: ./chroma.xml --version
Time (mean ± σ): 6.9 ms ± 1.1 ms [User: 5.1 ms, System: 1.5 ms]
Range (min … max): 5.7 ms … 19.9 ms 196 runs
Summary
'./chroma.master --version' ran
1.30 ± 0.23 times faster than './chroma.xml --version'
9.56 ± 0.83 times faster than './chroma.xml-pre-opt --version'
A slight increase in init time, but I think this is okay given the
increase in flexibility.
And binary size difference:
$ du -h lexers.test*
$ du -sh chroma* 951371ms
8.8M chroma.master
7.8M chroma.xml
7.8M chroma.xml-pre-opt
Benchmarks:
$ hyperfine --warmup 3 \
'./chroma.master --version' \
'./chroma.xml-pre-opt --version' \
'./chroma.xml --version'
Benchmark 1: ./chroma.master --version
Time (mean ± σ): 5.3 ms ± 0.5 ms [User: 3.6 ms, System: 1.4 ms]
Range (min … max): 4.2 ms … 6.6 ms 233 runs
Benchmark 2: ./chroma.xml-pre-opt --version
Time (mean ± σ): 50.6 ms ± 0.5 ms [User: 52.4 ms, System: 3.6 ms]
Range (min … max): 49.2 ms … 51.5 ms 51 runs
Benchmark 3: ./chroma.xml --version
Time (mean ± σ): 6.9 ms ± 1.1 ms [User: 5.1 ms, System: 1.5 ms]
Range (min … max): 5.7 ms … 19.9 ms 196 runs
Summary
'./chroma.master --version' ran
1.30 ± 0.23 times faster than './chroma.xml --version'
9.56 ± 0.83 times faster than './chroma.xml-pre-opt --version'
Incompatible changes:
- (*RegexLexer).SetAnalyser: changed from func(func(text string) float32) *RegexLexer to func(func(text string) float32) Lexer
- (*TokenType).UnmarshalJSON: removed
- Lexer.AnalyseText: added
- Lexer.SetAnalyser: added
- Lexer.SetRegistry: added
- MustNewLazyLexer: removed
- MustNewLexer: changed from func(*Config, Rules) *RegexLexer to func(*Config, func() Rules) *RegexLexer
- Mutators: changed from func(...Mutator) MutatorFunc to func(...Mutator) Mutator
- NewLazyLexer: removed
- NewLexer: changed from func(*Config, Rules) (*RegexLexer, error) to func(*Config, func() Rules) (*RegexLexer, error)
- Pop: changed from func(int) MutatorFunc to func(int) Mutator
- Push: changed from func(...string) MutatorFunc to func(...string) Mutator
- TokenType.MarshalJSON: removed
- Using: changed from func(Lexer) Emitter to func(string) Emitter
- UsingByGroup: changed from func(func(string) Lexer, int, int, ...Emitter) Emitter to func(int, int, ...Emitter) Emitter
2022-01-03 23:51:17 +11:00
|
|
|
func (r *RegexLexer) needRules() error {
|
2021-02-07 19:16:49 -06:00
|
|
|
var err error
|
Version 2 of Chroma
This cleans up the API in general, removing a bunch of deprecated stuff,
cleaning up circular imports, etc.
But the biggest change is switching to an optional XML format for the
regex lexer.
Having lexers defined only in Go is not ideal for a couple of reasons.
Firstly, it impedes a significant portion of contributors who use Chroma
in Hugo, but don't know Go. Secondly, it bloats the binary size of any
project that imports Chroma.
Why XML? YAML is an abomination and JSON is not human editable. XML
also compresses very well (eg. Go template lexer XML compresses from
3239 bytes to 718).
Why a new syntax format? All major existing formats rely on the
Oniguruma regex engine, which is extremely complex and for which there
is no Go port.
Why not earlier? Prior to the existence of fs.FS this was not a viable
option.
Benchmarks:
$ hyperfine --warmup 3 \
'./chroma.master --version' \
'./chroma.xml-pre-opt --version' \
'./chroma.xml --version'
Benchmark 1: ./chroma.master --version
Time (mean ± σ): 5.3 ms ± 0.5 ms [User: 3.6 ms, System: 1.4 ms]
Range (min … max): 4.2 ms … 6.6 ms 233 runs
Benchmark 2: ./chroma.xml-pre-opt --version
Time (mean ± σ): 50.6 ms ± 0.5 ms [User: 52.4 ms, System: 3.6 ms]
Range (min … max): 49.2 ms … 51.5 ms 51 runs
Benchmark 3: ./chroma.xml --version
Time (mean ± σ): 6.9 ms ± 1.1 ms [User: 5.1 ms, System: 1.5 ms]
Range (min … max): 5.7 ms … 19.9 ms 196 runs
Summary
'./chroma.master --version' ran
1.30 ± 0.23 times faster than './chroma.xml --version'
9.56 ± 0.83 times faster than './chroma.xml-pre-opt --version'
A slight increase in init time, but I think this is okay given the
increase in flexibility.
And binary size difference:
$ du -h lexers.test*
$ du -sh chroma* 951371ms
8.8M chroma.master
7.8M chroma.xml
7.8M chroma.xml-pre-opt
Benchmarks:
$ hyperfine --warmup 3 \
'./chroma.master --version' \
'./chroma.xml-pre-opt --version' \
'./chroma.xml --version'
Benchmark 1: ./chroma.master --version
Time (mean ± σ): 5.3 ms ± 0.5 ms [User: 3.6 ms, System: 1.4 ms]
Range (min … max): 4.2 ms … 6.6 ms 233 runs
Benchmark 2: ./chroma.xml-pre-opt --version
Time (mean ± σ): 50.6 ms ± 0.5 ms [User: 52.4 ms, System: 3.6 ms]
Range (min … max): 49.2 ms … 51.5 ms 51 runs
Benchmark 3: ./chroma.xml --version
Time (mean ± σ): 6.9 ms ± 1.1 ms [User: 5.1 ms, System: 1.5 ms]
Range (min … max): 5.7 ms … 19.9 ms 196 runs
Summary
'./chroma.master --version' ran
1.30 ± 0.23 times faster than './chroma.xml --version'
9.56 ± 0.83 times faster than './chroma.xml-pre-opt --version'
Incompatible changes:
- (*RegexLexer).SetAnalyser: changed from func(func(text string) float32) *RegexLexer to func(func(text string) float32) Lexer
- (*TokenType).UnmarshalJSON: removed
- Lexer.AnalyseText: added
- Lexer.SetAnalyser: added
- Lexer.SetRegistry: added
- MustNewLazyLexer: removed
- MustNewLexer: changed from func(*Config, Rules) *RegexLexer to func(*Config, func() Rules) *RegexLexer
- Mutators: changed from func(...Mutator) MutatorFunc to func(...Mutator) Mutator
- NewLazyLexer: removed
- NewLexer: changed from func(*Config, Rules) (*RegexLexer, error) to func(*Config, func() Rules) (*RegexLexer, error)
- Pop: changed from func(int) MutatorFunc to func(int) Mutator
- Push: changed from func(...string) MutatorFunc to func(...string) Mutator
- TokenType.MarshalJSON: removed
- Using: changed from func(Lexer) Emitter to func(string) Emitter
- UsingByGroup: changed from func(func(string) Lexer, int, int, ...Emitter) Emitter to func(int, int, ...Emitter) Emitter
2022-01-03 23:51:17 +11:00
|
|
|
if r.fetchRulesFunc != nil {
|
2021-02-07 19:16:49 -06:00
|
|
|
r.compileOnce.Do(func() {
|
Version 2 of Chroma
This cleans up the API in general, removing a bunch of deprecated stuff,
cleaning up circular imports, etc.
But the biggest change is switching to an optional XML format for the
regex lexer.
Having lexers defined only in Go is not ideal for a couple of reasons.
Firstly, it impedes a significant portion of contributors who use Chroma
in Hugo, but don't know Go. Secondly, it bloats the binary size of any
project that imports Chroma.
Why XML? YAML is an abomination and JSON is not human editable. XML
also compresses very well (eg. Go template lexer XML compresses from
3239 bytes to 718).
Why a new syntax format? All major existing formats rely on the
Oniguruma regex engine, which is extremely complex and for which there
is no Go port.
Why not earlier? Prior to the existence of fs.FS this was not a viable
option.
Benchmarks:
$ hyperfine --warmup 3 \
'./chroma.master --version' \
'./chroma.xml-pre-opt --version' \
'./chroma.xml --version'
Benchmark 1: ./chroma.master --version
Time (mean ± σ): 5.3 ms ± 0.5 ms [User: 3.6 ms, System: 1.4 ms]
Range (min … max): 4.2 ms … 6.6 ms 233 runs
Benchmark 2: ./chroma.xml-pre-opt --version
Time (mean ± σ): 50.6 ms ± 0.5 ms [User: 52.4 ms, System: 3.6 ms]
Range (min … max): 49.2 ms … 51.5 ms 51 runs
Benchmark 3: ./chroma.xml --version
Time (mean ± σ): 6.9 ms ± 1.1 ms [User: 5.1 ms, System: 1.5 ms]
Range (min … max): 5.7 ms … 19.9 ms 196 runs
Summary
'./chroma.master --version' ran
1.30 ± 0.23 times faster than './chroma.xml --version'
9.56 ± 0.83 times faster than './chroma.xml-pre-opt --version'
A slight increase in init time, but I think this is okay given the
increase in flexibility.
And binary size difference:
$ du -h lexers.test*
$ du -sh chroma* 951371ms
8.8M chroma.master
7.8M chroma.xml
7.8M chroma.xml-pre-opt
Benchmarks:
$ hyperfine --warmup 3 \
'./chroma.master --version' \
'./chroma.xml-pre-opt --version' \
'./chroma.xml --version'
Benchmark 1: ./chroma.master --version
Time (mean ± σ): 5.3 ms ± 0.5 ms [User: 3.6 ms, System: 1.4 ms]
Range (min … max): 4.2 ms … 6.6 ms 233 runs
Benchmark 2: ./chroma.xml-pre-opt --version
Time (mean ± σ): 50.6 ms ± 0.5 ms [User: 52.4 ms, System: 3.6 ms]
Range (min … max): 49.2 ms … 51.5 ms 51 runs
Benchmark 3: ./chroma.xml --version
Time (mean ± σ): 6.9 ms ± 1.1 ms [User: 5.1 ms, System: 1.5 ms]
Range (min … max): 5.7 ms … 19.9 ms 196 runs
Summary
'./chroma.master --version' ran
1.30 ± 0.23 times faster than './chroma.xml --version'
9.56 ± 0.83 times faster than './chroma.xml-pre-opt --version'
Incompatible changes:
- (*RegexLexer).SetAnalyser: changed from func(func(text string) float32) *RegexLexer to func(func(text string) float32) Lexer
- (*TokenType).UnmarshalJSON: removed
- Lexer.AnalyseText: added
- Lexer.SetAnalyser: added
- Lexer.SetRegistry: added
- MustNewLazyLexer: removed
- MustNewLexer: changed from func(*Config, Rules) *RegexLexer to func(*Config, func() Rules) *RegexLexer
- Mutators: changed from func(...Mutator) MutatorFunc to func(...Mutator) Mutator
- NewLazyLexer: removed
- NewLexer: changed from func(*Config, Rules) (*RegexLexer, error) to func(*Config, func() Rules) (*RegexLexer, error)
- Pop: changed from func(int) MutatorFunc to func(int) Mutator
- Push: changed from func(...string) MutatorFunc to func(...string) Mutator
- TokenType.MarshalJSON: removed
- Using: changed from func(Lexer) Emitter to func(string) Emitter
- UsingByGroup: changed from func(func(string) Lexer, int, int, ...Emitter) Emitter to func(int, int, ...Emitter) Emitter
2022-01-03 23:51:17 +11:00
|
|
|
err = r.fetchRules()
|
2021-02-07 19:16:49 -06:00
|
|
|
})
|
|
|
|
}
|
2017-09-20 20:15:35 +10:00
|
|
|
if err := r.maybeCompile(); err != nil {
|
Version 2 of Chroma
This cleans up the API in general, removing a bunch of deprecated stuff,
cleaning up circular imports, etc.
But the biggest change is switching to an optional XML format for the
regex lexer.
Having lexers defined only in Go is not ideal for a couple of reasons.
Firstly, it impedes a significant portion of contributors who use Chroma
in Hugo, but don't know Go. Secondly, it bloats the binary size of any
project that imports Chroma.
Why XML? YAML is an abomination and JSON is not human editable. XML
also compresses very well (eg. Go template lexer XML compresses from
3239 bytes to 718).
Why a new syntax format? All major existing formats rely on the
Oniguruma regex engine, which is extremely complex and for which there
is no Go port.
Why not earlier? Prior to the existence of fs.FS this was not a viable
option.
Benchmarks:
$ hyperfine --warmup 3 \
'./chroma.master --version' \
'./chroma.xml-pre-opt --version' \
'./chroma.xml --version'
Benchmark 1: ./chroma.master --version
Time (mean ± σ): 5.3 ms ± 0.5 ms [User: 3.6 ms, System: 1.4 ms]
Range (min … max): 4.2 ms … 6.6 ms 233 runs
Benchmark 2: ./chroma.xml-pre-opt --version
Time (mean ± σ): 50.6 ms ± 0.5 ms [User: 52.4 ms, System: 3.6 ms]
Range (min … max): 49.2 ms … 51.5 ms 51 runs
Benchmark 3: ./chroma.xml --version
Time (mean ± σ): 6.9 ms ± 1.1 ms [User: 5.1 ms, System: 1.5 ms]
Range (min … max): 5.7 ms … 19.9 ms 196 runs
Summary
'./chroma.master --version' ran
1.30 ± 0.23 times faster than './chroma.xml --version'
9.56 ± 0.83 times faster than './chroma.xml-pre-opt --version'
A slight increase in init time, but I think this is okay given the
increase in flexibility.
And binary size difference:
$ du -h lexers.test*
$ du -sh chroma* 951371ms
8.8M chroma.master
7.8M chroma.xml
7.8M chroma.xml-pre-opt
Benchmarks:
$ hyperfine --warmup 3 \
'./chroma.master --version' \
'./chroma.xml-pre-opt --version' \
'./chroma.xml --version'
Benchmark 1: ./chroma.master --version
Time (mean ± σ): 5.3 ms ± 0.5 ms [User: 3.6 ms, System: 1.4 ms]
Range (min … max): 4.2 ms … 6.6 ms 233 runs
Benchmark 2: ./chroma.xml-pre-opt --version
Time (mean ± σ): 50.6 ms ± 0.5 ms [User: 52.4 ms, System: 3.6 ms]
Range (min … max): 49.2 ms … 51.5 ms 51 runs
Benchmark 3: ./chroma.xml --version
Time (mean ± σ): 6.9 ms ± 1.1 ms [User: 5.1 ms, System: 1.5 ms]
Range (min … max): 5.7 ms … 19.9 ms 196 runs
Summary
'./chroma.master --version' ran
1.30 ± 0.23 times faster than './chroma.xml --version'
9.56 ± 0.83 times faster than './chroma.xml-pre-opt --version'
Incompatible changes:
- (*RegexLexer).SetAnalyser: changed from func(func(text string) float32) *RegexLexer to func(func(text string) float32) Lexer
- (*TokenType).UnmarshalJSON: removed
- Lexer.AnalyseText: added
- Lexer.SetAnalyser: added
- Lexer.SetRegistry: added
- MustNewLazyLexer: removed
- MustNewLexer: changed from func(*Config, Rules) *RegexLexer to func(*Config, func() Rules) *RegexLexer
- Mutators: changed from func(...Mutator) MutatorFunc to func(...Mutator) Mutator
- NewLazyLexer: removed
- NewLexer: changed from func(*Config, Rules) (*RegexLexer, error) to func(*Config, func() Rules) (*RegexLexer, error)
- Pop: changed from func(int) MutatorFunc to func(int) Mutator
- Push: changed from func(...string) MutatorFunc to func(...string) Mutator
- TokenType.MarshalJSON: removed
- Using: changed from func(Lexer) Emitter to func(string) Emitter
- UsingByGroup: changed from func(func(string) Lexer, int, int, ...Emitter) Emitter to func(int, int, ...Emitter) Emitter
2022-01-03 23:51:17 +11:00
|
|
|
return err
|
|
|
|
}
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
2023-08-22 22:51:13 -03:00
|
|
|
// Tokenise text using lexer, returning an iterator.
|
|
|
|
func (r *RegexLexer) Tokenise(options *TokeniseOptions, text string) (Iterator, error) {
|
Version 2 of Chroma
This cleans up the API in general, removing a bunch of deprecated stuff,
cleaning up circular imports, etc.
But the biggest change is switching to an optional XML format for the
regex lexer.
Having lexers defined only in Go is not ideal for a couple of reasons.
Firstly, it impedes a significant portion of contributors who use Chroma
in Hugo, but don't know Go. Secondly, it bloats the binary size of any
project that imports Chroma.
Why XML? YAML is an abomination and JSON is not human editable. XML
also compresses very well (eg. Go template lexer XML compresses from
3239 bytes to 718).
Why a new syntax format? All major existing formats rely on the
Oniguruma regex engine, which is extremely complex and for which there
is no Go port.
Why not earlier? Prior to the existence of fs.FS this was not a viable
option.
Benchmarks:
$ hyperfine --warmup 3 \
'./chroma.master --version' \
'./chroma.xml-pre-opt --version' \
'./chroma.xml --version'
Benchmark 1: ./chroma.master --version
Time (mean ± σ): 5.3 ms ± 0.5 ms [User: 3.6 ms, System: 1.4 ms]
Range (min … max): 4.2 ms … 6.6 ms 233 runs
Benchmark 2: ./chroma.xml-pre-opt --version
Time (mean ± σ): 50.6 ms ± 0.5 ms [User: 52.4 ms, System: 3.6 ms]
Range (min … max): 49.2 ms … 51.5 ms 51 runs
Benchmark 3: ./chroma.xml --version
Time (mean ± σ): 6.9 ms ± 1.1 ms [User: 5.1 ms, System: 1.5 ms]
Range (min … max): 5.7 ms … 19.9 ms 196 runs
Summary
'./chroma.master --version' ran
1.30 ± 0.23 times faster than './chroma.xml --version'
9.56 ± 0.83 times faster than './chroma.xml-pre-opt --version'
A slight increase in init time, but I think this is okay given the
increase in flexibility.
And binary size difference:
$ du -h lexers.test*
$ du -sh chroma* 951371ms
8.8M chroma.master
7.8M chroma.xml
7.8M chroma.xml-pre-opt
Benchmarks:
$ hyperfine --warmup 3 \
'./chroma.master --version' \
'./chroma.xml-pre-opt --version' \
'./chroma.xml --version'
Benchmark 1: ./chroma.master --version
Time (mean ± σ): 5.3 ms ± 0.5 ms [User: 3.6 ms, System: 1.4 ms]
Range (min … max): 4.2 ms … 6.6 ms 233 runs
Benchmark 2: ./chroma.xml-pre-opt --version
Time (mean ± σ): 50.6 ms ± 0.5 ms [User: 52.4 ms, System: 3.6 ms]
Range (min … max): 49.2 ms … 51.5 ms 51 runs
Benchmark 3: ./chroma.xml --version
Time (mean ± σ): 6.9 ms ± 1.1 ms [User: 5.1 ms, System: 1.5 ms]
Range (min … max): 5.7 ms … 19.9 ms 196 runs
Summary
'./chroma.master --version' ran
1.30 ± 0.23 times faster than './chroma.xml --version'
9.56 ± 0.83 times faster than './chroma.xml-pre-opt --version'
Incompatible changes:
- (*RegexLexer).SetAnalyser: changed from func(func(text string) float32) *RegexLexer to func(func(text string) float32) Lexer
- (*TokenType).UnmarshalJSON: removed
- Lexer.AnalyseText: added
- Lexer.SetAnalyser: added
- Lexer.SetRegistry: added
- MustNewLazyLexer: removed
- MustNewLexer: changed from func(*Config, Rules) *RegexLexer to func(*Config, func() Rules) *RegexLexer
- Mutators: changed from func(...Mutator) MutatorFunc to func(...Mutator) Mutator
- NewLazyLexer: removed
- NewLexer: changed from func(*Config, Rules) (*RegexLexer, error) to func(*Config, func() Rules) (*RegexLexer, error)
- Pop: changed from func(int) MutatorFunc to func(int) Mutator
- Push: changed from func(...string) MutatorFunc to func(...string) Mutator
- TokenType.MarshalJSON: removed
- Using: changed from func(Lexer) Emitter to func(string) Emitter
- UsingByGroup: changed from func(func(string) Lexer, int, int, ...Emitter) Emitter to func(int, int, ...Emitter) Emitter
2022-01-03 23:51:17 +11:00
|
|
|
err := r.needRules()
|
|
|
|
if err != nil {
|
2017-09-20 22:19:36 +10:00
|
|
|
return nil, err
|
2017-09-20 20:15:35 +10:00
|
|
|
}
|
|
|
|
if options == nil {
|
|
|
|
options = defaultOptions
|
|
|
|
}
|
2020-03-04 16:56:47 +09:00
|
|
|
if options.EnsureLF {
|
|
|
|
text = ensureLF(text)
|
|
|
|
}
|
2021-02-06 20:13:50 +11:00
|
|
|
newlineAdded := false
|
2018-03-03 10:16:21 +11:00
|
|
|
if !options.Nested && r.config.EnsureNL && !strings.HasSuffix(text, "\n") {
|
2017-09-29 21:59:52 +10:00
|
|
|
text += "\n"
|
2021-02-06 20:13:50 +11:00
|
|
|
newlineAdded = true
|
2017-09-29 21:59:52 +10:00
|
|
|
}
|
2017-09-20 20:15:35 +10:00
|
|
|
state := &LexerState{
|
Version 2 of Chroma
This cleans up the API in general, removing a bunch of deprecated stuff,
cleaning up circular imports, etc.
But the biggest change is switching to an optional XML format for the
regex lexer.
Having lexers defined only in Go is not ideal for a couple of reasons.
Firstly, it impedes a significant portion of contributors who use Chroma
in Hugo, but don't know Go. Secondly, it bloats the binary size of any
project that imports Chroma.
Why XML? YAML is an abomination and JSON is not human editable. XML
also compresses very well (eg. Go template lexer XML compresses from
3239 bytes to 718).
Why a new syntax format? All major existing formats rely on the
Oniguruma regex engine, which is extremely complex and for which there
is no Go port.
Why not earlier? Prior to the existence of fs.FS this was not a viable
option.
Benchmarks:
$ hyperfine --warmup 3 \
'./chroma.master --version' \
'./chroma.xml-pre-opt --version' \
'./chroma.xml --version'
Benchmark 1: ./chroma.master --version
Time (mean ± σ): 5.3 ms ± 0.5 ms [User: 3.6 ms, System: 1.4 ms]
Range (min … max): 4.2 ms … 6.6 ms 233 runs
Benchmark 2: ./chroma.xml-pre-opt --version
Time (mean ± σ): 50.6 ms ± 0.5 ms [User: 52.4 ms, System: 3.6 ms]
Range (min … max): 49.2 ms … 51.5 ms 51 runs
Benchmark 3: ./chroma.xml --version
Time (mean ± σ): 6.9 ms ± 1.1 ms [User: 5.1 ms, System: 1.5 ms]
Range (min … max): 5.7 ms … 19.9 ms 196 runs
Summary
'./chroma.master --version' ran
1.30 ± 0.23 times faster than './chroma.xml --version'
9.56 ± 0.83 times faster than './chroma.xml-pre-opt --version'
A slight increase in init time, but I think this is okay given the
increase in flexibility.
And binary size difference:
$ du -h lexers.test*
$ du -sh chroma* 951371ms
8.8M chroma.master
7.8M chroma.xml
7.8M chroma.xml-pre-opt
Benchmarks:
$ hyperfine --warmup 3 \
'./chroma.master --version' \
'./chroma.xml-pre-opt --version' \
'./chroma.xml --version'
Benchmark 1: ./chroma.master --version
Time (mean ± σ): 5.3 ms ± 0.5 ms [User: 3.6 ms, System: 1.4 ms]
Range (min … max): 4.2 ms … 6.6 ms 233 runs
Benchmark 2: ./chroma.xml-pre-opt --version
Time (mean ± σ): 50.6 ms ± 0.5 ms [User: 52.4 ms, System: 3.6 ms]
Range (min … max): 49.2 ms … 51.5 ms 51 runs
Benchmark 3: ./chroma.xml --version
Time (mean ± σ): 6.9 ms ± 1.1 ms [User: 5.1 ms, System: 1.5 ms]
Range (min … max): 5.7 ms … 19.9 ms 196 runs
Summary
'./chroma.master --version' ran
1.30 ± 0.23 times faster than './chroma.xml --version'
9.56 ± 0.83 times faster than './chroma.xml-pre-opt --version'
Incompatible changes:
- (*RegexLexer).SetAnalyser: changed from func(func(text string) float32) *RegexLexer to func(func(text string) float32) Lexer
- (*TokenType).UnmarshalJSON: removed
- Lexer.AnalyseText: added
- Lexer.SetAnalyser: added
- Lexer.SetRegistry: added
- MustNewLazyLexer: removed
- MustNewLexer: changed from func(*Config, Rules) *RegexLexer to func(*Config, func() Rules) *RegexLexer
- Mutators: changed from func(...Mutator) MutatorFunc to func(...Mutator) Mutator
- NewLazyLexer: removed
- NewLexer: changed from func(*Config, Rules) (*RegexLexer, error) to func(*Config, func() Rules) (*RegexLexer, error)
- Pop: changed from func(int) MutatorFunc to func(int) Mutator
- Push: changed from func(...string) MutatorFunc to func(...string) Mutator
- TokenType.MarshalJSON: removed
- Using: changed from func(Lexer) Emitter to func(string) Emitter
- UsingByGroup: changed from func(func(string) Lexer, int, int, ...Emitter) Emitter to func(int, int, ...Emitter) Emitter
2022-01-03 23:51:17 +11:00
|
|
|
Registry: r.registry,
|
2021-02-06 20:13:50 +11:00
|
|
|
newlineAdded: newlineAdded,
|
2019-04-22 18:22:58 +10:00
|
|
|
options: options,
|
2017-09-20 22:30:25 +10:00
|
|
|
Lexer: r,
|
2017-09-20 20:15:35 +10:00
|
|
|
Text: []rune(text),
|
|
|
|
Stack: []string{options.State},
|
|
|
|
Rules: r.rules,
|
|
|
|
MutatorContext: map[interface{}]interface{}{},
|
|
|
|
}
|
2017-09-23 21:55:56 +10:00
|
|
|
return state.Iterator, nil
|
2017-09-20 20:15:35 +10:00
|
|
|
}
|
|
|
|
|
Version 2 of Chroma
This cleans up the API in general, removing a bunch of deprecated stuff,
cleaning up circular imports, etc.
But the biggest change is switching to an optional XML format for the
regex lexer.
Having lexers defined only in Go is not ideal for a couple of reasons.
Firstly, it impedes a significant portion of contributors who use Chroma
in Hugo, but don't know Go. Secondly, it bloats the binary size of any
project that imports Chroma.
Why XML? YAML is an abomination and JSON is not human editable. XML
also compresses very well (eg. Go template lexer XML compresses from
3239 bytes to 718).
Why a new syntax format? All major existing formats rely on the
Oniguruma regex engine, which is extremely complex and for which there
is no Go port.
Why not earlier? Prior to the existence of fs.FS this was not a viable
option.
Benchmarks:
$ hyperfine --warmup 3 \
'./chroma.master --version' \
'./chroma.xml-pre-opt --version' \
'./chroma.xml --version'
Benchmark 1: ./chroma.master --version
Time (mean ± σ): 5.3 ms ± 0.5 ms [User: 3.6 ms, System: 1.4 ms]
Range (min … max): 4.2 ms … 6.6 ms 233 runs
Benchmark 2: ./chroma.xml-pre-opt --version
Time (mean ± σ): 50.6 ms ± 0.5 ms [User: 52.4 ms, System: 3.6 ms]
Range (min … max): 49.2 ms … 51.5 ms 51 runs
Benchmark 3: ./chroma.xml --version
Time (mean ± σ): 6.9 ms ± 1.1 ms [User: 5.1 ms, System: 1.5 ms]
Range (min … max): 5.7 ms … 19.9 ms 196 runs
Summary
'./chroma.master --version' ran
1.30 ± 0.23 times faster than './chroma.xml --version'
9.56 ± 0.83 times faster than './chroma.xml-pre-opt --version'
A slight increase in init time, but I think this is okay given the
increase in flexibility.
And binary size difference:
$ du -h lexers.test*
$ du -sh chroma* 951371ms
8.8M chroma.master
7.8M chroma.xml
7.8M chroma.xml-pre-opt
Benchmarks:
$ hyperfine --warmup 3 \
'./chroma.master --version' \
'./chroma.xml-pre-opt --version' \
'./chroma.xml --version'
Benchmark 1: ./chroma.master --version
Time (mean ± σ): 5.3 ms ± 0.5 ms [User: 3.6 ms, System: 1.4 ms]
Range (min … max): 4.2 ms … 6.6 ms 233 runs
Benchmark 2: ./chroma.xml-pre-opt --version
Time (mean ± σ): 50.6 ms ± 0.5 ms [User: 52.4 ms, System: 3.6 ms]
Range (min … max): 49.2 ms … 51.5 ms 51 runs
Benchmark 3: ./chroma.xml --version
Time (mean ± σ): 6.9 ms ± 1.1 ms [User: 5.1 ms, System: 1.5 ms]
Range (min … max): 5.7 ms … 19.9 ms 196 runs
Summary
'./chroma.master --version' ran
1.30 ± 0.23 times faster than './chroma.xml --version'
9.56 ± 0.83 times faster than './chroma.xml-pre-opt --version'
Incompatible changes:
- (*RegexLexer).SetAnalyser: changed from func(func(text string) float32) *RegexLexer to func(func(text string) float32) Lexer
- (*TokenType).UnmarshalJSON: removed
- Lexer.AnalyseText: added
- Lexer.SetAnalyser: added
- Lexer.SetRegistry: added
- MustNewLazyLexer: removed
- MustNewLexer: changed from func(*Config, Rules) *RegexLexer to func(*Config, func() Rules) *RegexLexer
- Mutators: changed from func(...Mutator) MutatorFunc to func(...Mutator) Mutator
- NewLazyLexer: removed
- NewLexer: changed from func(*Config, Rules) (*RegexLexer, error) to func(*Config, func() Rules) (*RegexLexer, error)
- Pop: changed from func(int) MutatorFunc to func(int) Mutator
- Push: changed from func(...string) MutatorFunc to func(...string) Mutator
- TokenType.MarshalJSON: removed
- Using: changed from func(Lexer) Emitter to func(string) Emitter
- UsingByGroup: changed from func(func(string) Lexer, int, int, ...Emitter) Emitter to func(int, int, ...Emitter) Emitter
2022-01-03 23:51:17 +11:00
|
|
|
// MustRules is like Rules() but will panic on error.
|
|
|
|
func (r *RegexLexer) MustRules() Rules {
|
|
|
|
rules, err := r.Rules()
|
|
|
|
if err != nil {
|
|
|
|
panic(err)
|
|
|
|
}
|
|
|
|
return rules
|
|
|
|
}
|
|
|
|
|
2021-05-06 11:43:54 +04:30
|
|
|
func matchRules(text []rune, pos int, rules []*CompiledRule) (int, *CompiledRule, []string, map[string]string) {
|
2017-09-20 20:15:35 +10:00
|
|
|
for i, rule := range rules {
|
2019-06-09 21:43:16 +10:00
|
|
|
match, err := rule.Regexp.FindRunesMatchStartingAt(text, pos)
|
|
|
|
if match != nil && err == nil && match.Index == pos {
|
2017-09-20 20:15:35 +10:00
|
|
|
groups := []string{}
|
2021-05-08 11:45:10 +04:30
|
|
|
namedGroups := make(map[string]string)
|
2017-09-20 20:15:35 +10:00
|
|
|
for _, g := range match.Groups() {
|
2021-05-08 11:45:10 +04:30
|
|
|
namedGroups[g.Name] = g.String()
|
2017-09-20 20:15:35 +10:00
|
|
|
groups = append(groups, g.String())
|
|
|
|
}
|
2021-05-06 11:43:54 +04:30
|
|
|
return i, rule, groups, namedGroups
|
2017-09-20 20:15:35 +10:00
|
|
|
}
|
|
|
|
}
|
2021-05-06 11:43:54 +04:30
|
|
|
return 0, &CompiledRule{}, nil, nil
|
2017-09-20 20:15:35 +10:00
|
|
|
}
|
2020-03-04 16:56:47 +09:00
|
|
|
|
|
|
|
// replace \r and \r\n with \n
|
|
|
|
// same as strings.ReplaceAll but more efficient
|
|
|
|
func ensureLF(text string) string {
|
|
|
|
buf := make([]byte, len(text))
|
|
|
|
var j int
|
|
|
|
for i := 0; i < len(text); i++ {
|
|
|
|
c := text[i]
|
|
|
|
if c == '\r' {
|
|
|
|
if i < len(text)-1 && text[i+1] == '\n' {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
c = '\n'
|
|
|
|
}
|
|
|
|
buf[j] = c
|
|
|
|
j++
|
|
|
|
}
|
|
|
|
return string(buf[:j])
|
|
|
|
}
|