Version 2 of Chroma
This cleans up the API in general, removing a bunch of deprecated stuff,
cleaning up circular imports, etc.
But the biggest change is switching to an optional XML format for the
regex lexer.
Having lexers defined only in Go is not ideal for a couple of reasons.
Firstly, it impedes a significant portion of contributors who use Chroma
in Hugo, but don't know Go. Secondly, it bloats the binary size of any
project that imports Chroma.
Why XML? YAML is an abomination and JSON is not human editable. XML
also compresses very well (eg. Go template lexer XML compresses from
3239 bytes to 718).
Why a new syntax format? All major existing formats rely on the
Oniguruma regex engine, which is extremely complex and for which there
is no Go port.
Why not earlier? Prior to the existence of fs.FS this was not a viable
option.
Benchmarks:
$ hyperfine --warmup 3 \
'./chroma.master --version' \
'./chroma.xml-pre-opt --version' \
'./chroma.xml --version'
Benchmark 1: ./chroma.master --version
Time (mean ± σ): 5.3 ms ± 0.5 ms [User: 3.6 ms, System: 1.4 ms]
Range (min … max): 4.2 ms … 6.6 ms 233 runs
Benchmark 2: ./chroma.xml-pre-opt --version
Time (mean ± σ): 50.6 ms ± 0.5 ms [User: 52.4 ms, System: 3.6 ms]
Range (min … max): 49.2 ms … 51.5 ms 51 runs
Benchmark 3: ./chroma.xml --version
Time (mean ± σ): 6.9 ms ± 1.1 ms [User: 5.1 ms, System: 1.5 ms]
Range (min … max): 5.7 ms … 19.9 ms 196 runs
Summary
'./chroma.master --version' ran
1.30 ± 0.23 times faster than './chroma.xml --version'
9.56 ± 0.83 times faster than './chroma.xml-pre-opt --version'
A slight increase in init time, but I think this is okay given the
increase in flexibility.
And binary size difference:
$ du -h lexers.test*
$ du -sh chroma* 951371ms
8.8M chroma.master
7.8M chroma.xml
7.8M chroma.xml-pre-opt
Benchmarks:
$ hyperfine --warmup 3 \
'./chroma.master --version' \
'./chroma.xml-pre-opt --version' \
'./chroma.xml --version'
Benchmark 1: ./chroma.master --version
Time (mean ± σ): 5.3 ms ± 0.5 ms [User: 3.6 ms, System: 1.4 ms]
Range (min … max): 4.2 ms … 6.6 ms 233 runs
Benchmark 2: ./chroma.xml-pre-opt --version
Time (mean ± σ): 50.6 ms ± 0.5 ms [User: 52.4 ms, System: 3.6 ms]
Range (min … max): 49.2 ms … 51.5 ms 51 runs
Benchmark 3: ./chroma.xml --version
Time (mean ± σ): 6.9 ms ± 1.1 ms [User: 5.1 ms, System: 1.5 ms]
Range (min … max): 5.7 ms … 19.9 ms 196 runs
Summary
'./chroma.master --version' ran
1.30 ± 0.23 times faster than './chroma.xml --version'
9.56 ± 0.83 times faster than './chroma.xml-pre-opt --version'
Incompatible changes:
- (*RegexLexer).SetAnalyser: changed from func(func(text string) float32) *RegexLexer to func(func(text string) float32) Lexer
- (*TokenType).UnmarshalJSON: removed
- Lexer.AnalyseText: added
- Lexer.SetAnalyser: added
- Lexer.SetRegistry: added
- MustNewLazyLexer: removed
- MustNewLexer: changed from func(*Config, Rules) *RegexLexer to func(*Config, func() Rules) *RegexLexer
- Mutators: changed from func(...Mutator) MutatorFunc to func(...Mutator) Mutator
- NewLazyLexer: removed
- NewLexer: changed from func(*Config, Rules) (*RegexLexer, error) to func(*Config, func() Rules) (*RegexLexer, error)
- Pop: changed from func(int) MutatorFunc to func(int) Mutator
- Push: changed from func(...string) MutatorFunc to func(...string) Mutator
- TokenType.MarshalJSON: removed
- Using: changed from func(Lexer) Emitter to func(string) Emitter
- UsingByGroup: changed from func(func(string) Lexer, int, int, ...Emitter) Emitter to func(int, int, ...Emitter) Emitter
2022-01-03 23:51:17 +11:00
|
|
|
package chroma
|
|
|
|
|
|
|
|
import (
|
|
|
|
"fmt"
|
|
|
|
)
|
|
|
|
|
|
|
|
// An Emitter takes group matches and returns tokens.
|
|
|
|
type Emitter interface {
|
|
|
|
// Emit tokens for the given regex groups.
|
|
|
|
Emit(groups []string, state *LexerState) Iterator
|
|
|
|
}
|
|
|
|
|
|
|
|
// SerialisableEmitter is an Emitter that can be serialised and deserialised to/from JSON.
|
|
|
|
type SerialisableEmitter interface {
|
|
|
|
Emitter
|
|
|
|
EmitterKind() string
|
|
|
|
}
|
|
|
|
|
|
|
|
// EmitterFunc is a function that is an Emitter.
|
|
|
|
type EmitterFunc func(groups []string, state *LexerState) Iterator
|
|
|
|
|
|
|
|
// Emit tokens for groups.
|
|
|
|
func (e EmitterFunc) Emit(groups []string, state *LexerState) Iterator {
|
|
|
|
return e(groups, state)
|
|
|
|
}
|
|
|
|
|
|
|
|
type Emitters []Emitter
|
|
|
|
|
|
|
|
type byGroupsEmitter struct {
|
|
|
|
Emitters
|
|
|
|
}
|
|
|
|
|
|
|
|
// ByGroups emits a token for each matching group in the rule's regex.
|
|
|
|
func ByGroups(emitters ...Emitter) Emitter {
|
|
|
|
return &byGroupsEmitter{Emitters: emitters}
|
|
|
|
}
|
|
|
|
|
|
|
|
func (b *byGroupsEmitter) EmitterKind() string { return "bygroups" }
|
|
|
|
|
|
|
|
func (b *byGroupsEmitter) Emit(groups []string, state *LexerState) Iterator {
|
|
|
|
iterators := make([]Iterator, 0, len(groups)-1)
|
|
|
|
if len(b.Emitters) != len(groups)-1 {
|
|
|
|
iterators = append(iterators, Error.Emit(groups, state))
|
|
|
|
// panic(errors.Errorf("number of groups %q does not match number of emitters %v", groups, emitters))
|
|
|
|
} else {
|
|
|
|
for i, group := range groups[1:] {
|
|
|
|
if b.Emitters[i] != nil {
|
|
|
|
iterators = append(iterators, b.Emitters[i].Emit([]string{group}, state))
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return Concaterator(iterators...)
|
|
|
|
}
|
|
|
|
|
|
|
|
// ByGroupNames emits a token for each named matching group in the rule's regex.
|
|
|
|
func ByGroupNames(emitters map[string]Emitter) Emitter {
|
|
|
|
return EmitterFunc(func(groups []string, state *LexerState) Iterator {
|
|
|
|
iterators := make([]Iterator, 0, len(state.NamedGroups)-1)
|
|
|
|
if len(state.NamedGroups)-1 == 0 {
|
|
|
|
if emitter, ok := emitters[`0`]; ok {
|
|
|
|
iterators = append(iterators, emitter.Emit(groups, state))
|
|
|
|
} else {
|
|
|
|
iterators = append(iterators, Error.Emit(groups, state))
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
ruleRegex := state.Rules[state.State][state.Rule].Regexp
|
|
|
|
for i := 1; i < len(state.NamedGroups); i++ {
|
|
|
|
groupName := ruleRegex.GroupNameFromNumber(i)
|
|
|
|
group := state.NamedGroups[groupName]
|
|
|
|
if emitter, ok := emitters[groupName]; ok {
|
|
|
|
if emitter != nil {
|
|
|
|
iterators = append(iterators, emitter.Emit([]string{group}, state))
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
iterators = append(iterators, Error.Emit([]string{group}, state))
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return Concaterator(iterators...)
|
|
|
|
})
|
|
|
|
}
|
|
|
|
|
|
|
|
// UsingByGroup emits tokens for the matched groups in the regex using a
|
2022-08-23 06:15:32 +07:00
|
|
|
// sublexer. Used when lexing code blocks where the name of a sublexer is
|
Version 2 of Chroma
This cleans up the API in general, removing a bunch of deprecated stuff,
cleaning up circular imports, etc.
But the biggest change is switching to an optional XML format for the
regex lexer.
Having lexers defined only in Go is not ideal for a couple of reasons.
Firstly, it impedes a significant portion of contributors who use Chroma
in Hugo, but don't know Go. Secondly, it bloats the binary size of any
project that imports Chroma.
Why XML? YAML is an abomination and JSON is not human editable. XML
also compresses very well (eg. Go template lexer XML compresses from
3239 bytes to 718).
Why a new syntax format? All major existing formats rely on the
Oniguruma regex engine, which is extremely complex and for which there
is no Go port.
Why not earlier? Prior to the existence of fs.FS this was not a viable
option.
Benchmarks:
$ hyperfine --warmup 3 \
'./chroma.master --version' \
'./chroma.xml-pre-opt --version' \
'./chroma.xml --version'
Benchmark 1: ./chroma.master --version
Time (mean ± σ): 5.3 ms ± 0.5 ms [User: 3.6 ms, System: 1.4 ms]
Range (min … max): 4.2 ms … 6.6 ms 233 runs
Benchmark 2: ./chroma.xml-pre-opt --version
Time (mean ± σ): 50.6 ms ± 0.5 ms [User: 52.4 ms, System: 3.6 ms]
Range (min … max): 49.2 ms … 51.5 ms 51 runs
Benchmark 3: ./chroma.xml --version
Time (mean ± σ): 6.9 ms ± 1.1 ms [User: 5.1 ms, System: 1.5 ms]
Range (min … max): 5.7 ms … 19.9 ms 196 runs
Summary
'./chroma.master --version' ran
1.30 ± 0.23 times faster than './chroma.xml --version'
9.56 ± 0.83 times faster than './chroma.xml-pre-opt --version'
A slight increase in init time, but I think this is okay given the
increase in flexibility.
And binary size difference:
$ du -h lexers.test*
$ du -sh chroma* 951371ms
8.8M chroma.master
7.8M chroma.xml
7.8M chroma.xml-pre-opt
Benchmarks:
$ hyperfine --warmup 3 \
'./chroma.master --version' \
'./chroma.xml-pre-opt --version' \
'./chroma.xml --version'
Benchmark 1: ./chroma.master --version
Time (mean ± σ): 5.3 ms ± 0.5 ms [User: 3.6 ms, System: 1.4 ms]
Range (min … max): 4.2 ms … 6.6 ms 233 runs
Benchmark 2: ./chroma.xml-pre-opt --version
Time (mean ± σ): 50.6 ms ± 0.5 ms [User: 52.4 ms, System: 3.6 ms]
Range (min … max): 49.2 ms … 51.5 ms 51 runs
Benchmark 3: ./chroma.xml --version
Time (mean ± σ): 6.9 ms ± 1.1 ms [User: 5.1 ms, System: 1.5 ms]
Range (min … max): 5.7 ms … 19.9 ms 196 runs
Summary
'./chroma.master --version' ran
1.30 ± 0.23 times faster than './chroma.xml --version'
9.56 ± 0.83 times faster than './chroma.xml-pre-opt --version'
Incompatible changes:
- (*RegexLexer).SetAnalyser: changed from func(func(text string) float32) *RegexLexer to func(func(text string) float32) Lexer
- (*TokenType).UnmarshalJSON: removed
- Lexer.AnalyseText: added
- Lexer.SetAnalyser: added
- Lexer.SetRegistry: added
- MustNewLazyLexer: removed
- MustNewLexer: changed from func(*Config, Rules) *RegexLexer to func(*Config, func() Rules) *RegexLexer
- Mutators: changed from func(...Mutator) MutatorFunc to func(...Mutator) Mutator
- NewLazyLexer: removed
- NewLexer: changed from func(*Config, Rules) (*RegexLexer, error) to func(*Config, func() Rules) (*RegexLexer, error)
- Pop: changed from func(int) MutatorFunc to func(int) Mutator
- Push: changed from func(...string) MutatorFunc to func(...string) Mutator
- TokenType.MarshalJSON: removed
- Using: changed from func(Lexer) Emitter to func(string) Emitter
- UsingByGroup: changed from func(func(string) Lexer, int, int, ...Emitter) Emitter to func(int, int, ...Emitter) Emitter
2022-01-03 23:51:17 +11:00
|
|
|
// contained within the block, for example on a Markdown text block or SQL
|
|
|
|
// language block.
|
|
|
|
//
|
2022-08-23 06:15:32 +07:00
|
|
|
// An attempt to load the sublexer will be made using the captured value from
|
|
|
|
// the text of the matched sublexerNameGroup. If a sublexer matching the
|
|
|
|
// sublexerNameGroup is available, then tokens for the matched codeGroup will
|
|
|
|
// be emitted using the sublexer. Otherwise, if no sublexer is available, then
|
|
|
|
// tokens will be emitted from the passed emitter.
|
Version 2 of Chroma
This cleans up the API in general, removing a bunch of deprecated stuff,
cleaning up circular imports, etc.
But the biggest change is switching to an optional XML format for the
regex lexer.
Having lexers defined only in Go is not ideal for a couple of reasons.
Firstly, it impedes a significant portion of contributors who use Chroma
in Hugo, but don't know Go. Secondly, it bloats the binary size of any
project that imports Chroma.
Why XML? YAML is an abomination and JSON is not human editable. XML
also compresses very well (eg. Go template lexer XML compresses from
3239 bytes to 718).
Why a new syntax format? All major existing formats rely on the
Oniguruma regex engine, which is extremely complex and for which there
is no Go port.
Why not earlier? Prior to the existence of fs.FS this was not a viable
option.
Benchmarks:
$ hyperfine --warmup 3 \
'./chroma.master --version' \
'./chroma.xml-pre-opt --version' \
'./chroma.xml --version'
Benchmark 1: ./chroma.master --version
Time (mean ± σ): 5.3 ms ± 0.5 ms [User: 3.6 ms, System: 1.4 ms]
Range (min … max): 4.2 ms … 6.6 ms 233 runs
Benchmark 2: ./chroma.xml-pre-opt --version
Time (mean ± σ): 50.6 ms ± 0.5 ms [User: 52.4 ms, System: 3.6 ms]
Range (min … max): 49.2 ms … 51.5 ms 51 runs
Benchmark 3: ./chroma.xml --version
Time (mean ± σ): 6.9 ms ± 1.1 ms [User: 5.1 ms, System: 1.5 ms]
Range (min … max): 5.7 ms … 19.9 ms 196 runs
Summary
'./chroma.master --version' ran
1.30 ± 0.23 times faster than './chroma.xml --version'
9.56 ± 0.83 times faster than './chroma.xml-pre-opt --version'
A slight increase in init time, but I think this is okay given the
increase in flexibility.
And binary size difference:
$ du -h lexers.test*
$ du -sh chroma* 951371ms
8.8M chroma.master
7.8M chroma.xml
7.8M chroma.xml-pre-opt
Benchmarks:
$ hyperfine --warmup 3 \
'./chroma.master --version' \
'./chroma.xml-pre-opt --version' \
'./chroma.xml --version'
Benchmark 1: ./chroma.master --version
Time (mean ± σ): 5.3 ms ± 0.5 ms [User: 3.6 ms, System: 1.4 ms]
Range (min … max): 4.2 ms … 6.6 ms 233 runs
Benchmark 2: ./chroma.xml-pre-opt --version
Time (mean ± σ): 50.6 ms ± 0.5 ms [User: 52.4 ms, System: 3.6 ms]
Range (min … max): 49.2 ms … 51.5 ms 51 runs
Benchmark 3: ./chroma.xml --version
Time (mean ± σ): 6.9 ms ± 1.1 ms [User: 5.1 ms, System: 1.5 ms]
Range (min … max): 5.7 ms … 19.9 ms 196 runs
Summary
'./chroma.master --version' ran
1.30 ± 0.23 times faster than './chroma.xml --version'
9.56 ± 0.83 times faster than './chroma.xml-pre-opt --version'
Incompatible changes:
- (*RegexLexer).SetAnalyser: changed from func(func(text string) float32) *RegexLexer to func(func(text string) float32) Lexer
- (*TokenType).UnmarshalJSON: removed
- Lexer.AnalyseText: added
- Lexer.SetAnalyser: added
- Lexer.SetRegistry: added
- MustNewLazyLexer: removed
- MustNewLexer: changed from func(*Config, Rules) *RegexLexer to func(*Config, func() Rules) *RegexLexer
- Mutators: changed from func(...Mutator) MutatorFunc to func(...Mutator) Mutator
- NewLazyLexer: removed
- NewLexer: changed from func(*Config, Rules) (*RegexLexer, error) to func(*Config, func() Rules) (*RegexLexer, error)
- Pop: changed from func(int) MutatorFunc to func(int) Mutator
- Push: changed from func(...string) MutatorFunc to func(...string) Mutator
- TokenType.MarshalJSON: removed
- Using: changed from func(Lexer) Emitter to func(string) Emitter
- UsingByGroup: changed from func(func(string) Lexer, int, int, ...Emitter) Emitter to func(int, int, ...Emitter) Emitter
2022-01-03 23:51:17 +11:00
|
|
|
//
|
|
|
|
// Example:
|
|
|
|
//
|
2022-08-23 06:15:32 +07:00
|
|
|
// var Markdown = internal.Register(MustNewLexer(
|
|
|
|
// &Config{
|
|
|
|
// Name: "markdown",
|
|
|
|
// Aliases: []string{"md", "mkd"},
|
|
|
|
// Filenames: []string{"*.md", "*.mkd", "*.markdown"},
|
|
|
|
// MimeTypes: []string{"text/x-markdown"},
|
|
|
|
// },
|
|
|
|
// Rules{
|
|
|
|
// "root": {
|
|
|
|
// {"^(```)(\\w+)(\\n)([\\w\\W]*?)(^```$)",
|
|
|
|
// UsingByGroup(
|
|
|
|
// 2, 4,
|
|
|
|
// String, String, String, Text, String,
|
|
|
|
// ),
|
|
|
|
// nil,
|
|
|
|
// },
|
|
|
|
// },
|
|
|
|
// },
|
|
|
|
// ))
|
Version 2 of Chroma
This cleans up the API in general, removing a bunch of deprecated stuff,
cleaning up circular imports, etc.
But the biggest change is switching to an optional XML format for the
regex lexer.
Having lexers defined only in Go is not ideal for a couple of reasons.
Firstly, it impedes a significant portion of contributors who use Chroma
in Hugo, but don't know Go. Secondly, it bloats the binary size of any
project that imports Chroma.
Why XML? YAML is an abomination and JSON is not human editable. XML
also compresses very well (eg. Go template lexer XML compresses from
3239 bytes to 718).
Why a new syntax format? All major existing formats rely on the
Oniguruma regex engine, which is extremely complex and for which there
is no Go port.
Why not earlier? Prior to the existence of fs.FS this was not a viable
option.
Benchmarks:
$ hyperfine --warmup 3 \
'./chroma.master --version' \
'./chroma.xml-pre-opt --version' \
'./chroma.xml --version'
Benchmark 1: ./chroma.master --version
Time (mean ± σ): 5.3 ms ± 0.5 ms [User: 3.6 ms, System: 1.4 ms]
Range (min … max): 4.2 ms … 6.6 ms 233 runs
Benchmark 2: ./chroma.xml-pre-opt --version
Time (mean ± σ): 50.6 ms ± 0.5 ms [User: 52.4 ms, System: 3.6 ms]
Range (min … max): 49.2 ms … 51.5 ms 51 runs
Benchmark 3: ./chroma.xml --version
Time (mean ± σ): 6.9 ms ± 1.1 ms [User: 5.1 ms, System: 1.5 ms]
Range (min … max): 5.7 ms … 19.9 ms 196 runs
Summary
'./chroma.master --version' ran
1.30 ± 0.23 times faster than './chroma.xml --version'
9.56 ± 0.83 times faster than './chroma.xml-pre-opt --version'
A slight increase in init time, but I think this is okay given the
increase in flexibility.
And binary size difference:
$ du -h lexers.test*
$ du -sh chroma* 951371ms
8.8M chroma.master
7.8M chroma.xml
7.8M chroma.xml-pre-opt
Benchmarks:
$ hyperfine --warmup 3 \
'./chroma.master --version' \
'./chroma.xml-pre-opt --version' \
'./chroma.xml --version'
Benchmark 1: ./chroma.master --version
Time (mean ± σ): 5.3 ms ± 0.5 ms [User: 3.6 ms, System: 1.4 ms]
Range (min … max): 4.2 ms … 6.6 ms 233 runs
Benchmark 2: ./chroma.xml-pre-opt --version
Time (mean ± σ): 50.6 ms ± 0.5 ms [User: 52.4 ms, System: 3.6 ms]
Range (min … max): 49.2 ms … 51.5 ms 51 runs
Benchmark 3: ./chroma.xml --version
Time (mean ± σ): 6.9 ms ± 1.1 ms [User: 5.1 ms, System: 1.5 ms]
Range (min … max): 5.7 ms … 19.9 ms 196 runs
Summary
'./chroma.master --version' ran
1.30 ± 0.23 times faster than './chroma.xml --version'
9.56 ± 0.83 times faster than './chroma.xml-pre-opt --version'
Incompatible changes:
- (*RegexLexer).SetAnalyser: changed from func(func(text string) float32) *RegexLexer to func(func(text string) float32) Lexer
- (*TokenType).UnmarshalJSON: removed
- Lexer.AnalyseText: added
- Lexer.SetAnalyser: added
- Lexer.SetRegistry: added
- MustNewLazyLexer: removed
- MustNewLexer: changed from func(*Config, Rules) *RegexLexer to func(*Config, func() Rules) *RegexLexer
- Mutators: changed from func(...Mutator) MutatorFunc to func(...Mutator) Mutator
- NewLazyLexer: removed
- NewLexer: changed from func(*Config, Rules) (*RegexLexer, error) to func(*Config, func() Rules) (*RegexLexer, error)
- Pop: changed from func(int) MutatorFunc to func(int) Mutator
- Push: changed from func(...string) MutatorFunc to func(...string) Mutator
- TokenType.MarshalJSON: removed
- Using: changed from func(Lexer) Emitter to func(string) Emitter
- UsingByGroup: changed from func(func(string) Lexer, int, int, ...Emitter) Emitter to func(int, int, ...Emitter) Emitter
2022-01-03 23:51:17 +11:00
|
|
|
//
|
2022-08-23 06:15:32 +07:00
|
|
|
// See the lexers/markdown.go for the complete example.
|
Version 2 of Chroma
This cleans up the API in general, removing a bunch of deprecated stuff,
cleaning up circular imports, etc.
But the biggest change is switching to an optional XML format for the
regex lexer.
Having lexers defined only in Go is not ideal for a couple of reasons.
Firstly, it impedes a significant portion of contributors who use Chroma
in Hugo, but don't know Go. Secondly, it bloats the binary size of any
project that imports Chroma.
Why XML? YAML is an abomination and JSON is not human editable. XML
also compresses very well (eg. Go template lexer XML compresses from
3239 bytes to 718).
Why a new syntax format? All major existing formats rely on the
Oniguruma regex engine, which is extremely complex and for which there
is no Go port.
Why not earlier? Prior to the existence of fs.FS this was not a viable
option.
Benchmarks:
$ hyperfine --warmup 3 \
'./chroma.master --version' \
'./chroma.xml-pre-opt --version' \
'./chroma.xml --version'
Benchmark 1: ./chroma.master --version
Time (mean ± σ): 5.3 ms ± 0.5 ms [User: 3.6 ms, System: 1.4 ms]
Range (min … max): 4.2 ms … 6.6 ms 233 runs
Benchmark 2: ./chroma.xml-pre-opt --version
Time (mean ± σ): 50.6 ms ± 0.5 ms [User: 52.4 ms, System: 3.6 ms]
Range (min … max): 49.2 ms … 51.5 ms 51 runs
Benchmark 3: ./chroma.xml --version
Time (mean ± σ): 6.9 ms ± 1.1 ms [User: 5.1 ms, System: 1.5 ms]
Range (min … max): 5.7 ms … 19.9 ms 196 runs
Summary
'./chroma.master --version' ran
1.30 ± 0.23 times faster than './chroma.xml --version'
9.56 ± 0.83 times faster than './chroma.xml-pre-opt --version'
A slight increase in init time, but I think this is okay given the
increase in flexibility.
And binary size difference:
$ du -h lexers.test*
$ du -sh chroma* 951371ms
8.8M chroma.master
7.8M chroma.xml
7.8M chroma.xml-pre-opt
Benchmarks:
$ hyperfine --warmup 3 \
'./chroma.master --version' \
'./chroma.xml-pre-opt --version' \
'./chroma.xml --version'
Benchmark 1: ./chroma.master --version
Time (mean ± σ): 5.3 ms ± 0.5 ms [User: 3.6 ms, System: 1.4 ms]
Range (min … max): 4.2 ms … 6.6 ms 233 runs
Benchmark 2: ./chroma.xml-pre-opt --version
Time (mean ± σ): 50.6 ms ± 0.5 ms [User: 52.4 ms, System: 3.6 ms]
Range (min … max): 49.2 ms … 51.5 ms 51 runs
Benchmark 3: ./chroma.xml --version
Time (mean ± σ): 6.9 ms ± 1.1 ms [User: 5.1 ms, System: 1.5 ms]
Range (min … max): 5.7 ms … 19.9 ms 196 runs
Summary
'./chroma.master --version' ran
1.30 ± 0.23 times faster than './chroma.xml --version'
9.56 ± 0.83 times faster than './chroma.xml-pre-opt --version'
Incompatible changes:
- (*RegexLexer).SetAnalyser: changed from func(func(text string) float32) *RegexLexer to func(func(text string) float32) Lexer
- (*TokenType).UnmarshalJSON: removed
- Lexer.AnalyseText: added
- Lexer.SetAnalyser: added
- Lexer.SetRegistry: added
- MustNewLazyLexer: removed
- MustNewLexer: changed from func(*Config, Rules) *RegexLexer to func(*Config, func() Rules) *RegexLexer
- Mutators: changed from func(...Mutator) MutatorFunc to func(...Mutator) Mutator
- NewLazyLexer: removed
- NewLexer: changed from func(*Config, Rules) (*RegexLexer, error) to func(*Config, func() Rules) (*RegexLexer, error)
- Pop: changed from func(int) MutatorFunc to func(int) Mutator
- Push: changed from func(...string) MutatorFunc to func(...string) Mutator
- TokenType.MarshalJSON: removed
- Using: changed from func(Lexer) Emitter to func(string) Emitter
- UsingByGroup: changed from func(func(string) Lexer, int, int, ...Emitter) Emitter to func(int, int, ...Emitter) Emitter
2022-01-03 23:51:17 +11:00
|
|
|
//
|
|
|
|
// Note: panic's if the number of emitters does not equal the number of matched
|
|
|
|
// groups in the regex.
|
|
|
|
func UsingByGroup(sublexerNameGroup, codeGroup int, emitters ...Emitter) Emitter {
|
|
|
|
return &usingByGroup{
|
|
|
|
SublexerNameGroup: sublexerNameGroup,
|
|
|
|
CodeGroup: codeGroup,
|
|
|
|
Emitters: emitters,
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
type usingByGroup struct {
|
|
|
|
SublexerNameGroup int `xml:"sublexer_name_group"`
|
|
|
|
CodeGroup int `xml:"code_group"`
|
|
|
|
Emitters Emitters `xml:"emitters"`
|
|
|
|
}
|
|
|
|
|
|
|
|
func (u *usingByGroup) EmitterKind() string { return "usingbygroup" }
|
|
|
|
func (u *usingByGroup) Emit(groups []string, state *LexerState) Iterator {
|
|
|
|
// bounds check
|
|
|
|
if len(u.Emitters) != len(groups)-1 {
|
|
|
|
panic("UsingByGroup expects number of emitters to be the same as len(groups)-1")
|
|
|
|
}
|
|
|
|
|
|
|
|
// grab sublexer
|
|
|
|
sublexer := state.Registry.Get(groups[u.SublexerNameGroup])
|
|
|
|
|
|
|
|
// build iterators
|
|
|
|
iterators := make([]Iterator, len(groups)-1)
|
|
|
|
for i, group := range groups[1:] {
|
|
|
|
if i == u.CodeGroup-1 && sublexer != nil {
|
|
|
|
var err error
|
|
|
|
iterators[i], err = sublexer.Tokenise(nil, groups[u.CodeGroup])
|
|
|
|
if err != nil {
|
|
|
|
panic(err)
|
|
|
|
}
|
|
|
|
} else if u.Emitters[i] != nil {
|
|
|
|
iterators[i] = u.Emitters[i].Emit([]string{group}, state)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return Concaterator(iterators...)
|
|
|
|
}
|
|
|
|
|
|
|
|
// UsingLexer returns an Emitter that uses a given Lexer for parsing and emitting.
|
|
|
|
//
|
|
|
|
// This Emitter is not serialisable.
|
|
|
|
func UsingLexer(lexer Lexer) Emitter {
|
|
|
|
return EmitterFunc(func(groups []string, _ *LexerState) Iterator {
|
|
|
|
it, err := lexer.Tokenise(&TokeniseOptions{State: "root", Nested: true}, groups[0])
|
|
|
|
if err != nil {
|
|
|
|
panic(err)
|
|
|
|
}
|
|
|
|
return it
|
|
|
|
})
|
|
|
|
}
|
|
|
|
|
|
|
|
type usingEmitter struct {
|
|
|
|
Lexer string `xml:"lexer,attr"`
|
|
|
|
}
|
|
|
|
|
|
|
|
func (u *usingEmitter) EmitterKind() string { return "using" }
|
|
|
|
|
|
|
|
func (u *usingEmitter) Emit(groups []string, state *LexerState) Iterator {
|
|
|
|
if state.Registry == nil {
|
|
|
|
panic(fmt.Sprintf("no LexerRegistry available for Using(%q)", u.Lexer))
|
|
|
|
}
|
|
|
|
lexer := state.Registry.Get(u.Lexer)
|
|
|
|
if lexer == nil {
|
|
|
|
panic(fmt.Sprintf("no such lexer %q", u.Lexer))
|
|
|
|
}
|
|
|
|
it, err := lexer.Tokenise(&TokeniseOptions{State: "root", Nested: true}, groups[0])
|
|
|
|
if err != nil {
|
|
|
|
panic(err)
|
|
|
|
}
|
|
|
|
return it
|
|
|
|
}
|
|
|
|
|
|
|
|
// Using returns an Emitter that uses a given Lexer reference for parsing and emitting.
|
|
|
|
//
|
|
|
|
// The referenced lexer must be stored in the same LexerRegistry.
|
|
|
|
func Using(lexer string) Emitter {
|
|
|
|
return &usingEmitter{Lexer: lexer}
|
|
|
|
}
|
|
|
|
|
|
|
|
type usingSelfEmitter struct {
|
|
|
|
State string `xml:"state,attr"`
|
|
|
|
}
|
|
|
|
|
|
|
|
func (u *usingSelfEmitter) EmitterKind() string { return "usingself" }
|
|
|
|
|
|
|
|
func (u *usingSelfEmitter) Emit(groups []string, state *LexerState) Iterator {
|
|
|
|
it, err := state.Lexer.Tokenise(&TokeniseOptions{State: u.State, Nested: true}, groups[0])
|
|
|
|
if err != nil {
|
|
|
|
panic(err)
|
|
|
|
}
|
|
|
|
return it
|
|
|
|
}
|
|
|
|
|
|
|
|
// UsingSelf is like Using, but uses the current Lexer.
|
|
|
|
func UsingSelf(stateName string) Emitter {
|
|
|
|
return &usingSelfEmitter{stateName}
|
|
|
|
}
|