1
0
mirror of https://github.com/alecthomas/chroma.git synced 2025-01-28 03:29:41 +02:00
chroma/delegate.go
Alec Thomas cc2dd5b8ad Version 2 of Chroma
This cleans up the API in general, removing a bunch of deprecated stuff,
cleaning up circular imports, etc.

But the biggest change is switching to an optional XML format for the
regex lexer.

Having lexers defined only in Go is not ideal for a couple of reasons.
Firstly, it impedes a significant portion of contributors who use Chroma
in Hugo, but don't know Go. Secondly, it bloats the binary size of any
project that imports Chroma.

Why XML? YAML is an abomination and JSON is not human editable. XML
also compresses very well (eg. Go template lexer XML compresses from
3239 bytes to 718).

Why a new syntax format? All major existing formats rely on the
Oniguruma regex engine, which is extremely complex and for which there
is no Go port.

Why not earlier? Prior to the existence of fs.FS this was not a viable
option.

Benchmarks:

    $ hyperfine --warmup 3 \
        './chroma.master --version' \
        './chroma.xml-pre-opt --version' \
        './chroma.xml --version'
    Benchmark 1: ./chroma.master --version
      Time (mean ± σ):       5.3 ms ±   0.5 ms    [User: 3.6 ms, System: 1.4 ms]
      Range (min … max):     4.2 ms …   6.6 ms    233 runs

    Benchmark 2: ./chroma.xml-pre-opt --version
      Time (mean ± σ):      50.6 ms ±   0.5 ms    [User: 52.4 ms, System: 3.6 ms]
      Range (min … max):    49.2 ms …  51.5 ms    51 runs

    Benchmark 3: ./chroma.xml --version
      Time (mean ± σ):       6.9 ms ±   1.1 ms    [User: 5.1 ms, System: 1.5 ms]
      Range (min … max):     5.7 ms …  19.9 ms    196 runs

    Summary
      './chroma.master --version' ran
        1.30 ± 0.23 times faster than './chroma.xml --version'
        9.56 ± 0.83 times faster than './chroma.xml-pre-opt --version'

A slight increase in init time, but I think this is okay given the
increase in flexibility.

And binary size difference:

    $ du -h lexers.test*
    $ du -sh chroma*                                                                                                                                                                                                                                                                                                                                                                                                                                                             951371ms
    8.8M	chroma.master
    7.8M	chroma.xml
    7.8M	chroma.xml-pre-opt

Benchmarks:

    $ hyperfine --warmup 3 \
        './chroma.master --version' \
        './chroma.xml-pre-opt --version' \
        './chroma.xml --version'
    Benchmark 1: ./chroma.master --version
      Time (mean ± σ):       5.3 ms ±   0.5 ms    [User: 3.6 ms, System: 1.4 ms]
      Range (min … max):     4.2 ms …   6.6 ms    233 runs

    Benchmark 2: ./chroma.xml-pre-opt --version
      Time (mean ± σ):      50.6 ms ±   0.5 ms    [User: 52.4 ms, System: 3.6 ms]
      Range (min … max):    49.2 ms …  51.5 ms    51 runs

    Benchmark 3: ./chroma.xml --version
      Time (mean ± σ):       6.9 ms ±   1.1 ms    [User: 5.1 ms, System: 1.5 ms]
      Range (min … max):     5.7 ms …  19.9 ms    196 runs

    Summary
      './chroma.master --version' ran
        1.30 ± 0.23 times faster than './chroma.xml --version'
        9.56 ± 0.83 times faster than './chroma.xml-pre-opt --version'

Incompatible changes:

- (*RegexLexer).SetAnalyser: changed from func(func(text string) float32) *RegexLexer to func(func(text string) float32) Lexer
- (*TokenType).UnmarshalJSON: removed
- Lexer.AnalyseText: added
- Lexer.SetAnalyser: added
- Lexer.SetRegistry: added
- MustNewLazyLexer: removed
- MustNewLexer: changed from func(*Config, Rules) *RegexLexer to func(*Config, func() Rules) *RegexLexer
- Mutators: changed from func(...Mutator) MutatorFunc to func(...Mutator) Mutator
- NewLazyLexer: removed
- NewLexer: changed from func(*Config, Rules) (*RegexLexer, error) to func(*Config, func() Rules) (*RegexLexer, error)
- Pop: changed from func(int) MutatorFunc to func(int) Mutator
- Push: changed from func(...string) MutatorFunc to func(...string) Mutator
- TokenType.MarshalJSON: removed
- Using: changed from func(Lexer) Emitter to func(string) Emitter
- UsingByGroup: changed from func(func(string) Lexer, int, int, ...Emitter) Emitter to func(int, int, ...Emitter) Emitter
2022-01-27 15:22:00 +11:00

153 lines
3.5 KiB
Go

package chroma
import (
"bytes"
)
type delegatingLexer struct {
root Lexer
language Lexer
}
// DelegatingLexer combines two lexers to handle the common case of a language embedded inside another, such as PHP
// inside HTML or PHP inside plain text.
//
// It takes two lexer as arguments: a root lexer and a language lexer. First everything is scanned using the language
// lexer, which must return "Other" for unrecognised tokens. Then all "Other" tokens are lexed using the root lexer.
// Finally, these two sets of tokens are merged.
//
// The lexers from the template lexer package use this base lexer.
func DelegatingLexer(root Lexer, language Lexer) Lexer {
return &delegatingLexer{
root: root,
language: language,
}
}
func (d *delegatingLexer) AnalyseText(text string) float32 {
return d.root.AnalyseText(text)
}
func (d *delegatingLexer) SetAnalyser(analyser func(text string) float32) Lexer {
d.root.SetAnalyser(analyser)
return d
}
func (d *delegatingLexer) SetRegistry(r *LexerRegistry) Lexer {
d.root.SetRegistry(r)
d.language.SetRegistry(r)
return d
}
func (d *delegatingLexer) Config() *Config {
return d.language.Config()
}
// An insertion is the character range where language tokens should be inserted.
type insertion struct {
start, end int
tokens []Token
}
func (d *delegatingLexer) Tokenise(options *TokeniseOptions, text string) (Iterator, error) { // nolint: gocognit
tokens, err := Tokenise(Coalesce(d.language), options, text)
if err != nil {
return nil, err
}
// Compute insertions and gather "Other" tokens.
others := &bytes.Buffer{}
insertions := []*insertion{}
var insert *insertion
offset := 0
var last Token
for _, t := range tokens {
if t.Type == Other {
if last != EOF && insert != nil && last.Type != Other {
insert.end = offset
}
others.WriteString(t.Value)
} else {
if last == EOF || last.Type == Other {
insert = &insertion{start: offset}
insertions = append(insertions, insert)
}
insert.tokens = append(insert.tokens, t)
}
last = t
offset += len(t.Value)
}
if len(insertions) == 0 {
return d.root.Tokenise(options, text)
}
// Lex the other tokens.
rootTokens, err := Tokenise(Coalesce(d.root), options, others.String())
if err != nil {
return nil, err
}
// Interleave the two sets of tokens.
var out []Token
offset = 0 // Offset into text.
tokenIndex := 0
nextToken := func() Token {
if tokenIndex >= len(rootTokens) {
return EOF
}
t := rootTokens[tokenIndex]
tokenIndex++
return t
}
insertionIndex := 0
nextInsertion := func() *insertion {
if insertionIndex >= len(insertions) {
return nil
}
i := insertions[insertionIndex]
insertionIndex++
return i
}
t := nextToken()
i := nextInsertion()
for t != EOF || i != nil {
// fmt.Printf("%d->%d:%q %d->%d:%q\n", offset, offset+len(t.Value), t.Value, i.start, i.end, Stringify(i.tokens...))
if t == EOF || (i != nil && i.start < offset+len(t.Value)) {
var l Token
l, t = splitToken(t, i.start-offset)
if l != EOF {
out = append(out, l)
offset += len(l.Value)
}
out = append(out, i.tokens...)
offset += i.end - i.start
if t == EOF {
t = nextToken()
}
i = nextInsertion()
} else {
out = append(out, t)
offset += len(t.Value)
t = nextToken()
}
}
return Literator(out...), nil
}
func splitToken(t Token, offset int) (l Token, r Token) {
if t == EOF {
return EOF, EOF
}
if offset == 0 {
return EOF, t
}
if offset == len(t.Value) {
return t, EOF
}
l = t.Clone()
r = t.Clone()
l.Value = l.Value[:offset]
r.Value = r.Value[offset:]
return
}