mirror of
https://github.com/alecthomas/chroma.git
synced 2025-03-19 21:10:15 +02:00
This cleans up the API in general, removing a bunch of deprecated stuff, cleaning up circular imports, etc. But the biggest change is switching to an optional XML format for the regex lexer. Having lexers defined only in Go is not ideal for a couple of reasons. Firstly, it impedes a significant portion of contributors who use Chroma in Hugo, but don't know Go. Secondly, it bloats the binary size of any project that imports Chroma. Why XML? YAML is an abomination and JSON is not human editable. XML also compresses very well (eg. Go template lexer XML compresses from 3239 bytes to 718). Why a new syntax format? All major existing formats rely on the Oniguruma regex engine, which is extremely complex and for which there is no Go port. Why not earlier? Prior to the existence of fs.FS this was not a viable option. Benchmarks: $ hyperfine --warmup 3 \ './chroma.master --version' \ './chroma.xml-pre-opt --version' \ './chroma.xml --version' Benchmark 1: ./chroma.master --version Time (mean ± σ): 5.3 ms ± 0.5 ms [User: 3.6 ms, System: 1.4 ms] Range (min … max): 4.2 ms … 6.6 ms 233 runs Benchmark 2: ./chroma.xml-pre-opt --version Time (mean ± σ): 50.6 ms ± 0.5 ms [User: 52.4 ms, System: 3.6 ms] Range (min … max): 49.2 ms … 51.5 ms 51 runs Benchmark 3: ./chroma.xml --version Time (mean ± σ): 6.9 ms ± 1.1 ms [User: 5.1 ms, System: 1.5 ms] Range (min … max): 5.7 ms … 19.9 ms 196 runs Summary './chroma.master --version' ran 1.30 ± 0.23 times faster than './chroma.xml --version' 9.56 ± 0.83 times faster than './chroma.xml-pre-opt --version' A slight increase in init time, but I think this is okay given the increase in flexibility. And binary size difference: $ du -h lexers.test* $ du -sh chroma* 951371ms 8.8M chroma.master 7.8M chroma.xml 7.8M chroma.xml-pre-opt Benchmarks: $ hyperfine --warmup 3 \ './chroma.master --version' \ './chroma.xml-pre-opt --version' \ './chroma.xml --version' Benchmark 1: ./chroma.master --version Time (mean ± σ): 5.3 ms ± 0.5 ms [User: 3.6 ms, System: 1.4 ms] Range (min … max): 4.2 ms … 6.6 ms 233 runs Benchmark 2: ./chroma.xml-pre-opt --version Time (mean ± σ): 50.6 ms ± 0.5 ms [User: 52.4 ms, System: 3.6 ms] Range (min … max): 49.2 ms … 51.5 ms 51 runs Benchmark 3: ./chroma.xml --version Time (mean ± σ): 6.9 ms ± 1.1 ms [User: 5.1 ms, System: 1.5 ms] Range (min … max): 5.7 ms … 19.9 ms 196 runs Summary './chroma.master --version' ran 1.30 ± 0.23 times faster than './chroma.xml --version' 9.56 ± 0.83 times faster than './chroma.xml-pre-opt --version' Incompatible changes: - (*RegexLexer).SetAnalyser: changed from func(func(text string) float32) *RegexLexer to func(func(text string) float32) Lexer - (*TokenType).UnmarshalJSON: removed - Lexer.AnalyseText: added - Lexer.SetAnalyser: added - Lexer.SetRegistry: added - MustNewLazyLexer: removed - MustNewLexer: changed from func(*Config, Rules) *RegexLexer to func(*Config, func() Rules) *RegexLexer - Mutators: changed from func(...Mutator) MutatorFunc to func(...Mutator) Mutator - NewLazyLexer: removed - NewLexer: changed from func(*Config, Rules) (*RegexLexer, error) to func(*Config, func() Rules) (*RegexLexer, error) - Pop: changed from func(int) MutatorFunc to func(int) Mutator - Push: changed from func(...string) MutatorFunc to func(...string) Mutator - TokenType.MarshalJSON: removed - Using: changed from func(Lexer) Emitter to func(string) Emitter - UsingByGroup: changed from func(func(string) Lexer, int, int, ...Emitter) Emitter to func(int, int, ...Emitter) Emitter
143 lines
3.8 KiB
Go
143 lines
3.8 KiB
Go
package chroma
|
|
|
|
import (
|
|
"fmt"
|
|
"strings"
|
|
)
|
|
|
|
var (
|
|
defaultOptions = &TokeniseOptions{
|
|
State: "root",
|
|
EnsureLF: true,
|
|
}
|
|
)
|
|
|
|
// Config for a lexer.
|
|
type Config struct {
|
|
// Name of the lexer.
|
|
Name string `xml:"name,omitempty"`
|
|
|
|
// Shortcuts for the lexer
|
|
Aliases []string `xml:"alias,omitempty"`
|
|
|
|
// File name globs
|
|
Filenames []string `xml:"filename,omitempty"`
|
|
|
|
// Secondary file name globs
|
|
AliasFilenames []string `xml:"alias_filename,omitempty"`
|
|
|
|
// MIME types
|
|
MimeTypes []string `xml:"mime_type,omitempty"`
|
|
|
|
// Regex matching is case-insensitive.
|
|
CaseInsensitive bool `xml:"case_insensitive,omitempty"`
|
|
|
|
// Regex matches all characters.
|
|
DotAll bool `xml:"dot_all,omitempty"`
|
|
|
|
// Regex does not match across lines ($ matches EOL).
|
|
//
|
|
// Defaults to multiline.
|
|
NotMultiline bool `xml:"not_multiline,omitempty"`
|
|
|
|
// Don't strip leading and trailing newlines from the input.
|
|
// DontStripNL bool
|
|
|
|
// Strip all leading and trailing whitespace from the input
|
|
// StripAll bool
|
|
|
|
// Make sure that the input ends with a newline. This
|
|
// is required for some lexers that consume input linewise.
|
|
EnsureNL bool `xml:"ensure_nl,omitempty"`
|
|
|
|
// If given and greater than 0, expand tabs in the input.
|
|
// TabSize int
|
|
|
|
// Priority of lexer.
|
|
//
|
|
// If this is 0 it will be treated as a default of 1.
|
|
Priority float32 `xml:"priority,omitempty"`
|
|
}
|
|
|
|
// Token output to formatter.
|
|
type Token struct {
|
|
Type TokenType `json:"type"`
|
|
Value string `json:"value"`
|
|
}
|
|
|
|
func (t *Token) String() string { return t.Value }
|
|
func (t *Token) GoString() string { return fmt.Sprintf("&Token{%s, %q}", t.Type, t.Value) }
|
|
|
|
// Clone returns a clone of the Token.
|
|
func (t *Token) Clone() Token {
|
|
return *t
|
|
}
|
|
|
|
// EOF is returned by lexers at the end of input.
|
|
var EOF Token
|
|
|
|
// TokeniseOptions contains options for tokenisers.
|
|
type TokeniseOptions struct {
|
|
// State to start tokenisation in. Defaults to "root".
|
|
State string
|
|
// Nested tokenisation.
|
|
Nested bool
|
|
|
|
// If true, all EOLs are converted into LF
|
|
// by replacing CRLF and CR
|
|
EnsureLF bool
|
|
}
|
|
|
|
// A Lexer for tokenising source code.
|
|
type Lexer interface {
|
|
// Config describing the features of the Lexer.
|
|
Config() *Config
|
|
// Tokenise returns an Iterator over tokens in text.
|
|
Tokenise(options *TokeniseOptions, text string) (Iterator, error)
|
|
// SetRegistry sets the registry this Lexer is associated with.
|
|
//
|
|
// The registry should be used by the Lexer if it needs to look up other
|
|
// lexers.
|
|
SetRegistry(registry *LexerRegistry) Lexer
|
|
// SetAnalyser sets a function the Lexer should use for scoring how
|
|
// likely a fragment of text is to match this lexer, between 0.0 and 1.0.
|
|
// A value of 1 indicates high confidence.
|
|
//
|
|
// Lexers may ignore this if they implement their own analysers.
|
|
SetAnalyser(analyser func(text string) float32) Lexer
|
|
// AnalyseText scores how likely a fragment of text is to match
|
|
// this lexer, between 0.0 and 1.0. A value of 1 indicates high confidence.
|
|
AnalyseText(text string) float32
|
|
}
|
|
|
|
// Lexers is a slice of lexers sortable by name.
|
|
type Lexers []Lexer
|
|
|
|
func (l Lexers) Len() int { return len(l) }
|
|
func (l Lexers) Swap(i, j int) { l[i], l[j] = l[j], l[i] }
|
|
func (l Lexers) Less(i, j int) bool {
|
|
return strings.ToLower(l[i].Config().Name) < strings.ToLower(l[j].Config().Name)
|
|
}
|
|
|
|
// PrioritisedLexers is a slice of lexers sortable by priority.
|
|
type PrioritisedLexers []Lexer
|
|
|
|
func (l PrioritisedLexers) Len() int { return len(l) }
|
|
func (l PrioritisedLexers) Swap(i, j int) { l[i], l[j] = l[j], l[i] }
|
|
func (l PrioritisedLexers) Less(i, j int) bool {
|
|
ip := l[i].Config().Priority
|
|
if ip == 0 {
|
|
ip = 1
|
|
}
|
|
jp := l[j].Config().Priority
|
|
if jp == 0 {
|
|
jp = 1
|
|
}
|
|
return ip > jp
|
|
}
|
|
|
|
// Analyser determines how appropriate this lexer is for the given text.
|
|
type Analyser interface {
|
|
AnalyseText(text string) float32
|
|
}
|