Version 2 of Chroma

This cleans up the API in general, removing a bunch of deprecated stuff, cleaning up circular imports, etc. But the biggest change is switching to an optional XML format for the regex lexer. Having lexers defined only in Go is not ideal for a couple of reasons. Firstly, it impedes a significant portion of contributors who use Chroma in Hugo, but don't know Go. Secondly, it bloats the binary size of any project that imports Chroma. Why XML? YAML is an abomination and JSON is not human editable. XML also compresses very well (eg. Go template lexer XML compresses from 3239 bytes to 718). Why a new syntax format? All major existing formats rely on the Oniguruma regex engine, which is extremely complex and for which there is no Go port. Why not earlier? Prior to the existence of fs.FS this was not a viable option. Benchmarks: $ hyperfine --warmup 3 \ './chroma.master --version' \ './chroma.xml-pre-opt --version' \ './chroma.xml --version' Benchmark 1: ./chroma.master --version Time (mean ± σ): 5.3 ms ± 0.5 ms [User: 3.6 ms, System: 1.4 ms] Range (min … max): 4.2 ms … 6.6 ms 233 runs Benchmark 2: ./chroma.xml-pre-opt --version Time (mean ± σ): 50.6 ms ± 0.5 ms [User: 52.4 ms, System: 3.6 ms] Range (min … max): 49.2 ms … 51.5 ms 51 runs Benchmark 3: ./chroma.xml --version Time (mean ± σ): 6.9 ms ± 1.1 ms [User: 5.1 ms, System: 1.5 ms] Range (min … max): 5.7 ms … 19.9 ms 196 runs Summary './chroma.master --version' ran 1.30 ± 0.23 times faster than './chroma.xml --version' 9.56 ± 0.83 times faster than './chroma.xml-pre-opt --version' A slight increase in init time, but I think this is okay given the increase in flexibility. And binary size difference: $ du -h lexers.test* $ du -sh chroma* 951371ms 8.8M chroma.master 7.8M chroma.xml 7.8M chroma.xml-pre-opt Benchmarks: $ hyperfine --warmup 3 \ './chroma.master --version' \ './chroma.xml-pre-opt --version' \ './chroma.xml --version' Benchmark 1: ./chroma.master --version Time (mean ± σ): 5.3 ms ± 0.5 ms [User: 3.6 ms, System: 1.4 ms] Range (min … max): 4.2 ms … 6.6 ms 233 runs Benchmark 2: ./chroma.xml-pre-opt --version Time (mean ± σ): 50.6 ms ± 0.5 ms [User: 52.4 ms, System: 3.6 ms] Range (min … max): 49.2 ms … 51.5 ms 51 runs Benchmark 3: ./chroma.xml --version Time (mean ± σ): 6.9 ms ± 1.1 ms [User: 5.1 ms, System: 1.5 ms] Range (min … max): 5.7 ms … 19.9 ms 196 runs Summary './chroma.master --version' ran 1.30 ± 0.23 times faster than './chroma.xml --version' 9.56 ± 0.83 times faster than './chroma.xml-pre-opt --version' Incompatible changes: - (*RegexLexer).SetAnalyser: changed from func(func(text string) float32) *RegexLexer to func(func(text string) float32) Lexer - (*TokenType).UnmarshalJSON: removed - Lexer.AnalyseText: added - Lexer.SetAnalyser: added - Lexer.SetRegistry: added - MustNewLazyLexer: removed - MustNewLexer: changed from func(*Config, Rules) *RegexLexer to func(*Config, func() Rules) *RegexLexer - Mutators: changed from func(...Mutator) MutatorFunc to func(...Mutator) Mutator - NewLazyLexer: removed - NewLexer: changed from func(*Config, Rules) (*RegexLexer, error) to func(*Config, func() Rules) (*RegexLexer, error) - Pop: changed from func(int) MutatorFunc to func(int) Mutator - Push: changed from func(...string) MutatorFunc to func(...string) Mutator - TokenType.MarshalJSON: removed - Using: changed from func(Lexer) Emitter to func(string) Emitter - UsingByGroup: changed from func(func(string) Lexer, int, int, ...Emitter) Emitter to func(int, int, ...Emitter) Emitter
2025-07-13 01:10:14 +02:00 · 2022-01-03 23:51:17 +11:00
parent d491f1b5c1
commit cc2dd5b8ad
490 changed files with 32816 additions and 14470 deletions
--- a/regexp.go
+++ b/regexp.go
@ -21,156 +21,6 @@ type Rule struct {
 	Mutator Mutator
 }

-// An Emitter takes group matches and returns tokens.
-type Emitter interface {
-	// Emit tokens for the given regex groups.
-	Emit(groups []string, state *LexerState) Iterator
-}
-
-// EmitterFunc is a function that is an Emitter.
-type EmitterFunc func(groups []string, state *LexerState) Iterator
-
-// Emit tokens for groups.
-func (e EmitterFunc) Emit(groups []string, state *LexerState) Iterator {
-	return e(groups, state)
-}
-
-// ByGroups emits a token for each matching group in the rule's regex.
-func ByGroups(emitters ...Emitter) Emitter {
-	return EmitterFunc(func(groups []string, state *LexerState) Iterator {
-		iterators := make([]Iterator, 0, len(groups)-1)
-		if len(emitters) != len(groups)-1 {
-			iterators = append(iterators, Error.Emit(groups, state))
-			// panic(errors.Errorf("number of groups %q does not match number of emitters %v", groups, emitters))
-		} else {
-			for i, group := range groups[1:] {
-				if emitters[i] != nil {
-					iterators = append(iterators, emitters[i].Emit([]string{group}, state))
-				}
-			}
-		}
-		return Concaterator(iterators...)
-	})
-}
-
-// ByGroupNames emits a token for each named matching group in the rule's regex.
-func ByGroupNames(emitters map[string]Emitter) Emitter {
-	return EmitterFunc(func(groups []string, state *LexerState) Iterator {
-		iterators := make([]Iterator, 0, len(state.NamedGroups)-1)
-		if len(state.NamedGroups)-1 == 0 {
-			if emitter, ok := emitters[`0`]; ok {
-				iterators = append(iterators, emitter.Emit(groups, state))
-			} else {
-				iterators = append(iterators, Error.Emit(groups, state))
-			}
-		} else {
-			ruleRegex := state.Rules[state.State][state.Rule].Regexp
-			for i := 1; i < len(state.NamedGroups); i++ {
-				groupName := ruleRegex.GroupNameFromNumber(i)
-				group := state.NamedGroups[groupName]
-				if emitter, ok := emitters[groupName]; ok {
-					if emitter != nil {
-						iterators = append(iterators, emitter.Emit([]string{group}, state))
-					}
-				} else {
-					iterators = append(iterators, Error.Emit([]string{group}, state))
-				}
-			}
-		}
-		return Concaterator(iterators...)
-	})
-}
-
-// UsingByGroup emits tokens for the matched groups in the regex using a
-// "sublexer". Used when lexing code blocks where the name of a sublexer is
-// contained within the block, for example on a Markdown text block or SQL
-// language block.
-//
-// The sublexer will be retrieved using sublexerGetFunc (typically
-// internal.Get), using the captured value from the matched sublexerNameGroup.
-//
-// If sublexerGetFunc returns a non-nil lexer for the captured sublexerNameGroup,
-// then tokens for the matched codeGroup will be emitted using the retrieved
-// lexer. Otherwise, if the sublexer is nil, then tokens will be emitted from
-// the passed emitter.
-//
-// Example:
-//
-// 	var Markdown = internal.Register(MustNewLexer(
-// 		&Config{
-// 			Name:      "markdown",
-// 			Aliases:   []string{"md", "mkd"},
-// 			Filenames: []string{"*.md", "*.mkd", "*.markdown"},
-// 			MimeTypes: []string{"text/x-markdown"},
-// 		},
-// 		Rules{
-// 			"root": {
-// 				{"^(```)(\\w+)(\\n)([\\w\\W]*?)(^```$)",
-// 					UsingByGroup(
-// 						internal.Get,
-// 						2, 4,
-// 						String, String, String, Text, String,
-// 					),
-// 					nil,
-// 				},
-// 			},
-// 		},
-// 	))
-//
-// See the lexers/m/markdown.go for the complete example.
-//
-// Note: panic's if the number emitters does not equal the number of matched
-// groups in the regex.
-func UsingByGroup(sublexerGetFunc func(string) Lexer, sublexerNameGroup, codeGroup int, emitters ...Emitter) Emitter {
-	return EmitterFunc(func(groups []string, state *LexerState) Iterator {
-		// bounds check
-		if len(emitters) != len(groups)-1 {
-			panic("UsingByGroup expects number of emitters to be the same as len(groups)-1")
-		}
-
-		// grab sublexer
-		sublexer := sublexerGetFunc(groups[sublexerNameGroup])
-
-		// build iterators
-		iterators := make([]Iterator, len(groups)-1)
-		for i, group := range groups[1:] {
-			if i == codeGroup-1 && sublexer != nil {
-				var err error
-				iterators[i], err = sublexer.Tokenise(nil, groups[codeGroup])
-				if err != nil {
-					panic(err)
-				}
-			} else if emitters[i] != nil {
-				iterators[i] = emitters[i].Emit([]string{group}, state)
-			}
-		}
-
-		return Concaterator(iterators...)
-	})
-}
-
-// Using returns an Emitter that uses a given Lexer for parsing and emitting.
-func Using(lexer Lexer) Emitter {
-	return EmitterFunc(func(groups []string, _ *LexerState) Iterator {
-		it, err := lexer.Tokenise(&TokeniseOptions{State: "root", Nested: true}, groups[0])
-		if err != nil {
-			panic(err)
-		}
-		return it
-	})
-}
-
-// UsingSelf is like Using, but uses the current Lexer.
-func UsingSelf(stateName string) Emitter {
-	return EmitterFunc(func(groups []string, state *LexerState) Iterator {
-		it, err := state.Lexer.Tokenise(&TokeniseOptions{State: stateName, Nested: true}, groups[0])
-		if err != nil {
-			panic(err)
-		}
-		return it
-	})
-}
-
 // Words creates a regex that matches any of the given literal words.
 func Words(prefix, suffix string, words ...string) string {
 	sort.Slice(words, func(i, j int) bool {
@ -225,17 +75,20 @@ func (r Rules) Merge(rules Rules) Rules {
 	return out
 }

-// MustNewLazyLexer creates a new Lexer with deferred rules generation or panics.
-func MustNewLazyLexer(config *Config, rulesFunc func() Rules) *RegexLexer {
-	lexer, err := NewLazyLexer(config, rulesFunc)
+// MustNewLexer creates a new Lexer with deferred rules generation or panics.
+func MustNewLexer(config *Config, rulesFunc func() Rules) *RegexLexer {
+	lexer, err := NewLexer(config, rulesFunc)
 	if err != nil {
 		panic(err)
 	}
 	return lexer
 }

-// NewLazyLexer creates a new regex-based Lexer with deferred rules generation.
-func NewLazyLexer(config *Config, rulesFunc func() Rules) (*RegexLexer, error) {
+// NewLexer creates a new regex-based Lexer.
+//
+// "rules" is a state machine transition map. Each key is a state. Values are sets of rules
+// that match input, optionally modify lexer state, and output tokens.
+func NewLexer(config *Config, rulesFunc func() Rules) (*RegexLexer, error) {
 	if config == nil {
 		config = &Config{}
 	}
@ -245,31 +98,40 @@ func NewLazyLexer(config *Config, rulesFunc func() Rules) (*RegexLexer, error) {
 			return nil, fmt.Errorf("%s: %q is not a valid glob: %w", config.Name, glob, err)
 		}
 	}
-	return &RegexLexer{
-		config:       config,
-		compilerFunc: rulesFunc,
-	}, nil
-}
-
-// MustNewLexer creates a new Lexer or panics.
-//
-// Deprecated: Use MustNewLazyLexer instead.
-func MustNewLexer(config *Config, rules Rules) *RegexLexer { // nolint: forbidigo
-	lexer, err := NewLexer(config, rules) // nolint: forbidigo
-	if err != nil {
-		panic(err)
+	r := &RegexLexer{
+		config:         config,
+		fetchRulesFunc: func() (Rules, error) { return rulesFunc(), nil },
 	}
-	return lexer
-}
-
-// NewLexer creates a new regex-based Lexer.
-//
-// "rules" is a state machine transitition map. Each key is a state. Values are sets of rules
-// that match input, optionally modify lexer state, and output tokens.
-//
-// Deprecated: Use NewLazyLexer instead.
-func NewLexer(config *Config, rules Rules) (*RegexLexer, error) { // nolint: forbidigo
-	return NewLazyLexer(config, func() Rules { return rules })
+	// One-off code to generate XML lexers in the Chroma source tree.
+	// var nameCleanRe = regexp.MustCompile(`[^-+A-Za-z0-9_]`)
+	// name := strings.ToLower(nameCleanRe.ReplaceAllString(config.Name, "_"))
+	// data, err := Marshal(r)
+	// if err != nil {
+	// 	if errors.Is(err, ErrNotSerialisable) {
+	// 		fmt.Fprintf(os.Stderr, "warning: %q: %s\n", name, err)
+	// 		return r, nil
+	// 	}
+	// 	return nil, err
+	// }
+	// _, file, _, ok := runtime.Caller(2)
+	// if !ok {
+	// 	panic("??")
+	// }
+	// fmt.Println(file)
+	// if strings.Contains(file, "/lexers/") {
+	// 	dir := filepath.Join(filepath.Dir(file), "embedded")
+	// 	err = os.MkdirAll(dir, 0700)
+	// 	if err != nil {
+	// 		return nil, err
+	// 	}
+	// 	filename := filepath.Join(dir, name) + ".xml"
+	// 	fmt.Println(filename)
+	// 	err = ioutil.WriteFile(filename, data, 0600)
+	// 	if err != nil {
+	// 		return nil, err
+	// 	}
+	// }
+	return r, nil
 }

 // Trace enables debug tracing.
@ -292,13 +154,14 @@ type CompiledRules map[string][]*CompiledRule

 // LexerState contains the state for a single lex.
 type LexerState struct {
-	Lexer *RegexLexer
-	Text  []rune
-	Pos   int
-	Rules CompiledRules
-	Stack []string
-	State string
-	Rule  int
+	Lexer    *RegexLexer
+	Registry *LexerRegistry
+	Text     []rune
+	Pos      int
+	Rules    CompiledRules
+	Stack    []string
+	State    string
+	Rule     int
 	// Group matches.
 	Groups []string
 	// Named Group matches.
@ -398,19 +261,39 @@ func (l *LexerState) Iterator() Token { // nolint: gocognit

 // RegexLexer is the default lexer implementation used in Chroma.
 type RegexLexer struct {
+	registry *LexerRegistry // The LexerRegistry this Lexer is associated with, if any.
 	config   *Config
 	analyser func(text string) float32
 	trace    bool

-	mu           sync.Mutex
-	compiled     bool
-	rules        map[string][]*CompiledRule
-	compilerFunc func() Rules
-	compileOnce  sync.Once
+	mu             sync.Mutex
+	compiled       bool
+	rawRules       Rules
+	rules          map[string][]*CompiledRule
+	fetchRulesFunc func() (Rules, error)
+	compileOnce    sync.Once
+}
+
+func (r *RegexLexer) String() string {
+	return r.config.Name
+}
+
+// Rules in the Lexer.
+func (r *RegexLexer) Rules() (Rules, error) {
+	if err := r.needRules(); err != nil {
+		return nil, err
+	}
+	return r.rawRules, nil
+}
+
+// SetRegistry the lexer will use to lookup other lexers if necessary.
+func (r *RegexLexer) SetRegistry(registry *LexerRegistry) Lexer {
+	r.registry = registry
+	return r
 }

 // SetAnalyser sets the analyser function used to perform content inspection.
-func (r *RegexLexer) SetAnalyser(analyser func(text string) float32) *RegexLexer {
+func (r *RegexLexer) SetAnalyser(analyser func(text string) float32) Lexer {
 	r.analyser = analyser
 	return r
 }
@ -422,6 +305,12 @@ func (r *RegexLexer) AnalyseText(text string) float32 { // nolint
 	return 0.0
 }

+// SetConfig replaces the Config for this Lexer.
+func (r *RegexLexer) SetConfig(config *Config) *RegexLexer {
+	r.config = config
+	return r
+}
+
 func (r *RegexLexer) Config() *Config { // nolint
 	return r.config
 }
@ -473,8 +362,11 @@ restart:
 	return nil
 }

-func (r *RegexLexer) compileRules() error {
-	rules := r.compilerFunc()
+func (r *RegexLexer) fetchRules() error {
+	rules, err := r.fetchRulesFunc()
+	if err != nil {
+		return fmt.Errorf("%s: failed to compile rules: %w", r.config.Name, err)
+	}
 	if _, ok := rules["root"]; !ok {
 		return fmt.Errorf("no \"root\" state")
 	}
@ -496,21 +388,27 @@ func (r *RegexLexer) compileRules() error {
 		}
 	}

+	r.rawRules = rules
 	r.rules = compiledRules
 	return nil
 }

-func (r *RegexLexer) Tokenise(options *TokeniseOptions, text string) (Iterator, error) { // nolint
+func (r *RegexLexer) needRules() error {
 	var err error
-	if r.compilerFunc != nil {
+	if r.fetchRulesFunc != nil {
 		r.compileOnce.Do(func() {
-			err = r.compileRules()
+			err = r.fetchRules()
 		})
 	}
-	if err != nil {
-		return nil, err
-	}
 	if err := r.maybeCompile(); err != nil {
+		return err
+	}
+	return err
+}
+
+func (r *RegexLexer) Tokenise(options *TokeniseOptions, text string) (Iterator, error) { // nolint
+	err := r.needRules()
+	if err != nil {
 		return nil, err
 	}
 	if options == nil {
@ -525,6 +423,7 @@ func (r *RegexLexer) Tokenise(options *TokeniseOptions, text string) (Iterator,
 		newlineAdded = true
 	}
 	state := &LexerState{
+		Registry:       r.registry,
 		newlineAdded:   newlineAdded,
 		options:        options,
 		Lexer:          r,
@ -536,6 +435,15 @@ func (r *RegexLexer) Tokenise(options *TokeniseOptions, text string) (Iterator,
 	return state.Iterator, nil
 }

+// MustRules is like Rules() but will panic on error.
+func (r *RegexLexer) MustRules() Rules {
+	rules, err := r.Rules()
+	if err != nil {
+		panic(err)
+	}
+	return rules
+}
+
 func matchRules(text []rune, pos int, rules []*CompiledRule) (int, *CompiledRule, []string, map[string]string) {
 	for i, rule := range rules {
 		match, err := rule.Regexp.FindRunesMatchStartingAt(text, pos)