1
0
mirror of https://github.com/alecthomas/chroma.git synced 2025-10-08 22:52:04 +02:00

feat: improve tracing

`--trace` now outputs a JSON structure with tracing information, eg.

```
{"lexer":"markdown","state":"root","rule":15,"pos":0,"elapsedMs":0.022875}
{"lexer":"markdown","state":"root","rule":15,"pos":1,"elapsedMs":0.002667}
{"lexer":"markdown","state":"root","rule":15,"pos":2,"elapsedMs":0.001833}
{"lexer":"markdown","state":"root","rule":15,"pos":3,"elapsedMs":0.002166}
{"lexer":"markdown","state":"root","rule":15,"pos":4,"elapsedMs":0.002125}
```

This should generally be much more amenable to analysis, eg. convenient
filtering using jq to help track down hotspots:

```
chroma --trace docs.md 2>&1 > /dev/null | jq 'select(. | .elapsedMs > 0.1)' | less
```
This commit is contained in:
Alec Thomas
2025-08-04 13:51:19 +10:00
parent 1f48e65abc
commit 303b65df3f
4 changed files with 61 additions and 5 deletions

View File

@@ -346,8 +346,8 @@ func listAll() {
} }
func lex(ctx *kong.Context, lexer chroma.Lexer, contents string) chroma.Iterator { func lex(ctx *kong.Context, lexer chroma.Lexer, contents string) chroma.Iterator {
if rel, ok := lexer.(*chroma.RegexLexer); ok { if rel, ok := lexer.(chroma.TracingLexer); ok {
rel.Trace(cli.Trace) rel.SetTracing(cli.Trace)
} }
lexer = chroma.Coalesce(lexer) lexer = chroma.Coalesce(lexer)
it, err := lexer.Tokenise(nil, contents) it, err := lexer.Tokenise(nil, contents)

View File

@@ -24,6 +24,15 @@ func DelegatingLexer(root Lexer, language Lexer) Lexer {
} }
} }
func (d *delegatingLexer) SetTracing(enable bool) {
if l, ok := d.language.(TracingLexer); ok {
l.SetTracing(enable)
}
if l, ok := d.root.(TracingLexer); ok {
l.SetTracing(enable)
}
}
func (d *delegatingLexer) AnalyseText(text string) float32 { func (d *delegatingLexer) AnalyseText(text string) float32 {
return d.root.AnalyseText(text) return d.root.AnalyseText(text)
} }

View File

@@ -130,6 +130,23 @@ type Lexer interface {
AnalyseText(text string) float32 AnalyseText(text string) float32
} }
// Trace is the trace of a tokenisation process.
type Trace struct {
Lexer string `json:"lexer"`
State string `json:"state"`
Rule int `json:"rule"`
Pattern string `json:"pattern"`
Pos int `json:"pos"`
Length int `json:"length"`
Elapsed float64 `json:"elapsedMs"` // Elapsed time spent matching for this rule.
}
// TracingLexer is a Lexer that can trace its tokenisation process.
type TracingLexer interface {
Lexer
SetTracing(enable bool)
}
// Lexers is a slice of lexers sortable by name. // Lexers is a slice of lexers sortable by name.
type Lexers []Lexer type Lexers []Lexer

View File

@@ -1,6 +1,7 @@
package chroma package chroma
import ( import (
"encoding/json"
"fmt" "fmt"
"os" "os"
"path/filepath" "path/filepath"
@@ -135,11 +136,20 @@ func NewLexer(config *Config, rulesFunc func() Rules) (*RegexLexer, error) {
} }
// Trace enables debug tracing. // Trace enables debug tracing.
//
// Deprecated: Use SetTracing instead.
func (r *RegexLexer) Trace(trace bool) *RegexLexer { func (r *RegexLexer) Trace(trace bool) *RegexLexer {
r.trace = trace r.trace = trace
return r return r
} }
// SetTracing enables debug tracing.
//
// This complies with the [TracingLexer] interface.
func (r *RegexLexer) SetTracing(trace bool) {
r.trace = trace
}
// A CompiledRule is a Rule with a pre-compiled regex. // A CompiledRule is a Rule with a pre-compiled regex.
// //
// Note that regular expressions are lazily compiled on first use of the lexer. // Note that regular expressions are lazily compiled on first use of the lexer.
@@ -185,6 +195,7 @@ func (l *LexerState) Get(key interface{}) interface{} {
// Iterator returns the next Token from the lexer. // Iterator returns the next Token from the lexer.
func (l *LexerState) Iterator() Token { // nolint: gocognit func (l *LexerState) Iterator() Token { // nolint: gocognit
trace := json.NewEncoder(os.Stderr)
end := len(l.Text) end := len(l.Text)
if l.newlineAdded { if l.newlineAdded {
end-- end--
@@ -205,14 +216,33 @@ func (l *LexerState) Iterator() Token { // nolint: gocognit
} }
l.State = l.Stack[len(l.Stack)-1] l.State = l.Stack[len(l.Stack)-1]
if l.Lexer.trace {
fmt.Fprintf(os.Stderr, "%s: pos=%d, text=%q\n", l.State, l.Pos, string(l.Text[l.Pos:]))
}
selectedRule, ok := l.Rules[l.State] selectedRule, ok := l.Rules[l.State]
if !ok { if !ok {
panic("unknown state " + l.State) panic("unknown state " + l.State)
} }
var start time.Time
if l.Lexer.trace {
start = time.Now()
}
ruleIndex, rule, groups, namedGroups := matchRules(l.Text, l.Pos, selectedRule) ruleIndex, rule, groups, namedGroups := matchRules(l.Text, l.Pos, selectedRule)
if l.Lexer.trace {
var length int
if groups != nil {
length = len(groups[0])
} else {
length = -1
}
_ = trace.Encode(Trace{ //nolint
Lexer: l.Lexer.config.Name,
State: l.State,
Rule: ruleIndex,
Pattern: rule.Pattern,
Pos: l.Pos,
Length: length,
Elapsed: float64(time.Since(start)) / float64(time.Millisecond),
})
// fmt.Fprintf(os.Stderr, "%s: pos=%d, text=%q, elapsed=%s\n", l.State, l.Pos, string(l.Text[l.Pos:]), time.Since(start))
}
// No match. // No match.
if groups == nil { if groups == nil {
// From Pygments :\ // From Pygments :\