1
0
mirror of https://github.com/alecthomas/chroma.git synced 2025-01-26 03:20:10 +02:00

Switch to an Iterator interface.

This is to solve an issue where writers returned by the Formatter
were often stateful, but this fact was not obvious to the API consumer,
and failed in interesting ways.
This commit is contained in:
Alec Thomas 2017-09-20 22:19:36 +10:00
parent 36ead7258a
commit cc0e4a59ab
20 changed files with 215 additions and 129 deletions

View File

@ -1,5 +1,7 @@
# Chroma - A general purpose syntax highlighter in pure Go [![](https://godoc.org/github.com/alecthomas/chroma?status.svg)](http://godoc.org/github.com/alecthomas/chroma) [![Build Status](https://travis-ci.org/alecthomas/chroma.png)](https://travis-ci.org/alecthomas/chroma) [![Gitter chat](https://badges.gitter.im/alecthomas.png)](https://gitter.im/alecthomas/Lobby)
> **NOTE:** As Chroma has just been released, its API is till in flux. That said, the high-level interface should not change significantly.
Chroma takes source code and other structured text and converts it into syntax
highlighted HTML, ANSI-coloured text, etc.
@ -115,17 +117,17 @@ if formatter == nil {
}
```
Then obtain a formatting function from the formatter:
```go
writer, err := formatter.Format(w, style)
```
And finally, lex the source code and write the output:
Then obtain an iterator over the tokens:
```go
contents, err := ioutil.ReadAll(r)
err := lexer.Tokenise(nil, string(contents), writer)
iterator, err := lexer.Tokenise(nil, string(contents))
```
And finally, format the tokens from the iterator:
```go
err := formatter.Format(w, style, iterator)
```
### The HTML formatter
@ -139,6 +141,9 @@ following constructor options:
- `Standalone()` - generate standalone HTML with embedded CSS.
- `WithClasses()` - use classes rather than inlined style attributes.
- `ClassPrefix(prefix)` - prefix each generated CSS class.
- `TabWidth(width)` - Set the rendered tab width, in characters.
- `WithLineNumbers()` - Render line numbers (style with `LineNumbers`).
- `HighlightLines(ranges)` - Highlight lines in these ranges (style with `LineHighlight`).
If `WithClasses()` is used, the corresponding CSS can be obtained from the formatter with:

View File

@ -146,16 +146,15 @@ command, for Go.
}
formatters.Register("html", html.New(options...))
}
writer := getWriter(w, style)
if len(*filesArgs) == 0 {
contents, err := ioutil.ReadAll(os.Stdin)
kingpin.FatalIfError(err, "")
lex("", string(contents), writer)
format(os.Stdout, style, lex("", string(contents)))
} else {
for _, filename := range *filesArgs {
contents, err := ioutil.ReadFile(filename)
kingpin.FatalIfError(err, "")
lex(filename, string(contents), writer)
format(os.Stdout, style, lex(filename, string(contents)))
}
}
}
@ -192,14 +191,15 @@ func listAll() {
fmt.Println()
}
func lex(path string, contents string, writer func(*chroma.Token)) {
func lex(path string, contents string) chroma.Iterator {
lexer := selexer(path, contents)
if lexer == nil {
lexer = lexers.Fallback
}
lexer = chroma.Coalesce(lexer)
err := lexer.Tokenise(nil, string(contents), writer)
it, err := lexer.Tokenise(nil, string(contents))
kingpin.FatalIfError(err, "")
return it
}
func selexer(path, contents string) (lexer chroma.Lexer) {
@ -215,10 +215,8 @@ func selexer(path, contents string) (lexer chroma.Lexer) {
return lexers.Analyse(contents)
}
func getWriter(w io.Writer, style *chroma.Style) func(*chroma.Token) {
func format(w io.Writer, style *chroma.Style, it chroma.Iterator) {
formatter := formatters.Get(*formatterFlag)
// formatter := formatters.TTY8
writer, err := formatter.Format(w, style)
err := formatter.Format(w, style, it)
kingpin.FatalIfError(err, "")
return writer
}

View File

@ -9,21 +9,24 @@ type coalescer struct {
Lexer
}
func (d *coalescer) Tokenise(options *TokeniseOptions, text string, out func(*Token)) error {
func (d *coalescer) Tokenise(options *TokeniseOptions, text string) (Iterator, error) {
var prev *Token
return d.Lexer.Tokenise(options, text, func(token *Token) {
if prev == nil {
prev = token
} else {
if prev.Type == token.Type && len(prev.Value) < 8192 {
prev.Value += token.Value
} else {
out(prev)
it, err := d.Lexer.Tokenise(options, text)
if err != nil {
return nil, err
}
return func() *Token {
for token := it(); token != nil; token = it() {
if prev == nil {
prev = token
} else {
if prev.Type == token.Type && len(prev.Value) < 8192 {
prev.Value += token.Value
}
}
}
if token.Type == EOF {
out(token)
}
})
out := prev
prev = nil
return out
}, nil
}

View File

@ -14,9 +14,6 @@ func TestCoalesce(t *testing.T) {
}))
actual, err := Tokenise(lexer, nil, "!@#$")
require.NoError(t, err)
expected := []*Token{
&Token{Punctuation, "!@#$"},
&Token{EOF, ""},
}
expected := []*Token{{Punctuation, "!@#$"}}
require.Equal(t, expected, actual)
}

View File

@ -7,12 +7,10 @@ import (
// A Formatter for Chroma lexers.
type Formatter interface {
// Format returns a formatting function for tokens.
Format(w io.Writer, style *Style) (func(*Token), error)
Format(w io.Writer, style *Style, iterator Iterator) error
}
// A FormatterFunc is a Formatter implemented as a function.
type FormatterFunc func(io.Writer, *Style) (func(*Token), error)
type FormatterFunc func(w io.Writer, style *Style, iterator Iterator) error
func (f FormatterFunc) Format(w io.Writer, s *Style) (func(*Token), error) {
return f(w, s)
}
func (f FormatterFunc) Format(w io.Writer, s *Style, it Iterator) error { return f(w, s, it) }

View File

@ -10,8 +10,13 @@ import (
var (
// NoOp formatter.
NoOp = Register("noop", chroma.FormatterFunc(func(w io.Writer, s *chroma.Style) (func(*chroma.Token), error) {
return func(t *chroma.Token) { io.WriteString(w, t.Value) }, nil
NoOp = Register("noop", chroma.FormatterFunc(func(w io.Writer, s *chroma.Style, iterator chroma.Iterator) error {
for t := iterator(); t != nil; t = iterator() {
if _, err := io.WriteString(w, t.Value); err != nil {
return err
}
}
return nil
}))
// Default HTML formatter outputs self-contained HTML.
htmlFull = Register("html", html.New(html.Standalone(), html.WithClasses()))

View File

@ -67,15 +67,8 @@ func (h highlightRanges) Len() int { return len(h) }
func (h highlightRanges) Swap(i, j int) { h[i], h[j] = h[j], h[i] }
func (h highlightRanges) Less(i, j int) bool { return h[i][0] < h[j][0] }
func (f *Formatter) Format(w io.Writer, style *chroma.Style) (func(*chroma.Token), error) {
tokens := []*chroma.Token{}
return func(token *chroma.Token) {
tokens = append(tokens, token)
if token.Type == chroma.EOF {
f.writeHTML(w, style, tokens)
return
}
}, nil
func (f *Formatter) Format(w io.Writer, style *chroma.Style, iterator chroma.Iterator) error {
return f.writeHTML(w, style, chroma.Flatten(iterator))
}
func (f *Formatter) writeHTML(w io.Writer, style *chroma.Style, tokens []*chroma.Token) error { // nolint: gocyclo

View File

@ -20,11 +20,11 @@ func TestCompressStyle(t *testing.T) {
func BenchmarkHTMLFormatter(b *testing.B) {
formatter := New()
writer, err := formatter.Format(ioutil.Discard, styles.Fallback)
assert.NoError(b, err)
b.ResetTimer()
for i := 0; i < b.N; i++ {
err = lexers.Go.Tokenise(nil, "package main\nfunc main()\n{\nprintln(`hello world`)\n}\n", writer)
it, err := lexers.Go.Tokenise(nil, "package main\nfunc main()\n{\nprintln(`hello world`)\n}\n")
assert.NoError(b, err)
err = formatter.Format(ioutil.Discard, styles.Fallback, it)
assert.NoError(b, err)
}
}
@ -33,7 +33,6 @@ func TestSplitTokensIntoLines(t *testing.T) {
in := []*chroma.Token{
{Value: "hello", Type: chroma.NameKeyword},
{Value: " world\nwhat?\n", Type: chroma.NameKeyword},
{Type: chroma.EOF},
}
expected := [][]*chroma.Token{
[]*chroma.Token{
@ -45,7 +44,6 @@ func TestSplitTokensIntoLines(t *testing.T) {
},
[]*chroma.Token{
{Type: chroma.NameKeyword},
{Type: chroma.EOF},
},
}
actual := splitTokensIntoLines(in)

View File

@ -8,8 +8,11 @@ import (
)
// Tokens formatter outputs the raw token structures.
var Tokens = Register("tokens", chroma.FormatterFunc(func(w io.Writer, s *chroma.Style) (func(*chroma.Token), error) {
return func(token *chroma.Token) {
fmt.Fprintln(w, token.GoString())
}, nil
var Tokens = Register("tokens", chroma.FormatterFunc(func(w io.Writer, s *chroma.Style, it chroma.Iterator) error {
for t := it(); t != nil; t = it() {
if _, err := fmt.Fprintln(w, t.GoString()); err != nil {
return err
}
}
return nil
}))

View File

@ -234,9 +234,9 @@ type indexedTTYFormatter struct {
table *ttyTable
}
func (c *indexedTTYFormatter) Format(w io.Writer, style *chroma.Style) (func(*chroma.Token), error) {
func (c *indexedTTYFormatter) Format(w io.Writer, style *chroma.Style, it chroma.Iterator) error {
theme := styleToEscapeSequence(c.table, style)
return func(token *chroma.Token) {
for token := it(); token != nil; token = it() {
// TODO: Cache token lookups?
clr, ok := theme[token.Type]
if !ok {
@ -255,7 +255,8 @@ func (c *indexedTTYFormatter) Format(w io.Writer, style *chroma.Style) (func(*ch
if clr != "" {
fmt.Fprintf(w, "\033[0m")
}
}, nil
}
return nil
}
// TTY8 is an 8-colour terminal formatter.

View File

@ -10,8 +10,8 @@ import (
// TTY16m is a true-colour terminal formatter.
var TTY16m = Register("terminal16m", chroma.FormatterFunc(trueColourFormatter))
func trueColourFormatter(w io.Writer, style *chroma.Style) (func(*chroma.Token), error) {
return func(token *chroma.Token) {
func trueColourFormatter(w io.Writer, style *chroma.Style, it chroma.Iterator) error {
for token := it(); token != nil; token = it() {
entry := style.Get(token.Type)
if !entry.IsZero() {
out := ""
@ -33,5 +33,6 @@ func trueColourFormatter(w io.Writer, style *chroma.Style) (func(*chroma.Token),
if !entry.IsZero() {
fmt.Fprint(w, "\033[0m")
}
}, nil
}
return nil
}

41
iterator.go Normal file
View File

@ -0,0 +1,41 @@
package chroma
// An Iterator across tokens.
//
// nil will be returned at the end of the Token stream.
type Iterator func() *Token
// Concaterator concatenates tokens from a series of iterators.
func Concaterator(iterators ...Iterator) Iterator {
return func() *Token {
for len(iterators) > 0 {
t := iterators[0]()
if t != nil {
return t
}
iterators = iterators[1:]
}
return nil
}
}
// Literator converts a sequence of literal Tokens into an Iterator.
func Literator(tokens ...*Token) Iterator {
return func() (out *Token) {
if len(tokens) == 0 {
return nil
}
token := tokens[0]
tokens = tokens[1:]
return token
}
}
// Flatten an Iterator into its tokens.
func Flatten(iterator Iterator) []*Token {
out := []*Token{}
for t := iterator(); t != nil; t = iterator() {
out = append(out, t)
}
return out
}

View File

@ -76,10 +76,8 @@ type TokeniseOptions struct {
type Lexer interface {
// Config describing the features of the Lexer.
Config() *Config
// Tokenise text and call out for each generated token.
//
// A token of type EOF will be passed to out() to signify the end of the stream.
Tokenise(options *TokeniseOptions, text string, out func(*Token)) error
// Tokenise returns an Iterator over tokens in text.
Tokenise(options *TokeniseOptions, text string) (Iterator, error)
}
type Lexers []Lexer

View File

@ -47,7 +47,6 @@ func TestSimpleLexer(t *testing.T) {
{Whitespace, " "},
{LiteralString, "10"},
{Whitespace, "\n"},
{EOF, ""},
}
require.Equal(t, expected, actual)
}

View File

@ -12,10 +12,10 @@ import (
)
func TestCompileAllRegexes(t *testing.T) {
writer, err := formatters.NoOp.Format(ioutil.Discard, styles.SwapOff)
assert.NoError(t, err)
for _, lexer := range lexers.Registry.Lexers {
err = lexer.Tokenise(nil, "", writer)
it, err := lexer.Tokenise(nil, "")
assert.NoError(t, err, "%s failed", lexer.Config().Name)
err = formatters.NoOp.Format(ioutil.Discard, styles.SwapOff, it)
assert.NoError(t, err, "%s failed", lexer.Config().Name)
}
}

View File

@ -3,7 +3,7 @@ package lexers
import (
"testing"
"github.com/alecthomas/chroma"
"github.com/stretchr/testify/assert"
)
const lexerBenchSource = `package chroma
@ -29,6 +29,9 @@ func (f FormatterFunc) Format(w io.Writer, s *Style) (func(*Token), error) {
func Benchmark(b *testing.B) {
b.ReportAllocs()
for i := 0; i < b.N; i++ {
Go.Tokenise(nil, lexerBenchSource, func(t *chroma.Token) {})
it, err := Go.Tokenise(nil, lexerBenchSource)
assert.NoError(b, err)
for t := it(); t != nil; t = it() {
}
}
}

View File

@ -38,16 +38,25 @@ var Markdown = Register(MustNewLexer(
},
))
func handleCodeblock(groups []string, lexer Lexer, out func(*Token)) {
out(&Token{String, groups[1]})
out(&Token{String, groups[2]})
out(&Token{Text, groups[3]})
func handleCodeblock(groups []string, lexer Lexer) Iterator {
iterators := []Iterator{}
tokens := []*Token{
&Token{String, groups[1]},
&Token{String, groups[2]},
&Token{Text, groups[3]},
}
code := groups[4]
lexer = Get(groups[2])
if lexer == nil {
out(&Token{String, code})
tokens = append(tokens, &Token{String, code})
iterators = append(iterators, Literator(tokens...))
} else {
lexer.Tokenise(nil, code, out)
sub, err := lexer.Tokenise(nil, code)
if err != nil {
panic(err)
}
iterators = append(iterators, sub)
}
out(&Token{String, groups[5]})
iterators = append(iterators, Literator(&Token{String, groups[5]}))
return Concaterator(iterators...)
}

View File

@ -35,10 +35,9 @@ func Highlight(w io.Writer, source, lexer, formatter, style string) error {
s = styles.Fallback
}
writer, err := f.Format(w, s)
it, err := l.Tokenise(nil, source)
if err != nil {
return err
}
return l.Tokenise(nil, source, writer)
return f.Format(w, s, it)
}

115
regexp.go
View File

@ -19,42 +19,47 @@ type Rule struct {
// An Emitter takes group matches and returns tokens.
type Emitter interface {
// Emit tokens for the given regex groups.
Emit(groups []string, lexer Lexer, out func(*Token))
Emit(groups []string, lexer Lexer) Iterator
}
// EmitterFunc is a function that is an Emitter.
type EmitterFunc func(groups []string, lexer Lexer, out func(*Token))
type EmitterFunc func(groups []string, lexer Lexer) Iterator
// Emit tokens for groups.
func (e EmitterFunc) Emit(groups []string, lexer Lexer, out func(*Token)) { e(groups, lexer, out) }
func (e EmitterFunc) Emit(groups []string, lexer Lexer) Iterator { return e(groups, lexer) }
// ByGroups emits a token for each matching group in the rule's regex.
func ByGroups(emitters ...Emitter) Emitter {
return EmitterFunc(func(groups []string, lexer Lexer, out func(*Token)) {
// NOTE: If this line panics, there is a mismatch with groups. Uncomment the following line to debug.
return EmitterFunc(func(groups []string, lexer Lexer) Iterator {
iterators := make([]Iterator, 0, len(groups)-1)
// NOTE: If this panics, there is a mismatch with groups. Uncomment the following line to debug.
// fmt.Printf("%s %#v\n", emitters, groups[1:])
for i, group := range groups[1:] {
emitters[i].Emit([]string{group}, lexer, out)
iterators = append(iterators, emitters[i].Emit([]string{group}, lexer))
}
return
return Concaterator(iterators...)
})
}
// Using returns an Emitter that uses a given Lexer for parsing and emitting.
func Using(lexer Lexer, options *TokeniseOptions) Emitter {
return EmitterFunc(func(groups []string, _ Lexer, out func(*Token)) {
if err := lexer.Tokenise(options, groups[0], out); err != nil {
return EmitterFunc(func(groups []string, _ Lexer) Iterator {
it, err := lexer.Tokenise(options, groups[0])
if err != nil {
panic(err)
}
return it
})
}
// UsingSelf is like Using, but uses the current Lexer.
func UsingSelf(state string) Emitter {
return EmitterFunc(func(groups []string, lexer Lexer, out func(*Token)) {
if err := lexer.Tokenise(&TokeniseOptions{State: state}, groups[0], out); err != nil {
return EmitterFunc(func(groups []string, lexer Lexer) Iterator {
it, err := lexer.Tokenise(&TokeniseOptions{State: state}, groups[0])
if err != nil {
panic(err)
}
return it
})
}
@ -69,7 +74,14 @@ func Words(prefix, suffix string, words ...string) string {
// Tokenise text using lexer, returning tokens as a slice.
func Tokenise(lexer Lexer, options *TokeniseOptions, text string) ([]*Token, error) {
out := []*Token{}
return out, lexer.Tokenise(options, text, func(token *Token) { out = append(out, token) })
it, err := lexer.Tokenise(options, text)
if err != nil {
return nil, err
}
for t := it(); t != nil; t = it() {
out = append(out, t)
}
return out, nil
}
// Rules maps from state to a sequence of Rules.
@ -129,6 +141,7 @@ type CompiledRule struct {
type CompiledRules map[string][]CompiledRule
type LexerState struct {
Lexer *RegexLexer
Text []rune
Pos int
Rules map[string][]CompiledRule
@ -149,6 +162,55 @@ func (l *LexerState) Get(key interface{}) interface{} {
return l.MutatorContext[key]
}
func (l *LexerState) Iterator() Iterator {
iteratorStack := []Iterator{}
return func() *Token {
for l.Pos < len(l.Text) && len(l.Stack) > 0 {
// Exhaust the IteratorStack, if any.
for len(iteratorStack) > 0 {
n := len(iteratorStack) - 1
t := iteratorStack[n]()
if t == nil {
iteratorStack = iteratorStack[:n]
continue
}
return t
}
l.State = l.Stack[len(l.Stack)-1]
ruleIndex, rule, groups := matchRules(l.Text[l.Pos:], l.Rules[l.State])
// No match.
if groups == nil {
l.Pos++
return &Token{Error, string(l.Text[l.Pos-1 : l.Pos])}
}
l.Rule = ruleIndex
l.Groups = groups
l.Pos += utf8.RuneCountInString(groups[0])
if rule.Mutator != nil {
if err := rule.Mutator.Mutate(l); err != nil {
panic(err)
}
}
if rule.Type != nil {
iteratorStack = append(iteratorStack, rule.Type.Emit(l.Groups, l.Lexer))
}
}
// Exhaust the IteratorStack, if any.
// Duplicate code, but eh.
for len(iteratorStack) > 0 {
n := len(iteratorStack) - 1
t := iteratorStack[n]()
if t == nil {
iteratorStack = iteratorStack[:n]
continue
}
return t
}
return nil
}
}
type RegexLexer struct {
config *Config
analyser func(text string) float32
@ -197,9 +259,9 @@ func (r *RegexLexer) maybeCompile() (err error) {
return nil
}
func (r *RegexLexer) Tokenise(options *TokeniseOptions, text string, out func(*Token)) error {
func (r *RegexLexer) Tokenise(options *TokeniseOptions, text string) (Iterator, error) {
if err := r.maybeCompile(); err != nil {
return err
return nil, err
}
if options == nil {
options = defaultOptions
@ -210,30 +272,7 @@ func (r *RegexLexer) Tokenise(options *TokeniseOptions, text string, out func(*T
Rules: r.rules,
MutatorContext: map[interface{}]interface{}{},
}
for state.Pos < len(state.Text) && len(state.Stack) > 0 {
state.State = state.Stack[len(state.Stack)-1]
ruleIndex, rule, groups := matchRules(state.Text[state.Pos:], state.Rules[state.State])
// No match.
if groups == nil {
out(&Token{Error, string(state.Text[state.Pos : state.Pos+1])})
state.Pos++
continue
}
state.Rule = ruleIndex
state.Groups = groups
state.Pos += utf8.RuneCountInString(groups[0])
if rule.Mutator != nil {
if err := rule.Mutator.Mutate(state); err != nil {
return err
}
}
if rule.Type != nil {
rule.Type.Emit(state.Groups, r, out)
}
}
out(&Token{Type: EOF})
return nil
return state.Iterator(), nil
}
func matchRules(text []rune, rules []CompiledRule) (int, CompiledRule, []string) {

View File

@ -21,16 +21,12 @@ const (
LineNumbers
// Line higlight style.
LineHighlight
// Character highlight style.
Highlight
// Input that could not be tokenised.
Error
// Other is used by the Delegate lexer to indicate which tokens should be handled by the delegate.
Other
// No highlighting.
None
// Final token.
EOF
)
// Keywords.
@ -208,6 +204,6 @@ func (t TokenType) InSubCategory(other TokenType) bool {
return t/100 == other/100
}
func (t TokenType) Emit(groups []string, lexer Lexer, out func(*Token)) {
out(&Token{Type: t, Value: groups[0]})
func (t TokenType) Emit(groups []string, lexer Lexer) Iterator {
return Literator(&Token{Type: t, Value: groups[0]})
}