1
0
mirror of https://github.com/alecthomas/chroma.git synced 2025-10-30 23:57:49 +02:00

Tokens by value (#187)

This results in about a 8% improvement in speed.
This commit is contained in:
Daniel Eloff
2018-11-03 16:22:51 -07:00
committed by Alec Thomas
parent 5a473179cf
commit 9c3abeae1d
26 changed files with 2536 additions and 98 deletions

View File

@@ -259,7 +259,7 @@ func format(w io.Writer, style *chroma.Style, it chroma.Iterator) {
func check(filename string, it chroma.Iterator) {
line, col := 1, 0
for token := it(); token != nil; token = it() {
for token := it(); token != chroma.EOF; token = it() {
if token.Type == chroma.Error {
fmt.Printf("%s:%d:%d %q\n", filename, line, col, token.String())
}

View File

@@ -6,17 +6,17 @@ func Coalesce(lexer Lexer) Lexer { return &coalescer{lexer} }
type coalescer struct{ Lexer }
func (d *coalescer) Tokenise(options *TokeniseOptions, text string) (Iterator, error) {
var prev *Token
var prev Token
it, err := d.Lexer.Tokenise(options, text)
if err != nil {
return nil, err
}
return func() *Token {
for token := it(); token != nil; token = it() {
return func() Token {
for token := it(); token != (EOF); token = it() {
if len(token.Value) == 0 {
continue
}
if prev == nil {
if prev == EOF {
prev = token
} else {
if prev.Type == token.Type && len(prev.Value) < 8192 {
@@ -29,7 +29,7 @@ func (d *coalescer) Tokenise(options *TokeniseOptions, text string) (Iterator, e
}
}
out := prev
prev = nil
prev = EOF
return out
}, nil
}

View File

@@ -14,6 +14,6 @@ func TestCoalesce(t *testing.T) {
}))
actual, err := Tokenise(lexer, nil, "!@#$")
assert.NoError(t, err)
expected := []*Token{{Punctuation, "!@#$"}}
expected := []Token{{Punctuation, "!@#$"}}
assert.Equal(t, expected, actual)
}

View File

@@ -31,7 +31,7 @@ func (d *delegatingLexer) Config() *Config {
// An insertion is the character range where language tokens should be inserted.
type insertion struct {
start, end int
tokens []*Token
tokens []Token
}
func (d *delegatingLexer) Tokenise(options *TokeniseOptions, text string) (Iterator, error) {
@@ -44,15 +44,15 @@ func (d *delegatingLexer) Tokenise(options *TokeniseOptions, text string) (Itera
insertions := []*insertion{}
var insert *insertion
offset := 0
var last *Token
var last Token
for _, t := range tokens {
if t.Type == Other {
if last != nil && insert != nil && last.Type != Other {
if last != EOF && insert != nil && last.Type != Other {
insert.end = offset
}
others.WriteString(t.Value)
} else {
if last == nil || last.Type == Other {
if last == EOF || last.Type == Other {
insert = &insertion{start: offset}
insertions = append(insertions, insert)
}
@@ -73,12 +73,12 @@ func (d *delegatingLexer) Tokenise(options *TokeniseOptions, text string) (Itera
}
// Interleave the two sets of tokens.
out := []*Token{}
var out []Token
offset = 0 // Offset into text.
tokenIndex := 0
nextToken := func() *Token {
nextToken := func() Token {
if tokenIndex >= len(rootTokens) {
return nil
return EOF
}
t := rootTokens[tokenIndex]
tokenIndex++
@@ -95,18 +95,18 @@ func (d *delegatingLexer) Tokenise(options *TokeniseOptions, text string) (Itera
}
t := nextToken()
i := nextInsertion()
for t != nil || i != nil {
for t != EOF || i != nil {
// fmt.Printf("%d->%d:%q %d->%d:%q\n", offset, offset+len(t.Value), t.Value, i.start, i.end, Stringify(i.tokens...))
if t == nil || (i != nil && i.start < offset+len(t.Value)) {
var l *Token
if t == EOF || (i != nil && i.start < offset+len(t.Value)) {
var l Token
l, t = splitToken(t, i.start-offset)
if l != nil {
if l != EOF {
out = append(out, l)
offset += len(l.Value)
}
out = append(out, i.tokens...)
offset += i.end - i.start
if t == nil {
if t == EOF {
t = nextToken()
}
i = nextInsertion()
@@ -119,15 +119,15 @@ func (d *delegatingLexer) Tokenise(options *TokeniseOptions, text string) (Itera
return Literator(out...), nil
}
func splitToken(t *Token, offset int) (l *Token, r *Token) {
if t == nil {
return nil, nil
func splitToken(t Token, offset int) (l Token, r Token) {
if t == EOF {
return EOF, EOF
}
if offset == 0 {
return nil, t
return EOF, t
}
if offset == len(t.Value) {
return t, nil
return t, EOF
}
l = t.Clone()
r = t.Clone()

View File

@@ -1,6 +1,7 @@
package chroma
import (
"fmt"
"testing"
"github.com/alecthomas/assert"
@@ -31,9 +32,9 @@ func TestDelegate(t *testing.T) {
testdata := []struct {
name string
source string
expected []*Token
expected []Token
}{
{"SourceInMiddle", `hello world <? what ?> there`, []*Token{
{"SourceInMiddle", `hello world <? what ?> there`, []Token{
{Keyword, "hello"},
{TextWhitespace, " "},
{Name, "world"},
@@ -48,7 +49,7 @@ func TestDelegate(t *testing.T) {
{TextWhitespace, " "},
{Name, "there"},
}},
{"SourceBeginning", `<? what ?> hello world there`, []*Token{
{"SourceBeginning", `<? what ?> hello world there`, []Token{
{CommentPreproc, "<?"},
{TextWhitespace, " "},
{Keyword, "what"},
@@ -61,7 +62,7 @@ func TestDelegate(t *testing.T) {
{TextWhitespace, " "},
{Name, "there"},
}},
{"SourceEnd", `hello world <? what there`, []*Token{
{"SourceEnd", `hello world <? what there`, []Token{
{Keyword, "hello"},
{TextWhitespace, " "},
{Name, "world"},
@@ -73,7 +74,7 @@ func TestDelegate(t *testing.T) {
{TextWhitespace, " "},
{Error, "there"},
}},
{"SourceMultiple", "hello world <? what ?> hello there <? what ?> hello", []*Token{
{"SourceMultiple", "hello world <? what ?> hello there <? what ?> hello", []Token{
{Keyword, "hello"},
{TextWhitespace, " "},
{Name, "world"},
@@ -104,6 +105,7 @@ func TestDelegate(t *testing.T) {
it, err := delegate.Tokenise(nil, test.source)
assert.NoError(t, err)
actual := it.Tokens()
fmt.Println(actual)
assert.Equal(t, test.expected, actual)
})
}

View File

@@ -11,7 +11,7 @@ import (
var (
// NoOp formatter.
NoOp = Register("noop", chroma.FormatterFunc(func(w io.Writer, s *chroma.Style, iterator chroma.Iterator) error {
for t := iterator(); t != nil; t = iterator() {
for t := iterator(); t != chroma.EOF; t = iterator() {
if _, err := io.WriteString(w, t.Value); err != nil {
return err
}

View File

@@ -129,7 +129,7 @@ func (f *Formatter) restyle(style *chroma.Style) (*chroma.Style, error) {
// We deliberately don't use html/template here because it is two orders of magnitude slower (benchmarked).
//
// OTOH we need to be super careful about correct escaping...
func (f *Formatter) writeHTML(w io.Writer, style *chroma.Style, tokens []*chroma.Token) (err error) { // nolint: gocyclo
func (f *Formatter) writeHTML(w io.Writer, style *chroma.Style, tokens []chroma.Token) (err error) { // nolint: gocyclo
style, err = f.restyle(style)
if err != nil {
return err
@@ -391,8 +391,8 @@ func compressStyle(s string) string {
return strings.Join(out, ";")
}
func splitTokensIntoLines(tokens []*chroma.Token) (out [][]*chroma.Token) {
line := []*chroma.Token{}
func splitTokensIntoLines(tokens []chroma.Token) (out [][]chroma.Token) {
var line []chroma.Token
for _, token := range tokens {
for strings.Contains(token.Value, "\n") {
parts := strings.SplitAfterN(token.Value, "\n", 2)

View File

@@ -32,11 +32,11 @@ func BenchmarkHTMLFormatter(b *testing.B) {
}
func TestSplitTokensIntoLines(t *testing.T) {
in := []*chroma.Token{
in := []chroma.Token{
{Value: "hello", Type: chroma.NameKeyword},
{Value: " world\nwhat?\n", Type: chroma.NameKeyword},
}
expected := [][]*chroma.Token{
expected := [][]chroma.Token{
{
{Type: chroma.NameKeyword, Value: "hello"},
{Type: chroma.NameKeyword, Value: " world\n"},
@@ -53,7 +53,7 @@ func TestSplitTokensIntoLines(t *testing.T) {
}
func TestIteratorPanicRecovery(t *testing.T) {
it := func() *chroma.Token {
it := func() chroma.Token {
panic(errors.New("bad"))
}
err := New().Format(ioutil.Discard, styles.Fallback, it)

View File

@@ -12,7 +12,7 @@ import (
var JSON = Register("json", chroma.FormatterFunc(func(w io.Writer, s *chroma.Style, it chroma.Iterator) error {
fmt.Fprintln(w, "[")
i := 0
for t := it(); t != nil; t = it() {
for t := it(); t != chroma.EOF; t = it() {
if i > 0 {
fmt.Fprintln(w, ",")
}

View File

@@ -9,7 +9,7 @@ import (
// Tokens formatter outputs the raw token structures.
var Tokens = Register("tokens", chroma.FormatterFunc(func(w io.Writer, s *chroma.Style, it chroma.Iterator) error {
for t := it(); t != nil; t = it() {
for t := it(); t != chroma.EOF; t = it() {
if _, err := fmt.Fprintln(w, t.GoString()); err != nil {
return err
}

View File

@@ -216,7 +216,7 @@ func (c *indexedTTYFormatter) Format(w io.Writer, style *chroma.Style, it chroma
}
}()
theme := styleToEscapeSequence(c.table, style)
for token := it(); token != nil; token = it() {
for token := it(); token != chroma.EOF; token = it() {
// TODO: Cache token lookups?
clr, ok := theme[token.Type]
if !ok {

View File

@@ -11,7 +11,7 @@ import (
var TTY16m = Register("terminal16m", chroma.FormatterFunc(trueColourFormatter))
func trueColourFormatter(w io.Writer, style *chroma.Style, it chroma.Iterator) error {
for token := it(); token != nil; token = it() {
for token := it(); token != chroma.EOF; token = it() {
entry := style.Get(token.Type)
if !entry.IsZero() {
out := ""

View File

@@ -5,12 +5,12 @@ package chroma
// nil will be returned at the end of the Token stream.
//
// If an error occurs within an Iterator, it may propagate this in a panic. Formatters should recover.
type Iterator func() *Token
type Iterator func() Token
// Tokens consumes all tokens from the iterator and returns them as a slice.
func (i Iterator) Tokens() []*Token {
out := []*Token{}
for t := i(); t != nil; t = i() {
func (i Iterator) Tokens() []Token {
var out []Token
for t := i(); t != EOF; t = i() {
out = append(out, t)
}
return out
@@ -18,23 +18,23 @@ func (i Iterator) Tokens() []*Token {
// Concaterator concatenates tokens from a series of iterators.
func Concaterator(iterators ...Iterator) Iterator {
return func() *Token {
return func() Token {
for len(iterators) > 0 {
t := iterators[0]()
if t != nil {
if t != EOF {
return t
}
iterators = iterators[1:]
}
return nil
return EOF
}
}
// Literator converts a sequence of literal Tokens into an Iterator.
func Literator(tokens ...*Token) Iterator {
return func() (out *Token) {
func Literator(tokens ...Token) Iterator {
return func() Token {
if len(tokens) == 0 {
return nil
return EOF
}
token := tokens[0]
tokens = tokens[1:]

View File

@@ -66,12 +66,12 @@ type Token struct {
func (t *Token) String() string { return t.Value }
func (t *Token) GoString() string { return fmt.Sprintf("&Token{%s, %q}", t.Type, t.Value) }
func (t *Token) Clone() *Token {
clone := &Token{}
*clone = *t
return clone
func (t *Token) Clone() Token {
return *t
}
var EOF Token
type TokeniseOptions struct {
// State to start tokenisation in. Defaults to "root".
State string

View File

@@ -35,7 +35,7 @@ func TestSimpleLexer(t *testing.T) {
a = 10
`)
assert.NoError(t, err)
expected := []*Token{
expected := []Token{
{Whitespace, "\n\t"},
{Comment, "; this is a comment"},
{Whitespace, "\n\t"},

View File

@@ -34,7 +34,7 @@ var HTTP = internal.Register(httpBodyContentTypeLexer(MustNewLexer(
)))
func httpContentBlock(groups []string, lexer Lexer) Iterator {
tokens := []*Token{
tokens := []Token{
{Generic, groups[0]},
}
return Literator(tokens...)
@@ -42,7 +42,7 @@ func httpContentBlock(groups []string, lexer Lexer) Iterator {
}
func httpHeaderBlock(groups []string, lexer Lexer) Iterator {
tokens := []*Token{
tokens := []Token{
{Name, groups[1]},
{Text, groups[2]},
{Operator, groups[3]},
@@ -54,7 +54,7 @@ func httpHeaderBlock(groups []string, lexer Lexer) Iterator {
}
func httpContinuousHeaderBlock(groups []string, lexer Lexer) Iterator {
tokens := []*Token{
tokens := []Token{
{Text, groups[1]},
{Literal, groups[2]},
{Text, groups[3]},
@@ -76,8 +76,8 @@ func (d *httpBodyContentTyper) Tokenise(options *TokeniseOptions, text string) (
return nil, err
}
return func() *Token {
for token := it(); token != nil; token = it() {
return func() Token {
for token := it(); token != EOF; token = it() {
switch {
case token.Type == Name && strings.ToLower(token.Value) == "content-type":
{
@@ -112,7 +112,7 @@ func (d *httpBodyContentTyper) Tokenise(options *TokeniseOptions, text string) (
if err != nil {
panic(err)
}
return nil
return EOF
}
}
@@ -122,11 +122,11 @@ func (d *httpBodyContentTyper) Tokenise(options *TokeniseOptions, text string) (
}
if subIterator != nil {
for token := subIterator(); token != nil; token = subIterator() {
for token := subIterator(); token != EOF; token = subIterator() {
return token
}
}
return nil
return EOF
}, nil
}

File diff suppressed because it is too large Load Diff

View File

@@ -65,7 +65,7 @@ func TestLexers(t *testing.T) {
assert.NoError(t, err)
// Read expected JSON into token slice.
expected := []*chroma.Token{}
var expected []chroma.Token
r, err := os.Open(expectedFilename)
assert.NoError(t, err)
err = json.NewDecoder(r).Decode(&expected)

View File

@@ -61,7 +61,7 @@ var Restructuredtext = internal.Register(MustNewLexer(
func rstCodeBlock(groups []string, lexer Lexer) Iterator {
iterators := []Iterator{}
tokens := []*Token{
tokens := []Token{
{Punctuation, groups[1]},
{Text, groups[2]},
{OperatorWord, groups[3]},
@@ -73,7 +73,7 @@ func rstCodeBlock(groups []string, lexer Lexer) Iterator {
code := strings.Join(groups[8:], "")
lexer = internal.Get(groups[6])
if lexer == nil {
tokens = append(tokens, &Token{String, code})
tokens = append(tokens, Token{String, code})
iterators = append(iterators, Literator(tokens...))
} else {
sub, err := lexer.Tokenise(nil, code)

View File

@@ -122,7 +122,7 @@ func Default(mutators ...Mutator) Rule {
}
// Stringify returns the raw string for a set of tokens.
func Stringify(tokens ...*Token) string {
func Stringify(tokens ...Token) string {
out := []string{}
for _, t := range tokens {
out = append(out, t.Value)

View File

@@ -52,6 +52,6 @@ func TestCombine(t *testing.T) {
})
it, err := l.Tokenise(nil, "hello world")
assert.NoError(t, err)
expected := []*Token{{String, `hello`}, {Whitespace, ` `}, {Name, `world`}}
expected := []Token{{String, `hello`}, {Whitespace, ` `}, {Name, `world`}}
assert.Equal(t, expected, it.Tokens())
}

View File

@@ -140,13 +140,13 @@ func Words(prefix, suffix string, words ...string) string {
}
// Tokenise text using lexer, returning tokens as a slice.
func Tokenise(lexer Lexer, options *TokeniseOptions, text string) ([]*Token, error) {
out := []*Token{}
func Tokenise(lexer Lexer, options *TokeniseOptions, text string) ([]Token, error) {
var out []Token
it, err := lexer.Tokenise(options, text)
if err != nil {
return nil, err
}
for t := it(); t != nil; t = it() {
for t := it(); t != EOF; t = it() {
out = append(out, t)
}
return out, nil
@@ -246,13 +246,13 @@ func (l *LexerState) Get(key interface{}) interface{} {
return l.MutatorContext[key]
}
func (l *LexerState) Iterator() *Token {
func (l *LexerState) Iterator() Token {
for l.Pos < len(l.Text) && len(l.Stack) > 0 {
// Exhaust the iterator stack, if any.
for len(l.iteratorStack) > 0 {
n := len(l.iteratorStack) - 1
t := l.iteratorStack[n]()
if t == nil {
if t == EOF {
l.iteratorStack = l.iteratorStack[:n]
continue
}
@@ -271,7 +271,7 @@ func (l *LexerState) Iterator() *Token {
// No match.
if groups == nil {
l.Pos++
return &Token{Error, string(l.Text[l.Pos-1 : l.Pos])}
return Token{Error, string(l.Text[l.Pos-1 : l.Pos])}
}
l.Rule = ruleIndex
l.Groups = groups
@@ -290,7 +290,7 @@ func (l *LexerState) Iterator() *Token {
for len(l.iteratorStack) > 0 {
n := len(l.iteratorStack) - 1
t := l.iteratorStack[n]()
if t == nil {
if t == EOF {
l.iteratorStack = l.iteratorStack[:n]
continue
}
@@ -301,9 +301,9 @@ func (l *LexerState) Iterator() *Token {
if l.Pos != len(l.Text) && len(l.Stack) == 0 {
value := string(l.Text[l.Pos:])
l.Pos = len(l.Text)
return &Token{Type: Error, Value: value}
return Token{Type: Error, Value: value}
}
return nil
return EOF
}
type RegexLexer struct {

View File

@@ -14,7 +14,7 @@ func TestNewlineAtEndOfFile(t *testing.T) {
}))
it, err := l.Tokenise(nil, `hello`)
assert.NoError(t, err)
assert.Equal(t, []*Token{{Keyword, "hello"}, {Whitespace, "\n"}}, it.Tokens())
assert.Equal(t, []Token{{Keyword, "hello"}, {Whitespace, "\n"}}, it.Tokens())
l = Coalesce(MustNewLexer(nil, Rules{
"root": {
@@ -23,5 +23,5 @@ func TestNewlineAtEndOfFile(t *testing.T) {
}))
it, err = l.Tokenise(nil, `hello`)
assert.NoError(t, err)
assert.Equal(t, []*Token{{Error, "hello"}}, it.Tokens())
assert.Equal(t, []Token{{Error, "hello"}}, it.Tokens())
}

View File

@@ -2,11 +2,11 @@ package chroma
type remappingLexer struct {
lexer Lexer
mapper func(*Token) []*Token
mapper func(Token) []Token
}
// RemappingLexer remaps a token to a set of, potentially empty, tokens.
func RemappingLexer(lexer Lexer, mapper func(*Token) []*Token) Lexer {
func RemappingLexer(lexer Lexer, mapper func(Token) []Token) Lexer {
return &remappingLexer{lexer, mapper}
}
@@ -19,8 +19,8 @@ func (r *remappingLexer) Tokenise(options *TokeniseOptions, text string) (Iterat
if err != nil {
return nil, err
}
buffer := []*Token{}
return func() *Token {
var buffer []Token
return func() Token {
for {
if len(buffer) > 0 {
t := buffer[0]
@@ -28,7 +28,7 @@ func (r *remappingLexer) Tokenise(options *TokeniseOptions, text string) (Iterat
return t
}
t := it()
if t == nil {
if t == EOF {
return t
}
buffer = r.mapper(t)
@@ -67,7 +67,7 @@ func TypeRemappingLexer(lexer Lexer, mapping TypeMapping) Lexer {
}
}
return RemappingLexer(lexer, func(t *Token) []*Token {
return RemappingLexer(lexer, func(t Token) []Token {
if k, ok := lut[t.Type]; ok {
if tt, ok := k[t.Value]; ok {
t.Type = tt
@@ -75,6 +75,6 @@ func TypeRemappingLexer(lexer Lexer, mapping TypeMapping) Lexer {
t.Type = tt
}
}
return []*Token{t}
return []Token{t}
})
}

View File

@@ -19,7 +19,7 @@ func TestRemappingLexer(t *testing.T) {
it, err := lexer.Tokenise(nil, `if true then print else end`)
assert.NoError(t, err)
expected := []*Token{
expected := []Token{
{Keyword, "if"}, {TextWhitespace, " "}, {Name, "true"}, {TextWhitespace, " "}, {Name, "then"},
{TextWhitespace, " "}, {Name, "print"}, {TextWhitespace, " "}, {Keyword, "else"},
{TextWhitespace, " "}, {Name, "end"},

View File

@@ -54,6 +54,8 @@ const (
Other
// No highlighting.
None
// Used as an EOF marker / nil token
EOFType TokenType = 0
)
// Keywords.
@@ -341,5 +343,5 @@ func (t TokenType) InSubCategory(other TokenType) bool {
}
func (t TokenType) Emit(groups []string, lexer Lexer) Iterator {
return Literator(&Token{Type: t, Value: groups[0]})
return Literator(Token{Type: t, Value: groups[0]})
}