diff --git a/delegate.go b/delegate.go index 65e5be5..bd2b42c 100644 --- a/delegate.go +++ b/delegate.go @@ -9,10 +9,12 @@ type delegatingLexer struct { language Lexer } -// DelegatingLexer takes two lexer as arguments. A root lexer and -// a language lexer. First everything is scanned using the language -// lexer, afterwards all Other tokens are lexed using the root -// lexer. +// DelegatingLexer combines two lexers to handle the common case of a language embedded inside another, such as PHP +// inside HTML or PHP inside plain text. +// +// It takes two lexer as arguments: a root lexer and a language lexer. First everything is scanned using the language +// lexer, which must return "Other" for unrecognised tokens. Then all "Other" tokens are lexed using the root lexer. +// Finally, these two sets of tokens are merged. // // The lexers from the template lexer package use this base lexer. func DelegatingLexer(root Lexer, language Lexer) Lexer { @@ -26,101 +28,108 @@ func (d *delegatingLexer) Config() *Config { return d.language.Config() } -type tokenSplit struct { - pos int - tokens []*Token -} - -func splitOtherTokens(it Iterator) ([]tokenSplit, string) { - splits := []tokenSplit{} - var split *tokenSplit - other := bytes.Buffer{} - offset := 0 - for t := it(); t != nil; t = it() { - if t.Type == Other { - if split != nil { - splits = append(splits, *split) - split = nil - } - other.WriteString(t.Value) - } else { - if split == nil { - split = &tokenSplit{pos: offset} - } - split.tokens = append(split.tokens, t) - } - offset += len(t.Value) - } - if split != nil { - splits = append(splits, *split) - } - return splits, other.String() +// An insertion is the character range where language tokens should be inserted. +type insertion struct { + start, end int + tokens []*Token } func (d *delegatingLexer) Tokenise(options *TokeniseOptions, text string) (Iterator, error) { - it, err := d.language.Tokenise(options, text) + tokens, err := Tokenise(Coalesce(d.language), options, text) if err != nil { return nil, err } - splits, other := splitOtherTokens(it) - it, err = d.root.Tokenise(options, other) - if err != nil { - return nil, err - } - + // Compute insertions and gather "Other" tokens. + others := &bytes.Buffer{} + insertions := []*insertion{} + var insert *insertion offset := 0 - return func() *Token { - // First, see if there's a split at the start of this token. - for len(splits) > 0 && splits[0].pos == offset { - if len(splits[0].tokens) > 0 { - t := splits[0].tokens[0] - splits[0].tokens = splits[0].tokens[1:] - offset += len(t.Value) - return t + var last *Token + for _, t := range tokens { + if t.Type == Other { + if last != nil && insert != nil && last.Type != Other { + insert.end = offset } - // End of tokens from this split, shift it off the queue. - splits = splits[1:] + others.WriteString(t.Value) + } else { + if last == nil || last.Type == Other { + insert = &insertion{start: offset} + insertions = append(insertions, insert) + } + insert.tokens = append(insert.tokens, t) } + last = t + offset += len(t.Value) + } - // No split, try to consume a token. - t := it() - if t == nil { - for len(splits) > 0 { - if len(splits[0].tokens) > 0 { - t = splits[0].tokens[0] - splits[0].tokens = splits[0].tokens[1:] - offset += len(t.Value) - return t - } - // End of tokens from this split, shift it off the queue. - splits = splits[1:] - } + if len(insertions) == 0 { + return d.root.Tokenise(options, text) + } + + // Lex the other tokens. + rootTokens, err := Tokenise(d.root, options, others.String()) + if err != nil { + return nil, err + } + + // Interleave the two sets of tokens. + out := []*Token{} + offset = 0 + index := 0 + next := func() *Token { + if index >= len(rootTokens) { return nil } - - // Check if there's a split in the middle of the current token. - if len(splits) > 0 && splits[0].pos < offset+len(t.Value) { - // Split the token. - next := t.Clone() - point := splits[0].pos - offset - next.Value = next.Value[point:] - t.Value = t.Value[:point] - - // Insert the tail of the split token after any other splits at the same point. - tailPos := offset + len(t.Value) - tail := []tokenSplit{{pos: tailPos, tokens: []*Token{next}}} - i := 0 - for ; i < len(splits); i++ { - if splits[i].pos > tailPos { - break - } - } - splits = append(splits[:i], append(tail, splits[i:]...)...) - - // Finally, return the head. + t := rootTokens[index] + index++ + return t + } + t := next() + for _, insert := range insertions { + // Consume tokens until insertion point. + for t != nil && offset+len(t.Value) <= insert.start { + out = append(out, t) + offset += len(t.Value) + t = next() + } + // End of root tokens, append insertion point. + if t == nil { + out = append(out, insert.tokens...) + break } - offset += len(t.Value) - return t - }, nil + // Split and insert. + l, r := splitToken(t, insert.start-offset) + if l != nil { + out = append(out, l) + offset += len(l.Value) + } + out = append(out, insert.tokens...) + offset += insert.end - insert.start + if r != nil { + out = append(out, r) + offset += len(r.Value) + } + t = next() + } + if t != nil { + out = append(out, t) + } + // Remainder. + out = append(out, rootTokens[index:]...) + return Literator(out...), nil +} + +func splitToken(t *Token, offset int) (l *Token, r *Token) { + if offset == 0 { + return nil, t + } + if offset >= len(t.Value) { + return t, nil + } + l = t.Clone() + r = t.Clone() + l.Value = l.Value[:offset] + r.Value = r.Value[offset:] + return } diff --git a/delegate_test.go b/delegate_test.go index 1ebf23b..ced29ef 100644 --- a/delegate_test.go +++ b/delegate_test.go @@ -6,11 +6,6 @@ import ( "github.com/alecthomas/assert" ) -var ( - delegateSourceMiddle = `hello world what ?> there` - delegateSourceEnd = `hello world what there` -) - func makeDelegationTestLexers() (lang Lexer, root Lexer) { return MustNewLexer(nil, Rules{ "root": { @@ -32,85 +27,84 @@ func makeDelegationTestLexers() (lang Lexer, root Lexer) { }) } -func TestDelegateSplitOtherTokens(t *testing.T) { - lang, _ := makeDelegationTestLexers() - it, err := lang.Tokenise(nil, delegateSourceMiddle) - assert.NoError(t, err) - splits, other := splitOtherTokens(it) - assert.Equal(t, "hello world there", other) - expected := []tokenSplit{tokenSplit{ - pos: 12, - tokens: []*Token{ +func TestDelegate(t *testing.T) { + testdata := []struct { + name string + source string + expected []*Token + }{ + {"SourceInMiddle", `hello world what ?> there`, []*Token{ + {Keyword, "hello"}, + {TextWhitespace, " "}, + {Name, "world"}, + {TextWhitespace, " "}, + // lang {CommentPreproc, ""}, {Whitespace, " "}, {Keyword, "what"}, {Whitespace, " "}, {CommentPreproc, "?>"}, - }, - }} - assert.Equal(t, expected, splits) -} - -func TestDelegateSplitOtherTokensSourceAtEnd(t *testing.T) { - lang, _ := makeDelegationTestLexers() - lang = Coalesce(lang) - it, err := lang.Tokenise(nil, delegateSourceEnd) - assert.NoError(t, err) - splits, other := splitOtherTokens(it) - assert.Equal(t, "hello world ", other) - expected := []tokenSplit{tokenSplit{ - pos: 12, - tokens: []*Token{ + // /lang + {TextWhitespace, " "}, + {Name, "there"}, + }}, + {"SourceBeginning", ` what ?> hello world there`, []*Token{ + {CommentPreproc, ""}, + {TextWhitespace, " "}, + {Keyword, "what"}, + {TextWhitespace, " "}, + {CommentPreproc, "?>"}, + {TextWhitespace, " "}, + {Keyword, "hello"}, + {TextWhitespace, " "}, + {Name, "world"}, + {TextWhitespace, " "}, + {Name, "there"}, + }}, + {"SourceEnd", `hello world what there`, []*Token{ + {Keyword, "hello"}, + {TextWhitespace, " "}, + {Name, "world"}, + {TextWhitespace, " "}, + // lang {CommentPreproc, ""}, {Whitespace, " "}, {Keyword, "what"}, {TextWhitespace, " "}, {Error, "there"}, - }, - }} - assert.Equal(t, expected, splits) -} - -func TestDelegate(t *testing.T) { + }}, + {"SourceMultiple", "hello world what ?> hello there what ?> hello", []*Token{ + {Keyword, "hello"}, + {TextWhitespace, " "}, + {Name, "world"}, + {TextWhitespace, " "}, + {CommentPreproc, ""}, + {TextWhitespace, " "}, + {Keyword, "what"}, + {TextWhitespace, " "}, + {CommentPreproc, "?>"}, + {TextWhitespace, " "}, + {Keyword, "hello"}, + {TextWhitespace, " "}, + {Name, "there"}, + {TextWhitespace, " "}, + {CommentPreproc, ""}, + {TextWhitespace, " "}, + {Keyword, "what"}, + {TextWhitespace, " "}, + {CommentPreproc, "?>"}, + {TextWhitespace, " "}, + {Keyword, "hello"}, + }}, + } lang, root := makeDelegationTestLexers() delegate := DelegatingLexer(root, lang) - it, err := delegate.Tokenise(nil, delegateSourceMiddle) - assert.NoError(t, err) - expected := []*Token{ - {Keyword, "hello"}, - {TextWhitespace, " "}, - {Name, "world"}, - {TextWhitespace, " "}, - // lang - {CommentPreproc, ""}, - {Whitespace, " "}, - {Keyword, "what"}, - {Whitespace, " "}, - {CommentPreproc, "?>"}, - // /lang - {TextWhitespace, " "}, - {Name, "there"}, + for _, test := range testdata { + t.Run(test.name, func(t *testing.T) { + it, err := delegate.Tokenise(nil, test.source) + assert.NoError(t, err) + actual := it.Tokens() + assert.Equal(t, test.expected, actual) + }) } - assert.Equal(t, expected, it.Tokens()) -} - -func TestDelegateEnd(t *testing.T) { - lang, root := makeDelegationTestLexers() - lang = Coalesce(lang) - delegate := DelegatingLexer(root, lang) - it, err := delegate.Tokenise(nil, delegateSourceEnd) - assert.NoError(t, err) - expected := []*Token{ - {Keyword, "hello"}, - {TextWhitespace, " "}, - {Name, "world"}, - {TextWhitespace, " "}, - // lang - {CommentPreproc, ""}, - {Whitespace, " "}, - {Keyword, "what"}, - {TextWhitespace, " "}, - {Error, "there"}, - } - assert.Equal(t, expected, it.Tokens()) } diff --git a/lexers/circular/doc.go b/lexers/circular/doc.go new file mode 100644 index 0000000..48d0fb7 --- /dev/null +++ b/lexers/circular/doc.go @@ -0,0 +1,2 @@ +// Package circular exists to break circular dependencies between lexers. +package circular diff --git a/lexers/p/php.go b/lexers/circular/php.go similarity index 97% rename from lexers/p/php.go rename to lexers/circular/php.go index 2ef4d4b..87127a8 100644 --- a/lexers/p/php.go +++ b/lexers/circular/php.go @@ -1,12 +1,13 @@ -package p +package circular import ( . "github.com/alecthomas/chroma" // nolint + "github.com/alecthomas/chroma/lexers/h" "github.com/alecthomas/chroma/lexers/internal" ) // PHP lexer. -var PHP = internal.Register(DelegatingLexer(HTML, MustNewLexer( +var PHP = internal.Register(DelegatingLexer(h.HTML, MustNewLexer( &Config{ Name: "PHP", Aliases: []string{"php", "php3", "php4", "php5"}, diff --git a/lexers/lexers.go b/lexers/lexers.go index 4ddb118..2897299 100644 --- a/lexers/lexers.go +++ b/lexers/lexers.go @@ -9,6 +9,7 @@ import ( _ "github.com/alecthomas/chroma/lexers/a" _ "github.com/alecthomas/chroma/lexers/b" _ "github.com/alecthomas/chroma/lexers/c" + _ "github.com/alecthomas/chroma/lexers/circular" _ "github.com/alecthomas/chroma/lexers/d" _ "github.com/alecthomas/chroma/lexers/e" _ "github.com/alecthomas/chroma/lexers/f" diff --git a/lexers/s/smarty.go b/lexers/s/smarty.go index a3592e2..c364ffa 100644 --- a/lexers/s/smarty.go +++ b/lexers/s/smarty.go @@ -1,9 +1,9 @@ package s import ( - . "github.com/alecthomas/chroma" // nolint + . "github.com/alecthomas/chroma" // nolint + . "github.com/alecthomas/chroma/lexers/circular" // nolint "github.com/alecthomas/chroma/lexers/internal" - . "github.com/alecthomas/chroma/lexers/p" // nolint ) // Smarty lexer. diff --git a/lexers/testdata/php.actual b/lexers/testdata/php.actual index e51730b..764de58 100644 --- a/lexers/testdata/php.actual +++ b/lexers/testdata/php.actual @@ -1,3 +1,8 @@ + + +
+ +