1
0
mirror of https://github.com/alecthomas/chroma.git synced 2025-03-21 21:17:50 +02:00

Implemented delegating lexer.

This is a lexer that is useful for templating languages, where the
surrounding text may be of a different syntax. eg. PHP+HTML

The PHP lexer has been changed accordingly.

Fixes #80
This commit is contained in:
Alec Thomas 2018-03-18 20:38:29 +11:00
parent 15a009f0fc
commit db6920e68f
8 changed files with 230 additions and 166 deletions

View File

@ -9,10 +9,12 @@ type delegatingLexer struct {
language Lexer language Lexer
} }
// DelegatingLexer takes two lexer as arguments. A root lexer and // DelegatingLexer combines two lexers to handle the common case of a language embedded inside another, such as PHP
// a language lexer. First everything is scanned using the language // inside HTML or PHP inside plain text.
// lexer, afterwards all Other tokens are lexed using the root //
// lexer. // It takes two lexer as arguments: a root lexer and a language lexer. First everything is scanned using the language
// lexer, which must return "Other" for unrecognised tokens. Then all "Other" tokens are lexed using the root lexer.
// Finally, these two sets of tokens are merged.
// //
// The lexers from the template lexer package use this base lexer. // The lexers from the template lexer package use this base lexer.
func DelegatingLexer(root Lexer, language Lexer) Lexer { func DelegatingLexer(root Lexer, language Lexer) Lexer {
@ -26,101 +28,108 @@ func (d *delegatingLexer) Config() *Config {
return d.language.Config() return d.language.Config()
} }
type tokenSplit struct { // An insertion is the character range where language tokens should be inserted.
pos int type insertion struct {
tokens []*Token start, end int
} tokens []*Token
func splitOtherTokens(it Iterator) ([]tokenSplit, string) {
splits := []tokenSplit{}
var split *tokenSplit
other := bytes.Buffer{}
offset := 0
for t := it(); t != nil; t = it() {
if t.Type == Other {
if split != nil {
splits = append(splits, *split)
split = nil
}
other.WriteString(t.Value)
} else {
if split == nil {
split = &tokenSplit{pos: offset}
}
split.tokens = append(split.tokens, t)
}
offset += len(t.Value)
}
if split != nil {
splits = append(splits, *split)
}
return splits, other.String()
} }
func (d *delegatingLexer) Tokenise(options *TokeniseOptions, text string) (Iterator, error) { func (d *delegatingLexer) Tokenise(options *TokeniseOptions, text string) (Iterator, error) {
it, err := d.language.Tokenise(options, text) tokens, err := Tokenise(Coalesce(d.language), options, text)
if err != nil { if err != nil {
return nil, err return nil, err
} }
splits, other := splitOtherTokens(it) // Compute insertions and gather "Other" tokens.
it, err = d.root.Tokenise(options, other) others := &bytes.Buffer{}
if err != nil { insertions := []*insertion{}
return nil, err var insert *insertion
}
offset := 0 offset := 0
return func() *Token { var last *Token
// First, see if there's a split at the start of this token. for _, t := range tokens {
for len(splits) > 0 && splits[0].pos == offset { if t.Type == Other {
if len(splits[0].tokens) > 0 { if last != nil && insert != nil && last.Type != Other {
t := splits[0].tokens[0] insert.end = offset
splits[0].tokens = splits[0].tokens[1:]
offset += len(t.Value)
return t
} }
// End of tokens from this split, shift it off the queue. others.WriteString(t.Value)
splits = splits[1:] } else {
if last == nil || last.Type == Other {
insert = &insertion{start: offset}
insertions = append(insertions, insert)
}
insert.tokens = append(insert.tokens, t)
} }
last = t
offset += len(t.Value)
}
// No split, try to consume a token. if len(insertions) == 0 {
t := it() return d.root.Tokenise(options, text)
if t == nil { }
for len(splits) > 0 {
if len(splits[0].tokens) > 0 { // Lex the other tokens.
t = splits[0].tokens[0] rootTokens, err := Tokenise(d.root, options, others.String())
splits[0].tokens = splits[0].tokens[1:] if err != nil {
offset += len(t.Value) return nil, err
return t }
}
// End of tokens from this split, shift it off the queue. // Interleave the two sets of tokens.
splits = splits[1:] out := []*Token{}
} offset = 0
index := 0
next := func() *Token {
if index >= len(rootTokens) {
return nil return nil
} }
t := rootTokens[index]
// Check if there's a split in the middle of the current token. index++
if len(splits) > 0 && splits[0].pos < offset+len(t.Value) { return t
// Split the token. }
next := t.Clone() t := next()
point := splits[0].pos - offset for _, insert := range insertions {
next.Value = next.Value[point:] // Consume tokens until insertion point.
t.Value = t.Value[:point] for t != nil && offset+len(t.Value) <= insert.start {
out = append(out, t)
// Insert the tail of the split token after any other splits at the same point. offset += len(t.Value)
tailPos := offset + len(t.Value) t = next()
tail := []tokenSplit{{pos: tailPos, tokens: []*Token{next}}} }
i := 0 // End of root tokens, append insertion point.
for ; i < len(splits); i++ { if t == nil {
if splits[i].pos > tailPos { out = append(out, insert.tokens...)
break break
}
}
splits = append(splits[:i], append(tail, splits[i:]...)...)
// Finally, return the head.
} }
offset += len(t.Value) // Split and insert.
return t l, r := splitToken(t, insert.start-offset)
}, nil if l != nil {
out = append(out, l)
offset += len(l.Value)
}
out = append(out, insert.tokens...)
offset += insert.end - insert.start
if r != nil {
out = append(out, r)
offset += len(r.Value)
}
t = next()
}
if t != nil {
out = append(out, t)
}
// Remainder.
out = append(out, rootTokens[index:]...)
return Literator(out...), nil
}
func splitToken(t *Token, offset int) (l *Token, r *Token) {
if offset == 0 {
return nil, t
}
if offset >= len(t.Value) {
return t, nil
}
l = t.Clone()
r = t.Clone()
l.Value = l.Value[:offset]
r.Value = r.Value[offset:]
return
} }

View File

@ -6,11 +6,6 @@ import (
"github.com/alecthomas/assert" "github.com/alecthomas/assert"
) )
var (
delegateSourceMiddle = `hello world <? what ?> there`
delegateSourceEnd = `hello world <? what there`
)
func makeDelegationTestLexers() (lang Lexer, root Lexer) { func makeDelegationTestLexers() (lang Lexer, root Lexer) {
return MustNewLexer(nil, Rules{ return MustNewLexer(nil, Rules{
"root": { "root": {
@ -32,85 +27,84 @@ func makeDelegationTestLexers() (lang Lexer, root Lexer) {
}) })
} }
func TestDelegateSplitOtherTokens(t *testing.T) { func TestDelegate(t *testing.T) {
lang, _ := makeDelegationTestLexers() testdata := []struct {
it, err := lang.Tokenise(nil, delegateSourceMiddle) name string
assert.NoError(t, err) source string
splits, other := splitOtherTokens(it) expected []*Token
assert.Equal(t, "hello world there", other) }{
expected := []tokenSplit{tokenSplit{ {"SourceInMiddle", `hello world <? what ?> there`, []*Token{
pos: 12, {Keyword, "hello"},
tokens: []*Token{ {TextWhitespace, " "},
{Name, "world"},
{TextWhitespace, " "},
// lang
{CommentPreproc, "<?"}, {CommentPreproc, "<?"},
{Whitespace, " "}, {Whitespace, " "},
{Keyword, "what"}, {Keyword, "what"},
{Whitespace, " "}, {Whitespace, " "},
{CommentPreproc, "?>"}, {CommentPreproc, "?>"},
}, // /lang
}} {TextWhitespace, " "},
assert.Equal(t, expected, splits) {Name, "there"},
} }},
{"SourceBeginning", `<? what ?> hello world there`, []*Token{
func TestDelegateSplitOtherTokensSourceAtEnd(t *testing.T) { {CommentPreproc, "<?"},
lang, _ := makeDelegationTestLexers() {TextWhitespace, " "},
lang = Coalesce(lang) {Keyword, "what"},
it, err := lang.Tokenise(nil, delegateSourceEnd) {TextWhitespace, " "},
assert.NoError(t, err) {CommentPreproc, "?>"},
splits, other := splitOtherTokens(it) {TextWhitespace, " "},
assert.Equal(t, "hello world ", other) {Keyword, "hello"},
expected := []tokenSplit{tokenSplit{ {TextWhitespace, " "},
pos: 12, {Name, "world"},
tokens: []*Token{ {TextWhitespace, " "},
{Name, "there"},
}},
{"SourceEnd", `hello world <? what there`, []*Token{
{Keyword, "hello"},
{TextWhitespace, " "},
{Name, "world"},
{TextWhitespace, " "},
// lang
{CommentPreproc, "<?"}, {CommentPreproc, "<?"},
{Whitespace, " "}, {Whitespace, " "},
{Keyword, "what"}, {Keyword, "what"},
{TextWhitespace, " "}, {TextWhitespace, " "},
{Error, "there"}, {Error, "there"},
}, }},
}} {"SourceMultiple", "hello world <? what ?> hello there <? what ?> hello", []*Token{
assert.Equal(t, expected, splits) {Keyword, "hello"},
} {TextWhitespace, " "},
{Name, "world"},
func TestDelegate(t *testing.T) { {TextWhitespace, " "},
{CommentPreproc, "<?"},
{TextWhitespace, " "},
{Keyword, "what"},
{TextWhitespace, " "},
{CommentPreproc, "?>"},
{TextWhitespace, " "},
{Keyword, "hello"},
{TextWhitespace, " "},
{Name, "there"},
{TextWhitespace, " "},
{CommentPreproc, "<?"},
{TextWhitespace, " "},
{Keyword, "what"},
{TextWhitespace, " "},
{CommentPreproc, "?>"},
{TextWhitespace, " "},
{Keyword, "hello"},
}},
}
lang, root := makeDelegationTestLexers() lang, root := makeDelegationTestLexers()
delegate := DelegatingLexer(root, lang) delegate := DelegatingLexer(root, lang)
it, err := delegate.Tokenise(nil, delegateSourceMiddle) for _, test := range testdata {
assert.NoError(t, err) t.Run(test.name, func(t *testing.T) {
expected := []*Token{ it, err := delegate.Tokenise(nil, test.source)
{Keyword, "hello"}, assert.NoError(t, err)
{TextWhitespace, " "}, actual := it.Tokens()
{Name, "world"}, assert.Equal(t, test.expected, actual)
{TextWhitespace, " "}, })
// lang
{CommentPreproc, "<?"},
{Whitespace, " "},
{Keyword, "what"},
{Whitespace, " "},
{CommentPreproc, "?>"},
// /lang
{TextWhitespace, " "},
{Name, "there"},
} }
assert.Equal(t, expected, it.Tokens())
}
func TestDelegateEnd(t *testing.T) {
lang, root := makeDelegationTestLexers()
lang = Coalesce(lang)
delegate := DelegatingLexer(root, lang)
it, err := delegate.Tokenise(nil, delegateSourceEnd)
assert.NoError(t, err)
expected := []*Token{
{Keyword, "hello"},
{TextWhitespace, " "},
{Name, "world"},
{TextWhitespace, " "},
// lang
{CommentPreproc, "<?"},
{Whitespace, " "},
{Keyword, "what"},
{TextWhitespace, " "},
{Error, "there"},
}
assert.Equal(t, expected, it.Tokens())
} }

2
lexers/circular/doc.go Normal file
View File

@ -0,0 +1,2 @@
// Package circular exists to break circular dependencies between lexers.
package circular

View File

@ -1,12 +1,13 @@
package p package circular
import ( import (
. "github.com/alecthomas/chroma" // nolint . "github.com/alecthomas/chroma" // nolint
"github.com/alecthomas/chroma/lexers/h"
"github.com/alecthomas/chroma/lexers/internal" "github.com/alecthomas/chroma/lexers/internal"
) )
// PHP lexer. // PHP lexer.
var PHP = internal.Register(DelegatingLexer(HTML, MustNewLexer( var PHP = internal.Register(DelegatingLexer(h.HTML, MustNewLexer(
&Config{ &Config{
Name: "PHP", Name: "PHP",
Aliases: []string{"php", "php3", "php4", "php5"}, Aliases: []string{"php", "php3", "php4", "php5"},

View File

@ -9,6 +9,7 @@ import (
_ "github.com/alecthomas/chroma/lexers/a" _ "github.com/alecthomas/chroma/lexers/a"
_ "github.com/alecthomas/chroma/lexers/b" _ "github.com/alecthomas/chroma/lexers/b"
_ "github.com/alecthomas/chroma/lexers/c" _ "github.com/alecthomas/chroma/lexers/c"
_ "github.com/alecthomas/chroma/lexers/circular"
_ "github.com/alecthomas/chroma/lexers/d" _ "github.com/alecthomas/chroma/lexers/d"
_ "github.com/alecthomas/chroma/lexers/e" _ "github.com/alecthomas/chroma/lexers/e"
_ "github.com/alecthomas/chroma/lexers/f" _ "github.com/alecthomas/chroma/lexers/f"

View File

@ -1,9 +1,9 @@
package s package s
import ( import (
. "github.com/alecthomas/chroma" // nolint . "github.com/alecthomas/chroma" // nolint
. "github.com/alecthomas/chroma/lexers/circular" // nolint
"github.com/alecthomas/chroma/lexers/internal" "github.com/alecthomas/chroma/lexers/internal"
. "github.com/alecthomas/chroma/lexers/p" // nolint
) )
// Smarty lexer. // Smarty lexer.

View File

@ -1,3 +1,8 @@
<!DOCTYPE html>
<html>
<body>
<h1>My first PHP page</h1>
<?php <?php
$docs = $modx->getIterator('modResource', ["parent" => 84]); $docs = $modx->getIterator('modResource', ["parent" => 84]);
@ -8,4 +13,7 @@ foreach($docs as $doc){
print_r($doc->content); print_r($doc->content);
// $doc->save(); // $doc->save();
} }
// some comment // some comment
?>
</body>
</html>

View File

@ -1,4 +1,35 @@
[ [
{"type":"CommentPreproc","value":"\u003c!DOCTYPE html\u003e"},
{"type":"Text","value":"\n"},
{"type":"Punctuation","value":"\u003c"},
{"type":"Text","value":""},
{"type":"NameTag","value":"html"},
{"type":"Punctuation","value":""},
{"type":"Text","value":""},
{"type":"Punctuation","value":"\u003e"},
{"type":"Text","value":"\n"},
{"type":"Punctuation","value":"\u003c"},
{"type":"Text","value":""},
{"type":"NameTag","value":"body"},
{"type":"Punctuation","value":""},
{"type":"Text","value":""},
{"type":"Punctuation","value":"\u003e"},
{"type":"Text","value":"\n\n"},
{"type":"Punctuation","value":"\u003c"},
{"type":"Text","value":""},
{"type":"NameTag","value":"h1"},
{"type":"Punctuation","value":""},
{"type":"Text","value":""},
{"type":"Punctuation","value":"\u003e"},
{"type":"Text","value":"My first PHP page"},
{"type":"Punctuation","value":"\u003c"},
{"type":"Text","value":""},
{"type":"Punctuation","value":"/"},
{"type":"Text","value":""},
{"type":"NameTag","value":"h1"},
{"type":"Text","value":""},
{"type":"Punctuation","value":"\u003e"},
{"type":"Text","value":"\n"},
{"type":"CommentPreproc","value":"\u003c?php"}, {"type":"CommentPreproc","value":"\u003c?php"},
{"type":"Text","value":"\n\n"}, {"type":"Text","value":"\n\n"},
{"type":"NameVariable","value":"$docs"}, {"type":"NameVariable","value":"$docs"},
@ -68,5 +99,23 @@
{"type":"CommentSingle","value":"// $doc-\u003esave();\n"}, {"type":"CommentSingle","value":"// $doc-\u003esave();\n"},
{"type":"Punctuation","value":"}"}, {"type":"Punctuation","value":"}"},
{"type":"Text","value":" \n"}, {"type":"Text","value":" \n"},
{"type":"CommentSingle","value":"// some comment\n"} {"type":"CommentSingle","value":"// some comment\n"},
{"type":"CommentPreproc","value":"?\u003e"},
{"type":"Text","value":"\n"},
{"type":"Punctuation","value":"\u003c"},
{"type":"Text","value":""},
{"type":"Punctuation","value":"/"},
{"type":"Text","value":""},
{"type":"NameTag","value":"body"},
{"type":"Text","value":""},
{"type":"Punctuation","value":"\u003e"},
{"type":"Text","value":"\n"},
{"type":"Punctuation","value":"\u003c"},
{"type":"Text","value":""},
{"type":"Punctuation","value":"/"},
{"type":"Text","value":""},
{"type":"NameTag","value":"html"},
{"type":"Text","value":""},
{"type":"Punctuation","value":"\u003e"},
{"type":"Text","value":"\n"}
] ]