mirror of
https://github.com/alecthomas/chroma.git
synced 2025-03-21 21:17:50 +02:00
Implemented delegating lexer.
This is a lexer that is useful for templating languages, where the surrounding text may be of a different syntax. eg. PHP+HTML The PHP lexer has been changed accordingly. Fixes #80
This commit is contained in:
parent
15a009f0fc
commit
db6920e68f
183
delegate.go
183
delegate.go
@ -9,10 +9,12 @@ type delegatingLexer struct {
|
||||
language Lexer
|
||||
}
|
||||
|
||||
// DelegatingLexer takes two lexer as arguments. A root lexer and
|
||||
// a language lexer. First everything is scanned using the language
|
||||
// lexer, afterwards all Other tokens are lexed using the root
|
||||
// lexer.
|
||||
// DelegatingLexer combines two lexers to handle the common case of a language embedded inside another, such as PHP
|
||||
// inside HTML or PHP inside plain text.
|
||||
//
|
||||
// It takes two lexer as arguments: a root lexer and a language lexer. First everything is scanned using the language
|
||||
// lexer, which must return "Other" for unrecognised tokens. Then all "Other" tokens are lexed using the root lexer.
|
||||
// Finally, these two sets of tokens are merged.
|
||||
//
|
||||
// The lexers from the template lexer package use this base lexer.
|
||||
func DelegatingLexer(root Lexer, language Lexer) Lexer {
|
||||
@ -26,101 +28,108 @@ func (d *delegatingLexer) Config() *Config {
|
||||
return d.language.Config()
|
||||
}
|
||||
|
||||
type tokenSplit struct {
|
||||
pos int
|
||||
tokens []*Token
|
||||
}
|
||||
|
||||
func splitOtherTokens(it Iterator) ([]tokenSplit, string) {
|
||||
splits := []tokenSplit{}
|
||||
var split *tokenSplit
|
||||
other := bytes.Buffer{}
|
||||
offset := 0
|
||||
for t := it(); t != nil; t = it() {
|
||||
if t.Type == Other {
|
||||
if split != nil {
|
||||
splits = append(splits, *split)
|
||||
split = nil
|
||||
}
|
||||
other.WriteString(t.Value)
|
||||
} else {
|
||||
if split == nil {
|
||||
split = &tokenSplit{pos: offset}
|
||||
}
|
||||
split.tokens = append(split.tokens, t)
|
||||
}
|
||||
offset += len(t.Value)
|
||||
}
|
||||
if split != nil {
|
||||
splits = append(splits, *split)
|
||||
}
|
||||
return splits, other.String()
|
||||
// An insertion is the character range where language tokens should be inserted.
|
||||
type insertion struct {
|
||||
start, end int
|
||||
tokens []*Token
|
||||
}
|
||||
|
||||
func (d *delegatingLexer) Tokenise(options *TokeniseOptions, text string) (Iterator, error) {
|
||||
it, err := d.language.Tokenise(options, text)
|
||||
tokens, err := Tokenise(Coalesce(d.language), options, text)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
splits, other := splitOtherTokens(it)
|
||||
it, err = d.root.Tokenise(options, other)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// Compute insertions and gather "Other" tokens.
|
||||
others := &bytes.Buffer{}
|
||||
insertions := []*insertion{}
|
||||
var insert *insertion
|
||||
offset := 0
|
||||
return func() *Token {
|
||||
// First, see if there's a split at the start of this token.
|
||||
for len(splits) > 0 && splits[0].pos == offset {
|
||||
if len(splits[0].tokens) > 0 {
|
||||
t := splits[0].tokens[0]
|
||||
splits[0].tokens = splits[0].tokens[1:]
|
||||
offset += len(t.Value)
|
||||
return t
|
||||
var last *Token
|
||||
for _, t := range tokens {
|
||||
if t.Type == Other {
|
||||
if last != nil && insert != nil && last.Type != Other {
|
||||
insert.end = offset
|
||||
}
|
||||
// End of tokens from this split, shift it off the queue.
|
||||
splits = splits[1:]
|
||||
others.WriteString(t.Value)
|
||||
} else {
|
||||
if last == nil || last.Type == Other {
|
||||
insert = &insertion{start: offset}
|
||||
insertions = append(insertions, insert)
|
||||
}
|
||||
insert.tokens = append(insert.tokens, t)
|
||||
}
|
||||
last = t
|
||||
offset += len(t.Value)
|
||||
}
|
||||
|
||||
// No split, try to consume a token.
|
||||
t := it()
|
||||
if t == nil {
|
||||
for len(splits) > 0 {
|
||||
if len(splits[0].tokens) > 0 {
|
||||
t = splits[0].tokens[0]
|
||||
splits[0].tokens = splits[0].tokens[1:]
|
||||
offset += len(t.Value)
|
||||
return t
|
||||
}
|
||||
// End of tokens from this split, shift it off the queue.
|
||||
splits = splits[1:]
|
||||
}
|
||||
if len(insertions) == 0 {
|
||||
return d.root.Tokenise(options, text)
|
||||
}
|
||||
|
||||
// Lex the other tokens.
|
||||
rootTokens, err := Tokenise(d.root, options, others.String())
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// Interleave the two sets of tokens.
|
||||
out := []*Token{}
|
||||
offset = 0
|
||||
index := 0
|
||||
next := func() *Token {
|
||||
if index >= len(rootTokens) {
|
||||
return nil
|
||||
}
|
||||
|
||||
// Check if there's a split in the middle of the current token.
|
||||
if len(splits) > 0 && splits[0].pos < offset+len(t.Value) {
|
||||
// Split the token.
|
||||
next := t.Clone()
|
||||
point := splits[0].pos - offset
|
||||
next.Value = next.Value[point:]
|
||||
t.Value = t.Value[:point]
|
||||
|
||||
// Insert the tail of the split token after any other splits at the same point.
|
||||
tailPos := offset + len(t.Value)
|
||||
tail := []tokenSplit{{pos: tailPos, tokens: []*Token{next}}}
|
||||
i := 0
|
||||
for ; i < len(splits); i++ {
|
||||
if splits[i].pos > tailPos {
|
||||
break
|
||||
}
|
||||
}
|
||||
splits = append(splits[:i], append(tail, splits[i:]...)...)
|
||||
|
||||
// Finally, return the head.
|
||||
t := rootTokens[index]
|
||||
index++
|
||||
return t
|
||||
}
|
||||
t := next()
|
||||
for _, insert := range insertions {
|
||||
// Consume tokens until insertion point.
|
||||
for t != nil && offset+len(t.Value) <= insert.start {
|
||||
out = append(out, t)
|
||||
offset += len(t.Value)
|
||||
t = next()
|
||||
}
|
||||
// End of root tokens, append insertion point.
|
||||
if t == nil {
|
||||
out = append(out, insert.tokens...)
|
||||
break
|
||||
}
|
||||
|
||||
offset += len(t.Value)
|
||||
return t
|
||||
}, nil
|
||||
// Split and insert.
|
||||
l, r := splitToken(t, insert.start-offset)
|
||||
if l != nil {
|
||||
out = append(out, l)
|
||||
offset += len(l.Value)
|
||||
}
|
||||
out = append(out, insert.tokens...)
|
||||
offset += insert.end - insert.start
|
||||
if r != nil {
|
||||
out = append(out, r)
|
||||
offset += len(r.Value)
|
||||
}
|
||||
t = next()
|
||||
}
|
||||
if t != nil {
|
||||
out = append(out, t)
|
||||
}
|
||||
// Remainder.
|
||||
out = append(out, rootTokens[index:]...)
|
||||
return Literator(out...), nil
|
||||
}
|
||||
|
||||
func splitToken(t *Token, offset int) (l *Token, r *Token) {
|
||||
if offset == 0 {
|
||||
return nil, t
|
||||
}
|
||||
if offset >= len(t.Value) {
|
||||
return t, nil
|
||||
}
|
||||
l = t.Clone()
|
||||
r = t.Clone()
|
||||
l.Value = l.Value[:offset]
|
||||
r.Value = r.Value[offset:]
|
||||
return
|
||||
}
|
||||
|
140
delegate_test.go
140
delegate_test.go
@ -6,11 +6,6 @@ import (
|
||||
"github.com/alecthomas/assert"
|
||||
)
|
||||
|
||||
var (
|
||||
delegateSourceMiddle = `hello world <? what ?> there`
|
||||
delegateSourceEnd = `hello world <? what there`
|
||||
)
|
||||
|
||||
func makeDelegationTestLexers() (lang Lexer, root Lexer) {
|
||||
return MustNewLexer(nil, Rules{
|
||||
"root": {
|
||||
@ -32,85 +27,84 @@ func makeDelegationTestLexers() (lang Lexer, root Lexer) {
|
||||
})
|
||||
}
|
||||
|
||||
func TestDelegateSplitOtherTokens(t *testing.T) {
|
||||
lang, _ := makeDelegationTestLexers()
|
||||
it, err := lang.Tokenise(nil, delegateSourceMiddle)
|
||||
assert.NoError(t, err)
|
||||
splits, other := splitOtherTokens(it)
|
||||
assert.Equal(t, "hello world there", other)
|
||||
expected := []tokenSplit{tokenSplit{
|
||||
pos: 12,
|
||||
tokens: []*Token{
|
||||
func TestDelegate(t *testing.T) {
|
||||
testdata := []struct {
|
||||
name string
|
||||
source string
|
||||
expected []*Token
|
||||
}{
|
||||
{"SourceInMiddle", `hello world <? what ?> there`, []*Token{
|
||||
{Keyword, "hello"},
|
||||
{TextWhitespace, " "},
|
||||
{Name, "world"},
|
||||
{TextWhitespace, " "},
|
||||
// lang
|
||||
{CommentPreproc, "<?"},
|
||||
{Whitespace, " "},
|
||||
{Keyword, "what"},
|
||||
{Whitespace, " "},
|
||||
{CommentPreproc, "?>"},
|
||||
},
|
||||
}}
|
||||
assert.Equal(t, expected, splits)
|
||||
}
|
||||
|
||||
func TestDelegateSplitOtherTokensSourceAtEnd(t *testing.T) {
|
||||
lang, _ := makeDelegationTestLexers()
|
||||
lang = Coalesce(lang)
|
||||
it, err := lang.Tokenise(nil, delegateSourceEnd)
|
||||
assert.NoError(t, err)
|
||||
splits, other := splitOtherTokens(it)
|
||||
assert.Equal(t, "hello world ", other)
|
||||
expected := []tokenSplit{tokenSplit{
|
||||
pos: 12,
|
||||
tokens: []*Token{
|
||||
// /lang
|
||||
{TextWhitespace, " "},
|
||||
{Name, "there"},
|
||||
}},
|
||||
{"SourceBeginning", `<? what ?> hello world there`, []*Token{
|
||||
{CommentPreproc, "<?"},
|
||||
{TextWhitespace, " "},
|
||||
{Keyword, "what"},
|
||||
{TextWhitespace, " "},
|
||||
{CommentPreproc, "?>"},
|
||||
{TextWhitespace, " "},
|
||||
{Keyword, "hello"},
|
||||
{TextWhitespace, " "},
|
||||
{Name, "world"},
|
||||
{TextWhitespace, " "},
|
||||
{Name, "there"},
|
||||
}},
|
||||
{"SourceEnd", `hello world <? what there`, []*Token{
|
||||
{Keyword, "hello"},
|
||||
{TextWhitespace, " "},
|
||||
{Name, "world"},
|
||||
{TextWhitespace, " "},
|
||||
// lang
|
||||
{CommentPreproc, "<?"},
|
||||
{Whitespace, " "},
|
||||
{Keyword, "what"},
|
||||
{TextWhitespace, " "},
|
||||
{Error, "there"},
|
||||
},
|
||||
}}
|
||||
assert.Equal(t, expected, splits)
|
||||
}
|
||||
|
||||
func TestDelegate(t *testing.T) {
|
||||
}},
|
||||
{"SourceMultiple", "hello world <? what ?> hello there <? what ?> hello", []*Token{
|
||||
{Keyword, "hello"},
|
||||
{TextWhitespace, " "},
|
||||
{Name, "world"},
|
||||
{TextWhitespace, " "},
|
||||
{CommentPreproc, "<?"},
|
||||
{TextWhitespace, " "},
|
||||
{Keyword, "what"},
|
||||
{TextWhitespace, " "},
|
||||
{CommentPreproc, "?>"},
|
||||
{TextWhitespace, " "},
|
||||
{Keyword, "hello"},
|
||||
{TextWhitespace, " "},
|
||||
{Name, "there"},
|
||||
{TextWhitespace, " "},
|
||||
{CommentPreproc, "<?"},
|
||||
{TextWhitespace, " "},
|
||||
{Keyword, "what"},
|
||||
{TextWhitespace, " "},
|
||||
{CommentPreproc, "?>"},
|
||||
{TextWhitespace, " "},
|
||||
{Keyword, "hello"},
|
||||
}},
|
||||
}
|
||||
lang, root := makeDelegationTestLexers()
|
||||
delegate := DelegatingLexer(root, lang)
|
||||
it, err := delegate.Tokenise(nil, delegateSourceMiddle)
|
||||
assert.NoError(t, err)
|
||||
expected := []*Token{
|
||||
{Keyword, "hello"},
|
||||
{TextWhitespace, " "},
|
||||
{Name, "world"},
|
||||
{TextWhitespace, " "},
|
||||
// lang
|
||||
{CommentPreproc, "<?"},
|
||||
{Whitespace, " "},
|
||||
{Keyword, "what"},
|
||||
{Whitespace, " "},
|
||||
{CommentPreproc, "?>"},
|
||||
// /lang
|
||||
{TextWhitespace, " "},
|
||||
{Name, "there"},
|
||||
for _, test := range testdata {
|
||||
t.Run(test.name, func(t *testing.T) {
|
||||
it, err := delegate.Tokenise(nil, test.source)
|
||||
assert.NoError(t, err)
|
||||
actual := it.Tokens()
|
||||
assert.Equal(t, test.expected, actual)
|
||||
})
|
||||
}
|
||||
assert.Equal(t, expected, it.Tokens())
|
||||
}
|
||||
|
||||
func TestDelegateEnd(t *testing.T) {
|
||||
lang, root := makeDelegationTestLexers()
|
||||
lang = Coalesce(lang)
|
||||
delegate := DelegatingLexer(root, lang)
|
||||
it, err := delegate.Tokenise(nil, delegateSourceEnd)
|
||||
assert.NoError(t, err)
|
||||
expected := []*Token{
|
||||
{Keyword, "hello"},
|
||||
{TextWhitespace, " "},
|
||||
{Name, "world"},
|
||||
{TextWhitespace, " "},
|
||||
// lang
|
||||
{CommentPreproc, "<?"},
|
||||
{Whitespace, " "},
|
||||
{Keyword, "what"},
|
||||
{TextWhitespace, " "},
|
||||
{Error, "there"},
|
||||
}
|
||||
assert.Equal(t, expected, it.Tokens())
|
||||
}
|
||||
|
2
lexers/circular/doc.go
Normal file
2
lexers/circular/doc.go
Normal file
@ -0,0 +1,2 @@
|
||||
// Package circular exists to break circular dependencies between lexers.
|
||||
package circular
|
@ -1,12 +1,13 @@
|
||||
package p
|
||||
package circular
|
||||
|
||||
import (
|
||||
. "github.com/alecthomas/chroma" // nolint
|
||||
"github.com/alecthomas/chroma/lexers/h"
|
||||
"github.com/alecthomas/chroma/lexers/internal"
|
||||
)
|
||||
|
||||
// PHP lexer.
|
||||
var PHP = internal.Register(DelegatingLexer(HTML, MustNewLexer(
|
||||
var PHP = internal.Register(DelegatingLexer(h.HTML, MustNewLexer(
|
||||
&Config{
|
||||
Name: "PHP",
|
||||
Aliases: []string{"php", "php3", "php4", "php5"},
|
@ -9,6 +9,7 @@ import (
|
||||
_ "github.com/alecthomas/chroma/lexers/a"
|
||||
_ "github.com/alecthomas/chroma/lexers/b"
|
||||
_ "github.com/alecthomas/chroma/lexers/c"
|
||||
_ "github.com/alecthomas/chroma/lexers/circular"
|
||||
_ "github.com/alecthomas/chroma/lexers/d"
|
||||
_ "github.com/alecthomas/chroma/lexers/e"
|
||||
_ "github.com/alecthomas/chroma/lexers/f"
|
||||
|
@ -1,9 +1,9 @@
|
||||
package s
|
||||
|
||||
import (
|
||||
. "github.com/alecthomas/chroma" // nolint
|
||||
. "github.com/alecthomas/chroma" // nolint
|
||||
. "github.com/alecthomas/chroma/lexers/circular" // nolint
|
||||
"github.com/alecthomas/chroma/lexers/internal"
|
||||
. "github.com/alecthomas/chroma/lexers/p" // nolint
|
||||
)
|
||||
|
||||
// Smarty lexer.
|
||||
|
10
lexers/testdata/php.actual
vendored
10
lexers/testdata/php.actual
vendored
@ -1,3 +1,8 @@
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<body>
|
||||
|
||||
<h1>My first PHP page</h1>
|
||||
<?php
|
||||
|
||||
$docs = $modx->getIterator('modResource', ["parent" => 84]);
|
||||
@ -8,4 +13,7 @@ foreach($docs as $doc){
|
||||
print_r($doc->content);
|
||||
// $doc->save();
|
||||
}
|
||||
// some comment
|
||||
// some comment
|
||||
?>
|
||||
</body>
|
||||
</html>
|
||||
|
51
lexers/testdata/php.expected
vendored
51
lexers/testdata/php.expected
vendored
@ -1,4 +1,35 @@
|
||||
[
|
||||
{"type":"CommentPreproc","value":"\u003c!DOCTYPE html\u003e"},
|
||||
{"type":"Text","value":"\n"},
|
||||
{"type":"Punctuation","value":"\u003c"},
|
||||
{"type":"Text","value":""},
|
||||
{"type":"NameTag","value":"html"},
|
||||
{"type":"Punctuation","value":""},
|
||||
{"type":"Text","value":""},
|
||||
{"type":"Punctuation","value":"\u003e"},
|
||||
{"type":"Text","value":"\n"},
|
||||
{"type":"Punctuation","value":"\u003c"},
|
||||
{"type":"Text","value":""},
|
||||
{"type":"NameTag","value":"body"},
|
||||
{"type":"Punctuation","value":""},
|
||||
{"type":"Text","value":""},
|
||||
{"type":"Punctuation","value":"\u003e"},
|
||||
{"type":"Text","value":"\n\n"},
|
||||
{"type":"Punctuation","value":"\u003c"},
|
||||
{"type":"Text","value":""},
|
||||
{"type":"NameTag","value":"h1"},
|
||||
{"type":"Punctuation","value":""},
|
||||
{"type":"Text","value":""},
|
||||
{"type":"Punctuation","value":"\u003e"},
|
||||
{"type":"Text","value":"My first PHP page"},
|
||||
{"type":"Punctuation","value":"\u003c"},
|
||||
{"type":"Text","value":""},
|
||||
{"type":"Punctuation","value":"/"},
|
||||
{"type":"Text","value":""},
|
||||
{"type":"NameTag","value":"h1"},
|
||||
{"type":"Text","value":""},
|
||||
{"type":"Punctuation","value":"\u003e"},
|
||||
{"type":"Text","value":"\n"},
|
||||
{"type":"CommentPreproc","value":"\u003c?php"},
|
||||
{"type":"Text","value":"\n\n"},
|
||||
{"type":"NameVariable","value":"$docs"},
|
||||
@ -68,5 +99,23 @@
|
||||
{"type":"CommentSingle","value":"// $doc-\u003esave();\n"},
|
||||
{"type":"Punctuation","value":"}"},
|
||||
{"type":"Text","value":" \n"},
|
||||
{"type":"CommentSingle","value":"// some comment\n"}
|
||||
{"type":"CommentSingle","value":"// some comment\n"},
|
||||
{"type":"CommentPreproc","value":"?\u003e"},
|
||||
{"type":"Text","value":"\n"},
|
||||
{"type":"Punctuation","value":"\u003c"},
|
||||
{"type":"Text","value":""},
|
||||
{"type":"Punctuation","value":"/"},
|
||||
{"type":"Text","value":""},
|
||||
{"type":"NameTag","value":"body"},
|
||||
{"type":"Text","value":""},
|
||||
{"type":"Punctuation","value":"\u003e"},
|
||||
{"type":"Text","value":"\n"},
|
||||
{"type":"Punctuation","value":"\u003c"},
|
||||
{"type":"Text","value":""},
|
||||
{"type":"Punctuation","value":"/"},
|
||||
{"type":"Text","value":""},
|
||||
{"type":"NameTag","value":"html"},
|
||||
{"type":"Text","value":""},
|
||||
{"type":"Punctuation","value":"\u003e"},
|
||||
{"type":"Text","value":"\n"}
|
||||
]
|
||||
|
Loading…
x
Reference in New Issue
Block a user