Implemented delegating lexer.

This is a lexer that is useful for templating languages, where the surrounding text may be of a different syntax. eg. PHP+HTML The PHP lexer has been changed accordingly. Fixes #80
2025-04-13 11:40:29 +02:00 · 2018-03-18 20:38:29 +11:00 · 2018-03-18 20:38:29 +11:00 · db6920e68f
commit db6920e68f
parent 15a009f0fc
8 changed files with 230 additions and 166 deletions
--- a/delegate.go
+++ b/delegate.go
@ -9,10 +9,12 @@ type delegatingLexer struct {
 	language Lexer
 }

-// DelegatingLexer takes two lexer as arguments. A root lexer and
-// a language lexer. First everything is scanned using the language
-// lexer, afterwards all Other tokens are lexed using the root
-// lexer.
+// DelegatingLexer combines two lexers to handle the common case of a language embedded inside another, such as PHP
+// inside HTML or PHP inside plain text.
+//
+// It takes two lexer as arguments: a root lexer and a language lexer.  First everything is scanned using the language
+// lexer, which must return "Other" for unrecognised tokens. Then all "Other" tokens are lexed using the root lexer.
+// Finally, these two sets of tokens are merged.
 //
 // The lexers from the template lexer package use this base lexer.
 func DelegatingLexer(root Lexer, language Lexer) Lexer {
@ -26,101 +28,108 @@ func (d *delegatingLexer) Config() *Config {
 	return d.language.Config()
 }

-type tokenSplit struct {
-	pos    int
-	tokens []*Token
-}
-
-func splitOtherTokens(it Iterator) ([]tokenSplit, string) {
-	splits := []tokenSplit{}
-	var split *tokenSplit
-	other := bytes.Buffer{}
-	offset := 0
-	for t := it(); t != nil; t = it() {
-		if t.Type == Other {
-			if split != nil {
-				splits = append(splits, *split)
-				split = nil
-			}
-			other.WriteString(t.Value)
-		} else {
-			if split == nil {
-				split = &tokenSplit{pos: offset}
-			}
-			split.tokens = append(split.tokens, t)
-		}
-		offset += len(t.Value)
-	}
-	if split != nil {
-		splits = append(splits, *split)
-	}
-	return splits, other.String()
+// An insertion is the character range where language tokens should be inserted.
+type insertion struct {
+	start, end int
+	tokens     []*Token
 }

 func (d *delegatingLexer) Tokenise(options *TokeniseOptions, text string) (Iterator, error) {
-	it, err := d.language.Tokenise(options, text)
+	tokens, err := Tokenise(Coalesce(d.language), options, text)
 	if err != nil {
 		return nil, err
 	}
-	splits, other := splitOtherTokens(it)
-	it, err = d.root.Tokenise(options, other)
-	if err != nil {
-		return nil, err
-	}
-
+	// Compute insertions and gather "Other" tokens.
+	others := &bytes.Buffer{}
+	insertions := []*insertion{}
+	var insert *insertion
 	offset := 0
-	return func() *Token {
-		// First, see if there's a split at the start of this token.
-		for len(splits) > 0 && splits[0].pos == offset {
-			if len(splits[0].tokens) > 0 {
-				t := splits[0].tokens[0]
-				splits[0].tokens = splits[0].tokens[1:]
-				offset += len(t.Value)
-				return t
+	var last *Token
+	for _, t := range tokens {
+		if t.Type == Other {
+			if last != nil && insert != nil && last.Type != Other {
+				insert.end = offset
 			}
-			// End of tokens from this split, shift it off the queue.
-			splits = splits[1:]
+			others.WriteString(t.Value)
+		} else {
+			if last == nil || last.Type == Other {
+				insert = &insertion{start: offset}
+				insertions = append(insertions, insert)
+			}
+			insert.tokens = append(insert.tokens, t)
 		}
+		last = t
+		offset += len(t.Value)
+	}

-		// No split, try to consume a token.
-		t := it()
-		if t == nil {
-			for len(splits) > 0 {
-				if len(splits[0].tokens) > 0 {
-					t = splits[0].tokens[0]
-					splits[0].tokens = splits[0].tokens[1:]
-					offset += len(t.Value)
-					return t
-				}
-				// End of tokens from this split, shift it off the queue.
-				splits = splits[1:]
-			}
+	if len(insertions) == 0 {
+		return d.root.Tokenise(options, text)
+	}
+
+	// Lex the other tokens.
+	rootTokens, err := Tokenise(d.root, options, others.String())
+	if err != nil {
+		return nil, err
+	}
+
+	// Interleave the two sets of tokens.
+	out := []*Token{}
+	offset = 0
+	index := 0
+	next := func() *Token {
+		if index >= len(rootTokens) {
 			return nil
 		}
-
-		// Check if there's a split in the middle of the current token.
-		if len(splits) > 0 && splits[0].pos < offset+len(t.Value) {
-			// Split the token.
-			next := t.Clone()
-			point := splits[0].pos - offset
-			next.Value = next.Value[point:]
-			t.Value = t.Value[:point]
-
-			// Insert the tail of the split token after any other splits at the same point.
-			tailPos := offset + len(t.Value)
-			tail := []tokenSplit{{pos: tailPos, tokens: []*Token{next}}}
-			i := 0
-			for ; i < len(splits); i++ {
-				if splits[i].pos > tailPos {
-					break
-				}
-			}
-			splits = append(splits[:i], append(tail, splits[i:]...)...)
-
-			// Finally, return the head.
+		t := rootTokens[index]
+		index++
+		return t
+	}
+	t := next()
+	for _, insert := range insertions {
+		// Consume tokens until insertion point.
+		for t != nil && offset+len(t.Value) <= insert.start {
+			out = append(out, t)
+			offset += len(t.Value)
+			t = next()
+		}
+		// End of root tokens, append insertion point.
+		if t == nil {
+			out = append(out, insert.tokens...)
+			break
 		}

-		offset += len(t.Value)
-		return t
-	}, nil
+		// Split and insert.
+		l, r := splitToken(t, insert.start-offset)
+		if l != nil {
+			out = append(out, l)
+			offset += len(l.Value)
+		}
+		out = append(out, insert.tokens...)
+		offset += insert.end - insert.start
+		if r != nil {
+			out = append(out, r)
+			offset += len(r.Value)
+		}
+		t = next()
+	}
+	if t != nil {
+		out = append(out, t)
+	}
+	// Remainder.
+	out = append(out, rootTokens[index:]...)
+	return Literator(out...), nil
+}
+
+func splitToken(t *Token, offset int) (l *Token, r *Token) {
+	if offset == 0 {
+		return nil, t
+	}
+	if offset >= len(t.Value) {
+		return t, nil
+	}
+	l = t.Clone()
+	r = t.Clone()
+	l.Value = l.Value[:offset]
+	r.Value = r.Value[offset:]
+	return
 }
--- a/delegate_test.go
+++ b/delegate_test.go
@ -6,11 +6,6 @@ import (
 	"github.com/alecthomas/assert"
 )

-var (
-	delegateSourceMiddle = `hello world <? what ?> there`
-	delegateSourceEnd    = `hello world <? what there`
-)
-
 func makeDelegationTestLexers() (lang Lexer, root Lexer) {
 	return MustNewLexer(nil, Rules{
 			"root": {
@ -32,85 +27,84 @@ func makeDelegationTestLexers() (lang Lexer, root Lexer) {
 		})
 }

-func TestDelegateSplitOtherTokens(t *testing.T) {
-	lang, _ := makeDelegationTestLexers()
-	it, err := lang.Tokenise(nil, delegateSourceMiddle)
-	assert.NoError(t, err)
-	splits, other := splitOtherTokens(it)
-	assert.Equal(t, "hello world  there", other)
-	expected := []tokenSplit{tokenSplit{
-		pos: 12,
-		tokens: []*Token{
+func TestDelegate(t *testing.T) {
+	testdata := []struct {
+		name     string
+		source   string
+		expected []*Token
+	}{
+		{"SourceInMiddle", `hello world <? what ?> there`, []*Token{
+			{Keyword, "hello"},
+			{TextWhitespace, " "},
+			{Name, "world"},
+			{TextWhitespace, " "},
+			// lang
 			{CommentPreproc, "<?"},
 			{Whitespace, " "},
 			{Keyword, "what"},
 			{Whitespace, " "},
 			{CommentPreproc, "?>"},
-		},
-	}}
-	assert.Equal(t, expected, splits)
-}
-
-func TestDelegateSplitOtherTokensSourceAtEnd(t *testing.T) {
-	lang, _ := makeDelegationTestLexers()
-	lang = Coalesce(lang)
-	it, err := lang.Tokenise(nil, delegateSourceEnd)
-	assert.NoError(t, err)
-	splits, other := splitOtherTokens(it)
-	assert.Equal(t, "hello world ", other)
-	expected := []tokenSplit{tokenSplit{
-		pos: 12,
-		tokens: []*Token{
+			// /lang
+			{TextWhitespace, " "},
+			{Name, "there"},
+		}},
+		{"SourceBeginning", `<? what ?> hello world there`, []*Token{
+			{CommentPreproc, "<?"},
+			{TextWhitespace, " "},
+			{Keyword, "what"},
+			{TextWhitespace, " "},
+			{CommentPreproc, "?>"},
+			{TextWhitespace, " "},
+			{Keyword, "hello"},
+			{TextWhitespace, " "},
+			{Name, "world"},
+			{TextWhitespace, " "},
+			{Name, "there"},
+		}},
+		{"SourceEnd", `hello world <? what there`, []*Token{
+			{Keyword, "hello"},
+			{TextWhitespace, " "},
+			{Name, "world"},
+			{TextWhitespace, " "},
+			// lang
 			{CommentPreproc, "<?"},
 			{Whitespace, " "},
 			{Keyword, "what"},
 			{TextWhitespace, " "},
 			{Error, "there"},
-		},
-	}}
-	assert.Equal(t, expected, splits)
-}
-
-func TestDelegate(t *testing.T) {
+		}},
+		{"SourceMultiple", "hello world <? what ?> hello there <? what ?> hello", []*Token{
+			{Keyword, "hello"},
+			{TextWhitespace, " "},
+			{Name, "world"},
+			{TextWhitespace, " "},
+			{CommentPreproc, "<?"},
+			{TextWhitespace, " "},
+			{Keyword, "what"},
+			{TextWhitespace, " "},
+			{CommentPreproc, "?>"},
+			{TextWhitespace, " "},
+			{Keyword, "hello"},
+			{TextWhitespace, " "},
+			{Name, "there"},
+			{TextWhitespace, " "},
+			{CommentPreproc, "<?"},
+			{TextWhitespace, " "},
+			{Keyword, "what"},
+			{TextWhitespace, " "},
+			{CommentPreproc, "?>"},
+			{TextWhitespace, " "},
+			{Keyword, "hello"},
+		}},
+	}
 	lang, root := makeDelegationTestLexers()
 	delegate := DelegatingLexer(root, lang)
-	it, err := delegate.Tokenise(nil, delegateSourceMiddle)
-	assert.NoError(t, err)
-	expected := []*Token{
-		{Keyword, "hello"},
-		{TextWhitespace, " "},
-		{Name, "world"},
-		{TextWhitespace, " "},
-		// lang
-		{CommentPreproc, "<?"},
-		{Whitespace, " "},
-		{Keyword, "what"},
-		{Whitespace, " "},
-		{CommentPreproc, "?>"},
-		// /lang
-		{TextWhitespace, " "},
-		{Name, "there"},
+	for _, test := range testdata {
+		t.Run(test.name, func(t *testing.T) {
+			it, err := delegate.Tokenise(nil, test.source)
+			assert.NoError(t, err)
+			actual := it.Tokens()
+			assert.Equal(t, test.expected, actual)
+		})
 	}
-	assert.Equal(t, expected, it.Tokens())
-}
-
-func TestDelegateEnd(t *testing.T) {
-	lang, root := makeDelegationTestLexers()
-	lang = Coalesce(lang)
-	delegate := DelegatingLexer(root, lang)
-	it, err := delegate.Tokenise(nil, delegateSourceEnd)
-	assert.NoError(t, err)
-	expected := []*Token{
-		{Keyword, "hello"},
-		{TextWhitespace, " "},
-		{Name, "world"},
-		{TextWhitespace, " "},
-		// lang
-		{CommentPreproc, "<?"},
-		{Whitespace, " "},
-		{Keyword, "what"},
-		{TextWhitespace, " "},
-		{Error, "there"},
-	}
-	assert.Equal(t, expected, it.Tokens())
 }
--- a/lexers/circular/doc.go
+++ b/lexers/circular/doc.go
@ -0,0 +1,2 @@
+// Package circular exists to break circular dependencies between lexers.
+package circular
--- a/lexers/circular/php.go
+++ b/lexers/circular/php.go
@ -1,12 +1,13 @@
-package p
+package circular

 import (
 	. "github.com/alecthomas/chroma" // nolint
+	"github.com/alecthomas/chroma/lexers/h"
 	"github.com/alecthomas/chroma/lexers/internal"
 )

 // PHP lexer.
-var PHP = internal.Register(DelegatingLexer(HTML, MustNewLexer(
+var PHP = internal.Register(DelegatingLexer(h.HTML, MustNewLexer(
 	&Config{
 		Name:            "PHP",
 		Aliases:         []string{"php", "php3", "php4", "php5"},
--- a/lexers/lexers.go
+++ b/lexers/lexers.go
@ -9,6 +9,7 @@ import (
 	_ "github.com/alecthomas/chroma/lexers/a"
 	_ "github.com/alecthomas/chroma/lexers/b"
 	_ "github.com/alecthomas/chroma/lexers/c"
+	_ "github.com/alecthomas/chroma/lexers/circular"
 	_ "github.com/alecthomas/chroma/lexers/d"
 	_ "github.com/alecthomas/chroma/lexers/e"
 	_ "github.com/alecthomas/chroma/lexers/f"
--- a/lexers/s/smarty.go
+++ b/lexers/s/smarty.go
@ -1,9 +1,9 @@
 package s

 import (
-	. "github.com/alecthomas/chroma" // nolint
+	. "github.com/alecthomas/chroma"                 // nolint
+	. "github.com/alecthomas/chroma/lexers/circular" // nolint
 	"github.com/alecthomas/chroma/lexers/internal"
-	. "github.com/alecthomas/chroma/lexers/p" // nolint
 )

 // Smarty lexer.
--- a/lexers/testdata/php.actual
+++ b/lexers/testdata/php.actual
@ -1,3 +1,8 @@
+<!DOCTYPE html>
+<html>
+<body>
+
+<h1>My first PHP page</h1>
 <?php

 $docs = $modx->getIterator('modResource', ["parent" => 84]);
@ -8,4 +13,7 @@ foreach($docs as $doc){
    print_r($doc->content);
    // $doc->save();
 } 
-// some comment
+// some comment
+?>
+</body>
+</html>
--- a/lexers/testdata/php.expected
+++ b/lexers/testdata/php.expected
@ -1,4 +1,35 @@
 [
+  {"type":"CommentPreproc","value":"\u003c!DOCTYPE html\u003e"},
+  {"type":"Text","value":"\n"},
+  {"type":"Punctuation","value":"\u003c"},
+  {"type":"Text","value":""},
+  {"type":"NameTag","value":"html"},
+  {"type":"Punctuation","value":""},
+  {"type":"Text","value":""},
+  {"type":"Punctuation","value":"\u003e"},
+  {"type":"Text","value":"\n"},
+  {"type":"Punctuation","value":"\u003c"},
+  {"type":"Text","value":""},
+  {"type":"NameTag","value":"body"},
+  {"type":"Punctuation","value":""},
+  {"type":"Text","value":""},
+  {"type":"Punctuation","value":"\u003e"},
+  {"type":"Text","value":"\n\n"},
+  {"type":"Punctuation","value":"\u003c"},
+  {"type":"Text","value":""},
+  {"type":"NameTag","value":"h1"},
+  {"type":"Punctuation","value":""},
+  {"type":"Text","value":""},
+  {"type":"Punctuation","value":"\u003e"},
+  {"type":"Text","value":"My first PHP page"},
+  {"type":"Punctuation","value":"\u003c"},
+  {"type":"Text","value":""},
+  {"type":"Punctuation","value":"/"},
+  {"type":"Text","value":""},
+  {"type":"NameTag","value":"h1"},
+  {"type":"Text","value":""},
+  {"type":"Punctuation","value":"\u003e"},
+  {"type":"Text","value":"\n"},
  {"type":"CommentPreproc","value":"\u003c?php"},
  {"type":"Text","value":"\n\n"},
  {"type":"NameVariable","value":"$docs"},
@ -68,5 +99,23 @@
  {"type":"CommentSingle","value":"// $doc-\u003esave();\n"},
  {"type":"Punctuation","value":"}"},
  {"type":"Text","value":" \n"},
-  {"type":"CommentSingle","value":"// some comment\n"}
+  {"type":"CommentSingle","value":"// some comment\n"},
+  {"type":"CommentPreproc","value":"?\u003e"},
+  {"type":"Text","value":"\n"},
+  {"type":"Punctuation","value":"\u003c"},
+  {"type":"Text","value":""},
+  {"type":"Punctuation","value":"/"},
+  {"type":"Text","value":""},
+  {"type":"NameTag","value":"body"},
+  {"type":"Text","value":""},
+  {"type":"Punctuation","value":"\u003e"},
+  {"type":"Text","value":"\n"},
+  {"type":"Punctuation","value":"\u003c"},
+  {"type":"Text","value":""},
+  {"type":"Punctuation","value":"/"},
+  {"type":"Text","value":""},
+  {"type":"NameTag","value":"html"},
+  {"type":"Text","value":""},
+  {"type":"Punctuation","value":"\u003e"},
+  {"type":"Text","value":"\n"}
 ]