From ea14dd8660c608784452d795ad4f0f2373a4aba8 Mon Sep 17 00:00:00 2001 From: Alec Thomas Date: Sun, 9 Jun 2019 21:43:16 +1000 Subject: [PATCH] Fixed a fundamental bug where ^ would always match. The engine was always passing a string sliced to the current position, resulting in ^ always matching. Switched to use FindRunesMatchStartingAt. Fixes #242. --- go.mod | 1 + lexers/testdata/ballerina.expected | 7 ++- lexers/testdata/csharp.expected | 2 +- lexers/testdata/erlang.actual | 15 ++++++ lexers/testdata/erlang.expected | 79 ++++++++++++++++++++++++++++++ regexp.go | 15 ++++-- regexp_test.go | 19 +++++++ 7 files changed, 130 insertions(+), 8 deletions(-) create mode 100644 lexers/testdata/erlang.actual create mode 100644 lexers/testdata/erlang.expected diff --git a/go.mod b/go.mod index 9d0aa86..0c1cb0e 100644 --- a/go.mod +++ b/go.mod @@ -10,5 +10,6 @@ require ( github.com/mattn/go-colorable v0.0.9 github.com/mattn/go-isatty v0.0.4 github.com/sergi/go-diff v1.0.0 // indirect + github.com/stretchr/testify v1.2.2 golang.org/x/sys v0.0.0-20181128092732-4ed8d59d0b35 // indirect ) diff --git a/lexers/testdata/ballerina.expected b/lexers/testdata/ballerina.expected index c30dd01..f3b491c 100644 --- a/lexers/testdata/ballerina.expected +++ b/lexers/testdata/ballerina.expected @@ -7,13 +7,16 @@ {"type":"Text","value":"\n\n"}, {"type":"KeywordDeclaration","value":"endpoint"}, {"type":"Text","value":" "}, - {"type":"NameLabel","value":"http:"}, + {"type":"Name","value":"http"}, + {"type":"Operator","value":":"}, {"type":"Name","value":"Listener"}, {"type":"Text","value":" "}, {"type":"Name","value":"listener"}, {"type":"Text","value":" "}, {"type":"Operator","value":"{"}, - {"type":"NameLabel","value":"\n port:"}, + {"type":"Text","value":"\n "}, + {"type":"Name","value":"port"}, + {"type":"Operator","value":":"}, {"type":"Text","value":" "}, {"type":"Name","value":"9090"}, {"type":"Text","value":"\n"}, diff --git a/lexers/testdata/csharp.expected b/lexers/testdata/csharp.expected index a2a4baa..1c47faf 100644 --- a/lexers/testdata/csharp.expected +++ b/lexers/testdata/csharp.expected @@ -1,6 +1,6 @@ [ {"type":"Name","value":"DriveInfo"}, - {"type":"NameAttribute","value":"[]"}, + {"type":"Punctuation","value":"[]"}, {"type":"Text","value":" "}, {"type":"Name","value":"drives"}, {"type":"Text","value":" "}, diff --git a/lexers/testdata/erlang.actual b/lexers/testdata/erlang.actual new file mode 100644 index 0000000..2eb2c3b --- /dev/null +++ b/lexers/testdata/erlang.actual @@ -0,0 +1,15 @@ +-module(repl). + +-export([run/0]). + +run() -> + read_eval_process(). + +read_eval_process() -> + Line = io:get_line("> "), + Out = process_line(Line), + io:format("< ~s~n~n", [Out]), + read_eval_process(). + +process_line(Line) -> + string:uppercase(Line). diff --git a/lexers/testdata/erlang.expected b/lexers/testdata/erlang.expected new file mode 100644 index 0000000..483969b --- /dev/null +++ b/lexers/testdata/erlang.expected @@ -0,0 +1,79 @@ +[ + {"type":"Punctuation","value":"-"}, + {"type":"NameEntity","value":"module"}, + {"type":"Punctuation","value":"("}, + {"type":"Name","value":"repl"}, + {"type":"Punctuation","value":")."}, + {"type":"Text","value":"\n\n"}, + {"type":"Punctuation","value":"-"}, + {"type":"NameEntity","value":"export"}, + {"type":"Punctuation","value":"(["}, + {"type":"Name","value":"run"}, + {"type":"Operator","value":"/"}, + {"type":"LiteralNumberInteger","value":"0"}, + {"type":"Punctuation","value":"])."}, + {"type":"Text","value":"\n\n"}, + {"type":"NameFunction","value":"run"}, + {"type":"Punctuation","value":"()"}, + {"type":"Text","value":" "}, + {"type":"Operator","value":"-\u003e"}, + {"type":"Text","value":"\n "}, + {"type":"Name","value":"read_eval_process"}, + {"type":"Punctuation","value":"()."}, + {"type":"Text","value":"\n\n"}, + {"type":"NameFunction","value":"read_eval_process"}, + {"type":"Punctuation","value":"()"}, + {"type":"Text","value":" "}, + {"type":"Operator","value":"-\u003e"}, + {"type":"Text","value":"\n "}, + {"type":"NameVariable","value":"Line"}, + {"type":"Text","value":" "}, + {"type":"Operator","value":"="}, + {"type":"Text","value":" "}, + {"type":"NameNamespace","value":"io"}, + {"type":"Punctuation","value":":"}, + {"type":"NameFunction","value":"get_line"}, + {"type":"Punctuation","value":"("}, + {"type":"LiteralString","value":"\"\u003e \""}, + {"type":"Punctuation","value":"),"}, + {"type":"Text","value":"\n "}, + {"type":"NameVariable","value":"Out"}, + {"type":"Text","value":" "}, + {"type":"Operator","value":"="}, + {"type":"Text","value":" "}, + {"type":"Name","value":"process_line"}, + {"type":"Punctuation","value":"("}, + {"type":"NameVariable","value":"Line"}, + {"type":"Punctuation","value":"),"}, + {"type":"Text","value":"\n "}, + {"type":"NameNamespace","value":"io"}, + {"type":"Punctuation","value":":"}, + {"type":"NameFunction","value":"format"}, + {"type":"Punctuation","value":"("}, + {"type":"LiteralString","value":"\"\u003c "}, + {"type":"LiteralStringInterpol","value":"~s~n~n"}, + {"type":"LiteralString","value":"\""}, + {"type":"Punctuation","value":","}, + {"type":"Text","value":" "}, + {"type":"Punctuation","value":"["}, + {"type":"NameVariable","value":"Out"}, + {"type":"Punctuation","value":"]),"}, + {"type":"Text","value":"\n "}, + {"type":"Name","value":"read_eval_process"}, + {"type":"Punctuation","value":"()."}, + {"type":"Text","value":"\n\n"}, + {"type":"NameFunction","value":"process_line"}, + {"type":"Punctuation","value":"("}, + {"type":"NameVariable","value":"Line"}, + {"type":"Punctuation","value":")"}, + {"type":"Text","value":" "}, + {"type":"Operator","value":"-\u003e"}, + {"type":"Text","value":"\n "}, + {"type":"NameNamespace","value":"string"}, + {"type":"Punctuation","value":":"}, + {"type":"NameFunction","value":"uppercase"}, + {"type":"Punctuation","value":"("}, + {"type":"NameVariable","value":"Line"}, + {"type":"Punctuation","value":")."}, + {"type":"Text","value":"\n"} +] diff --git a/regexp.go b/regexp.go index 2e7897c..44ff149 100644 --- a/regexp.go +++ b/regexp.go @@ -276,7 +276,7 @@ func (l *LexerState) Iterator() Token { if !ok { panic("unknown state " + l.State) } - ruleIndex, rule, groups := matchRules(l.Text[l.Pos:], selectedRule) + ruleIndex, rule, groups := matchRules(l.Text, l.Pos, selectedRule) // No match. if groups == nil { // From Pygments :\ @@ -363,7 +363,12 @@ func (r *RegexLexer) maybeCompile() (err error) { for state, rules := range r.rules { for i, rule := range rules { if rule.Regexp == nil { - rule.Regexp, err = regexp2.Compile("^(?"+rule.flags+")(?:"+rule.Pattern+")", 0) + pattern := "(?:" + rule.Pattern + ")" + if rule.flags != "" { + pattern = "(?" + rule.flags + ")" + pattern + } + pattern = `\G` + pattern + rule.Regexp, err = regexp2.Compile(pattern, 0) if err != nil { return fmt.Errorf("failed to compile rule %s.%d: %s", state, i, err) } @@ -415,10 +420,10 @@ func (r *RegexLexer) Tokenise(options *TokeniseOptions, text string) (Iterator, return state.Iterator, nil } -func matchRules(text []rune, rules []*CompiledRule) (int, *CompiledRule, []string) { +func matchRules(text []rune, pos int, rules []*CompiledRule) (int, *CompiledRule, []string) { for i, rule := range rules { - match, err := rule.Regexp.FindRunesMatch(text) - if match != nil && err == nil { + match, err := rule.Regexp.FindRunesMatchStartingAt(text, pos) + if match != nil && err == nil && match.Index == pos { groups := []string{} for _, g := range match.Groups() { groups = append(groups, g.String()) diff --git a/regexp_test.go b/regexp_test.go index c47a819..463457a 100644 --- a/regexp_test.go +++ b/regexp_test.go @@ -4,6 +4,7 @@ import ( "testing" "github.com/alecthomas/assert" + "github.com/stretchr/testify/require" ) func TestNewlineAtEndOfFile(t *testing.T) { @@ -25,3 +26,21 @@ func TestNewlineAtEndOfFile(t *testing.T) { assert.NoError(t, err) assert.Equal(t, []Token{{Error, "hello"}}, it.Tokens()) } + +func TestMatchingAtStart(t *testing.T) { + l := Coalesce(MustNewLexer(&Config{}, Rules{ + "root": { + {`\s+`, Whitespace, nil}, + {`^-`, Punctuation, Push("directive")}, + {`->`, Operator, nil}, + }, + "directive": { + {"module", NameEntity, Pop(1)}, + }, + })) + it, err := l.Tokenise(nil, `-module ->`) + assert.NoError(t, err) + require.Equal(t, + []Token{{Punctuation, "-"}, {NameEntity, "module"}, {Whitespace, " "}, {Operator, "->"}}, + it.Tokens()) +}