1
0
mirror of https://github.com/alecthomas/chroma.git synced 2025-01-28 03:29:41 +02:00
chroma/regexp_test.go
Gusted 4d11870090
Don't output extra whitespace in YAML multiline (#993)
This resolves a particular issue with parsing YAML multiline, for
example:
```yaml
a: |
  multiline literal
  line 2
```

The regex used would capture the amount of indentation in the third
capture group and then use that as a kind of "status" to know which
lines are part of the indented multiline. However, because its a
captured group it has to be assigned a token which was `TextWhitespace`.
This meant that the indentation was outputted after the multiline,
technically it should be seen as an non-captured group, but then its no
longer to refer to it in the regex. Therefore I've gone with the
solution to add a new token, Ignore, which will not be emitted as a
token in the iterator, which can safely be used to make use of capture
groups but not have them show up in the output.

## Before

![image](https://github.com/user-attachments/assets/c29353c5-9e15-4f14-a733-57a60fb51910)

## After

![image](https://github.com/user-attachments/assets/57b5d129-a9d3-4b84-ae1f-dc05182b9ad3)
2024-08-23 06:58:31 +10:00

206 lines
5.0 KiB
Go

package chroma
import (
"testing"
assert "github.com/alecthomas/assert/v2"
)
func mustNewLexer(t *testing.T, config *Config, rules Rules) *RegexLexer { // nolint: forbidigo
lexer, err := NewLexer(config, func() Rules {
return rules
})
assert.NoError(t, err)
return lexer
}
func TestNewlineAtEndOfFile(t *testing.T) {
l := Coalesce(mustNewLexer(t, &Config{EnsureNL: true}, Rules{ // nolint: forbidigo
"root": {
{`(\w+)(\n)`, ByGroups(Keyword, Whitespace), nil},
},
}))
it, err := l.Tokenise(nil, `hello`)
assert.NoError(t, err)
assert.Equal(t, []Token{{Keyword, "hello"}, {Whitespace, "\n"}}, it.Tokens())
l = Coalesce(mustNewLexer(t, nil, Rules{ // nolint: forbidigo
"root": {
{`(\w+)(\n)`, ByGroups(Keyword, Whitespace), nil},
},
}))
it, err = l.Tokenise(nil, `hello`)
assert.NoError(t, err)
assert.Equal(t, []Token{{Error, "hello"}}, it.Tokens())
}
func TestMatchingAtStart(t *testing.T) {
l := Coalesce(mustNewLexer(t, &Config{}, Rules{ // nolint: forbidigo
"root": {
{`\s+`, Whitespace, nil},
{`^-`, Punctuation, Push("directive")},
{`->`, Operator, nil},
},
"directive": {
{"module", NameEntity, Pop(1)},
},
}))
it, err := l.Tokenise(nil, `-module ->`)
assert.NoError(t, err)
assert.Equal(t,
[]Token{{Punctuation, "-"}, {NameEntity, "module"}, {Whitespace, " "}, {Operator, "->"}},
it.Tokens())
}
func TestEnsureLFOption(t *testing.T) {
l := Coalesce(mustNewLexer(t, &Config{}, Rules{ // nolint: forbidigo
"root": {
{`(\w+)(\r?\n|\r)`, ByGroups(Keyword, Whitespace), nil},
},
}))
it, err := l.Tokenise(&TokeniseOptions{
State: "root",
EnsureLF: true,
}, "hello\r\nworld\r")
assert.NoError(t, err)
assert.Equal(t, []Token{
{Keyword, "hello"},
{Whitespace, "\n"},
{Keyword, "world"},
{Whitespace, "\n"},
}, it.Tokens())
l = Coalesce(mustNewLexer(t, nil, Rules{ // nolint: forbidigo
"root": {
{`(\w+)(\r?\n|\r)`, ByGroups(Keyword, Whitespace), nil},
},
}))
it, err = l.Tokenise(&TokeniseOptions{
State: "root",
EnsureLF: false,
}, "hello\r\nworld\r")
assert.NoError(t, err)
assert.Equal(t, []Token{
{Keyword, "hello"},
{Whitespace, "\r\n"},
{Keyword, "world"},
{Whitespace, "\r"},
}, it.Tokens())
}
func TestEnsureLFFunc(t *testing.T) {
tests := []struct{ in, out string }{
{in: "", out: ""},
{in: "abc", out: "abc"},
{in: "\r", out: "\n"},
{in: "a\r", out: "a\n"},
{in: "\rb", out: "\nb"},
{in: "a\rb", out: "a\nb"},
{in: "\r\n", out: "\n"},
{in: "a\r\n", out: "a\n"},
{in: "\r\nb", out: "\nb"},
{in: "a\r\nb", out: "a\nb"},
{in: "\r\r\r\n\r", out: "\n\n\n\n"},
}
for _, test := range tests {
out := ensureLF(test.in)
assert.Equal(t, out, test.out)
}
}
func TestByGroupNames(t *testing.T) {
l := Coalesce(mustNewLexer(t, nil, Rules{ // nolint: forbidigo
"root": {
{
`(?<key>\w+)(?<operator>=)(?<value>\w+)`,
ByGroupNames(map[string]Emitter{
`key`: String,
`operator`: Operator,
`value`: String,
}),
nil,
},
},
}))
it, err := l.Tokenise(nil, `abc=123`)
assert.NoError(t, err)
assert.Equal(t, []Token{{String, `abc`}, {Operator, `=`}, {String, `123`}}, it.Tokens())
l = Coalesce(mustNewLexer(t, nil, Rules{ // nolint: forbidigo
"root": {
{
`(?<key>\w+)(?<operator>=)(?<value>\w+)`,
ByGroupNames(map[string]Emitter{
`key`: String,
`value`: String,
}),
nil,
},
},
}))
it, err = l.Tokenise(nil, `abc=123`)
assert.NoError(t, err)
assert.Equal(t, []Token{{String, `abc`}, {Error, `=`}, {String, `123`}}, it.Tokens())
l = Coalesce(mustNewLexer(t, nil, Rules{ // nolint: forbidigo
"root": {
{
`(?<key>\w+)=(?<value>\w+)`,
ByGroupNames(map[string]Emitter{
`key`: String,
`value`: String,
}),
nil,
},
},
}))
it, err = l.Tokenise(nil, `abc=123`)
assert.NoError(t, err)
assert.Equal(t, []Token{{String, `abc123`}}, it.Tokens())
l = Coalesce(mustNewLexer(t, nil, Rules{ // nolint: forbidigo
"root": {
{
`(?<key>\w+)(?<op>=)(?<value>\w+)`,
ByGroupNames(map[string]Emitter{
`key`: String,
`operator`: Operator,
`value`: String,
}),
nil,
},
},
}))
it, err = l.Tokenise(nil, `abc=123`)
assert.NoError(t, err)
assert.Equal(t, []Token{{String, `abc`}, {Error, `=`}, {String, `123`}}, it.Tokens())
l = Coalesce(mustNewLexer(t, nil, Rules{ // nolint: forbidigo
"root": {
{
`\w+=\w+`,
ByGroupNames(map[string]Emitter{
`key`: String,
`operator`: Operator,
`value`: String,
}),
nil,
},
},
}))
it, err = l.Tokenise(nil, `abc=123`)
assert.NoError(t, err)
assert.Equal(t, []Token{{Error, `abc=123`}}, it.Tokens())
}
func TestIgnoreToken(t *testing.T) {
l := Coalesce(mustNewLexer(t, &Config{EnsureNL: true}, Rules{ // nolint: forbidigo
"root": {
{`(\s*)(\w+)(?:\1)(\n)`, ByGroups(Ignore, Keyword, Whitespace), nil},
},
}))
it, err := l.Tokenise(nil, ` hello `)
assert.NoError(t, err)
assert.Equal(t, []Token{{Keyword, "hello"}, {TextWhitespace, "\n"}}, it.Tokens())
}