mirror of
https://github.com/alecthomas/chroma.git
synced 2025-03-17 20:58:08 +02:00
Add new TokeniseOption EnsureLF (#336)
* Add new TokeniseOption EnsureLF ref #329 * Use efficient process suggested by @chmike
This commit is contained in:
parent
e5d9650a20
commit
34d9c7143b
7
lexer.go
7
lexer.go
@ -6,7 +6,8 @@ import (
|
||||
|
||||
var (
|
||||
defaultOptions = &TokeniseOptions{
|
||||
State: "root",
|
||||
State: "root",
|
||||
EnsureLF: true,
|
||||
}
|
||||
)
|
||||
|
||||
@ -80,6 +81,10 @@ type TokeniseOptions struct {
|
||||
State string
|
||||
// Nested tokenisation.
|
||||
Nested bool
|
||||
|
||||
// If true, all EOLs are converted into LF
|
||||
// by replacing CRLF and CR
|
||||
EnsureLF bool
|
||||
}
|
||||
|
||||
// A Lexer for tokenising source code.
|
||||
|
22
regexp.go
22
regexp.go
@ -410,6 +410,9 @@ func (r *RegexLexer) Tokenise(options *TokeniseOptions, text string) (Iterator,
|
||||
if options == nil {
|
||||
options = defaultOptions
|
||||
}
|
||||
if options.EnsureLF {
|
||||
text = ensureLF(text)
|
||||
}
|
||||
if !options.Nested && r.config.EnsureNL && !strings.HasSuffix(text, "\n") {
|
||||
text += "\n"
|
||||
}
|
||||
@ -437,3 +440,22 @@ func matchRules(text []rune, pos int, rules []*CompiledRule) (int, *CompiledRule
|
||||
}
|
||||
return 0, &CompiledRule{}, nil
|
||||
}
|
||||
|
||||
// replace \r and \r\n with \n
|
||||
// same as strings.ReplaceAll but more efficient
|
||||
func ensureLF(text string) string {
|
||||
buf := make([]byte, len(text))
|
||||
var j int
|
||||
for i := 0; i < len(text); i++ {
|
||||
c := text[i]
|
||||
if c == '\r' {
|
||||
if i < len(text)-1 && text[i+1] == '\n' {
|
||||
continue
|
||||
}
|
||||
c = '\n'
|
||||
}
|
||||
buf[j] = c
|
||||
j++
|
||||
}
|
||||
return string(buf[:j])
|
||||
}
|
||||
|
@ -43,3 +43,59 @@ func TestMatchingAtStart(t *testing.T) {
|
||||
[]Token{{Punctuation, "-"}, {NameEntity, "module"}, {Whitespace, " "}, {Operator, "->"}},
|
||||
it.Tokens())
|
||||
}
|
||||
|
||||
func TestEnsureLFOption(t *testing.T) {
|
||||
l := Coalesce(MustNewLexer(&Config{}, Rules{
|
||||
"root": {
|
||||
{`(\w+)(\r?\n|\r)`, ByGroups(Keyword, Whitespace), nil},
|
||||
},
|
||||
}))
|
||||
it, err := l.Tokenise(&TokeniseOptions{
|
||||
State: "root",
|
||||
EnsureLF: true,
|
||||
}, "hello\r\nworld\r")
|
||||
assert.NoError(t, err)
|
||||
assert.Equal(t, []Token{
|
||||
{Keyword, "hello"},
|
||||
{Whitespace, "\n"},
|
||||
{Keyword, "world"},
|
||||
{Whitespace, "\n"},
|
||||
}, it.Tokens())
|
||||
|
||||
l = Coalesce(MustNewLexer(nil, Rules{
|
||||
"root": {
|
||||
{`(\w+)(\r?\n|\r)`, ByGroups(Keyword, Whitespace), nil},
|
||||
},
|
||||
}))
|
||||
it, err = l.Tokenise(&TokeniseOptions{
|
||||
State: "root",
|
||||
EnsureLF: false,
|
||||
}, "hello\r\nworld\r")
|
||||
assert.NoError(t, err)
|
||||
assert.Equal(t, []Token{
|
||||
{Keyword, "hello"},
|
||||
{Whitespace, "\r\n"},
|
||||
{Keyword, "world"},
|
||||
{Whitespace, "\r"},
|
||||
}, it.Tokens())
|
||||
}
|
||||
|
||||
func TestEnsureLFFunc(t *testing.T) {
|
||||
tests := []struct{ in, out string }{
|
||||
{in: "", out: ""},
|
||||
{in: "abc", out: "abc"},
|
||||
{in: "\r", out: "\n"},
|
||||
{in: "a\r", out: "a\n"},
|
||||
{in: "\rb", out: "\nb"},
|
||||
{in: "a\rb", out: "a\nb"},
|
||||
{in: "\r\n", out: "\n"},
|
||||
{in: "a\r\n", out: "a\n"},
|
||||
{in: "\r\nb", out: "\nb"},
|
||||
{in: "a\r\nb", out: "a\nb"},
|
||||
{in: "\r\r\r\n\r", out: "\n\n\n\n"},
|
||||
}
|
||||
for _, test := range tests {
|
||||
out := ensureLF(test.in)
|
||||
assert.Equal(t, out, test.out)
|
||||
}
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user