1
0
mirror of https://github.com/pocketbase/pocketbase.git synced 2025-02-14 00:42:10 +02:00

added tokenizer.IgnoreParenthesis() to allow ignoring the parenthesis characters boundary checks

This commit is contained in:
Gani Georgiev 2023-09-17 12:14:57 +03:00
parent 71f9be3cb0
commit f3bcd7d3df
2 changed files with 171 additions and 133 deletions

View File

@ -34,9 +34,10 @@ func NewFromBytes(b []byte) *Tokenizer {
// New creates new Tokenizer from the provided reader with DefaultSeparators. // New creates new Tokenizer from the provided reader with DefaultSeparators.
func New(r io.Reader) *Tokenizer { func New(r io.Reader) *Tokenizer {
return &Tokenizer{ return &Tokenizer{
r: bufio.NewReader(r), r: bufio.NewReader(r),
separators: DefaultSeparators, separators: DefaultSeparators,
keepSeparator: false, keepSeparator: false,
ignoreParenthesis: false,
} }
} }
@ -45,54 +46,61 @@ func New(r io.Reader) *Tokenizer {
type Tokenizer struct { type Tokenizer struct {
r *bufio.Reader r *bufio.Reader
separators []rune separators []rune
keepSeparator bool keepSeparator bool
ignoreParenthesis bool
} }
// Separators defines the provided separatos of the current Tokenizer. // Separators defines the provided separatos of the current Tokenizer.
func (s *Tokenizer) Separators(separators ...rune) { func (t *Tokenizer) Separators(separators ...rune) {
s.separators = separators t.separators = separators
} }
// KeepSeparator defines whether to keep the separator rune as part // KeepSeparator defines whether to keep the separator rune as part
// of the token (default to false). // of the token (default to false).
func (s *Tokenizer) KeepSeparator(state bool) { func (t *Tokenizer) KeepSeparator(state bool) {
s.keepSeparator = state t.keepSeparator = state
}
// IgnoreParenthesis defines whether to ignore the parenthesis boundaries
// and to treat the '(' and ')' as regular characters.
func (t *Tokenizer) IgnoreParenthesis(state bool) {
t.ignoreParenthesis = state
} }
// Scan reads and returns the next available token from the Tokenizer's buffer (trimmed). // Scan reads and returns the next available token from the Tokenizer's buffer (trimmed).
// //
// Returns [io.EOF] error when there are no more tokens to scan. // Returns [io.EOF] error when there are no more tokens to scan.
func (s *Tokenizer) Scan() (string, error) { func (t *Tokenizer) Scan() (string, error) {
ch := s.read() ch := t.read()
if ch == eof { if ch == eof {
return "", io.EOF return "", io.EOF
} }
if isWhitespaceRune(ch) { if isWhitespaceRune(ch) {
s.readWhiteSpaces() t.readWhiteSpaces()
} else { } else {
s.unread() t.unread()
} }
token, err := s.readToken() token, err := t.readToken()
if err != nil { if err != nil {
return "", err return "", err
} }
// read all remaining whitespaces // read all remaining whitespaces
s.readWhiteSpaces() t.readWhiteSpaces()
return token, err return token, err
} }
// ScanAll reads the entire Tokenizer's buffer and return all found tokens. // ScanAll reads the entire Tokenizer's buffer and return all found tokens.
func (s *Tokenizer) ScanAll() ([]string, error) { func (t *Tokenizer) ScanAll() ([]string, error) {
tokens := []string{} tokens := []string{}
for { for {
token, err := s.Scan() token, err := t.Scan()
if err != nil { if err != nil {
if err == io.EOF { if err == io.EOF {
break break
@ -108,35 +116,35 @@ func (s *Tokenizer) ScanAll() ([]string, error) {
} }
// readToken reads a single token from the buffer and returns it. // readToken reads a single token from the buffer and returns it.
func (s *Tokenizer) readToken() (string, error) { func (t *Tokenizer) readToken() (string, error) {
var buf bytes.Buffer var buf bytes.Buffer
var parenthesis int var parenthesis int
var quoteCh rune var quoteCh rune
var prevCh rune var prevCh rune
for { for {
ch := s.read() ch := t.read()
if ch == eof { if ch == eof {
break break
} }
if !isEscapeRune(prevCh) { if !isEscapeRune(prevCh) {
if ch == '(' && quoteCh == eof { if !t.ignoreParenthesis && ch == '(' && quoteCh == eof {
parenthesis++ parenthesis++ // opening parenthesis
} else if ch == ')' && parenthesis > 0 && quoteCh == eof { } else if !t.ignoreParenthesis && ch == ')' && parenthesis > 0 && quoteCh == eof {
parenthesis-- parenthesis-- // closing parenthesis
} else if isQuoteRune(ch) { } else if isQuoteRune(ch) {
if quoteCh == ch { if quoteCh == ch {
quoteCh = eof // reached closing quote quoteCh = eof // closing quote
} else if quoteCh == eof { } else if quoteCh == eof {
quoteCh = ch // opening quote quoteCh = ch // opening quote
} }
} }
} }
if s.isSeperatorRune(ch) && parenthesis == 0 && quoteCh == eof { if t.isSeperatorRune(ch) && parenthesis == 0 && quoteCh == eof {
if s.keepSeparator { if t.keepSeparator {
buf.WriteRune(ch) buf.WriteRune(ch)
} }
break break
@ -154,16 +162,16 @@ func (s *Tokenizer) readToken() (string, error) {
} }
// readWhiteSpaces consumes all contiguous whitespace runes. // readWhiteSpaces consumes all contiguous whitespace runes.
func (s *Tokenizer) readWhiteSpaces() { func (t *Tokenizer) readWhiteSpaces() {
for { for {
ch := s.read() ch := t.read()
if ch == eof { if ch == eof {
break break
} }
if !s.isSeperatorRune(ch) { if !t.isSeperatorRune(ch) {
s.unread() t.unread()
break break
} }
} }
@ -171,8 +179,8 @@ func (s *Tokenizer) readWhiteSpaces() {
// read reads the next rune from the buffered reader. // read reads the next rune from the buffered reader.
// Returns the `rune(0)` if an error or `io.EOF` occurs. // Returns the `rune(0)` if an error or `io.EOF` occurs.
func (s *Tokenizer) read() rune { func (t *Tokenizer) read() rune {
ch, _, err := s.r.ReadRune() ch, _, err := t.r.ReadRune()
if err != nil { if err != nil {
return eof return eof
} }
@ -181,13 +189,13 @@ func (s *Tokenizer) read() rune {
} }
// unread places the previously read rune back on the reader. // unread places the previously read rune back on the reader.
func (s *Tokenizer) unread() error { func (t *Tokenizer) unread() error {
return s.r.UnreadRune() return t.r.UnreadRune()
} }
// isSeperatorRune checks if a rune is a token part separator. // isSeperatorRune checks if a rune is a token part separator.
func (s *Tokenizer) isSeperatorRune(ch rune) bool { func (t *Tokenizer) isSeperatorRune(ch rune) bool {
for _, r := range s.separators { for _, r := range t.separators {
if ch == r { if ch == r {
return true return true
} }

View File

@ -28,32 +28,38 @@ func TestFactories(t *testing.T) {
} }
for _, s := range scenarios { for _, s := range scenarios {
content, _ := s.tk.r.ReadString(0) t.Run(s.name, func(t *testing.T) {
content, _ := s.tk.r.ReadString(0)
if content != expectedContent { if content != expectedContent {
t.Fatalf("[%s] Expected reader with content %q, got %q", s.name, expectedContent, content) t.Fatalf("Expected reader with content %q, got %q", expectedContent, content)
} }
if s.tk.keepSeparator != false { if s.tk.keepSeparator != false {
t.Fatalf("[%s] Expected false, got true", s.name) t.Fatal("Expected keepSeparator false, got true")
} }
if len(s.tk.separators) != len(DefaultSeparators) { if s.tk.ignoreParenthesis != false {
t.Fatalf("[%s] Expected \n%v, \ngot \n%v", s.name, DefaultSeparators, s.tk.separators) t.Fatal("Expected ignoreParenthesis false, got true")
} }
for _, r := range s.tk.separators { if len(s.tk.separators) != len(DefaultSeparators) {
exists := false t.Fatalf("Expected \n%v, \ngot \n%v", DefaultSeparators, s.tk.separators)
for _, def := range s.tk.separators { }
if r == def {
exists = true for _, r := range s.tk.separators {
break exists := false
for _, def := range s.tk.separators {
if r == def {
exists = true
break
}
}
if !exists {
t.Fatalf("Unexpected sepator %s", string(r))
} }
} }
if !exists { })
t.Fatalf("[%s] Unexpected sepator %s", s.name, string(r))
}
}
} }
} }
@ -85,54 +91,58 @@ func TestScan(t *testing.T) {
func TestScanAll(t *testing.T) { func TestScanAll(t *testing.T) {
scenarios := []struct { scenarios := []struct {
name string name string
content string content string
separators []rune separators []rune
keepSeparator bool keepSeparator bool
expectError bool ignoreParenthesis bool
expectTokens []string expectError bool
expectTokens []string
}{ }{
{ {
"empty string", name: "empty string",
"", content: "",
DefaultSeparators, separators: DefaultSeparators,
false, keepSeparator: false,
false, ignoreParenthesis: false,
nil, expectError: false,
expectTokens: nil,
}, },
{ {
"unbalanced parenthesis", name: "unbalanced parenthesis",
`(a,b() c`, content: `(a,b() c`,
DefaultSeparators, separators: DefaultSeparators,
false, keepSeparator: false,
true, ignoreParenthesis: false,
[]string{}, expectError: true,
expectTokens: []string{},
}, },
{ {
"unmatching quotes", name: "unmatching quotes",
`'asd"`, content: `'asd"`,
DefaultSeparators, separators: DefaultSeparators,
false, keepSeparator: false,
true, ignoreParenthesis: false,
[]string{}, expectError: true,
expectTokens: []string{},
}, },
{ {
"no separators", name: "no separators",
`a, b, c, d, e 123, "abc"`, content: `a, b, c, d, e 123, "abc"`,
nil, separators: nil,
false, keepSeparator: false,
false, ignoreParenthesis: false,
[]string{ expectError: false,
`a, b, c, d, e 123, "abc"`, expectTokens: []string{`a, b, c, d, e 123, "abc"`},
},
}, },
{ {
"default separators", name: "default separators",
`a, b, c, d e, "a,b, c ", (123, 456)`, content: `a, b, c, d e, "a,b, c ", (123, 456)`,
DefaultSeparators, separators: DefaultSeparators,
false, keepSeparator: false,
false, ignoreParenthesis: false,
[]string{ expectError: false,
expectTokens: []string{
"a", "a",
"b", "b",
"c", "c",
@ -142,12 +152,13 @@ func TestScanAll(t *testing.T) {
}, },
}, },
{ {
"default separators (with preserve)", name: "default separators (with preserve)",
`a, b, c, d e, "a,b, c ", (123, 456)`, content: `a, b, c, d e, "a,b, c ", (123, 456)`,
DefaultSeparators, separators: DefaultSeparators,
true, keepSeparator: true,
false, ignoreParenthesis: false,
[]string{ expectError: false,
expectTokens: []string{
"a,", "a,",
"b,", "b,",
"c,", "c,",
@ -157,14 +168,15 @@ func TestScanAll(t *testing.T) {
}, },
}, },
{ {
"custom separators", name: "custom separators",
` a , 123.456, b, c d, ( content: ` a , 123.456, b, c d, (
test (a,b,c) " 123 " test (a,b,c) " 123 "
),"(abc d", "abc) d", "(abc) d \" " 'abc "'`, ),"(abc d", "abc) d", "(abc) d \" " 'abc "'`,
[]rune{',', ' ', '\t', '\n'}, separators: []rune{',', ' ', '\t', '\n'},
false, keepSeparator: false,
false, ignoreParenthesis: false,
[]string{ expectError: false,
expectTokens: []string{
"a", "a",
"123.456", "123.456",
"b", "b",
@ -178,14 +190,15 @@ func TestScanAll(t *testing.T) {
}, },
}, },
{ {
"custom separators (with preserve)", name: "custom separators (with preserve)",
` a , 123.456, b, c d, ( content: ` a , 123.456, b, c d, (
test (a,b,c) " 123 " test (a,b,c) " 123 "
),"(abc d", "abc) d", "(abc) d \" " 'abc "'`, ),"(abc d", "abc) d", "(abc) d \" " 'abc "'`,
[]rune{',', ' ', '\t', '\n'}, separators: []rune{',', ' ', '\t', '\n'},
true, keepSeparator: true,
false, ignoreParenthesis: false,
[]string{ expectError: false,
expectTokens: []string{
"a ", "a ",
"123.456,", "123.456,",
"b,", "b,",
@ -198,36 +211,53 @@ func TestScanAll(t *testing.T) {
`'abc "'`, `'abc "'`,
}, },
}, },
{
name: "ignoring parenthesis",
content: `a, b, (c,d)`,
separators: []rune{','},
keepSeparator: false,
ignoreParenthesis: true,
expectError: false,
expectTokens: []string{
"a",
"b",
"(c",
"d)",
},
},
} }
for _, s := range scenarios { for _, s := range scenarios {
tk := NewFromString(s.content) t.Run(s.name, func(t *testing.T) {
tk := NewFromString(s.content)
tk.Separators(s.separators...) tk.Separators(s.separators...)
tk.KeepSeparator(s.keepSeparator) tk.KeepSeparator(s.keepSeparator)
tk.IgnoreParenthesis(s.ignoreParenthesis)
tokens, err := tk.ScanAll() tokens, err := tk.ScanAll()
hasErr := err != nil hasErr := err != nil
if hasErr != s.expectError { if hasErr != s.expectError {
t.Fatalf("[%s] Expected hasErr %v, got %v (%v)", s.name, s.expectError, hasErr, err) t.Fatalf("Expected hasErr %v, got %v (%v)", s.expectError, hasErr, err)
} }
if len(tokens) != len(s.expectTokens) { if len(tokens) != len(s.expectTokens) {
t.Fatalf("[%s] Expected \n%v (%d), \ngot \n%v (%d)", s.name, s.expectTokens, len(s.expectTokens), tokens, len(tokens)) t.Fatalf("Expected \n%v (%d), \ngot \n%v (%d)", s.expectTokens, len(s.expectTokens), tokens, len(tokens))
} }
for _, tok := range tokens { for _, tok := range tokens {
exists := false exists := false
for _, def := range s.expectTokens { for _, def := range s.expectTokens {
if tok == def { if tok == def {
exists = true exists = true
break break
}
}
if !exists {
t.Fatalf("Unexpected token %s", tok)
} }
} }
if !exists { })
t.Fatalf("[%s] Unexpected token %s", s.name, tok)
}
}
} }
} }