1
0
mirror of https://github.com/pocketbase/pocketbase.git synced 2025-03-18 05:37:49 +02:00

added generic tokenizer helper

This commit is contained in:
Gani Georgiev 2023-02-05 20:53:48 +02:00
parent 1b21e86be6
commit 23dfa9c634
2 changed files with 386 additions and 0 deletions

View File

@ -0,0 +1,201 @@
// Package tokenizer implements a rudimentary tokens parser of buffered
// io.Reader while respecting quotes and parenthesis boundaries.
//
// Example
//
// tk := tokenizer.NewFromString("a, b, (c, d)")
// result, _ := tk.ScanAll() // ["a", "b", "(c, d)"]
package tokenizer
import (
"bufio"
"bytes"
"fmt"
"io"
"strings"
)
// eof represents a marker rune for the end of the reader.
const eof = rune(0)
// DefaultSeparators is a list with the default token separator characters.
var DefaultSeparators = []rune{',', ' ', '\t', '\n'}
// NewFromString creates new Tokenizer from the provided string.
func NewFromString(str string) *Tokenizer {
return New(strings.NewReader(str))
}
// NewFromBytes creates new Tokenizer from the provided bytes slice.
func NewFromBytes(b []byte) *Tokenizer {
return New(bytes.NewReader(b))
}
// New creates new Tokenizer from the provided reader.
func New(r io.Reader) *Tokenizer {
return &Tokenizer{
r: bufio.NewReader(r),
separators: DefaultSeparators,
}
}
// Tokenizer defines a struct that parses a reader into tokens while
// respecting quotes and parenthesis boundaries.
type Tokenizer struct {
r *bufio.Reader
separators []rune
}
// SetSeparators specifies the provided separatos of the current Tokenizer.
func (s *Tokenizer) SetSeparators(separators ...rune) {
s.separators = separators
}
// Scan reads and returns the next available token from the Tokenizer's buffer (trimmed).
//
// Returns [io.EOF] error when there are no more tokens to scan.
func (s *Tokenizer) Scan() (string, error) {
ch := s.read()
if ch == eof {
return "", io.EOF
}
if isWhitespaceRune(ch) {
s.readWhiteSpaces()
} else {
s.unread()
}
token, err := s.readToken()
if err != nil {
return "", err
}
// read all remaining whitespaces
s.readWhiteSpaces()
return token, err
}
// ScanAll reads the entire Tokenizer's buffer and return all found tokens.
func (s *Tokenizer) ScanAll() ([]string, error) {
tokens := []string{}
for {
token, err := s.Scan()
if err != nil {
if err == io.EOF {
break
}
return nil, err
}
tokens = append(tokens, token)
}
return tokens, nil
}
// readToken reads a single token from the buffer and returns it.
func (s *Tokenizer) readToken() (string, error) {
var buf bytes.Buffer
var parenthesis int
var quoteCh rune
var prevCh rune
for {
ch := s.read()
if ch == eof {
break
}
if !isEscapeRune(prevCh) {
if ch == '(' && quoteCh == eof {
parenthesis++
} else if ch == ')' && parenthesis > 0 && quoteCh == eof {
parenthesis--
} else if isQuoteRune(ch) {
if quoteCh == ch {
quoteCh = eof // reached closing quote
} else if quoteCh == eof {
quoteCh = ch // opening quote
}
}
}
if s.isSeperatorRune(ch) && parenthesis == 0 && quoteCh == eof {
break
}
prevCh = ch
buf.WriteRune(ch)
}
if parenthesis > 0 || quoteCh != eof {
return "", fmt.Errorf("unbalanced parenthesis or quoted expression: %q", buf.String())
}
return buf.String(), nil
}
// readWhiteSpaces consumes all contiguous whitespace runes.
func (s *Tokenizer) readWhiteSpaces() {
for {
ch := s.read()
if ch == eof {
break
}
if !s.isSeperatorRune(ch) {
s.unread()
break
}
}
}
// read reads the next rune from the buffered reader.
// Returns the `rune(0)` if an error or `io.EOF` occurs.
func (s *Tokenizer) read() rune {
ch, _, err := s.r.ReadRune()
if err != nil {
return eof
}
return ch
}
// unread places the previously read rune back on the reader.
func (s *Tokenizer) unread() error {
return s.r.UnreadRune()
}
// isSeperatorRune checks if a rune is a token part separator.
func (s *Tokenizer) isSeperatorRune(ch rune) bool {
for _, r := range s.separators {
if ch == r {
return true
}
}
return false
}
// isWhitespaceRune checks if a rune is a space, tab, or newline.
func isWhitespaceRune(ch rune) bool {
return ch == ' ' || ch == '\t' || ch == '\n'
}
// isQuoteRune checks if a rune is a quote.
func isQuoteRune(ch rune) bool {
return ch == '\'' || ch == '"' || ch == '`'
}
// isEscapeRune checks if a rune is an escape character.
func isEscapeRune(ch rune) bool {
return ch == '\\'
}

View File

@ -0,0 +1,185 @@
package tokenizer
import (
"io"
"strings"
"testing"
)
func TestFactories(t *testing.T) {
expectedContent := "test"
scenarios := []struct {
name string
tk *Tokenizer
}{
{
"New()",
New(strings.NewReader(expectedContent)),
},
{
"NewFromString()",
NewFromString(expectedContent),
},
{
"NewFromBytes()",
NewFromBytes([]byte(expectedContent)),
},
}
for _, s := range scenarios {
content, _ := s.tk.r.ReadString(0)
if content != expectedContent {
t.Fatalf("[%s] Expected reader with content %q, got %q", s.name, expectedContent, content)
}
if len(s.tk.separators) != len(DefaultSeparators) {
t.Fatalf("[%s] Expected \n%v, \ngot \n%v", s.name, DefaultSeparators, s.tk.separators)
}
for _, r := range s.tk.separators {
exists := false
for _, def := range s.tk.separators {
if r == def {
exists = true
break
}
}
if !exists {
t.Fatalf("[%s] Unexpected sepator %s", s.name, string(r))
}
}
}
}
func TestScan(t *testing.T) {
tk := NewFromString("abc 123.456 (abc)")
expectedTokens := []string{"abc", "123.456", "(abc)"}
for _, token := range expectedTokens {
result, err := tk.Scan()
if err != nil {
t.Fatalf("Expected token %q, got error %v", token, err)
}
if result != token {
t.Fatalf("Expected token %q, got error %v", token, result)
}
}
// scan the last character
token, err := tk.Scan()
if err != io.EOF {
t.Fatalf("Expected EOF error, got %v", err)
}
if token != "" || err != io.EOF {
t.Fatalf("Expected empty token, got %q", token)
}
}
func TestScanAllWithDefaultSeparators(t *testing.T) {
scenarios := []struct {
name string
content string
separators []rune
expectError bool
expectTokens []string
}{
{
"empty string",
"",
DefaultSeparators,
false,
nil,
},
{
"unbalanced parenthesis",
`(a,b() c`,
DefaultSeparators,
true,
[]string{},
},
{
"unmatching quotes",
`'asd"`,
DefaultSeparators,
true,
[]string{},
},
{
"no separators",
`a, b, c, d, e 123, "abc"`,
nil,
false,
[]string{
`a, b, c, d, e 123, "abc"`,
},
},
{
"default separators",
` a , 123.456, b, c d, (
test (a,b,c) " 123 "
),"(abc d", "abc) d", "(abc) d \" " 'abc "'`,
DefaultSeparators,
false,
[]string{
"a",
"123.456",
"b",
"c",
"d",
"(\n\t\t\t\ttest (a,b,c) \" 123 \"\n\t\t\t)",
`"(abc d"`,
`"abc) d"`,
`"(abc) d \" "`,
`'abc "'`,
},
},
{
"custom separators",
`a, b, c, d e, "a,b, c ", (123, 456)`,
[]rune{','},
false,
[]string{
"a",
"b",
"c",
"d e",
`"a,b, c "`,
`(123, 456)`,
},
},
}
for _, s := range scenarios {
tk := NewFromString(s.content)
tk.SetSeparators(s.separators...)
tokens, err := tk.ScanAll()
hasErr := err != nil
if hasErr != s.expectError {
t.Fatalf("[%s] Expected hasErr %v, got %v (%v)", s.name, s.expectError, hasErr, err)
}
if len(tokens) != len(s.expectTokens) {
t.Fatalf("[%s] Expected \n%v, \ngot \n%v", s.name, s.expectTokens, tokens)
}
for _, tok := range tokens {
exists := false
for _, def := range s.expectTokens {
if tok == def {
exists = true
break
}
}
if !exists {
t.Fatalf("[%s] Unexpected token %s", s.name, tok)
}
}
}
}