mirror of
https://github.com/pocketbase/pocketbase.git
synced 2025-01-26 06:55:51 +02:00
248 lines
5.4 KiB
Go
248 lines
5.4 KiB
Go
// Package tokenizer implements a rudimentary tokens parser of buffered
|
|
// io.Reader while respecting quotes and parenthesis boundaries.
|
|
//
|
|
// Example
|
|
//
|
|
// tk := tokenizer.NewFromString("a, b, (c, d)")
|
|
// result, _ := tk.ScanAll() // ["a", "b", "(c, d)"]
|
|
package tokenizer
|
|
|
|
import (
|
|
"bufio"
|
|
"bytes"
|
|
"fmt"
|
|
"io"
|
|
"strings"
|
|
)
|
|
|
|
// eof represents a marker rune for the end of the reader.
|
|
const eof = rune(0)
|
|
|
|
// DefaultSeparators is a list with the default token separator characters.
|
|
var DefaultSeparators = []rune{','}
|
|
|
|
var whitespaceChars = []rune{'\t', '\n', '\v', '\f', '\r', ' ', 0x85, 0xA0}
|
|
|
|
// NewFromString creates new Tokenizer from the provided string.
|
|
func NewFromString(str string) *Tokenizer {
|
|
return New(strings.NewReader(str))
|
|
}
|
|
|
|
// NewFromBytes creates new Tokenizer from the provided bytes slice.
|
|
func NewFromBytes(b []byte) *Tokenizer {
|
|
return New(bytes.NewReader(b))
|
|
}
|
|
|
|
// New creates new Tokenizer from the provided reader with DefaultSeparators.
|
|
func New(r io.Reader) *Tokenizer {
|
|
t := &Tokenizer{r: bufio.NewReader(r)}
|
|
|
|
t.Separators(DefaultSeparators...)
|
|
|
|
return t
|
|
}
|
|
|
|
// Tokenizer defines a struct that parses a reader into tokens while
|
|
// respecting quotes and parenthesis boundaries.
|
|
type Tokenizer struct {
|
|
r *bufio.Reader
|
|
|
|
trimCutset string
|
|
separators []rune
|
|
keepSeparator bool
|
|
keepEmptyTokens bool
|
|
ignoreParenthesis bool
|
|
}
|
|
|
|
// Separators defines the provided separatos of the current Tokenizer.
|
|
func (t *Tokenizer) Separators(separators ...rune) {
|
|
t.separators = separators
|
|
|
|
t.rebuildTrimCutset()
|
|
}
|
|
|
|
// KeepSeparator defines whether to keep the separator rune as part
|
|
// of the token (default to false).
|
|
func (t *Tokenizer) KeepSeparator(state bool) {
|
|
t.keepSeparator = state
|
|
}
|
|
|
|
// KeepEmptyTokens defines whether to keep empty tokens on Scan() (default to false).
|
|
func (t *Tokenizer) KeepEmptyTokens(state bool) {
|
|
t.keepEmptyTokens = state
|
|
}
|
|
|
|
// IgnoreParenthesis defines whether to ignore the parenthesis boundaries
|
|
// and to treat the '(' and ')' as regular characters.
|
|
func (t *Tokenizer) IgnoreParenthesis(state bool) {
|
|
t.ignoreParenthesis = state
|
|
}
|
|
|
|
// Scan reads and returns the next available token from the Tokenizer's buffer (trimmed!).
|
|
//
|
|
// Empty tokens are skipped if t.keepEmptyTokens is not set (which is the default).
|
|
//
|
|
// Returns [io.EOF] error when there are no more tokens to scan.
|
|
func (t *Tokenizer) Scan() (string, error) {
|
|
ch := t.read()
|
|
if ch == eof {
|
|
return "", io.EOF
|
|
}
|
|
t.unread()
|
|
|
|
token, err := t.readToken()
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
|
|
if !t.keepEmptyTokens && token == "" {
|
|
return t.Scan()
|
|
}
|
|
|
|
return token, err
|
|
}
|
|
|
|
// ScanAll reads the entire Tokenizer's buffer and return all found tokens.
|
|
func (t *Tokenizer) ScanAll() ([]string, error) {
|
|
tokens := []string{}
|
|
|
|
for {
|
|
token, err := t.Scan()
|
|
if err != nil {
|
|
if err == io.EOF {
|
|
break
|
|
}
|
|
|
|
return nil, err
|
|
}
|
|
|
|
tokens = append(tokens, token)
|
|
}
|
|
|
|
return tokens, nil
|
|
}
|
|
|
|
// readToken reads a single token from the buffer and returns it.
|
|
func (t *Tokenizer) readToken() (string, error) {
|
|
var buf bytes.Buffer
|
|
var parenthesis int
|
|
var quoteCh rune
|
|
var prevCh rune
|
|
|
|
for {
|
|
ch := t.read()
|
|
|
|
if ch == eof {
|
|
break
|
|
}
|
|
|
|
if !t.isEscapeRune(prevCh) {
|
|
if !t.ignoreParenthesis && ch == '(' && quoteCh == eof {
|
|
parenthesis++ // opening parenthesis
|
|
} else if !t.ignoreParenthesis && ch == ')' && parenthesis > 0 && quoteCh == eof {
|
|
parenthesis-- // closing parenthesis
|
|
} else if t.isQuoteRune(ch) {
|
|
if quoteCh == ch {
|
|
quoteCh = eof // closing quote
|
|
} else if quoteCh == eof {
|
|
quoteCh = ch // opening quote
|
|
}
|
|
}
|
|
}
|
|
|
|
if t.isSeperatorRune(ch) && parenthesis == 0 && quoteCh == eof {
|
|
if t.keepSeparator {
|
|
buf.WriteRune(ch)
|
|
}
|
|
break
|
|
}
|
|
|
|
prevCh = ch
|
|
buf.WriteRune(ch)
|
|
}
|
|
|
|
if parenthesis > 0 || quoteCh != eof {
|
|
return "", fmt.Errorf("unbalanced parenthesis or quoted expression: %q", buf.String())
|
|
}
|
|
|
|
return strings.Trim(buf.String(), t.trimCutset), nil
|
|
}
|
|
|
|
// readWhiteSpaces consumes all contiguous whitespace runes.
|
|
func (t *Tokenizer) readWhiteSpaces() {
|
|
for {
|
|
ch := t.read()
|
|
|
|
if ch == eof {
|
|
break
|
|
}
|
|
|
|
if !t.isWhitespaceRune(ch) {
|
|
t.unread()
|
|
break
|
|
}
|
|
}
|
|
}
|
|
|
|
// read reads the next rune from the buffered reader.
|
|
// Returns the `rune(0)` if an error or `io.EOF` occurs.
|
|
func (t *Tokenizer) read() rune {
|
|
ch, _, err := t.r.ReadRune()
|
|
if err != nil {
|
|
return eof
|
|
}
|
|
|
|
return ch
|
|
}
|
|
|
|
// unread places the previously read rune back on the reader.
|
|
func (t *Tokenizer) unread() error {
|
|
return t.r.UnreadRune()
|
|
}
|
|
|
|
// rebuildTrimCutset rebuilds the tokenizer trimCutset based on its separator runes.
|
|
func (t *Tokenizer) rebuildTrimCutset() {
|
|
var cutset strings.Builder
|
|
|
|
for _, w := range whitespaceChars {
|
|
if t.isSeperatorRune(w) {
|
|
continue
|
|
}
|
|
cutset.WriteRune(w)
|
|
}
|
|
|
|
t.trimCutset = cutset.String()
|
|
}
|
|
|
|
// isSeperatorRune checks if a rune is a token part separator.
|
|
func (t *Tokenizer) isSeperatorRune(ch rune) bool {
|
|
for _, r := range t.separators {
|
|
if ch == r {
|
|
return true
|
|
}
|
|
}
|
|
|
|
return false
|
|
}
|
|
|
|
// isWhitespaceRune checks if a rune is a space character (eg. space, tab, new line).
|
|
func (t *Tokenizer) isWhitespaceRune(ch rune) bool {
|
|
for _, c := range whitespaceChars {
|
|
if c == ch {
|
|
return true
|
|
}
|
|
}
|
|
|
|
return false
|
|
}
|
|
|
|
// isQuoteRune checks if a rune is a quote.
|
|
func (t *Tokenizer) isQuoteRune(ch rune) bool {
|
|
return ch == '\'' || ch == '"' || ch == '`'
|
|
}
|
|
|
|
// isEscapeRune checks if a rune is an escape character.
|
|
func (t *Tokenizer) isEscapeRune(ch rune) bool {
|
|
return ch == '\\'
|
|
}
|