diff --git a/tools/tokenizer/tokenizer.go b/tools/tokenizer/tokenizer.go new file mode 100644 index 00000000..b9fbc09b --- /dev/null +++ b/tools/tokenizer/tokenizer.go @@ -0,0 +1,201 @@ +// Package tokenizer implements a rudimentary tokens parser of buffered +// io.Reader while respecting quotes and parenthesis boundaries. +// +// Example +// +// tk := tokenizer.NewFromString("a, b, (c, d)") +// result, _ := tk.ScanAll() // ["a", "b", "(c, d)"] +package tokenizer + +import ( + "bufio" + "bytes" + "fmt" + "io" + "strings" +) + +// eof represents a marker rune for the end of the reader. +const eof = rune(0) + +// DefaultSeparators is a list with the default token separator characters. +var DefaultSeparators = []rune{',', ' ', '\t', '\n'} + +// NewFromString creates new Tokenizer from the provided string. +func NewFromString(str string) *Tokenizer { + return New(strings.NewReader(str)) +} + +// NewFromBytes creates new Tokenizer from the provided bytes slice. +func NewFromBytes(b []byte) *Tokenizer { + return New(bytes.NewReader(b)) +} + +// New creates new Tokenizer from the provided reader. +func New(r io.Reader) *Tokenizer { + return &Tokenizer{ + r: bufio.NewReader(r), + separators: DefaultSeparators, + } +} + +// Tokenizer defines a struct that parses a reader into tokens while +// respecting quotes and parenthesis boundaries. +type Tokenizer struct { + r *bufio.Reader + + separators []rune +} + +// SetSeparators specifies the provided separatos of the current Tokenizer. +func (s *Tokenizer) SetSeparators(separators ...rune) { + s.separators = separators +} + +// Scan reads and returns the next available token from the Tokenizer's buffer (trimmed). +// +// Returns [io.EOF] error when there are no more tokens to scan. +func (s *Tokenizer) Scan() (string, error) { + ch := s.read() + + if ch == eof { + return "", io.EOF + } + + if isWhitespaceRune(ch) { + s.readWhiteSpaces() + } else { + s.unread() + } + + token, err := s.readToken() + if err != nil { + return "", err + } + + // read all remaining whitespaces + s.readWhiteSpaces() + + return token, err +} + +// ScanAll reads the entire Tokenizer's buffer and return all found tokens. +func (s *Tokenizer) ScanAll() ([]string, error) { + tokens := []string{} + + for { + token, err := s.Scan() + if err != nil { + if err == io.EOF { + break + } + + return nil, err + } + + tokens = append(tokens, token) + } + + return tokens, nil +} + +// readToken reads a single token from the buffer and returns it. +func (s *Tokenizer) readToken() (string, error) { + var buf bytes.Buffer + var parenthesis int + var quoteCh rune + var prevCh rune + + for { + ch := s.read() + + if ch == eof { + break + } + + if !isEscapeRune(prevCh) { + if ch == '(' && quoteCh == eof { + parenthesis++ + } else if ch == ')' && parenthesis > 0 && quoteCh == eof { + parenthesis-- + } else if isQuoteRune(ch) { + if quoteCh == ch { + quoteCh = eof // reached closing quote + } else if quoteCh == eof { + quoteCh = ch // opening quote + } + } + } + + if s.isSeperatorRune(ch) && parenthesis == 0 && quoteCh == eof { + break + } + + prevCh = ch + buf.WriteRune(ch) + } + + if parenthesis > 0 || quoteCh != eof { + return "", fmt.Errorf("unbalanced parenthesis or quoted expression: %q", buf.String()) + } + + return buf.String(), nil +} + +// readWhiteSpaces consumes all contiguous whitespace runes. +func (s *Tokenizer) readWhiteSpaces() { + for { + ch := s.read() + + if ch == eof { + break + } + + if !s.isSeperatorRune(ch) { + s.unread() + break + } + } +} + +// read reads the next rune from the buffered reader. +// Returns the `rune(0)` if an error or `io.EOF` occurs. +func (s *Tokenizer) read() rune { + ch, _, err := s.r.ReadRune() + if err != nil { + return eof + } + + return ch +} + +// unread places the previously read rune back on the reader. +func (s *Tokenizer) unread() error { + return s.r.UnreadRune() +} + +// isSeperatorRune checks if a rune is a token part separator. +func (s *Tokenizer) isSeperatorRune(ch rune) bool { + for _, r := range s.separators { + if ch == r { + return true + } + } + + return false +} + +// isWhitespaceRune checks if a rune is a space, tab, or newline. +func isWhitespaceRune(ch rune) bool { + return ch == ' ' || ch == '\t' || ch == '\n' +} + +// isQuoteRune checks if a rune is a quote. +func isQuoteRune(ch rune) bool { + return ch == '\'' || ch == '"' || ch == '`' +} + +// isEscapeRune checks if a rune is an escape character. +func isEscapeRune(ch rune) bool { + return ch == '\\' +} diff --git a/tools/tokenizer/tokenizer_test.go b/tools/tokenizer/tokenizer_test.go new file mode 100644 index 00000000..134ae2d8 --- /dev/null +++ b/tools/tokenizer/tokenizer_test.go @@ -0,0 +1,185 @@ +package tokenizer + +import ( + "io" + "strings" + "testing" +) + +func TestFactories(t *testing.T) { + expectedContent := "test" + + scenarios := []struct { + name string + tk *Tokenizer + }{ + { + "New()", + New(strings.NewReader(expectedContent)), + }, + { + "NewFromString()", + NewFromString(expectedContent), + }, + { + "NewFromBytes()", + NewFromBytes([]byte(expectedContent)), + }, + } + + for _, s := range scenarios { + content, _ := s.tk.r.ReadString(0) + + if content != expectedContent { + t.Fatalf("[%s] Expected reader with content %q, got %q", s.name, expectedContent, content) + } + + if len(s.tk.separators) != len(DefaultSeparators) { + t.Fatalf("[%s] Expected \n%v, \ngot \n%v", s.name, DefaultSeparators, s.tk.separators) + } + + for _, r := range s.tk.separators { + exists := false + for _, def := range s.tk.separators { + if r == def { + exists = true + break + } + } + if !exists { + t.Fatalf("[%s] Unexpected sepator %s", s.name, string(r)) + } + } + } +} + +func TestScan(t *testing.T) { + tk := NewFromString("abc 123.456 (abc)") + + expectedTokens := []string{"abc", "123.456", "(abc)"} + + for _, token := range expectedTokens { + result, err := tk.Scan() + if err != nil { + t.Fatalf("Expected token %q, got error %v", token, err) + } + + if result != token { + t.Fatalf("Expected token %q, got error %v", token, result) + } + } + + // scan the last character + token, err := tk.Scan() + if err != io.EOF { + t.Fatalf("Expected EOF error, got %v", err) + } + if token != "" || err != io.EOF { + t.Fatalf("Expected empty token, got %q", token) + } +} + +func TestScanAllWithDefaultSeparators(t *testing.T) { + scenarios := []struct { + name string + content string + separators []rune + expectError bool + expectTokens []string + }{ + { + "empty string", + "", + DefaultSeparators, + false, + nil, + }, + { + "unbalanced parenthesis", + `(a,b() c`, + DefaultSeparators, + true, + []string{}, + }, + { + "unmatching quotes", + `'asd"`, + DefaultSeparators, + true, + []string{}, + }, + { + "no separators", + `a, b, c, d, e 123, "abc"`, + nil, + false, + []string{ + `a, b, c, d, e 123, "abc"`, + }, + }, + { + "default separators", + ` a , 123.456, b, c d, ( + test (a,b,c) " 123 " + ),"(abc d", "abc) d", "(abc) d \" " 'abc "'`, + DefaultSeparators, + false, + []string{ + "a", + "123.456", + "b", + "c", + "d", + "(\n\t\t\t\ttest (a,b,c) \" 123 \"\n\t\t\t)", + `"(abc d"`, + `"abc) d"`, + `"(abc) d \" "`, + `'abc "'`, + }, + }, + { + "custom separators", + `a, b, c, d e, "a,b, c ", (123, 456)`, + []rune{','}, + false, + []string{ + "a", + "b", + "c", + "d e", + `"a,b, c "`, + `(123, 456)`, + }, + }, + } + + for _, s := range scenarios { + tk := NewFromString(s.content) + + tk.SetSeparators(s.separators...) + + tokens, err := tk.ScanAll() + + hasErr := err != nil + if hasErr != s.expectError { + t.Fatalf("[%s] Expected hasErr %v, got %v (%v)", s.name, s.expectError, hasErr, err) + } + + if len(tokens) != len(s.expectTokens) { + t.Fatalf("[%s] Expected \n%v, \ngot \n%v", s.name, s.expectTokens, tokens) + } + + for _, tok := range tokens { + exists := false + for _, def := range s.expectTokens { + if tok == def { + exists = true + break + } + } + if !exists { + t.Fatalf("[%s] Unexpected token %s", s.name, tok) + } + } + } +}