added generic tokenizer helper

2025-03-18 13:47:47 +02:00 · 2023-02-05 20:53:48 +02:00 · 2023-02-05 20:53:48 +02:00 · 23dfa9c634
commit 23dfa9c634
parent 1b21e86be6
2 changed files with 386 additions and 0 deletions
--- a/tools/tokenizer/tokenizer.go
+++ b/tools/tokenizer/tokenizer.go
@ -0,0 +1,201 @@
+// Package tokenizer implements a rudimentary tokens parser of buffered
+// io.Reader while respecting quotes and parenthesis boundaries.
+//
+// Example
+//
+//	tk := tokenizer.NewFromString("a, b, (c, d)")
+//	result, _ := tk.ScanAll() // ["a", "b", "(c, d)"]
+package tokenizer
+
+import (
+	"bufio"
+	"bytes"
+	"fmt"
+	"io"
+	"strings"
+)
+
+// eof represents a marker rune for the end of the reader.
+const eof = rune(0)
+
+// DefaultSeparators is a list with the default token separator characters.
+var DefaultSeparators = []rune{',', ' ', '\t', '\n'}
+
+// NewFromString creates new Tokenizer from the provided string.
+func NewFromString(str string) *Tokenizer {
+	return New(strings.NewReader(str))
+}
+
+// NewFromBytes creates new Tokenizer from the provided bytes slice.
+func NewFromBytes(b []byte) *Tokenizer {
+	return New(bytes.NewReader(b))
+}
+
+// New creates new Tokenizer from the provided reader.
+func New(r io.Reader) *Tokenizer {
+	return &Tokenizer{
+		r:          bufio.NewReader(r),
+		separators: DefaultSeparators,
+	}
+}
+
+// Tokenizer defines a struct that parses a reader into tokens while
+// respecting quotes and parenthesis boundaries.
+type Tokenizer struct {
+	r *bufio.Reader
+
+	separators []rune
+}
+
+// SetSeparators specifies the provided separatos of the current Tokenizer.
+func (s *Tokenizer) SetSeparators(separators ...rune) {
+	s.separators = separators
+}
+
+// Scan reads and returns the next available token from the Tokenizer's buffer (trimmed).
+//
+// Returns [io.EOF] error when there are no more tokens to scan.
+func (s *Tokenizer) Scan() (string, error) {
+	ch := s.read()
+
+	if ch == eof {
+		return "", io.EOF
+	}
+
+	if isWhitespaceRune(ch) {
+		s.readWhiteSpaces()
+	} else {
+		s.unread()
+	}
+
+	token, err := s.readToken()
+	if err != nil {
+		return "", err
+	}
+
+	// read all remaining whitespaces
+	s.readWhiteSpaces()
+
+	return token, err
+}
+
+// ScanAll reads the entire Tokenizer's buffer and return all found tokens.
+func (s *Tokenizer) ScanAll() ([]string, error) {
+	tokens := []string{}
+
+	for {
+		token, err := s.Scan()
+		if err != nil {
+			if err == io.EOF {
+				break
+			}
+
+			return nil, err
+		}
+
+		tokens = append(tokens, token)
+	}
+
+	return tokens, nil
+}
+
+// readToken reads a single token from the buffer and returns it.
+func (s *Tokenizer) readToken() (string, error) {
+	var buf bytes.Buffer
+	var parenthesis int
+	var quoteCh rune
+	var prevCh rune
+
+	for {
+		ch := s.read()
+
+		if ch == eof {
+			break
+		}
+
+		if !isEscapeRune(prevCh) {
+			if ch == '(' && quoteCh == eof {
+				parenthesis++
+			} else if ch == ')' && parenthesis > 0 && quoteCh == eof {
+				parenthesis--
+			} else if isQuoteRune(ch) {
+				if quoteCh == ch {
+					quoteCh = eof // reached closing quote
+				} else if quoteCh == eof {
+					quoteCh = ch // opening quote
+				}
+			}
+		}
+
+		if s.isSeperatorRune(ch) && parenthesis == 0 && quoteCh == eof {
+			break
+		}
+
+		prevCh = ch
+		buf.WriteRune(ch)
+	}
+
+	if parenthesis > 0 || quoteCh != eof {
+		return "", fmt.Errorf("unbalanced parenthesis or quoted expression: %q", buf.String())
+	}
+
+	return buf.String(), nil
+}
+
+// readWhiteSpaces consumes all contiguous whitespace runes.
+func (s *Tokenizer) readWhiteSpaces() {
+	for {
+		ch := s.read()
+
+		if ch == eof {
+			break
+		}
+
+		if !s.isSeperatorRune(ch) {
+			s.unread()
+			break
+		}
+	}
+}
+
+// read reads the next rune from the buffered reader.
+// Returns the `rune(0)` if an error or `io.EOF` occurs.
+func (s *Tokenizer) read() rune {
+	ch, _, err := s.r.ReadRune()
+	if err != nil {
+		return eof
+	}
+
+	return ch
+}
+
+// unread places the previously read rune back on the reader.
+func (s *Tokenizer) unread() error {
+	return s.r.UnreadRune()
+}
+
+// isSeperatorRune checks if a rune is a token part separator.
+func (s *Tokenizer) isSeperatorRune(ch rune) bool {
+	for _, r := range s.separators {
+		if ch == r {
+			return true
+		}
+	}
+
+	return false
+}
+
+// isWhitespaceRune checks if a rune is a space, tab, or newline.
+func isWhitespaceRune(ch rune) bool {
+	return ch == ' ' || ch == '\t' || ch == '\n'
+}
+
+// isQuoteRune checks if a rune is a quote.
+func isQuoteRune(ch rune) bool {
+	return ch == '\'' || ch == '"' || ch == '`'
+}
+
+// isEscapeRune checks if a rune is an escape character.
+func isEscapeRune(ch rune) bool {
+	return ch == '\\'
+}
--- a/tools/tokenizer/tokenizer_test.go
+++ b/tools/tokenizer/tokenizer_test.go
@ -0,0 +1,185 @@
+package tokenizer
+
+import (
+	"io"
+	"strings"
+	"testing"
+)
+
+func TestFactories(t *testing.T) {
+	expectedContent := "test"
+
+	scenarios := []struct {
+		name string
+		tk   *Tokenizer
+	}{
+		{
+			"New()",
+			New(strings.NewReader(expectedContent)),
+		},
+		{
+			"NewFromString()",
+			NewFromString(expectedContent),
+		},
+		{
+			"NewFromBytes()",
+			NewFromBytes([]byte(expectedContent)),
+		},
+	}
+
+	for _, s := range scenarios {
+		content, _ := s.tk.r.ReadString(0)
+
+		if content != expectedContent {
+			t.Fatalf("[%s] Expected reader with content %q, got %q", s.name, expectedContent, content)
+		}
+
+		if len(s.tk.separators) != len(DefaultSeparators) {
+			t.Fatalf("[%s] Expected \n%v, \ngot \n%v", s.name, DefaultSeparators, s.tk.separators)
+		}
+
+		for _, r := range s.tk.separators {
+			exists := false
+			for _, def := range s.tk.separators {
+				if r == def {
+					exists = true
+					break
+				}
+			}
+			if !exists {
+				t.Fatalf("[%s] Unexpected sepator %s", s.name, string(r))
+			}
+		}
+	}
+}
+
+func TestScan(t *testing.T) {
+	tk := NewFromString("abc 123.456 (abc)")
+
+	expectedTokens := []string{"abc", "123.456", "(abc)"}
+
+	for _, token := range expectedTokens {
+		result, err := tk.Scan()
+		if err != nil {
+			t.Fatalf("Expected token %q, got error %v", token, err)
+		}
+
+		if result != token {
+			t.Fatalf("Expected token %q, got error %v", token, result)
+		}
+	}
+
+	// scan the last character
+	token, err := tk.Scan()
+	if err != io.EOF {
+		t.Fatalf("Expected EOF error, got %v", err)
+	}
+	if token != "" || err != io.EOF {
+		t.Fatalf("Expected empty token, got %q", token)
+	}
+}
+
+func TestScanAllWithDefaultSeparators(t *testing.T) {
+	scenarios := []struct {
+		name         string
+		content      string
+		separators   []rune
+		expectError  bool
+		expectTokens []string
+	}{
+		{
+			"empty string",
+			"",
+			DefaultSeparators,
+			false,
+			nil,
+		},
+		{
+			"unbalanced parenthesis",
+			`(a,b() c`,
+			DefaultSeparators,
+			true,
+			[]string{},
+		},
+		{
+			"unmatching quotes",
+			`'asd"`,
+			DefaultSeparators,
+			true,
+			[]string{},
+		},
+		{
+			"no separators",
+			`a, b, c, d, e 123, "abc"`,
+			nil,
+			false,
+			[]string{
+				`a, b, c, d, e 123, "abc"`,
+			},
+		},
+		{
+			"default separators",
+			`   a   , 123.456, b, c d, (
+				test (a,b,c) " 123 "
+			),"(abc d", "abc) d", "(abc) d \" " 'abc "'`,
+			DefaultSeparators,
+			false,
+			[]string{
+				"a",
+				"123.456",
+				"b",
+				"c",
+				"d",
+				"(\n\t\t\t\ttest (a,b,c) \" 123 \"\n\t\t\t)",
+				`"(abc d"`,
+				`"abc) d"`,
+				`"(abc) d \" "`,
+				`'abc "'`,
+			},
+		},
+		{
+			"custom separators",
+			`a, b, c, d e, "a,b,  c  ", (123, 456)`,
+			[]rune{','},
+			false,
+			[]string{
+				"a",
+				"b",
+				"c",
+				"d e",
+				`"a,b,  c  "`,
+				`(123, 456)`,
+			},
+		},
+	}
+
+	for _, s := range scenarios {
+		tk := NewFromString(s.content)
+
+		tk.SetSeparators(s.separators...)
+
+		tokens, err := tk.ScanAll()
+
+		hasErr := err != nil
+		if hasErr != s.expectError {
+			t.Fatalf("[%s] Expected hasErr %v, got %v (%v)", s.name, s.expectError, hasErr, err)
+		}
+
+		if len(tokens) != len(s.expectTokens) {
+			t.Fatalf("[%s] Expected \n%v, \ngot \n%v", s.name, s.expectTokens, tokens)
+		}
+
+		for _, tok := range tokens {
+			exists := false
+			for _, def := range s.expectTokens {
+				if tok == def {
+					exists = true
+					break
+				}
+			}
+			if !exists {
+				t.Fatalf("[%s] Unexpected token %s", s.name, tok)
+			}
+		}
+	}
+}