added tokenizer.IgnoreParenthesis() to allow ignoring the parenthesis characters boundary checks

2025-09-16 09:36:20 +02:00 · 2023-09-17 12:14:57 +03:00
parent 71f9be3cb0
commit f3bcd7d3df
2 changed files with 171 additions and 133 deletions
--- a/tools/tokenizer/tokenizer.go
+++ b/tools/tokenizer/tokenizer.go
@@ -34,9 +34,10 @@ func NewFromBytes(b []byte) *Tokenizer {
 // New creates new Tokenizer from the provided reader with DefaultSeparators.
 func New(r io.Reader) *Tokenizer {
 	return &Tokenizer{
-		r:             bufio.NewReader(r),
+		r:                 bufio.NewReader(r),
-		separators:    DefaultSeparators,
+		separators:        DefaultSeparators,
-		keepSeparator: false,
+		keepSeparator:     false,
 		ignoreParenthesis: false,
 	}
 }
@@ -45,54 +46,61 @@ func New(r io.Reader) *Tokenizer {
 type Tokenizer struct {
 	r *bufio.Reader
-	separators    []rune
+	separators        []rune
-	keepSeparator bool
+	keepSeparator     bool
 	ignoreParenthesis bool
 }
 // Separators defines the provided separatos of the current Tokenizer.
-func (s *Tokenizer) Separators(separators ...rune) {
+func (t *Tokenizer) Separators(separators ...rune) {
-	s.separators = separators
+	t.separators = separators
 }
 // KeepSeparator defines whether to keep the separator rune as part
 // of the token (default to false).
-func (s *Tokenizer) KeepSeparator(state bool) {
+func (t *Tokenizer) KeepSeparator(state bool) {
-	s.keepSeparator = state
+	t.keepSeparator = state
 }
 // IgnoreParenthesis defines whether to ignore the parenthesis boundaries
 // and to treat the '(' and ')' as regular characters.
 func (t *Tokenizer) IgnoreParenthesis(state bool) {
 	t.ignoreParenthesis = state
 }
 // Scan reads and returns the next available token from the Tokenizer's buffer (trimmed).
 //
 // Returns [io.EOF] error when there are no more tokens to scan.
-func (s *Tokenizer) Scan() (string, error) {
+func (t *Tokenizer) Scan() (string, error) {
-	ch := s.read()
+	ch := t.read()
 	if ch == eof {
 		return "", io.EOF
 	}
 	if isWhitespaceRune(ch) {
-		s.readWhiteSpaces()
+		t.readWhiteSpaces()
 	} else {
-		s.unread()
+		t.unread()
 	}
-	token, err := s.readToken()
+	token, err := t.readToken()
 	if err != nil {
 		return "", err
 	}
 	// read all remaining whitespaces
-	s.readWhiteSpaces()
+	t.readWhiteSpaces()
 	return token, err
 }
 // ScanAll reads the entire Tokenizer's buffer and return all found tokens.
-func (s *Tokenizer) ScanAll() ([]string, error) {
+func (t *Tokenizer) ScanAll() ([]string, error) {
 	tokens := []string{}
 	for {
-		token, err := s.Scan()
+		token, err := t.Scan()
 		if err != nil {
 			if err == io.EOF {
 				break
@@ -108,35 +116,35 @@ func (s *Tokenizer) ScanAll() ([]string, error) {
 }
 // readToken reads a single token from the buffer and returns it.
-func (s *Tokenizer) readToken() (string, error) {
+func (t *Tokenizer) readToken() (string, error) {
 	var buf bytes.Buffer
 	var parenthesis int
 	var quoteCh rune
 	var prevCh rune
 	for {
-		ch := s.read()
+		ch := t.read()
 		if ch == eof {
 			break
 		}
 		if !isEscapeRune(prevCh) {
-			if ch == '(' && quoteCh == eof {
+			if !t.ignoreParenthesis && ch == '(' && quoteCh == eof {
-				parenthesis++
+				parenthesis++ // opening parenthesis
-			} else if ch == ')' && parenthesis > 0 && quoteCh == eof {
+			} else if !t.ignoreParenthesis && ch == ')' && parenthesis > 0 && quoteCh == eof {
-				parenthesis--
+				parenthesis-- // closing parenthesis
 			} else if isQuoteRune(ch) {
 				if quoteCh == ch {
-					quoteCh = eof // reached closing quote
+					quoteCh = eof // closing quote
 				} else if quoteCh == eof {
 					quoteCh = ch // opening quote
 				}
 			}
 		}
-		if s.isSeperatorRune(ch) && parenthesis == 0 && quoteCh == eof {
+		if t.isSeperatorRune(ch) && parenthesis == 0 && quoteCh == eof {
-			if s.keepSeparator {
+			if t.keepSeparator {
 				buf.WriteRune(ch)
 			}
 			break
@@ -154,16 +162,16 @@ func (s *Tokenizer) readToken() (string, error) {
 }
 // readWhiteSpaces consumes all contiguous whitespace runes.
-func (s *Tokenizer) readWhiteSpaces() {
+func (t *Tokenizer) readWhiteSpaces() {
 	for {
-		ch := s.read()
+		ch := t.read()
 		if ch == eof {
 			break
 		}
-		if !s.isSeperatorRune(ch) {
+		if !t.isSeperatorRune(ch) {
-			s.unread()
+			t.unread()
 			break
 		}
 	}
@@ -171,8 +179,8 @@ func (s *Tokenizer) readWhiteSpaces() {
 // read reads the next rune from the buffered reader.
 // Returns the `rune(0)` if an error or `io.EOF` occurs.
-func (s *Tokenizer) read() rune {
+func (t *Tokenizer) read() rune {
-	ch, _, err := s.r.ReadRune()
+	ch, _, err := t.r.ReadRune()
 	if err != nil {
 		return eof
 	}
@@ -181,13 +189,13 @@ func (s *Tokenizer) read() rune {
 }
 // unread places the previously read rune back on the reader.
-func (s *Tokenizer) unread() error {
+func (t *Tokenizer) unread() error {
-	return s.r.UnreadRune()
+	return t.r.UnreadRune()
 }
 // isSeperatorRune checks if a rune is a token part separator.
-func (s *Tokenizer) isSeperatorRune(ch rune) bool {
+func (t *Tokenizer) isSeperatorRune(ch rune) bool {
-	for _, r := range s.separators {
+	for _, r := range t.separators {
 		if ch == r {
 			return true
 		}
--- a/tools/tokenizer/tokenizer_test.go
+++ b/tools/tokenizer/tokenizer_test.go
@@ -28,32 +28,38 @@ func TestFactories(t *testing.T) {
 	}
 	for _, s := range scenarios {
-		content, _ := s.tk.r.ReadString(0)
+		t.Run(s.name, func(t *testing.T) {
 			content, _ := s.tk.r.ReadString(0)
-		if content != expectedContent {
+			if content != expectedContent {
-			t.Fatalf("[%s] Expected reader with content %q, got %q", s.name, expectedContent, content)
+				t.Fatalf("Expected reader with content %q, got %q", expectedContent, content)
-		}
+			}
-		if s.tk.keepSeparator != false {
+			if s.tk.keepSeparator != false {
-			t.Fatalf("[%s] Expected false, got true", s.name)
+				t.Fatal("Expected keepSeparator false, got true")
-		}
+			}
-		if len(s.tk.separators) != len(DefaultSeparators) {
+			if s.tk.ignoreParenthesis != false {
-			t.Fatalf("[%s] Expected \n%v, \ngot \n%v", s.name, DefaultSeparators, s.tk.separators)
+				t.Fatal("Expected ignoreParenthesis false, got true")
-		}
+			}
-		for _, r := range s.tk.separators {
+			if len(s.tk.separators) != len(DefaultSeparators) {
-			exists := false
+				t.Fatalf("Expected \n%v, \ngot \n%v", DefaultSeparators, s.tk.separators)
-			for _, def := range s.tk.separators {
+			}
-				if r == def {
+
-					exists = true
+			for _, r := range s.tk.separators {
-					break
+				exists := false
 				for _, def := range s.tk.separators {
 					if r == def {
 						exists = true
 						break
 					}
 				}
 				if !exists {
 					t.Fatalf("Unexpected sepator %s", string(r))
 				}
 			}
-			if !exists {
+		})
 				t.Fatalf("[%s] Unexpected sepator %s", s.name, string(r))
 			}
 		}
 	}
 }
@@ -85,54 +91,58 @@ func TestScan(t *testing.T) {
 func TestScanAll(t *testing.T) {
 	scenarios := []struct {
-		name          string
+		name              string
-		content       string
+		content           string
-		separators    []rune
+		separators        []rune
-		keepSeparator bool
+		keepSeparator     bool
-		expectError   bool
+		ignoreParenthesis bool
-		expectTokens  []string
+		expectError       bool
 		expectTokens      []string
 	}{
 		{
-			"empty string",
+			name:              "empty string",
-			"",
+			content:           "",
-			DefaultSeparators,
+			separators:        DefaultSeparators,
-			false,
+			keepSeparator:     false,
-			false,
+			ignoreParenthesis: false,
-			nil,
+			expectError:       false,
 			expectTokens:      nil,
 		},
 		{
-			"unbalanced parenthesis",
+			name:              "unbalanced parenthesis",
-			`(a,b() c`,
+			content:           `(a,b() c`,
-			DefaultSeparators,
+			separators:        DefaultSeparators,
-			false,
+			keepSeparator:     false,
-			true,
+			ignoreParenthesis: false,
-			[]string{},
+			expectError:       true,
 			expectTokens:      []string{},
 		},
 		{
-			"unmatching quotes",
+			name:              "unmatching quotes",
-			`'asd"`,
+			content:           `'asd"`,
-			DefaultSeparators,
+			separators:        DefaultSeparators,
-			false,
+			keepSeparator:     false,
-			true,
+			ignoreParenthesis: false,
-			[]string{},
+			expectError:       true,
 			expectTokens:      []string{},
 		},
 		{
-			"no separators",
+			name:              "no separators",
-			`a, b, c, d, e 123, "abc"`,
+			content:           `a, b, c, d, e 123, "abc"`,
-			nil,
+			separators:        nil,
-			false,
+			keepSeparator:     false,
-			false,
+			ignoreParenthesis: false,
-			[]string{
+			expectError:       false,
-				`a, b, c, d, e 123, "abc"`,
+			expectTokens:      []string{`a, b, c, d, e 123, "abc"`},
 			},
 		},
 		{
-			"default separators",
+			name:              "default separators",
-			`a, b, c, d e, "a,b,  c  ", (123, 456)`,
+			content:           `a, b, c, d e, "a,b,  c  ", (123, 456)`,
-			DefaultSeparators,
+			separators:        DefaultSeparators,
-			false,
+			keepSeparator:     false,
-			false,
+			ignoreParenthesis: false,
-			[]string{
+			expectError:       false,
 			expectTokens: []string{
 				"a",
 				"b",
 				"c",
@@ -142,12 +152,13 @@ func TestScanAll(t *testing.T) {
 			},
 		},
 		{
-			"default separators (with preserve)",
+			name:              "default separators (with preserve)",
-			`a, b, c, d e, "a,b,  c  ", (123, 456)`,
+			content:           `a, b, c, d e, "a,b,  c  ", (123, 456)`,
-			DefaultSeparators,
+			separators:        DefaultSeparators,
-			true,
+			keepSeparator:     true,
-			false,
+			ignoreParenthesis: false,
-			[]string{
+			expectError:       false,
 			expectTokens: []string{
 				"a,",
 				"b,",
 				"c,",
@@ -157,14 +168,15 @@ func TestScanAll(t *testing.T) {
 			},
 		},
 		{
-			"custom separators",
+			name: "custom separators",
-			`   a   , 123.456, b, c d, (
+			content: `   a   , 123.456, b, c d, (
 				test (a,b,c) " 123 "
 			),"(abc d", "abc) d", "(abc) d \" " 'abc "'`,
-			[]rune{',', ' ', '\t', '\n'},
+			separators:        []rune{',', ' ', '\t', '\n'},
-			false,
+			keepSeparator:     false,
-			false,
+			ignoreParenthesis: false,
-			[]string{
+			expectError:       false,
 			expectTokens: []string{
 				"a",
 				"123.456",
 				"b",
@@ -178,14 +190,15 @@ func TestScanAll(t *testing.T) {
 			},
 		},
 		{
-			"custom separators (with preserve)",
+			name: "custom separators (with preserve)",
-			`   a   , 123.456, b, c d, (
+			content: `   a   , 123.456, b, c d, (
 				test (a,b,c) " 123 "
 			),"(abc d", "abc) d", "(abc) d \" " 'abc "'`,
-			[]rune{',', ' ', '\t', '\n'},
+			separators:        []rune{',', ' ', '\t', '\n'},
-			true,
+			keepSeparator:     true,
-			false,
+			ignoreParenthesis: false,
-			[]string{
+			expectError:       false,
 			expectTokens: []string{
 				"a ",
 				"123.456,",
 				"b,",
@@ -198,36 +211,53 @@ func TestScanAll(t *testing.T) {
 				`'abc "'`,
 			},
 		},
 		{
 			name:              "ignoring parenthesis",
 			content:           `a, b, (c,d)`,
 			separators:        []rune{','},
 			keepSeparator:     false,
 			ignoreParenthesis: true,
 			expectError:       false,
 			expectTokens: []string{
 				"a",
 				"b",
 				"(c",
 				"d)",
 			},
 		},
 	}
 	for _, s := range scenarios {
-		tk := NewFromString(s.content)
+		t.Run(s.name, func(t *testing.T) {
 			tk := NewFromString(s.content)
-		tk.Separators(s.separators...)
+			tk.Separators(s.separators...)
-		tk.KeepSeparator(s.keepSeparator)
+			tk.KeepSeparator(s.keepSeparator)
 			tk.IgnoreParenthesis(s.ignoreParenthesis)
-		tokens, err := tk.ScanAll()
+			tokens, err := tk.ScanAll()
-		hasErr := err != nil
+			hasErr := err != nil
-		if hasErr != s.expectError {
+			if hasErr != s.expectError {
-			t.Fatalf("[%s] Expected hasErr %v, got %v (%v)", s.name, s.expectError, hasErr, err)
+				t.Fatalf("Expected hasErr %v, got %v (%v)", s.expectError, hasErr, err)
-		}
+			}
-		if len(tokens) != len(s.expectTokens) {
+			if len(tokens) != len(s.expectTokens) {
-			t.Fatalf("[%s] Expected \n%v (%d), \ngot \n%v (%d)", s.name, s.expectTokens, len(s.expectTokens), tokens, len(tokens))
+				t.Fatalf("Expected \n%v (%d), \ngot \n%v (%d)", s.expectTokens, len(s.expectTokens), tokens, len(tokens))
-		}
+			}
-		for _, tok := range tokens {
+			for _, tok := range tokens {
-			exists := false
+				exists := false
-			for _, def := range s.expectTokens {
+				for _, def := range s.expectTokens {
-				if tok == def {
+					if tok == def {
-					exists = true
+						exists = true
-					break
+						break
 					}
 				}
 				if !exists {
 					t.Fatalf("Unexpected token %s", tok)
 				}
 			}
-			if !exists {
+		})
 				t.Fatalf("[%s] Unexpected token %s", s.name, tok)
 			}
 		}
 	}
 }