Add ByGroupNames function, same as ByGroups but use named groups (#519)

For named groups that are not given, an Error will be emitted anyway. This also handles the case when an Emitter for group `0` is provided or not. Since numbers can also be used for names. But it might be over-doing, because why would anyone use ByGroupNames if they wanted to assign a token to the whole match?!
2025-07-15 01:14:21 +02:00 · 2021-06-08 16:56:59 +04:30
parent 22cbca546a
commit 10329f849e
2 changed files with 113 additions and 0 deletions
--- a/regexp.go
+++ b/regexp.go
@ -52,6 +52,34 @@ func ByGroups(emitters ...Emitter) Emitter {
 	})
 }

+// ByGroupNames emits a token for each named matching group in the rule's regex.
+func ByGroupNames(emitters map[string]Emitter) Emitter {
+	return EmitterFunc(func(groups []string, state *LexerState) Iterator {
+		iterators := make([]Iterator, 0, len(state.NamedGroups)-1)
+		if len(state.NamedGroups)-1 == 0 {
+			if emitter, ok := emitters[`0`]; ok {
+				iterators = append(iterators, emitter.Emit(groups, state))
+			} else {
+				iterators = append(iterators, Error.Emit(groups, state))
+			}
+		} else {
+			ruleRegex := state.Rules[state.State][state.Rule].Regexp
+			for i := 1; i < len(state.NamedGroups); i++ {
+				groupName := ruleRegex.GroupNameFromNumber(i)
+				group := state.NamedGroups[groupName]
+				if emitter, ok := emitters[groupName]; ok {
+					if emitter != nil {
+						iterators = append(iterators, emitter.Emit([]string{group}, state))
+					}
+				} else {
+					iterators = append(iterators, Error.Emit([]string{group}, state))
+				}
+			}
+		}
+		return Concaterator(iterators...)
+	})
+}
+
 // UsingByGroup emits tokens for the matched groups in the regex using a
 // "sublexer". Used when lexing code blocks where the name of a sublexer is
 // contained within the block, for example on a Markdown text block or SQL
--- a/regexp_test.go
+++ b/regexp_test.go
@ -99,3 +99,88 @@ func TestEnsureLFFunc(t *testing.T) {
 		assert.Equal(t, out, test.out)
 	}
 }
+
+func TestByGroupNames(t *testing.T) {
+	l := Coalesce(MustNewLexer(nil, Rules{ // nolint: forbidigo
+		"root": {
+			{
+				`(?<key>\w+)(?<operator>=)(?<value>\w+)`,
+				ByGroupNames(map[string]Emitter{
+					`key`:      String,
+					`operator`: Operator,
+					`value`:    String,
+				}),
+				nil,
+			},
+		},
+	}))
+	it, err := l.Tokenise(nil, `abc=123`)
+	assert.NoError(t, err)
+	assert.Equal(t, []Token{{String, `abc`}, {Operator, `=`}, {String, `123`}}, it.Tokens())
+
+	l = Coalesce(MustNewLexer(nil, Rules{ // nolint: forbidigo
+		"root": {
+			{
+				`(?<key>\w+)(?<operator>=)(?<value>\w+)`,
+				ByGroupNames(map[string]Emitter{
+					`key`:   String,
+					`value`: String,
+				}),
+				nil,
+			},
+		},
+	}))
+	it, err = l.Tokenise(nil, `abc=123`)
+	assert.NoError(t, err)
+	assert.Equal(t, []Token{{String, `abc`}, {Error, `=`}, {String, `123`}}, it.Tokens())
+
+	l = Coalesce(MustNewLexer(nil, Rules{ // nolint: forbidigo
+		"root": {
+			{
+				`(?<key>\w+)=(?<value>\w+)`,
+				ByGroupNames(map[string]Emitter{
+					`key`:   String,
+					`value`: String,
+				}),
+				nil,
+			},
+		},
+	}))
+	it, err = l.Tokenise(nil, `abc=123`)
+	assert.NoError(t, err)
+	assert.Equal(t, []Token{{String, `abc123`}}, it.Tokens())
+
+	l = Coalesce(MustNewLexer(nil, Rules{ // nolint: forbidigo
+		"root": {
+			{
+				`(?<key>\w+)(?<op>=)(?<value>\w+)`,
+				ByGroupNames(map[string]Emitter{
+					`key`:      String,
+					`operator`: Operator,
+					`value`:    String,
+				}),
+				nil,
+			},
+		},
+	}))
+	it, err = l.Tokenise(nil, `abc=123`)
+	assert.NoError(t, err)
+	assert.Equal(t, []Token{{String, `abc`}, {Error, `=`}, {String, `123`}}, it.Tokens())
+
+	l = Coalesce(MustNewLexer(nil, Rules{ // nolint: forbidigo
+		"root": {
+			{
+				`\w+=\w+`,
+				ByGroupNames(map[string]Emitter{
+					`key`:      String,
+					`operator`: Operator,
+					`value`:    String,
+				}),
+				nil,
+			},
+		},
+	}))
+	it, err = l.Tokenise(nil, `abc=123`)
+	assert.NoError(t, err)
+	assert.Equal(t, []Token{{Error, `abc=123`}}, it.Tokens())
+}