mirror of
https://github.com/alecthomas/chroma.git
synced 2025-02-19 19:00:13 +02:00
Split Regexp lexer into its own file.
This commit is contained in:
parent
a5637e60b2
commit
44b23f97b4
245
lexer.go
245
lexer.go
@ -2,11 +2,6 @@ package chroma
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"regexp"
|
||||
"strings"
|
||||
"sync"
|
||||
|
||||
"github.com/dlclark/regexp2"
|
||||
)
|
||||
|
||||
var (
|
||||
@ -117,243 +112,3 @@ func (l Lexers) Less(i, j int) bool { return l[i].Config().Name < l[j].Config().
|
||||
type Analyser interface {
|
||||
AnalyseText(text string) float32
|
||||
}
|
||||
|
||||
type Rule struct {
|
||||
Pattern string
|
||||
Type Emitter
|
||||
Mutator Mutator
|
||||
}
|
||||
|
||||
// An Emitter takes group matches and returns tokens.
|
||||
type Emitter interface {
|
||||
// Emit tokens for the given regex groups.
|
||||
Emit(groups []string, lexer Lexer, out func(*Token))
|
||||
}
|
||||
|
||||
// EmitterFunc is a function that is an Emitter.
|
||||
type EmitterFunc func(groups []string, lexer Lexer, out func(*Token))
|
||||
|
||||
// Emit tokens for groups.
|
||||
func (e EmitterFunc) Emit(groups []string, lexer Lexer, out func(*Token)) { e(groups, lexer, out) }
|
||||
|
||||
// ByGroups emits a token for each matching group in the rule's regex.
|
||||
func ByGroups(emitters ...Emitter) Emitter {
|
||||
return EmitterFunc(func(groups []string, lexer Lexer, out func(*Token)) {
|
||||
// NOTE: If this line panics, there is a mismatch with groups. Uncomment the following line to debug.
|
||||
// fmt.Printf("%s %#v\n", emitters, groups[1:])
|
||||
for i, group := range groups[1:] {
|
||||
emitters[i].Emit([]string{group}, lexer, out)
|
||||
}
|
||||
return
|
||||
})
|
||||
}
|
||||
|
||||
// Using returns an Emitter that uses a given Lexer for parsing and emitting.
|
||||
func Using(lexer Lexer, options *TokeniseOptions) Emitter {
|
||||
return EmitterFunc(func(groups []string, _ Lexer, out func(*Token)) {
|
||||
if err := lexer.Tokenise(options, groups[0], out); err != nil {
|
||||
panic(err)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
// UsingSelf is like Using, but uses the current Lexer.
|
||||
func UsingSelf(state string) Emitter {
|
||||
return EmitterFunc(func(groups []string, lexer Lexer, out func(*Token)) {
|
||||
if err := lexer.Tokenise(&TokeniseOptions{State: state}, groups[0], out); err != nil {
|
||||
panic(err)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
// Words creates a regex that matches any of the given literal words.
|
||||
func Words(prefix, suffix string, words ...string) string {
|
||||
for i, word := range words {
|
||||
words[i] = regexp.QuoteMeta(word)
|
||||
}
|
||||
return prefix + `(` + strings.Join(words, `|`) + `)` + suffix
|
||||
}
|
||||
|
||||
// Rules maps from state to a sequence of Rules.
|
||||
type Rules map[string][]Rule
|
||||
|
||||
// MustNewLexer creates a new Lexer or panics.
|
||||
func MustNewLexer(config *Config, rules Rules) *RegexLexer {
|
||||
lexer, err := NewLexer(config, rules)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
return lexer
|
||||
}
|
||||
|
||||
// NewLexer creates a new regex-based Lexer.
|
||||
//
|
||||
// "rules" is a state machine transitition map. Each key is a state. Values are sets of rules
|
||||
// that match input, optionally modify lexer state, and output tokens.
|
||||
func NewLexer(config *Config, rules Rules) (*RegexLexer, error) {
|
||||
if config == nil {
|
||||
config = &Config{}
|
||||
}
|
||||
if _, ok := rules["root"]; !ok {
|
||||
return nil, fmt.Errorf("no \"root\" state")
|
||||
}
|
||||
compiledRules := map[string][]CompiledRule{}
|
||||
for state, rules := range rules {
|
||||
for _, rule := range rules {
|
||||
flags := ""
|
||||
if !config.NotMultiline {
|
||||
flags += "m"
|
||||
}
|
||||
if config.CaseInsensitive {
|
||||
flags += "i"
|
||||
}
|
||||
if config.DotAll {
|
||||
flags += "s"
|
||||
}
|
||||
compiledRules[state] = append(compiledRules[state], CompiledRule{Rule: rule, flags: flags})
|
||||
}
|
||||
}
|
||||
return &RegexLexer{
|
||||
config: config,
|
||||
rules: compiledRules,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// A CompiledRule is a Rule with a pre-compiled regex.
|
||||
//
|
||||
// Note that regular expressions are lazily compiled on first use of the lexer.
|
||||
type CompiledRule struct {
|
||||
Rule
|
||||
Regexp *regexp2.Regexp
|
||||
flags string
|
||||
}
|
||||
|
||||
type CompiledRules map[string][]CompiledRule
|
||||
|
||||
type LexerState struct {
|
||||
Text []rune
|
||||
Pos int
|
||||
Rules map[string][]CompiledRule
|
||||
Stack []string
|
||||
State string
|
||||
Rule int
|
||||
// Group matches.
|
||||
Groups []string
|
||||
// Custum context for mutators.
|
||||
MutatorContext map[interface{}]interface{}
|
||||
}
|
||||
|
||||
func (l *LexerState) Set(key interface{}, value interface{}) {
|
||||
l.MutatorContext[key] = value
|
||||
}
|
||||
|
||||
func (l *LexerState) Get(key interface{}) interface{} {
|
||||
return l.MutatorContext[key]
|
||||
}
|
||||
|
||||
type RegexLexer struct {
|
||||
config *Config
|
||||
analyser func(text string) float32
|
||||
|
||||
mu sync.Mutex
|
||||
compiled bool
|
||||
rules map[string][]CompiledRule
|
||||
}
|
||||
|
||||
// SetAnalyser sets the analyser function used to perform content inspection.
|
||||
func (r *RegexLexer) SetAnalyser(analyser func(text string) float32) *RegexLexer {
|
||||
r.analyser = analyser
|
||||
return r
|
||||
}
|
||||
|
||||
func (r *RegexLexer) AnalyseText(text string) float32 {
|
||||
if r.analyser != nil {
|
||||
return r.analyser(text)
|
||||
}
|
||||
return 0.0
|
||||
}
|
||||
|
||||
func (r *RegexLexer) Config() *Config {
|
||||
return r.config
|
||||
}
|
||||
|
||||
// Regex compilation is deferred until the lexer is used. This is to avoid significant init() time costs.
|
||||
func (r *RegexLexer) maybeCompile() (err error) {
|
||||
r.mu.Lock()
|
||||
defer r.mu.Unlock()
|
||||
if r.compiled {
|
||||
return nil
|
||||
}
|
||||
for state, rules := range r.rules {
|
||||
for i, rule := range rules {
|
||||
if rule.Regexp == nil {
|
||||
rule.Regexp, err = regexp2.Compile("^(?"+rule.flags+")(?:"+rule.Pattern+")", 0)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to compile rule %s.%d: %s", state, i, err)
|
||||
}
|
||||
}
|
||||
rules[i] = rule
|
||||
}
|
||||
}
|
||||
r.compiled = true
|
||||
return nil
|
||||
}
|
||||
|
||||
func (r *RegexLexer) Tokenise(options *TokeniseOptions, text string, out func(*Token)) error {
|
||||
if err := r.maybeCompile(); err != nil {
|
||||
return err
|
||||
}
|
||||
if options == nil {
|
||||
options = defaultOptions
|
||||
}
|
||||
state := &LexerState{
|
||||
Text: []rune(text),
|
||||
Stack: []string{options.State},
|
||||
Rules: r.rules,
|
||||
MutatorContext: map[interface{}]interface{}{},
|
||||
}
|
||||
for state.Pos < len(state.Text) && len(state.Stack) > 0 {
|
||||
state.State = state.Stack[len(state.Stack)-1]
|
||||
ruleIndex, rule, groups := matchRules(state.Text[state.Pos:], state.Rules[state.State])
|
||||
// No match.
|
||||
if groups == nil {
|
||||
out(&Token{Error, string(state.Text[state.Pos : state.Pos+1])})
|
||||
state.Pos++
|
||||
continue
|
||||
}
|
||||
state.Rule = ruleIndex
|
||||
|
||||
state.Groups = groups
|
||||
state.Pos += len(groups[0])
|
||||
if rule.Mutator != nil {
|
||||
if err := rule.Mutator.Mutate(state); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
if rule.Type != nil {
|
||||
rule.Type.Emit(state.Groups, r, out)
|
||||
}
|
||||
}
|
||||
out(&Token{Type: EOF})
|
||||
return nil
|
||||
}
|
||||
|
||||
// Tokenise text using lexer, returning tokens as a slice.
|
||||
func Tokenise(lexer Lexer, options *TokeniseOptions, text string) ([]*Token, error) {
|
||||
out := []*Token{}
|
||||
return out, lexer.Tokenise(options, text, func(token *Token) { out = append(out, token) })
|
||||
}
|
||||
|
||||
func matchRules(text []rune, rules []CompiledRule) (int, CompiledRule, []string) {
|
||||
for i, rule := range rules {
|
||||
match, err := rule.Regexp.FindRunesMatch(text)
|
||||
if match != nil && err == nil {
|
||||
groups := []string{}
|
||||
for _, g := range match.Groups() {
|
||||
groups = append(groups, g.String())
|
||||
}
|
||||
return i, rule, groups
|
||||
}
|
||||
}
|
||||
return 0, CompiledRule{}, nil
|
||||
}
|
||||
|
250
regexp.go
Normal file
250
regexp.go
Normal file
@ -0,0 +1,250 @@
|
||||
package chroma
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"regexp"
|
||||
"strings"
|
||||
"sync"
|
||||
|
||||
"github.com/dlclark/regexp2"
|
||||
)
|
||||
|
||||
type Rule struct {
|
||||
Pattern string
|
||||
Type Emitter
|
||||
Mutator Mutator
|
||||
}
|
||||
|
||||
// An Emitter takes group matches and returns tokens.
|
||||
type Emitter interface {
|
||||
// Emit tokens for the given regex groups.
|
||||
Emit(groups []string, lexer Lexer, out func(*Token))
|
||||
}
|
||||
|
||||
// EmitterFunc is a function that is an Emitter.
|
||||
type EmitterFunc func(groups []string, lexer Lexer, out func(*Token))
|
||||
|
||||
// Emit tokens for groups.
|
||||
func (e EmitterFunc) Emit(groups []string, lexer Lexer, out func(*Token)) { e(groups, lexer, out) }
|
||||
|
||||
// ByGroups emits a token for each matching group in the rule's regex.
|
||||
func ByGroups(emitters ...Emitter) Emitter {
|
||||
return EmitterFunc(func(groups []string, lexer Lexer, out func(*Token)) {
|
||||
// NOTE: If this line panics, there is a mismatch with groups. Uncomment the following line to debug.
|
||||
// fmt.Printf("%s %#v\n", emitters, groups[1:])
|
||||
for i, group := range groups[1:] {
|
||||
emitters[i].Emit([]string{group}, lexer, out)
|
||||
}
|
||||
return
|
||||
})
|
||||
}
|
||||
|
||||
// Using returns an Emitter that uses a given Lexer for parsing and emitting.
|
||||
func Using(lexer Lexer, options *TokeniseOptions) Emitter {
|
||||
return EmitterFunc(func(groups []string, _ Lexer, out func(*Token)) {
|
||||
if err := lexer.Tokenise(options, groups[0], out); err != nil {
|
||||
panic(err)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
// UsingSelf is like Using, but uses the current Lexer.
|
||||
func UsingSelf(state string) Emitter {
|
||||
return EmitterFunc(func(groups []string, lexer Lexer, out func(*Token)) {
|
||||
if err := lexer.Tokenise(&TokeniseOptions{State: state}, groups[0], out); err != nil {
|
||||
panic(err)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
// Words creates a regex that matches any of the given literal words.
|
||||
func Words(prefix, suffix string, words ...string) string {
|
||||
for i, word := range words {
|
||||
words[i] = regexp.QuoteMeta(word)
|
||||
}
|
||||
return prefix + `(` + strings.Join(words, `|`) + `)` + suffix
|
||||
}
|
||||
|
||||
// Tokenise text using lexer, returning tokens as a slice.
|
||||
func Tokenise(lexer Lexer, options *TokeniseOptions, text string) ([]*Token, error) {
|
||||
out := []*Token{}
|
||||
return out, lexer.Tokenise(options, text, func(token *Token) { out = append(out, token) })
|
||||
}
|
||||
|
||||
// Rules maps from state to a sequence of Rules.
|
||||
type Rules map[string][]Rule
|
||||
|
||||
// MustNewLexer creates a new Lexer or panics.
|
||||
func MustNewLexer(config *Config, rules Rules) *RegexLexer {
|
||||
lexer, err := NewLexer(config, rules)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
return lexer
|
||||
}
|
||||
|
||||
// NewLexer creates a new regex-based Lexer.
|
||||
//
|
||||
// "rules" is a state machine transitition map. Each key is a state. Values are sets of rules
|
||||
// that match input, optionally modify lexer state, and output tokens.
|
||||
func NewLexer(config *Config, rules Rules) (*RegexLexer, error) {
|
||||
if config == nil {
|
||||
config = &Config{}
|
||||
}
|
||||
if _, ok := rules["root"]; !ok {
|
||||
return nil, fmt.Errorf("no \"root\" state")
|
||||
}
|
||||
compiledRules := map[string][]CompiledRule{}
|
||||
for state, rules := range rules {
|
||||
for _, rule := range rules {
|
||||
flags := ""
|
||||
if !config.NotMultiline {
|
||||
flags += "m"
|
||||
}
|
||||
if config.CaseInsensitive {
|
||||
flags += "i"
|
||||
}
|
||||
if config.DotAll {
|
||||
flags += "s"
|
||||
}
|
||||
compiledRules[state] = append(compiledRules[state], CompiledRule{Rule: rule, flags: flags})
|
||||
}
|
||||
}
|
||||
return &RegexLexer{
|
||||
config: config,
|
||||
rules: compiledRules,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// A CompiledRule is a Rule with a pre-compiled regex.
|
||||
//
|
||||
// Note that regular expressions are lazily compiled on first use of the lexer.
|
||||
type CompiledRule struct {
|
||||
Rule
|
||||
Regexp *regexp2.Regexp
|
||||
flags string
|
||||
}
|
||||
|
||||
type CompiledRules map[string][]CompiledRule
|
||||
|
||||
type LexerState struct {
|
||||
Text []rune
|
||||
Pos int
|
||||
Rules map[string][]CompiledRule
|
||||
Stack []string
|
||||
State string
|
||||
Rule int
|
||||
// Group matches.
|
||||
Groups []string
|
||||
// Custum context for mutators.
|
||||
MutatorContext map[interface{}]interface{}
|
||||
}
|
||||
|
||||
func (l *LexerState) Set(key interface{}, value interface{}) {
|
||||
l.MutatorContext[key] = value
|
||||
}
|
||||
|
||||
func (l *LexerState) Get(key interface{}) interface{} {
|
||||
return l.MutatorContext[key]
|
||||
}
|
||||
|
||||
type RegexLexer struct {
|
||||
config *Config
|
||||
analyser func(text string) float32
|
||||
|
||||
mu sync.Mutex
|
||||
compiled bool
|
||||
rules map[string][]CompiledRule
|
||||
}
|
||||
|
||||
// SetAnalyser sets the analyser function used to perform content inspection.
|
||||
func (r *RegexLexer) SetAnalyser(analyser func(text string) float32) *RegexLexer {
|
||||
r.analyser = analyser
|
||||
return r
|
||||
}
|
||||
|
||||
func (r *RegexLexer) AnalyseText(text string) float32 {
|
||||
if r.analyser != nil {
|
||||
return r.analyser(text)
|
||||
}
|
||||
return 0.0
|
||||
}
|
||||
|
||||
func (r *RegexLexer) Config() *Config {
|
||||
return r.config
|
||||
}
|
||||
|
||||
// Regex compilation is deferred until the lexer is used. This is to avoid significant init() time costs.
|
||||
func (r *RegexLexer) maybeCompile() (err error) {
|
||||
r.mu.Lock()
|
||||
defer r.mu.Unlock()
|
||||
if r.compiled {
|
||||
return nil
|
||||
}
|
||||
for state, rules := range r.rules {
|
||||
for i, rule := range rules {
|
||||
if rule.Regexp == nil {
|
||||
rule.Regexp, err = regexp2.Compile("^(?"+rule.flags+")(?:"+rule.Pattern+")", 0)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to compile rule %s.%d: %s", state, i, err)
|
||||
}
|
||||
}
|
||||
rules[i] = rule
|
||||
}
|
||||
}
|
||||
r.compiled = true
|
||||
return nil
|
||||
}
|
||||
|
||||
func (r *RegexLexer) Tokenise(options *TokeniseOptions, text string, out func(*Token)) error {
|
||||
if err := r.maybeCompile(); err != nil {
|
||||
return err
|
||||
}
|
||||
if options == nil {
|
||||
options = defaultOptions
|
||||
}
|
||||
state := &LexerState{
|
||||
Text: []rune(text),
|
||||
Stack: []string{options.State},
|
||||
Rules: r.rules,
|
||||
MutatorContext: map[interface{}]interface{}{},
|
||||
}
|
||||
for state.Pos < len(state.Text) && len(state.Stack) > 0 {
|
||||
state.State = state.Stack[len(state.Stack)-1]
|
||||
ruleIndex, rule, groups := matchRules(state.Text[state.Pos:], state.Rules[state.State])
|
||||
// No match.
|
||||
if groups == nil {
|
||||
out(&Token{Error, string(state.Text[state.Pos : state.Pos+1])})
|
||||
state.Pos++
|
||||
continue
|
||||
}
|
||||
state.Rule = ruleIndex
|
||||
|
||||
state.Groups = groups
|
||||
state.Pos += len(groups[0])
|
||||
if rule.Mutator != nil {
|
||||
if err := rule.Mutator.Mutate(state); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
if rule.Type != nil {
|
||||
rule.Type.Emit(state.Groups, r, out)
|
||||
}
|
||||
}
|
||||
out(&Token{Type: EOF})
|
||||
return nil
|
||||
}
|
||||
|
||||
func matchRules(text []rune, rules []CompiledRule) (int, CompiledRule, []string) {
|
||||
for i, rule := range rules {
|
||||
match, err := rule.Regexp.FindRunesMatch(text)
|
||||
if match != nil && err == nil {
|
||||
groups := []string{}
|
||||
for _, g := range match.Groups() {
|
||||
groups = append(groups, g.String())
|
||||
}
|
||||
return i, rule, groups
|
||||
}
|
||||
}
|
||||
return 0, CompiledRule{}, nil
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user