mirror of
https://github.com/alecthomas/chroma.git
synced 2025-07-17 01:22:22 +02:00
Initial commit! Working!
This commit is contained in:
210
lexer.go
Normal file
210
lexer.go
Normal file
@ -0,0 +1,210 @@
|
||||
package chroma
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"regexp"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// Config for a lexer.
|
||||
type Config struct {
|
||||
// Name of the lexer.
|
||||
Name string
|
||||
|
||||
// Shortcuts for the lexer
|
||||
Aliases []string
|
||||
|
||||
// File name globs
|
||||
Filenames []string
|
||||
|
||||
// Secondary file name globs
|
||||
AliasFilenames []string
|
||||
|
||||
// MIME types
|
||||
MimeTypes []string
|
||||
|
||||
// Priority, should multiple lexers match and no content is provided
|
||||
Priority int
|
||||
|
||||
// Don't strip leading and trailing newlines from the input.
|
||||
DontStripNL bool
|
||||
|
||||
// Strip all leading and trailing whitespace from the input
|
||||
StripAll bool
|
||||
|
||||
// Make sure that the input does not end with a newline. This
|
||||
// is required for some lexers that consume input linewise.
|
||||
DontEnsureNL bool
|
||||
|
||||
// If given and greater than 0, expand tabs in the input.
|
||||
TabSize int
|
||||
|
||||
// If given, must be an encoding name. This encoding will be used to
|
||||
// convert the input string to Unicode, if it is not already a Unicode
|
||||
// string.
|
||||
Encoding string
|
||||
}
|
||||
|
||||
type Token struct {
|
||||
Type TokenType
|
||||
Value string
|
||||
}
|
||||
|
||||
func (t Token) String() string { return fmt.Sprintf("Token{%s, %q}", t.Type, t.Value) }
|
||||
func (t Token) GoString() string { return t.String() }
|
||||
|
||||
type Lexer interface {
|
||||
Config() *Config
|
||||
Tokenise(text string) ([]Token, error)
|
||||
}
|
||||
|
||||
// Analyser determines if this lexer is appropriate for the given text.
|
||||
type Analyser interface {
|
||||
AnalyseText(text string) float32
|
||||
}
|
||||
|
||||
type Rule struct {
|
||||
Pattern string
|
||||
Type Emitter
|
||||
Modifier Modifier
|
||||
}
|
||||
|
||||
// An Emitter takes group matches and returns tokens.
|
||||
type Emitter interface {
|
||||
// Emit tokens for the given regex groups.
|
||||
Emit(groups []string) []Token
|
||||
}
|
||||
|
||||
type EmitterFunc func(groups []string) []Token
|
||||
|
||||
func (e EmitterFunc) Emit(groups []string) []Token { return e(groups) }
|
||||
|
||||
// ByGroups emits a token for each matching group in the rule's regex.
|
||||
func ByGroups(types ...TokenType) Emitter {
|
||||
return EmitterFunc(func(groups []string) (out []Token) {
|
||||
for i, group := range groups[1:] {
|
||||
out = append(out, Token{types[i], group})
|
||||
}
|
||||
return
|
||||
})
|
||||
}
|
||||
|
||||
// Words creates a regex that matches any of the given literal words.
|
||||
func Words(words ...string) string {
|
||||
for i, word := range words {
|
||||
words[i] = regexp.QuoteMeta(word)
|
||||
}
|
||||
return "\\b(?:" + strings.Join(words, "|") + ")\\b"
|
||||
}
|
||||
|
||||
type Rules map[string][]Rule
|
||||
|
||||
// MustNewLexer creates a new Lexer or panics.
|
||||
func MustNewLexer(config *Config, rules Rules) Lexer {
|
||||
lexer, err := NewLexer(config, rules)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
return lexer
|
||||
}
|
||||
|
||||
// NewLexer creates a new regex-based Lexer.
|
||||
//
|
||||
// "rules" is a state machine transitition map. Each key is a state. Values are sets of rules
|
||||
// that match input, optionally modify lexer state, and output tokens.
|
||||
func NewLexer(config *Config, rules Rules) (Lexer, error) {
|
||||
if _, ok := rules["root"]; !ok {
|
||||
return nil, fmt.Errorf("no \"root\" state")
|
||||
}
|
||||
compiledRules := map[string][]CompiledRule{}
|
||||
for state, rules := range rules {
|
||||
for _, rule := range rules {
|
||||
crule := CompiledRule{Rule: rule}
|
||||
re, err := regexp.Compile("^(?m)" + rule.Pattern)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("invalid regex %q for state %q: %s", rule.Pattern, state, err)
|
||||
}
|
||||
crule.Regexp = re
|
||||
compiledRules[state] = append(compiledRules[state], crule)
|
||||
}
|
||||
}
|
||||
// Apply any pre-processor modifiers.
|
||||
for state, rules := range compiledRules {
|
||||
for index, rule := range rules {
|
||||
if rule.Modifier != nil {
|
||||
err := rule.Modifier.Preprocess(compiledRules, state, index)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return ®exLexer{
|
||||
config: config,
|
||||
rules: compiledRules,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// A CompiledRule is a Rule with a pre-compiled regex.
|
||||
type CompiledRule struct {
|
||||
Rule
|
||||
Regexp *regexp.Regexp
|
||||
}
|
||||
|
||||
type regexLexer struct {
|
||||
config *Config
|
||||
rules map[string][]CompiledRule
|
||||
}
|
||||
|
||||
func (r *regexLexer) Config() *Config {
|
||||
return r.config
|
||||
}
|
||||
|
||||
type LexerState struct {
|
||||
Text string
|
||||
Pos int
|
||||
Stack []string
|
||||
Rules map[string][]CompiledRule
|
||||
State string
|
||||
}
|
||||
|
||||
func (r *regexLexer) Tokenise(text string) (out []Token, err error) {
|
||||
state := &LexerState{
|
||||
Text: text,
|
||||
Stack: []string{"root"},
|
||||
Rules: r.rules,
|
||||
}
|
||||
for state.Pos < len(text) && len(state.Stack) > 0 {
|
||||
state.State = state.Stack[len(state.Stack)-1]
|
||||
rule, index := matchRules(state.Text[state.Pos:], state.Rules[state.State])
|
||||
// No match.
|
||||
if index == nil {
|
||||
out = append(out, Token{Error, state.Text[state.Pos : state.Pos+1]})
|
||||
state.Pos++
|
||||
continue
|
||||
}
|
||||
|
||||
groups := make([]string, len(index)/2)
|
||||
for i := 0; i < len(index); i += 2 {
|
||||
groups[i/2] = text[state.Pos+index[i] : state.Pos+index[i+1]]
|
||||
}
|
||||
state.Pos += index[1]
|
||||
if rule.Modifier != nil {
|
||||
if err = rule.Modifier.Mutate(state); err != nil {
|
||||
return
|
||||
}
|
||||
} else {
|
||||
out = append(out, rule.Type.Emit(groups)...)
|
||||
}
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
func matchRules(text string, rules []CompiledRule) (CompiledRule, []int) {
|
||||
for _, rule := range rules {
|
||||
if index := rule.Regexp.FindStringSubmatchIndex(text); index != nil {
|
||||
return rule, index
|
||||
}
|
||||
}
|
||||
return CompiledRule{}, nil
|
||||
}
|
Reference in New Issue
Block a user