Initial commit! Working!

2025-07-17 01:22:22 +02:00 · 2017-06-02 00:17:21 +10:00
parent 3de978543f
commit b2fb8edf77
16 changed files with 962 additions and 0 deletions
--- a/lexer.go
+++ b/lexer.go
@ -0,0 +1,210 @@
+package chroma
+
+import (
+	"fmt"
+	"regexp"
+	"strings"
+)
+
+// Config for a lexer.
+type Config struct {
+	// Name of the lexer.
+	Name string
+
+	// Shortcuts for the lexer
+	Aliases []string
+
+	// File name globs
+	Filenames []string
+
+	// Secondary file name globs
+	AliasFilenames []string
+
+	// MIME types
+	MimeTypes []string
+
+	// Priority, should multiple lexers match and no content is provided
+	Priority int
+
+	// Don't strip leading and trailing newlines from the input.
+	DontStripNL bool
+
+	// Strip all leading and trailing whitespace from the input
+	StripAll bool
+
+	// Make sure that the input does not end with a newline. This
+	// is required for some lexers that consume input linewise.
+	DontEnsureNL bool
+
+	// If given and greater than 0, expand tabs in the input.
+	TabSize int
+
+	// If given, must be an encoding name. This encoding will be used to
+	// convert the input string to Unicode, if it is not already a Unicode
+	// string.
+	Encoding string
+}
+
+type Token struct {
+	Type  TokenType
+	Value string
+}
+
+func (t Token) String() string   { return fmt.Sprintf("Token{%s, %q}", t.Type, t.Value) }
+func (t Token) GoString() string { return t.String() }
+
+type Lexer interface {
+	Config() *Config
+	Tokenise(text string) ([]Token, error)
+}
+
+// Analyser determines if this lexer is appropriate for the given text.
+type Analyser interface {
+	AnalyseText(text string) float32
+}
+
+type Rule struct {
+	Pattern  string
+	Type     Emitter
+	Modifier Modifier
+}
+
+// An Emitter takes group matches and returns tokens.
+type Emitter interface {
+	// Emit tokens for the given regex groups.
+	Emit(groups []string) []Token
+}
+
+type EmitterFunc func(groups []string) []Token
+
+func (e EmitterFunc) Emit(groups []string) []Token { return e(groups) }
+
+// ByGroups emits a token for each matching group in the rule's regex.
+func ByGroups(types ...TokenType) Emitter {
+	return EmitterFunc(func(groups []string) (out []Token) {
+		for i, group := range groups[1:] {
+			out = append(out, Token{types[i], group})
+		}
+		return
+	})
+}
+
+// Words creates a regex that matches any of the given literal words.
+func Words(words ...string) string {
+	for i, word := range words {
+		words[i] = regexp.QuoteMeta(word)
+	}
+	return "\\b(?:" + strings.Join(words, "|") + ")\\b"
+}
+
+type Rules map[string][]Rule
+
+// MustNewLexer creates a new Lexer or panics.
+func MustNewLexer(config *Config, rules Rules) Lexer {
+	lexer, err := NewLexer(config, rules)
+	if err != nil {
+		panic(err)
+	}
+	return lexer
+}
+
+// NewLexer creates a new regex-based Lexer.
+//
+// "rules" is a state machine transitition map. Each key is a state. Values are sets of rules
+// that match input, optionally modify lexer state, and output tokens.
+func NewLexer(config *Config, rules Rules) (Lexer, error) {
+	if _, ok := rules["root"]; !ok {
+		return nil, fmt.Errorf("no \"root\" state")
+	}
+	compiledRules := map[string][]CompiledRule{}
+	for state, rules := range rules {
+		for _, rule := range rules {
+			crule := CompiledRule{Rule: rule}
+			re, err := regexp.Compile("^(?m)" + rule.Pattern)
+			if err != nil {
+				return nil, fmt.Errorf("invalid regex %q for state %q: %s", rule.Pattern, state, err)
+			}
+			crule.Regexp = re
+			compiledRules[state] = append(compiledRules[state], crule)
+		}
+	}
+	// Apply any pre-processor modifiers.
+	for state, rules := range compiledRules {
+		for index, rule := range rules {
+			if rule.Modifier != nil {
+				err := rule.Modifier.Preprocess(compiledRules, state, index)
+				if err != nil {
+					return nil, err
+				}
+			}
+		}
+	}
+	return &regexLexer{
+		config: config,
+		rules:  compiledRules,
+	}, nil
+}
+
+// A CompiledRule is a Rule with a pre-compiled regex.
+type CompiledRule struct {
+	Rule
+	Regexp *regexp.Regexp
+}
+
+type regexLexer struct {
+	config *Config
+	rules  map[string][]CompiledRule
+}
+
+func (r *regexLexer) Config() *Config {
+	return r.config
+}
+
+type LexerState struct {
+	Text  string
+	Pos   int
+	Stack []string
+	Rules map[string][]CompiledRule
+	State string
+}
+
+func (r *regexLexer) Tokenise(text string) (out []Token, err error) {
+	state := &LexerState{
+		Text:  text,
+		Stack: []string{"root"},
+		Rules: r.rules,
+	}
+	for state.Pos < len(text) && len(state.Stack) > 0 {
+		state.State = state.Stack[len(state.Stack)-1]
+		rule, index := matchRules(state.Text[state.Pos:], state.Rules[state.State])
+		// No match.
+		if index == nil {
+			out = append(out, Token{Error, state.Text[state.Pos : state.Pos+1]})
+			state.Pos++
+			continue
+		}
+
+		groups := make([]string, len(index)/2)
+		for i := 0; i < len(index); i += 2 {
+			groups[i/2] = text[state.Pos+index[i] : state.Pos+index[i+1]]
+		}
+		state.Pos += index[1]
+		if rule.Modifier != nil {
+			if err = rule.Modifier.Mutate(state); err != nil {
+				return
+			}
+		} else {
+			out = append(out, rule.Type.Emit(groups)...)
+		}
+	}
+	return
+}
+
+func matchRules(text string, rules []CompiledRule) (CompiledRule, []int) {
+	for _, rule := range rules {
+		if index := rule.Regexp.FindStringSubmatchIndex(text); index != nil {
+			return rule, index
+		}
+	}
+	return CompiledRule{}, nil
+}