2017-06-02 00:17:21 +10:00
|
|
|
package chroma
|
|
|
|
|
|
|
|
import (
|
|
|
|
"fmt"
|
|
|
|
"regexp"
|
|
|
|
"strings"
|
2017-09-15 22:18:20 +10:00
|
|
|
|
|
|
|
"github.com/dlclark/regexp2"
|
2017-06-02 00:17:21 +10:00
|
|
|
)
|
|
|
|
|
2017-06-04 22:18:35 +10:00
|
|
|
var (
|
|
|
|
defaultOptions = &TokeniseOptions{
|
|
|
|
State: "root",
|
|
|
|
}
|
|
|
|
)
|
|
|
|
|
2017-06-02 00:17:21 +10:00
|
|
|
// Config for a lexer.
|
|
|
|
type Config struct {
|
|
|
|
// Name of the lexer.
|
|
|
|
Name string
|
|
|
|
|
|
|
|
// Shortcuts for the lexer
|
|
|
|
Aliases []string
|
|
|
|
|
|
|
|
// File name globs
|
|
|
|
Filenames []string
|
|
|
|
|
|
|
|
// Secondary file name globs
|
|
|
|
AliasFilenames []string
|
|
|
|
|
|
|
|
// MIME types
|
|
|
|
MimeTypes []string
|
|
|
|
|
2017-06-04 22:18:35 +10:00
|
|
|
// Regex matching is case-insensitive.
|
|
|
|
CaseInsensitive bool
|
|
|
|
|
2017-06-05 09:55:19 +10:00
|
|
|
// Regex matches all characters.
|
|
|
|
DotAll bool
|
|
|
|
|
|
|
|
// Regex does not match across lines ($ matches EOL).
|
|
|
|
//
|
|
|
|
// Defaults to multiline.
|
|
|
|
NotMultiline bool
|
|
|
|
|
2017-06-02 00:17:21 +10:00
|
|
|
// Don't strip leading and trailing newlines from the input.
|
2017-06-04 22:18:35 +10:00
|
|
|
// DontStripNL bool
|
2017-06-02 00:17:21 +10:00
|
|
|
|
|
|
|
// Strip all leading and trailing whitespace from the input
|
2017-06-04 22:18:35 +10:00
|
|
|
// StripAll bool
|
2017-06-02 00:17:21 +10:00
|
|
|
|
|
|
|
// Make sure that the input does not end with a newline. This
|
|
|
|
// is required for some lexers that consume input linewise.
|
2017-06-04 22:18:35 +10:00
|
|
|
// DontEnsureNL bool
|
2017-06-02 00:17:21 +10:00
|
|
|
|
|
|
|
// If given and greater than 0, expand tabs in the input.
|
2017-06-04 22:18:35 +10:00
|
|
|
// TabSize int
|
2017-06-02 00:17:21 +10:00
|
|
|
}
|
|
|
|
|
2017-06-05 10:29:50 +10:00
|
|
|
// Token output to formatter.
|
2017-06-02 00:17:21 +10:00
|
|
|
type Token struct {
|
|
|
|
Type TokenType
|
|
|
|
Value string
|
|
|
|
}
|
|
|
|
|
2017-06-05 10:29:50 +10:00
|
|
|
func (t *Token) String() string { return t.Value }
|
|
|
|
func (t *Token) GoString() string { return fmt.Sprintf("Token{%s, %q}", t.Type, t.Value) }
|
2017-06-02 00:17:21 +10:00
|
|
|
|
2017-06-04 22:18:35 +10:00
|
|
|
type TokeniseOptions struct {
|
|
|
|
// State to start tokenisation in. Defaults to "root".
|
|
|
|
State string
|
|
|
|
}
|
|
|
|
|
2017-07-19 23:51:16 -07:00
|
|
|
// A Lexer for tokenising source code.
|
2017-06-02 00:17:21 +10:00
|
|
|
type Lexer interface {
|
2017-07-19 23:51:16 -07:00
|
|
|
// Config describing the features of the Lexer.
|
2017-06-02 00:17:21 +10:00
|
|
|
Config() *Config
|
2017-07-19 23:51:16 -07:00
|
|
|
// Tokenise text and call out for each generated token.
|
|
|
|
// nil will be passed to out to signify the end of the stream.
|
2017-06-05 10:29:50 +10:00
|
|
|
Tokenise(options *TokeniseOptions, text string, out func(*Token)) error
|
2017-06-02 00:17:21 +10:00
|
|
|
}
|
|
|
|
|
2017-06-07 10:27:10 +10:00
|
|
|
type Lexers []Lexer
|
|
|
|
|
|
|
|
// Pick attempts to pick the best Lexer for a piece of source code. May return nil.
|
|
|
|
func (l Lexers) Pick(text string) Lexer {
|
|
|
|
if len(l) == 0 {
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
var picked Lexer
|
|
|
|
highest := float32(-1)
|
|
|
|
for _, lexer := range l {
|
|
|
|
if analyser, ok := lexer.(Analyser); ok {
|
|
|
|
score := analyser.AnalyseText(text)
|
|
|
|
if score > highest {
|
|
|
|
highest = score
|
|
|
|
picked = lexer
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return picked
|
|
|
|
}
|
|
|
|
|
2017-07-19 23:51:16 -07:00
|
|
|
// Analyser determines how appropriate this lexer is for the given text.
|
2017-06-02 00:17:21 +10:00
|
|
|
type Analyser interface {
|
|
|
|
AnalyseText(text string) float32
|
|
|
|
}
|
|
|
|
|
|
|
|
type Rule struct {
|
2017-06-04 22:18:35 +10:00
|
|
|
Pattern string
|
|
|
|
Type Emitter
|
|
|
|
Mutator Mutator
|
2017-06-02 00:17:21 +10:00
|
|
|
}
|
|
|
|
|
|
|
|
// An Emitter takes group matches and returns tokens.
|
|
|
|
type Emitter interface {
|
|
|
|
// Emit tokens for the given regex groups.
|
2017-06-05 10:29:50 +10:00
|
|
|
Emit(groups []string, lexer Lexer, out func(*Token))
|
2017-06-02 00:17:21 +10:00
|
|
|
}
|
|
|
|
|
2017-06-02 15:15:15 +10:00
|
|
|
// EmitterFunc is a function that is an Emitter.
|
2017-06-05 10:29:50 +10:00
|
|
|
type EmitterFunc func(groups []string, lexer Lexer, out func(*Token))
|
2017-06-02 00:17:21 +10:00
|
|
|
|
2017-06-02 15:15:15 +10:00
|
|
|
// Emit tokens for groups.
|
2017-06-05 10:29:50 +10:00
|
|
|
func (e EmitterFunc) Emit(groups []string, lexer Lexer, out func(*Token)) { e(groups, lexer, out) }
|
2017-06-02 00:17:21 +10:00
|
|
|
|
|
|
|
// ByGroups emits a token for each matching group in the rule's regex.
|
2017-06-02 11:42:52 +10:00
|
|
|
func ByGroups(emitters ...Emitter) Emitter {
|
2017-06-05 10:29:50 +10:00
|
|
|
return EmitterFunc(func(groups []string, lexer Lexer, out func(*Token)) {
|
2017-06-02 00:17:21 +10:00
|
|
|
for i, group := range groups[1:] {
|
2017-06-04 22:18:35 +10:00
|
|
|
emitters[i].Emit([]string{group}, lexer, out)
|
2017-06-02 00:17:21 +10:00
|
|
|
}
|
|
|
|
return
|
|
|
|
})
|
|
|
|
}
|
|
|
|
|
2017-06-04 22:18:35 +10:00
|
|
|
// Using returns an Emitter that uses a given Lexer for parsing and emitting.
|
|
|
|
func Using(lexer Lexer, options *TokeniseOptions) Emitter {
|
2017-06-05 10:29:50 +10:00
|
|
|
return EmitterFunc(func(groups []string, _ Lexer, out func(*Token)) {
|
2017-06-04 22:18:35 +10:00
|
|
|
if err := lexer.Tokenise(options, groups[0], out); err != nil {
|
|
|
|
panic(err)
|
|
|
|
}
|
|
|
|
})
|
|
|
|
}
|
|
|
|
|
|
|
|
// UsingSelf is like Using, but uses the current Lexer.
|
|
|
|
func UsingSelf(state string) Emitter {
|
2017-06-05 10:29:50 +10:00
|
|
|
return EmitterFunc(func(groups []string, lexer Lexer, out func(*Token)) {
|
2017-06-04 22:18:35 +10:00
|
|
|
if err := lexer.Tokenise(&TokeniseOptions{State: state}, groups[0], out); err != nil {
|
2017-06-02 15:15:15 +10:00
|
|
|
panic(err)
|
|
|
|
}
|
|
|
|
})
|
|
|
|
}
|
|
|
|
|
2017-06-02 00:17:21 +10:00
|
|
|
// Words creates a regex that matches any of the given literal words.
|
|
|
|
func Words(words ...string) string {
|
|
|
|
for i, word := range words {
|
|
|
|
words[i] = regexp.QuoteMeta(word)
|
|
|
|
}
|
2017-06-04 22:18:35 +10:00
|
|
|
return `\b(?:` + strings.Join(words, `|`) + `)\b`
|
2017-06-02 00:17:21 +10:00
|
|
|
}
|
|
|
|
|
2017-06-04 22:18:35 +10:00
|
|
|
// Rules maps from state to a sequence of Rules.
|
2017-06-02 00:17:21 +10:00
|
|
|
type Rules map[string][]Rule
|
|
|
|
|
|
|
|
// MustNewLexer creates a new Lexer or panics.
|
2017-06-07 10:27:10 +10:00
|
|
|
func MustNewLexer(config *Config, rules Rules) *RegexLexer {
|
2017-06-02 00:17:21 +10:00
|
|
|
lexer, err := NewLexer(config, rules)
|
|
|
|
if err != nil {
|
|
|
|
panic(err)
|
|
|
|
}
|
|
|
|
return lexer
|
|
|
|
}
|
|
|
|
|
|
|
|
// NewLexer creates a new regex-based Lexer.
|
|
|
|
//
|
|
|
|
// "rules" is a state machine transitition map. Each key is a state. Values are sets of rules
|
|
|
|
// that match input, optionally modify lexer state, and output tokens.
|
2017-06-07 10:27:10 +10:00
|
|
|
func NewLexer(config *Config, rules Rules) (*RegexLexer, error) {
|
2017-06-05 10:29:50 +10:00
|
|
|
if config == nil {
|
|
|
|
config = &Config{}
|
|
|
|
}
|
2017-06-02 00:17:21 +10:00
|
|
|
if _, ok := rules["root"]; !ok {
|
|
|
|
return nil, fmt.Errorf("no \"root\" state")
|
|
|
|
}
|
|
|
|
compiledRules := map[string][]CompiledRule{}
|
|
|
|
for state, rules := range rules {
|
|
|
|
for _, rule := range rules {
|
|
|
|
crule := CompiledRule{Rule: rule}
|
2017-06-05 09:55:19 +10:00
|
|
|
flags := ""
|
|
|
|
if !config.NotMultiline {
|
|
|
|
flags += "m"
|
|
|
|
}
|
2017-06-04 22:18:35 +10:00
|
|
|
if config.CaseInsensitive {
|
|
|
|
flags += "i"
|
|
|
|
}
|
2017-06-05 09:55:19 +10:00
|
|
|
if config.DotAll {
|
|
|
|
flags += "s"
|
|
|
|
}
|
2017-09-15 22:18:20 +10:00
|
|
|
re, err := regexp2.Compile("^(?"+flags+")(?:"+rule.Pattern+")", 0)
|
2017-06-02 00:17:21 +10:00
|
|
|
if err != nil {
|
|
|
|
return nil, fmt.Errorf("invalid regex %q for state %q: %s", rule.Pattern, state, err)
|
|
|
|
}
|
|
|
|
crule.Regexp = re
|
|
|
|
compiledRules[state] = append(compiledRules[state], crule)
|
|
|
|
}
|
|
|
|
}
|
2017-06-07 10:27:10 +10:00
|
|
|
return &RegexLexer{
|
2017-06-02 00:17:21 +10:00
|
|
|
config: config,
|
|
|
|
rules: compiledRules,
|
|
|
|
}, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// A CompiledRule is a Rule with a pre-compiled regex.
|
|
|
|
type CompiledRule struct {
|
|
|
|
Rule
|
2017-09-15 22:18:20 +10:00
|
|
|
Regexp *regexp2.Regexp
|
2017-06-02 00:17:21 +10:00
|
|
|
}
|
|
|
|
|
2017-06-04 22:18:35 +10:00
|
|
|
type CompiledRules map[string][]CompiledRule
|
|
|
|
|
|
|
|
type LexerState struct {
|
|
|
|
Text string
|
|
|
|
Pos int
|
|
|
|
Rules map[string][]CompiledRule
|
|
|
|
Stack []string
|
|
|
|
State string
|
|
|
|
Rule int
|
2017-06-06 15:59:48 +10:00
|
|
|
// Group matches.
|
|
|
|
Groups []string
|
2017-06-04 22:18:35 +10:00
|
|
|
}
|
|
|
|
|
2017-06-07 10:27:10 +10:00
|
|
|
type RegexLexer struct {
|
|
|
|
config *Config
|
|
|
|
rules map[string][]CompiledRule
|
|
|
|
analyser func(text string) float32
|
|
|
|
}
|
|
|
|
|
|
|
|
// SetAnalyser sets the analyser function used to perform content inspection.
|
|
|
|
func (r *RegexLexer) SetAnalyser(analyser func(text string) float32) *RegexLexer {
|
|
|
|
r.analyser = analyser
|
|
|
|
return r
|
|
|
|
}
|
|
|
|
|
|
|
|
func (r *RegexLexer) AnalyseText(text string) float32 {
|
|
|
|
if r.analyser != nil {
|
|
|
|
return r.analyser(text)
|
|
|
|
}
|
|
|
|
return 0.0
|
2017-06-02 00:17:21 +10:00
|
|
|
}
|
|
|
|
|
2017-06-07 10:27:10 +10:00
|
|
|
func (r *RegexLexer) Config() *Config {
|
2017-06-02 00:17:21 +10:00
|
|
|
return r.config
|
|
|
|
}
|
|
|
|
|
2017-06-07 10:27:10 +10:00
|
|
|
func (r *RegexLexer) Tokenise(options *TokeniseOptions, text string, out func(*Token)) error {
|
2017-06-04 22:18:35 +10:00
|
|
|
if options == nil {
|
|
|
|
options = defaultOptions
|
|
|
|
}
|
2017-06-02 00:17:21 +10:00
|
|
|
state := &LexerState{
|
|
|
|
Text: text,
|
2017-06-04 22:18:35 +10:00
|
|
|
Stack: []string{options.State},
|
2017-06-02 00:17:21 +10:00
|
|
|
Rules: r.rules,
|
|
|
|
}
|
|
|
|
for state.Pos < len(text) && len(state.Stack) > 0 {
|
|
|
|
state.State = state.Stack[len(state.Stack)-1]
|
2017-09-15 22:18:20 +10:00
|
|
|
ruleIndex, rule, groups := matchRules(state.Text[state.Pos:], state.Rules[state.State])
|
2017-06-02 00:17:21 +10:00
|
|
|
// No match.
|
2017-09-15 22:18:20 +10:00
|
|
|
if groups == nil {
|
2017-06-05 10:29:50 +10:00
|
|
|
out(&Token{Error, state.Text[state.Pos : state.Pos+1]})
|
2017-06-02 00:17:21 +10:00
|
|
|
state.Pos++
|
|
|
|
continue
|
|
|
|
}
|
2017-06-04 22:18:35 +10:00
|
|
|
state.Rule = ruleIndex
|
2017-06-02 00:17:21 +10:00
|
|
|
|
2017-09-15 22:18:20 +10:00
|
|
|
state.Groups = groups
|
|
|
|
state.Pos += len(groups[0])
|
2017-06-04 22:18:35 +10:00
|
|
|
if rule.Mutator != nil {
|
|
|
|
if err := rule.Mutator.Mutate(state); err != nil {
|
2017-06-02 15:15:15 +10:00
|
|
|
return err
|
2017-06-02 00:17:21 +10:00
|
|
|
}
|
|
|
|
}
|
2017-06-06 15:59:48 +10:00
|
|
|
if rule.Type != nil {
|
|
|
|
rule.Type.Emit(state.Groups, r, out)
|
|
|
|
}
|
2017-06-02 00:17:21 +10:00
|
|
|
}
|
2017-07-19 23:51:16 -07:00
|
|
|
out(&Token{Type: EOF})
|
2017-06-02 15:15:15 +10:00
|
|
|
return nil
|
2017-06-02 00:17:21 +10:00
|
|
|
}
|
|
|
|
|
2017-06-04 22:18:35 +10:00
|
|
|
// Tokenise text using lexer, returning tokens as a slice.
|
2017-06-05 10:29:50 +10:00
|
|
|
func Tokenise(lexer Lexer, options *TokeniseOptions, text string) ([]*Token, error) {
|
|
|
|
out := []*Token{}
|
|
|
|
return out, lexer.Tokenise(options, text, func(token *Token) { out = append(out, token) })
|
2017-06-04 22:18:35 +10:00
|
|
|
}
|
|
|
|
|
2017-09-15 22:18:20 +10:00
|
|
|
func matchRules(text string, rules []CompiledRule) (int, CompiledRule, []string) {
|
2017-06-04 22:18:35 +10:00
|
|
|
for i, rule := range rules {
|
2017-09-15 22:18:20 +10:00
|
|
|
match, err := rule.Regexp.FindStringMatch(text)
|
|
|
|
if match != nil && err == nil {
|
|
|
|
groups := []string{}
|
|
|
|
for _, g := range match.Groups() {
|
|
|
|
groups = append(groups, g.String())
|
|
|
|
}
|
|
|
|
return i, rule, groups
|
2017-06-02 00:17:21 +10:00
|
|
|
}
|
|
|
|
}
|
2017-06-04 22:18:35 +10:00
|
|
|
return 0, CompiledRule{}, nil
|
2017-06-02 00:17:21 +10:00
|
|
|
}
|