chroma/lexer.go

package chroma

import (
	"fmt"
	"regexp"
	"strings"
)

var (
	defaultOptions = &TokeniseOptions{
		State: "root",
	}
)

// Config for a lexer.
type Config struct {
	// Name of the lexer.
	Name string

	// Shortcuts for the lexer
	Aliases []string

	// File name globs
	Filenames []string

	// Secondary file name globs
	AliasFilenames []string

	// MIME types
	MimeTypes []string

	// Priority, should multiple lexers match and no content is provided
	Priority int

	// Regex matching is case-insensitive.
	CaseInsensitive bool

	// Regex matches all characters.
	DotAll bool

	// Regex does not match across lines ($ matches EOL).
	//
	// Defaults to multiline.
	NotMultiline bool

	// Don't strip leading and trailing newlines from the input.
	// DontStripNL bool

	// Strip all leading and trailing whitespace from the input
	// StripAll bool

	// Make sure that the input does not end with a newline. This
	// is required for some lexers that consume input linewise.
	// DontEnsureNL bool

	// If given and greater than 0, expand tabs in the input.
	// TabSize int
}

// Token output to formatter.
type Token struct {
	Type  TokenType
	Value string
}

func (t *Token) String() string   { return t.Value }
func (t *Token) GoString() string { return fmt.Sprintf("Token{%s, %q}", t.Type, t.Value) }

type TokeniseOptions struct {
	// State to start tokenisation in. Defaults to "root".
	State string
}

type Lexer interface {
	Config() *Config
	Tokenise(options *TokeniseOptions, text string, out func(*Token)) error
}

// Analyser determines if this lexer is appropriate for the given text.
type Analyser interface {
	AnalyseText(text string) float32
}

type Rule struct {
	Pattern string
	Type    Emitter
	Mutator Mutator
}

// An Emitter takes group matches and returns tokens.
type Emitter interface {
	// Emit tokens for the given regex groups.
	Emit(groups []string, lexer Lexer, out func(*Token))
}

// EmitterFunc is a function that is an Emitter.
type EmitterFunc func(groups []string, lexer Lexer, out func(*Token))

// Emit tokens for groups.
func (e EmitterFunc) Emit(groups []string, lexer Lexer, out func(*Token)) { e(groups, lexer, out) }

// ByGroups emits a token for each matching group in the rule's regex.
func ByGroups(emitters ...Emitter) Emitter {
	return EmitterFunc(func(groups []string, lexer Lexer, out func(*Token)) {
		for i, group := range groups[1:] {
			emitters[i].Emit([]string{group}, lexer, out)
		}
		return
	})
}

// Using returns an Emitter that uses a given Lexer for parsing and emitting.
func Using(lexer Lexer, options *TokeniseOptions) Emitter {
	return EmitterFunc(func(groups []string, _ Lexer, out func(*Token)) {
		if err := lexer.Tokenise(options, groups[0], out); err != nil {
			panic(err)
		}
	})
}

// UsingSelf is like Using, but uses the current Lexer.
func UsingSelf(state string) Emitter {
	return EmitterFunc(func(groups []string, lexer Lexer, out func(*Token)) {
		if err := lexer.Tokenise(&TokeniseOptions{State: state}, groups[0], out); err != nil {
			panic(err)
		}
	})
}

// Words creates a regex that matches any of the given literal words.
func Words(words ...string) string {
	for i, word := range words {
		words[i] = regexp.QuoteMeta(word)
	}
	return `\b(?:` + strings.Join(words, `|`) + `)\b`
}

// Rules maps from state to a sequence of Rules.
type Rules map[string][]Rule

// MustNewLexer creates a new Lexer or panics.
func MustNewLexer(config *Config, rules Rules) Lexer {
	lexer, err := NewLexer(config, rules)
	if err != nil {
		panic(err)
	}
	return lexer
}

// NewLexer creates a new regex-based Lexer.
//
// "rules" is a state machine transitition map. Each key is a state. Values are sets of rules
// that match input, optionally modify lexer state, and output tokens.
func NewLexer(config *Config, rules Rules) (Lexer, error) {
	if config == nil {
		config = &Config{}
	}
	if _, ok := rules["root"]; !ok {
		return nil, fmt.Errorf("no \"root\" state")
	}
	compiledRules := map[string][]CompiledRule{}
	for state, rules := range rules {
		for _, rule := range rules {
			crule := CompiledRule{Rule: rule}
			flags := ""
			if !config.NotMultiline {
				flags += "m"
			}
			if config.CaseInsensitive {
				flags += "i"
			}
			if config.DotAll {
				flags += "s"
			}
			re, err := regexp.Compile("^(?" + flags + ")(?:" + rule.Pattern + ")")
			if err != nil {
				return nil, fmt.Errorf("invalid regex %q for state %q: %s", rule.Pattern, state, err)
			}
			crule.Regexp = re
			compiledRules[state] = append(compiledRules[state], crule)
		}
	}
	return &regexLexer{
		config: config,
		rules:  compiledRules,
	}, nil
}

// A CompiledRule is a Rule with a pre-compiled regex.
type CompiledRule struct {
	Rule
	Regexp *regexp.Regexp
}

type CompiledRules map[string][]CompiledRule

type LexerState struct {
	Text  string
	Pos   int
	Rules map[string][]CompiledRule
	Stack []string
	State string
	Rule  int
}

type regexLexer struct {
	config *Config
	rules  map[string][]CompiledRule
}

func (r *regexLexer) Config() *Config {
	return r.config
}

func (r *regexLexer) Tokenise(options *TokeniseOptions, text string, out func(*Token)) error {
	if options == nil {
		options = defaultOptions
	}
	state := &LexerState{
		Text:  text,
		Stack: []string{options.State},
		Rules: r.rules,
	}
	for state.Pos < len(text) && len(state.Stack) > 0 {
		state.State = state.Stack[len(state.Stack)-1]
		ruleIndex, rule, index := matchRules(state.Text[state.Pos:], state.Rules[state.State])
		// fmt.Println(text[state.Pos:state.Pos+1], rule, state.Text[state.Pos:state.Pos+1])
		// No match.
		if index == nil {
			out(&Token{Error, state.Text[state.Pos : state.Pos+1]})
			state.Pos++
			continue
		}
		state.Rule = ruleIndex

		groups := make([]string, len(index)/2)
		for i := 0; i < len(index); i += 2 {
			start := state.Pos + index[i]
			end := state.Pos + index[i+1]
			if start == -1 || end == -1 {
				continue
			}
			groups[i/2] = text[start:end]
		}
		state.Pos += index[1]
		if rule.Type != nil {
			rule.Type.Emit(groups, r, out)
		}
		if rule.Mutator != nil {
			if err := rule.Mutator.Mutate(state); err != nil {
				return err
			}
		}
	}
	return nil
}

// Tokenise text using lexer, returning tokens as a slice.
func Tokenise(lexer Lexer, options *TokeniseOptions, text string) ([]*Token, error) {
	out := []*Token{}
	return out, lexer.Tokenise(options, text, func(token *Token) { out = append(out, token) })
}

func matchRules(text string, rules []CompiledRule) (int, CompiledRule, []int) {
	for i, rule := range rules {
		if index := rule.Regexp.FindStringSubmatchIndex(text); index != nil {
			return i, rule, index
		}
	}
	return 0, CompiledRule{}, nil
}
Initial commit! Working! 2017-06-02 00:17:21 +10:00			`package chroma`

			`import (`
			`"fmt"`
			`"regexp"`
			`"strings"`
			`)`

Add a bunch of automatically translated lexers. 2017-06-04 22:18:35 +10:00			`var (`
			`defaultOptions = &TokeniseOptions{`
			`State: "root",`
			`}`
			`)`

Initial commit! Working! 2017-06-02 00:17:21 +10:00			`// Config for a lexer.`
			`type Config struct {`
			`// Name of the lexer.`
			`Name string`

			`// Shortcuts for the lexer`
			`Aliases []string`

			`// File name globs`
			`Filenames []string`

			`// Secondary file name globs`
			`AliasFilenames []string`

			`// MIME types`
			`MimeTypes []string`

			`// Priority, should multiple lexers match and no content is provided`
			`Priority int`

Add a bunch of automatically translated lexers. 2017-06-04 22:18:35 +10:00			`// Regex matching is case-insensitive.`
			`CaseInsensitive bool`

Add JavaScript. 2017-06-05 09:55:19 +10:00			`// Regex matches all characters.`
			`DotAll bool`

			`// Regex does not match across lines ($ matches EOL).`
			`//`
			`// Defaults to multiline.`
			`NotMultiline bool`

Initial commit! Working! 2017-06-02 00:17:21 +10:00			`// Don't strip leading and trailing newlines from the input.`
Add a bunch of automatically translated lexers. 2017-06-04 22:18:35 +10:00			`// DontStripNL bool`
Initial commit! Working! 2017-06-02 00:17:21 +10:00
			`// Strip all leading and trailing whitespace from the input`
Add a bunch of automatically translated lexers. 2017-06-04 22:18:35 +10:00			`// StripAll bool`
Initial commit! Working! 2017-06-02 00:17:21 +10:00
			`// Make sure that the input does not end with a newline. This`
			`// is required for some lexers that consume input linewise.`
Add a bunch of automatically translated lexers. 2017-06-04 22:18:35 +10:00			`// DontEnsureNL bool`
Initial commit! Working! 2017-06-02 00:17:21 +10:00
			`// If given and greater than 0, expand tabs in the input.`
Add a bunch of automatically translated lexers. 2017-06-04 22:18:35 +10:00			`// TabSize int`
Initial commit! Working! 2017-06-02 00:17:21 +10:00			`}`

Use pointers to tokens + support regex flags in importer. 2017-06-05 10:29:50 +10:00			`// Token output to formatter.`
Initial commit! Working! 2017-06-02 00:17:21 +10:00			`type Token struct {`
			`Type TokenType`
			`Value string`
			`}`

Use pointers to tokens + support regex flags in importer. 2017-06-05 10:29:50 +10:00			`func (t *Token) String() string { return t.Value }`
			`func (t *Token) GoString() string { return fmt.Sprintf("Token{%s, %q}", t.Type, t.Value) }`
Initial commit! Working! 2017-06-02 00:17:21 +10:00
Add a bunch of automatically translated lexers. 2017-06-04 22:18:35 +10:00			`type TokeniseOptions struct {`
			`// State to start tokenisation in. Defaults to "root".`
			`State string`
			`}`

Initial commit! Working! 2017-06-02 00:17:21 +10:00			`type Lexer interface {`
			`Config() *Config`
Use pointers to tokens + support regex flags in importer. 2017-06-05 10:29:50 +10:00			`Tokenise(options TokeniseOptions, text string, out func(Token)) error`
Initial commit! Working! 2017-06-02 00:17:21 +10:00			`}`

			`// Analyser determines if this lexer is appropriate for the given text.`
			`type Analyser interface {`
			`AnalyseText(text string) float32`
			`}`

			`type Rule struct {`
Add a bunch of automatically translated lexers. 2017-06-04 22:18:35 +10:00			`Pattern string`
			`Type Emitter`
			`Mutator Mutator`
Initial commit! Working! 2017-06-02 00:17:21 +10:00			`}`

			`// An Emitter takes group matches and returns tokens.`
			`type Emitter interface {`
			`// Emit tokens for the given regex groups.`
Use pointers to tokens + support regex flags in importer. 2017-06-05 10:29:50 +10:00			`Emit(groups []string, lexer Lexer, out func(*Token))`
Initial commit! Working! 2017-06-02 00:17:21 +10:00			`}`

Use a callback to emit tokens. This is a) faster and b) supports streaming output. 2017-06-02 15:15:15 +10:00			`// EmitterFunc is a function that is an Emitter.`
Use pointers to tokens + support regex flags in importer. 2017-06-05 10:29:50 +10:00			`type EmitterFunc func(groups []string, lexer Lexer, out func(*Token))`
Initial commit! Working! 2017-06-02 00:17:21 +10:00
Use a callback to emit tokens. This is a) faster and b) supports streaming output. 2017-06-02 15:15:15 +10:00			`// Emit tokens for groups.`
Use pointers to tokens + support regex flags in importer. 2017-06-05 10:29:50 +10:00			`func (e EmitterFunc) Emit(groups []string, lexer Lexer, out func(*Token)) { e(groups, lexer, out) }`
Initial commit! Working! 2017-06-02 00:17:21 +10:00
			`// ByGroups emits a token for each matching group in the rule's regex.`
Add Markdown processor. A bunch of performance improvements. 2017-06-02 11:42:52 +10:00			`func ByGroups(emitters ...Emitter) Emitter {`
Use pointers to tokens + support regex flags in importer. 2017-06-05 10:29:50 +10:00			`return EmitterFunc(func(groups []string, lexer Lexer, out func(*Token)) {`
Initial commit! Working! 2017-06-02 00:17:21 +10:00			`for i, group := range groups[1:] {`
Add a bunch of automatically translated lexers. 2017-06-04 22:18:35 +10:00			`emitters[i].Emit([]string{group}, lexer, out)`
Initial commit! Working! 2017-06-02 00:17:21 +10:00			`}`
			`return`
			`})`
			`}`

Add a bunch of automatically translated lexers. 2017-06-04 22:18:35 +10:00			`// Using returns an Emitter that uses a given Lexer for parsing and emitting.`
			`func Using(lexer Lexer, options *TokeniseOptions) Emitter {`
Use pointers to tokens + support regex flags in importer. 2017-06-05 10:29:50 +10:00			`return EmitterFunc(func(groups []string, _ Lexer, out func(*Token)) {`
Add a bunch of automatically translated lexers. 2017-06-04 22:18:35 +10:00			`if err := lexer.Tokenise(options, groups[0], out); err != nil {`
			`panic(err)`
			`}`
			`})`
			`}`

			`// UsingSelf is like Using, but uses the current Lexer.`
			`func UsingSelf(state string) Emitter {`
Use pointers to tokens + support regex flags in importer. 2017-06-05 10:29:50 +10:00			`return EmitterFunc(func(groups []string, lexer Lexer, out func(*Token)) {`
Add a bunch of automatically translated lexers. 2017-06-04 22:18:35 +10:00			`if err := lexer.Tokenise(&TokeniseOptions{State: state}, groups[0], out); err != nil {`
Use a callback to emit tokens. This is a) faster and b) supports streaming output. 2017-06-02 15:15:15 +10:00			`panic(err)`
			`}`
			`})`
			`}`

Initial commit! Working! 2017-06-02 00:17:21 +10:00			`// Words creates a regex that matches any of the given literal words.`
			`func Words(words ...string) string {`
			`for i, word := range words {`
			`words[i] = regexp.QuoteMeta(word)`
			`}`
Add a bunch of automatically translated lexers. 2017-06-04 22:18:35 +10:00			return `\b(?:` + strings.Join(words, `\|`) + `)\b`
Initial commit! Working! 2017-06-02 00:17:21 +10:00			`}`

Add a bunch of automatically translated lexers. 2017-06-04 22:18:35 +10:00			`// Rules maps from state to a sequence of Rules.`
Initial commit! Working! 2017-06-02 00:17:21 +10:00			`type Rules map[string][]Rule`

			`// MustNewLexer creates a new Lexer or panics.`
			`func MustNewLexer(config *Config, rules Rules) Lexer {`
			`lexer, err := NewLexer(config, rules)`
			`if err != nil {`
			`panic(err)`
			`}`
			`return lexer`
			`}`

			`// NewLexer creates a new regex-based Lexer.`
			`//`
			`// "rules" is a state machine transitition map. Each key is a state. Values are sets of rules`
			`// that match input, optionally modify lexer state, and output tokens.`
			`func NewLexer(config *Config, rules Rules) (Lexer, error) {`
Use pointers to tokens + support regex flags in importer. 2017-06-05 10:29:50 +10:00			`if config == nil {`
			`config = &Config{}`
			`}`
Initial commit! Working! 2017-06-02 00:17:21 +10:00			`if _, ok := rules["root"]; !ok {`
			`return nil, fmt.Errorf("no \"root\" state")`
			`}`
			`compiledRules := map[string][]CompiledRule{}`
			`for state, rules := range rules {`
			`for _, rule := range rules {`
			`crule := CompiledRule{Rule: rule}`
Add JavaScript. 2017-06-05 09:55:19 +10:00			`flags := ""`
			`if !config.NotMultiline {`
			`flags += "m"`
			`}`
Add a bunch of automatically translated lexers. 2017-06-04 22:18:35 +10:00			`if config.CaseInsensitive {`
			`flags += "i"`
			`}`
Add JavaScript. 2017-06-05 09:55:19 +10:00			`if config.DotAll {`
			`flags += "s"`
			`}`
Add a bunch of automatically translated lexers. 2017-06-04 22:18:35 +10:00			`re, err := regexp.Compile("^(?" + flags + ")(?:" + rule.Pattern + ")")`
Initial commit! Working! 2017-06-02 00:17:21 +10:00			`if err != nil {`
			`return nil, fmt.Errorf("invalid regex %q for state %q: %s", rule.Pattern, state, err)`
			`}`
			`crule.Regexp = re`
			`compiledRules[state] = append(compiledRules[state], crule)`
			`}`
			`}`
			`return &regexLexer{`
			`config: config,`
			`rules: compiledRules,`
			`}, nil`
			`}`

			`// A CompiledRule is a Rule with a pre-compiled regex.`
			`type CompiledRule struct {`
			`Rule`
			`Regexp *regexp.Regexp`
			`}`

Add a bunch of automatically translated lexers. 2017-06-04 22:18:35 +10:00			`type CompiledRules map[string][]CompiledRule`

			`type LexerState struct {`
			`Text string`
			`Pos int`
			`Rules map[string][]CompiledRule`
			`Stack []string`
			`State string`
			`Rule int`
			`}`

Initial commit! Working! 2017-06-02 00:17:21 +10:00			`type regexLexer struct {`
			`config *Config`
			`rules map[string][]CompiledRule`
			`}`

			`func (r regexLexer) Config() Config {`
			`return r.config`
			`}`

Use pointers to tokens + support regex flags in importer. 2017-06-05 10:29:50 +10:00			`func (r regexLexer) Tokenise(options TokeniseOptions, text string, out func(*Token)) error {`
Add a bunch of automatically translated lexers. 2017-06-04 22:18:35 +10:00			`if options == nil {`
			`options = defaultOptions`
			`}`
Initial commit! Working! 2017-06-02 00:17:21 +10:00			`state := &LexerState{`
			`Text: text,`
Add a bunch of automatically translated lexers. 2017-06-04 22:18:35 +10:00			`Stack: []string{options.State},`
Initial commit! Working! 2017-06-02 00:17:21 +10:00			`Rules: r.rules,`
			`}`
			`for state.Pos < len(text) && len(state.Stack) > 0 {`
			`state.State = state.Stack[len(state.Stack)-1]`
Add a bunch of automatically translated lexers. 2017-06-04 22:18:35 +10:00			`ruleIndex, rule, index := matchRules(state.Text[state.Pos:], state.Rules[state.State])`
			`// fmt.Println(text[state.Pos:state.Pos+1], rule, state.Text[state.Pos:state.Pos+1])`
Initial commit! Working! 2017-06-02 00:17:21 +10:00			`// No match.`
			`if index == nil {`
Use pointers to tokens + support regex flags in importer. 2017-06-05 10:29:50 +10:00			`out(&Token{Error, state.Text[state.Pos : state.Pos+1]})`
Initial commit! Working! 2017-06-02 00:17:21 +10:00			`state.Pos++`
			`continue`
			`}`
Add a bunch of automatically translated lexers. 2017-06-04 22:18:35 +10:00			`state.Rule = ruleIndex`
Initial commit! Working! 2017-06-02 00:17:21 +10:00
			`groups := make([]string, len(index)/2)`
			`for i := 0; i < len(index); i += 2 {`
Add a bunch of automatically translated lexers. 2017-06-04 22:18:35 +10:00			`start := state.Pos + index[i]`
			`end := state.Pos + index[i+1]`
			`if start == -1 \|\| end == -1 {`
			`continue`
			`}`
			`groups[i/2] = text[start:end]`
Initial commit! Working! 2017-06-02 00:17:21 +10:00			`}`
			`state.Pos += index[1]`
Add a bunch of automatically translated lexers. 2017-06-04 22:18:35 +10:00			`if rule.Type != nil {`
			`rule.Type.Emit(groups, r, out)`
			`}`
			`if rule.Mutator != nil {`
			`if err := rule.Mutator.Mutate(state); err != nil {`
Use a callback to emit tokens. This is a) faster and b) supports streaming output. 2017-06-02 15:15:15 +10:00			`return err`
Initial commit! Working! 2017-06-02 00:17:21 +10:00			`}`
			`}`
			`}`
Use a callback to emit tokens. This is a) faster and b) supports streaming output. 2017-06-02 15:15:15 +10:00			`return nil`
Initial commit! Working! 2017-06-02 00:17:21 +10:00			`}`

Add a bunch of automatically translated lexers. 2017-06-04 22:18:35 +10:00			`// Tokenise text using lexer, returning tokens as a slice.`
Use pointers to tokens + support regex flags in importer. 2017-06-05 10:29:50 +10:00			`func Tokenise(lexer Lexer, options TokeniseOptions, text string) ([]Token, error) {`
			`out := []*Token{}`
			`return out, lexer.Tokenise(options, text, func(token *Token) { out = append(out, token) })`
Add a bunch of automatically translated lexers. 2017-06-04 22:18:35 +10:00			`}`

			`func matchRules(text string, rules []CompiledRule) (int, CompiledRule, []int) {`
			`for i, rule := range rules {`
Initial commit! Working! 2017-06-02 00:17:21 +10:00			`if index := rule.Regexp.FindStringSubmatchIndex(text); index != nil {`
Add a bunch of automatically translated lexers. 2017-06-04 22:18:35 +10:00			`return i, rule, index`
Initial commit! Working! 2017-06-02 00:17:21 +10:00			`}`
			`}`
Add a bunch of automatically translated lexers. 2017-06-04 22:18:35 +10:00			`return 0, CompiledRule{}, nil`
Initial commit! Working! 2017-06-02 00:17:21 +10:00			`}`