chroma/lexer.go

package chroma

import (
	"fmt"
	"strings"
)

var (
	defaultOptions = &TokeniseOptions{
		State:    "root",
		EnsureLF: true,
	}
)

// Config for a lexer.
type Config struct {
	// Name of the lexer.
	Name string `xml:"name,omitempty"`

	// Shortcuts for the lexer
	Aliases []string `xml:"alias,omitempty"`

	// File name globs
	Filenames []string `xml:"filename,omitempty"`

	// Secondary file name globs
	AliasFilenames []string `xml:"alias_filename,omitempty"`

	// MIME types
	MimeTypes []string `xml:"mime_type,omitempty"`

	// Regex matching is case-insensitive.
	CaseInsensitive bool `xml:"case_insensitive,omitempty"`

	// Regex matches all characters.
	DotAll bool `xml:"dot_all,omitempty"`

	// Regex does not match across lines ($ matches EOL).
	//
	// Defaults to multiline.
	NotMultiline bool `xml:"not_multiline,omitempty"`

	// Don't strip leading and trailing newlines from the input.
	// DontStripNL bool

	// Strip all leading and trailing whitespace from the input
	// StripAll bool

	// Make sure that the input ends with a newline. This
	// is required for some lexers that consume input linewise.
	EnsureNL bool `xml:"ensure_nl,omitempty"`

	// If given and greater than 0, expand tabs in the input.
	// TabSize int

	// Priority of lexer.
	//
	// If this is 0 it will be treated as a default of 1.
	Priority float32 `xml:"priority,omitempty"`
}

// Token output to formatter.
type Token struct {
	Type  TokenType `json:"type"`
	Value string    `json:"value"`
}

func (t *Token) String() string   { return t.Value }
func (t *Token) GoString() string { return fmt.Sprintf("&Token{%s, %q}", t.Type, t.Value) }

// Clone returns a clone of the Token.
func (t *Token) Clone() Token {
	return *t
}

// EOF is returned by lexers at the end of input.
var EOF Token

// TokeniseOptions contains options for tokenisers.
type TokeniseOptions struct {
	// State to start tokenisation in. Defaults to "root".
	State string
	// Nested tokenisation.
	Nested bool

	// If true, all EOLs are converted into LF
	// by replacing CRLF and CR
	EnsureLF bool
}

// A Lexer for tokenising source code.
type Lexer interface {
	// Config describing the features of the Lexer.
	Config() *Config
	// Tokenise returns an Iterator over tokens in text.
	Tokenise(options *TokeniseOptions, text string) (Iterator, error)
	// SetRegistry sets the registry this Lexer is associated with.
	//
	// The registry should be used by the Lexer if it needs to look up other
	// lexers.
	SetRegistry(registry *LexerRegistry) Lexer
	// SetAnalyser sets a function the Lexer should use for scoring how
	// likely a fragment of text is to match this lexer, between 0.0 and 1.0.
	// A value of 1 indicates high confidence.
	//
	// Lexers may ignore this if they implement their own analysers.
	SetAnalyser(analyser func(text string) float32) Lexer
	// AnalyseText scores how likely a fragment of text is to match
	// this lexer, between 0.0 and 1.0. A value of 1 indicates high confidence.
	AnalyseText(text string) float32
}

// Lexers is a slice of lexers sortable by name.
type Lexers []Lexer

func (l Lexers) Len() int      { return len(l) }
func (l Lexers) Swap(i, j int) { l[i], l[j] = l[j], l[i] }
func (l Lexers) Less(i, j int) bool {
	return strings.ToLower(l[i].Config().Name) < strings.ToLower(l[j].Config().Name)
}

// PrioritisedLexers is a slice of lexers sortable by priority.
type PrioritisedLexers []Lexer

func (l PrioritisedLexers) Len() int      { return len(l) }
func (l PrioritisedLexers) Swap(i, j int) { l[i], l[j] = l[j], l[i] }
func (l PrioritisedLexers) Less(i, j int) bool {
	ip := l[i].Config().Priority
	if ip == 0 {
		ip = 1
	}
	jp := l[j].Config().Priority
	if jp == 0 {
		jp = 1
	}
	return ip > jp
}

// Analyser determines how appropriate this lexer is for the given text.
type Analyser interface {
	AnalyseText(text string) float32
}