From 5dedc6e45bcfd6467e4593bcdc0ee5cf74e7e0b9 Mon Sep 17 00:00:00 2001 From: Alec Thomas Date: Sun, 4 Jun 2017 22:18:35 +1000 Subject: [PATCH] Add a bunch of automatically translated lexers. --- cmd/chroma/main.go | 25 +++-- coalesce.go | 4 +- coalesce_test.go | 5 +- formatters/console.go | 12 ++- lexer.go | 144 +++++++++++++++++------------ lexer_test.go | 26 +++--- lexers/bash.go | 85 +++++++++++++++++ lexers/c.go | 90 ++++++++++++++++++ lexers/{default.go => fallback.go} | 6 +- lexers/makefile.go | 51 ++++++++++ lexers/markdown.go | 14 +-- lexers/postgres.go | 57 ++++++++++++ lexers/python.go | 136 +++++++++++++++++++++++++++ lexers/registry.go | 2 +- modifiers.go | 79 ---------------- modifiers_test.go | 6 -- mutators.go | 90 ++++++++++++++++++ mutators_test.go | 55 +++++++++++ tokentype_string.go | 64 ++++++------- types.go | 12 ++- 20 files changed, 749 insertions(+), 214 deletions(-) create mode 100644 lexers/bash.go create mode 100644 lexers/c.go rename lexers/{default.go => fallback.go} (64%) create mode 100644 lexers/makefile.go create mode 100644 lexers/postgres.go create mode 100644 lexers/python.go delete mode 100644 modifiers.go delete mode 100644 modifiers_test.go create mode 100644 mutators.go create mode 100644 mutators_test.go diff --git a/cmd/chroma/main.go b/cmd/chroma/main.go index b96bb30..98a6190 100644 --- a/cmd/chroma/main.go +++ b/cmd/chroma/main.go @@ -16,9 +16,10 @@ import ( ) var ( - profileFlag = kingpin.Flag("profile", "Enable profiling to file.").String() + profileFlag = kingpin.Flag("profile", "Enable profiling to file.").PlaceHolder("FILE").String() tokensFlag = kingpin.Flag("tokens", "Dump raw tokens.").Bool() - filesArgs = kingpin.Arg("files", "Files to highlight.").Required().ExistingFiles() + lexerFlag = kingpin.Flag("lexer", "Lexer to use when formatting (default is to autodetect).").Short('l').String() + filesArgs = kingpin.Arg("files", "Files to highlight.").ExistingFiles() ) func main() { @@ -32,14 +33,22 @@ func main() { w := bufio.NewWriterSize(os.Stdout, 16384) defer w.Flush() writer := getWriter(w) - for _, filename := range *filesArgs { - lexers := lexers.Registry.Match(filename) - lexer := lexers[0] - lexer = chroma.Coalesce(lexer) - contents, err := ioutil.ReadFile(filename) + if len(*filesArgs) == 0 { + lexer := lexers.Registry.Get(*lexerFlag) + contents, err := ioutil.ReadAll(os.Stdin) kingpin.FatalIfError(err, "") - err = lexer.Tokenise(string(contents), writer) + err = lexer.Tokenise(nil, string(contents), writer) kingpin.FatalIfError(err, "") + } else { + for _, filename := range *filesArgs { + lexers := lexers.Registry.Match(filename) + lexer := lexers[0] + lexer = chroma.Coalesce(lexer) + contents, err := ioutil.ReadFile(filename) + kingpin.FatalIfError(err, "") + err = lexer.Tokenise(nil, string(contents), writer) + kingpin.FatalIfError(err, "") + } } } diff --git a/coalesce.go b/coalesce.go index f54e4b0..e447a61 100644 --- a/coalesce.go +++ b/coalesce.go @@ -9,14 +9,14 @@ type coalescer struct { Lexer } -func (d *coalescer) Tokenise(text string, out func(Token)) error { +func (d *coalescer) Tokenise(options *TokeniseOptions, text string, out func(Token)) error { var last *Token defer func() { if last != nil { out(*last) } }() - return d.Lexer.Tokenise(text, func(token Token) { + return d.Lexer.Tokenise(options, text, func(token Token) { if last == nil { last = &token } else { diff --git a/coalesce_test.go b/coalesce_test.go index da20e20..62831df 100644 --- a/coalesce_test.go +++ b/coalesce_test.go @@ -7,13 +7,12 @@ import ( ) func TestCoalesce(t *testing.T) { - lexer, err := Coalesce(MustNewLexer(nil, Rules{ + lexer := Coalesce(MustNewLexer(nil, Rules{ "root": []Rule{ Rule{`[[:punct:]]`, Punctuation, nil}, }, })) - require.NoError(t, err) - actual, err := lexer.Tokenise("!@#$%") + actual, err := Tokenise(lexer, nil, "!@#$%") require.NoError(t, err) expected := []Token{ Token{Punctuation, "!@#$%"}, diff --git a/formatters/console.go b/formatters/console.go index 8edbf5a..ffd61ed 100644 --- a/formatters/console.go +++ b/formatters/console.go @@ -10,16 +10,23 @@ import ( var DefaultConsoleTheme = map[TokenType]string{ Number: "\033[1m\033[33m", Comment: "\033[36m", + CommentPreproc: "\033[1m\033[32m", String: "\033[1m\033[36m", Keyword: "\033[1m\033[37m", GenericHeading: "\033[1m", GenericSubheading: "\033[1m", + GenericStrong: "\033[1m", + GenericUnderline: "\033[4m", + GenericDeleted: "\033[9m", } // Console formatter. // -// formatter := Console(DefaultConsoleTheme) +// formatter := Console(nil) func Console(theme map[TokenType]string) Formatter { + if theme == nil { + theme = DefaultConsoleTheme + } return &consoleFormatter{theme} } @@ -35,11 +42,12 @@ func (c *consoleFormatter) Format(w io.Writer) (func(Token), error) { if !ok { clr, ok = c.theme[token.Type.Category()] if !ok { - clr = "\033[0m" + clr = "" } } } fmt.Fprint(w, clr) fmt.Fprint(w, token.Value) + fmt.Fprintf(w, "\033[0m") }, nil } diff --git a/lexer.go b/lexer.go index afce794..937fa9f 100644 --- a/lexer.go +++ b/lexer.go @@ -6,6 +6,12 @@ import ( "strings" ) +var ( + defaultOptions = &TokeniseOptions{ + State: "root", + } +) + // Config for a lexer. type Config struct { // Name of the lexer. @@ -26,23 +32,21 @@ type Config struct { // Priority, should multiple lexers match and no content is provided Priority int + // Regex matching is case-insensitive. + CaseInsensitive bool + // Don't strip leading and trailing newlines from the input. - DontStripNL bool + // DontStripNL bool // Strip all leading and trailing whitespace from the input - StripAll bool + // StripAll bool // Make sure that the input does not end with a newline. This // is required for some lexers that consume input linewise. - DontEnsureNL bool + // DontEnsureNL bool // If given and greater than 0, expand tabs in the input. - TabSize int - - // If given, must be an encoding name. This encoding will be used to - // convert the input string to Unicode, if it is not already a Unicode - // string. - Encoding string + // TabSize int } type Token struct { @@ -53,9 +57,14 @@ type Token struct { func (t Token) String() string { return fmt.Sprintf("Token{%s, %q}", t.Type, t.Value) } func (t Token) GoString() string { return t.String() } +type TokeniseOptions struct { + // State to start tokenisation in. Defaults to "root". + State string +} + type Lexer interface { Config() *Config - Tokenise(text string, out func(Token)) error + Tokenise(options *TokeniseOptions, text string, out func(Token)) error } // Analyser determines if this lexer is appropriate for the given text. @@ -64,39 +73,46 @@ type Analyser interface { } type Rule struct { - Pattern string - Type Emitter - Modifier Modifier + Pattern string + Type Emitter + Mutator Mutator } // An Emitter takes group matches and returns tokens. type Emitter interface { // Emit tokens for the given regex groups. - Emit(groups []string, out func(Token)) + Emit(groups []string, lexer Lexer, out func(Token)) } // EmitterFunc is a function that is an Emitter. -type EmitterFunc func(groups []string, out func(Token)) +type EmitterFunc func(groups []string, lexer Lexer, out func(Token)) // Emit tokens for groups. -func (e EmitterFunc) Emit(groups []string, out func(Token)) { e(groups, out) } +func (e EmitterFunc) Emit(groups []string, lexer Lexer, out func(Token)) { e(groups, lexer, out) } // ByGroups emits a token for each matching group in the rule's regex. func ByGroups(emitters ...Emitter) Emitter { - return EmitterFunc(func(groups []string, out func(Token)) { + return EmitterFunc(func(groups []string, lexer Lexer, out func(Token)) { for i, group := range groups[1:] { - emitters[i].Emit([]string{group}, out) + emitters[i].Emit([]string{group}, lexer, out) } return }) } -// Using uses a given Lexer for parsing and emitting. -func Using(lexer Lexer) Emitter { - return EmitterFunc(func(groups []string, out func(Token)) { - if err := lexer.Tokenise(groups[0], out); err != nil { - // TODO: Emitters should return an error, though it's not clear what one would do with - // it. +// Using returns an Emitter that uses a given Lexer for parsing and emitting. +func Using(lexer Lexer, options *TokeniseOptions) Emitter { + return EmitterFunc(func(groups []string, _ Lexer, out func(Token)) { + if err := lexer.Tokenise(options, groups[0], out); err != nil { + panic(err) + } + }) +} + +// UsingSelf is like Using, but uses the current Lexer. +func UsingSelf(state string) Emitter { + return EmitterFunc(func(groups []string, lexer Lexer, out func(Token)) { + if err := lexer.Tokenise(&TokeniseOptions{State: state}, groups[0], out); err != nil { panic(err) } }) @@ -107,9 +123,10 @@ func Words(words ...string) string { for i, word := range words { words[i] = regexp.QuoteMeta(word) } - return "\\b(?:" + strings.Join(words, "|") + ")\\b" + return `\b(?:` + strings.Join(words, `|`) + `)\b` } +// Rules maps from state to a sequence of Rules. type Rules map[string][]Rule // MustNewLexer creates a new Lexer or panics. @@ -133,7 +150,11 @@ func NewLexer(config *Config, rules Rules) (Lexer, error) { for state, rules := range rules { for _, rule := range rules { crule := CompiledRule{Rule: rule} - re, err := regexp.Compile("^(?m)" + rule.Pattern) + flags := "m" + if config.CaseInsensitive { + flags += "i" + } + re, err := regexp.Compile("^(?" + flags + ")(?:" + rule.Pattern + ")") if err != nil { return nil, fmt.Errorf("invalid regex %q for state %q: %s", rule.Pattern, state, err) } @@ -141,17 +162,6 @@ func NewLexer(config *Config, rules Rules) (Lexer, error) { compiledRules[state] = append(compiledRules[state], crule) } } - // Apply any pre-processor modifiers. - for state, rules := range compiledRules { - for index, rule := range rules { - if rule.Modifier != nil { - err := rule.Modifier.Preprocess(compiledRules, state, index) - if err != nil { - return nil, err - } - } - } - } return ®exLexer{ config: config, rules: compiledRules, @@ -164,6 +174,17 @@ type CompiledRule struct { Regexp *regexp.Regexp } +type CompiledRules map[string][]CompiledRule + +type LexerState struct { + Text string + Pos int + Rules map[string][]CompiledRule + Stack []string + State string + Rule int +} + type regexLexer struct { config *Config rules map[string][]CompiledRule @@ -173,51 +194,60 @@ func (r *regexLexer) Config() *Config { return r.config } -type LexerState struct { - Text string - Pos int - Stack []string - Rules map[string][]CompiledRule - State string -} - -func (r *regexLexer) Tokenise(text string, out func(Token)) error { +func (r *regexLexer) Tokenise(options *TokeniseOptions, text string, out func(Token)) error { + if options == nil { + options = defaultOptions + } state := &LexerState{ Text: text, - Stack: []string{"root"}, + Stack: []string{options.State}, Rules: r.rules, } for state.Pos < len(text) && len(state.Stack) > 0 { state.State = state.Stack[len(state.Stack)-1] - rule, index := matchRules(state.Text[state.Pos:], state.Rules[state.State]) + ruleIndex, rule, index := matchRules(state.Text[state.Pos:], state.Rules[state.State]) + // fmt.Println(text[state.Pos:state.Pos+1], rule, state.Text[state.Pos:state.Pos+1]) // No match. if index == nil { out(Token{Error, state.Text[state.Pos : state.Pos+1]}) state.Pos++ continue } + state.Rule = ruleIndex groups := make([]string, len(index)/2) for i := 0; i < len(index); i += 2 { - groups[i/2] = text[state.Pos+index[i] : state.Pos+index[i+1]] + start := state.Pos + index[i] + end := state.Pos + index[i+1] + if start == -1 || end == -1 { + continue + } + groups[i/2] = text[start:end] } state.Pos += index[1] - if rule.Modifier != nil { - if err := rule.Modifier.Mutate(state); err != nil { + if rule.Type != nil { + rule.Type.Emit(groups, r, out) + } + if rule.Mutator != nil { + if err := rule.Mutator.Mutate(state); err != nil { return err } - } else { - rule.Type.Emit(groups, out) } } return nil } -func matchRules(text string, rules []CompiledRule) (CompiledRule, []int) { - for _, rule := range rules { +// Tokenise text using lexer, returning tokens as a slice. +func Tokenise(lexer Lexer, options *TokeniseOptions, text string) ([]Token, error) { + out := []Token{} + return out, lexer.Tokenise(options, text, func(token Token) { out = append(out, token) }) +} + +func matchRules(text string, rules []CompiledRule) (int, CompiledRule, []int) { + for i, rule := range rules { if index := rule.Regexp.FindStringSubmatchIndex(text); index != nil { - return rule, index + return i, rule, index } } - return CompiledRule{}, nil + return 0, CompiledRule{}, nil } diff --git a/lexer_test.go b/lexer_test.go index 99680bf..afba59d 100644 --- a/lexer_test.go +++ b/lexer_test.go @@ -20,7 +20,7 @@ func TestSimpleLexer(t *testing.T) { Filenames: []string{"*.ini", "*.cfg"}, }, map[string][]Rule{ - "root": []Rule{ + "root": { {`\s+`, Whitespace, nil}, {`;.*?$`, Comment, nil}, {`\[.*?\]$`, Keyword, nil}, @@ -29,24 +29,24 @@ func TestSimpleLexer(t *testing.T) { }, ) require.NoError(t, err) - actual, err := lexer.Tokenise(` + actual, err := Tokenise(lexer, nil, ` ; this is a comment [section] a = 10 `) require.NoError(t, err) expected := []Token{ - Token{Whitespace, "\n\t"}, - Token{Comment, "; this is a comment"}, - Token{Whitespace, "\n\t"}, - Token{Keyword, "[section]"}, - Token{Whitespace, "\n\t"}, - Token{Name, "a"}, - Token{Whitespace, " "}, - Token{Operator, "="}, - Token{Whitespace, " "}, - Token{LiteralString, "10"}, - Token{Whitespace, "\n"}, + {Whitespace, "\n\t"}, + {Comment, "; this is a comment"}, + {Whitespace, "\n\t"}, + {Keyword, "[section]"}, + {Whitespace, "\n\t"}, + {Name, "a"}, + {Whitespace, " "}, + {Operator, "="}, + {Whitespace, " "}, + {LiteralString, "10"}, + {Whitespace, "\n"}, } require.Equal(t, expected, actual) } diff --git a/lexers/bash.go b/lexers/bash.go new file mode 100644 index 0000000..c9e8b3a --- /dev/null +++ b/lexers/bash.go @@ -0,0 +1,85 @@ +package lexers + +import ( + . "github.com/alecthomas/chroma" // nolint +) + +// Bash lexer. +var Bash = Register(NewLexer( + &Config{ + Name: "Bash", + Aliases: []string{"bash", "sh", "ksh", "zsh", "shell"}, + Filenames: []string{"*.sh", "*.ksh", "*.bash", "*.ebuild", "*.eclass", "*.exheres-0", "*.exlib", "*.zsh", ".bashrc", "bashrc", ".bash_*", "bash_*", "zshrc", ".zshrc", "PKGBUILD"}, + MimeTypes: []string{"application/x-sh", "application/x-shellscript"}, + }, + Rules{ + "root": { + Include("basic"), + {"`", LiteralStringBacktick, Push("backticks")}, + Include("data"), + Include("interp"), + }, + "interp": { + {`\$\(\(`, Keyword, Push("math")}, + {`\$\(`, Keyword, Push("paren")}, + {`\$\{#?`, LiteralStringInterpol, Push("curly")}, + {`\$[a-zA-Z_]\w*`, NameVariable, nil}, + {`\$(?:\d+|[#$?!_*@-])`, NameVariable, nil}, + {`\$`, Text, nil}, + }, + "basic": { + {`\b(if|fi|else|while|do|done|for|then|return|function|case|select|continue|until|esac|elif)(\s*)\b`, ByGroups(Keyword, Text), nil}, + {`\b(alias|bg|bind|break|builtin|caller|cd|command|compgen|complete|declare|dirs|disown|echo|enable|eval|exec|exit|export|false|fc|fg|getopts|hash|help|history|jobs|kill|let|local|logout|popd|printf|pushd|pwd|read|readonly|set|shift|shopt|source|suspend|test|time|times|trap|true|type|typeset|ulimit|umask|unalias|unset|wait)(\s*)\b`, NameBuiltin, nil}, + {`\A#!.+\n`, CommentHashbang, nil}, + {`#.*\n`, CommentSingle, nil}, + {`\\[\w\W]`, LiteralStringEscape, nil}, + {`(\b\w+)(\s*)(\+?=)`, ByGroups(NameVariable, Text, Operator), nil}, + {`[\[\]{}()=]`, Operator, nil}, + {`<<<`, Operator, nil}, + // {`<<-?\s*(\'?)\\?(\w+)[\w\W]+?\2`, LiteralString, nil}, + {`&&|\|\|`, Operator, nil}, + }, + "data": { + {`(?s)\$?"(\\\\|\\[0-7]+|\\.|[^"\\$])*"`, LiteralStringDouble, nil}, + {`"`, LiteralStringDouble, Push("string")}, + {`(?s)\$'(\\\\|\\[0-7]+|\\.|[^'\\])*'`, LiteralStringSingle, nil}, + {`(?s)'.*?'`, LiteralStringSingle, nil}, + {`;`, Punctuation, nil}, + {`&`, Punctuation, nil}, + {`\|`, Punctuation, nil}, + {`\s+`, Text, nil}, + {`\d+\b`, LiteralNumber, nil}, + {"[^=\\s\\[\\]{}()$\"\\'`\\\\<&|;]+", Text, nil}, + {`<`, Text, nil}, + }, + "string": { + {`"`, LiteralStringDouble, Pop(1)}, + {`(?s)(\\\\|\\[0-7]+|\\.|[^"\\$])+`, LiteralStringDouble, nil}, + Include("interp"), + }, + "curly": { + {`\}`, LiteralStringInterpol, Pop(1)}, + {`:-`, Keyword, nil}, + {`\w+`, NameVariable, nil}, + {"[^}:\"\\'`$\\\\]+", Punctuation, nil}, + {`:`, Punctuation, nil}, + Include("root"), + }, + "paren": { + {`\)`, Keyword, Pop(1)}, + Include("root"), + }, + "math": { + {`\)\)`, Keyword, Pop(1)}, + {`[-+*/%^|&]|\*\*|\|\|`, Operator, nil}, + {`\d+#\d+`, LiteralNumber, nil}, + {`\d+#`, LiteralNumber, nil}, + {`\d+`, LiteralNumber, nil}, + Include("root"), + }, + "backticks": { + {"`", LiteralStringBacktick, Pop(1)}, + Include("root"), + }, + }, +)) diff --git a/lexers/c.go b/lexers/c.go new file mode 100644 index 0000000..6793fde --- /dev/null +++ b/lexers/c.go @@ -0,0 +1,90 @@ +package lexers + +import ( + . "github.com/alecthomas/chroma" // nolint +) + +// C lexer. +var C = Register(NewLexer( + &Config{ + Name: "C", + Aliases: []string{"c"}, + Filenames: []string{"*.c", "*.h", "*.idc"}, + MimeTypes: []string{"text/x-chdr", "text/x-csrc"}, + }, + Rules{ + "whitespace": { + {`^#if\s+0`, CommentPreproc, Push("if0")}, + {`^#`, CommentPreproc, Push("macro")}, + {`^(\s*(?:/[*].*?[*]/\s*)?)(#if\s+0)`, ByGroups(UsingSelf("root"), CommentPreproc), Push("if0")}, + {`^(\s*(?:/[*].*?[*]/\s*)?)(#)`, ByGroups(UsingSelf("root"), CommentPreproc), Push("macro")}, + {`\n`, Text, nil}, + {`\s+`, Text, nil}, + {`\\\n`, Text, nil}, + {`//(\n|[\w\W]*?[^\\]\n)`, CommentSingle, nil}, + {`/(\\\n)?[*][\w\W]*?[*](\\\n)?/`, CommentMultiline, nil}, + {`/(\\\n)?[*][\w\W]*`, CommentMultiline, nil}, + }, + "statements": { + {`(L?)(")`, ByGroups(LiteralStringAffix, LiteralString), Push("string")}, + {`(L?)(')(\\.|\\[0-7]{1,3}|\\x[a-fA-F0-9]{1,2}|[^\\\'\n])(')`, ByGroups(LiteralStringAffix, LiteralStringChar, LiteralStringChar, LiteralStringChar), nil}, + {`(\d+\.\d*|\.\d+|\d+)[eE][+-]?\d+[LlUu]*`, LiteralNumberFloat, nil}, + {`(\d+\.\d*|\.\d+|\d+[fF])[fF]?`, LiteralNumberFloat, nil}, + {`0x[0-9a-fA-F]+[LlUu]*`, LiteralNumberHex, nil}, + {`0[0-7]+[LlUu]*`, LiteralNumberOct, nil}, + {`\d+[LlUu]*`, LiteralNumberInteger, nil}, + {`\*/`, Error, nil}, + {`[~!%^&*+=|?:<>/-]`, Operator, nil}, + {`[()\[\],.]`, Punctuation, nil}, + {`(?:asm|auto|break|case|const|continue|default|do|else|enum|extern|for|goto|if|register|restricted|return|sizeof|static|struct|switch|typedef|union|volatile|while)\b`, Keyword, nil}, + {`(bool|int|long|float|short|double|char|unsigned|signed|void)\b`, KeywordType, nil}, + {`(?:inline|_inline|__inline|naked|restrict|thread|typename)\b`, KeywordReserved, nil}, + {`(__m(128i|128d|128|64))\b`, KeywordReserved, nil}, + {`__(?:asm|int8|based|except|int16|stdcall|cdecl|fastcall|int32|declspec|finally|int64|try|leave|wchar_t|w64|unaligned|raise|noop|identifier|forceinline|assume)\b`, KeywordReserved, nil}, + {`(true|false|NULL)\b`, NameBuiltin, nil}, + {`([a-zA-Z_]\w*)(\s*)(:)`, ByGroups(NameLabel, Text, Punctuation), nil}, + {`[a-zA-Z_]\w*`, Name, nil}, + }, + "root": { + Include("whitespace"), + {`((?:[\w*\s])+?(?:\s|[*]))([a-zA-Z_]\w*)(\s*\([^;]*?\))([^;{]*)(\{)`, ByGroups(UsingSelf("root"), NameFunction, UsingSelf("root"), UsingSelf("root"), Punctuation), Push("function")}, + {`((?:[\w*\s])+?(?:\s|[*]))([a-zA-Z_]\w*)(\s*\([^;]*?\))([^;]*)(;)`, ByGroups(UsingSelf("root"), NameFunction, UsingSelf("root"), UsingSelf("root"), Punctuation), nil}, + Default(Push("statement")), + }, + "statement": { + Include("whitespace"), + Include("statements"), + {`[{}]`, Punctuation, nil}, + {`;`, Punctuation, Pop(1)}, + }, + "function": { + Include("whitespace"), + Include("statements"), + {`;`, Punctuation, nil}, + {`\{`, Punctuation, Push()}, + {`\}`, Punctuation, Pop(1)}, + }, + "string": { + {`"`, LiteralString, Pop(1)}, + {`\\([\\abfnrtv"\']|x[a-fA-F0-9]{2,4}|u[a-fA-F0-9]{4}|U[a-fA-F0-9]{8}|[0-7]{1,3})`, LiteralStringEscape, nil}, + {`[^\\"\n]+`, LiteralString, nil}, + {`\\\n`, LiteralString, nil}, + {`\\`, LiteralString, nil}, + }, + "macro": { + {`(include)(\s*(?:/[*].*?[*]/\s*)?)([^\n]+)`, ByGroups(CommentPreproc, Text, CommentPreprocFile), nil}, + {`[^/\n]+`, CommentPreproc, nil}, + {`/[*](.|\n)*?[*]/`, CommentMultiline, nil}, + {`//.*?\n`, CommentSingle, Pop(1)}, + {`/`, CommentPreproc, nil}, + // {`(?<=\\)\n`, CommentPreproc, nil}, + {`\n`, CommentPreproc, Pop(1)}, + }, + "if0": { + {`^\s*#if.*?\n`, CommentPreproc, Push()}, + {`^\s*#el(?:se|if).*\n`, CommentPreproc, Pop(1)}, + {`^\s*#endif.*?\n`, CommentPreproc, Pop(1)}, + {`.*?\n`, Comment, nil}, + }, + }, +)) diff --git a/lexers/default.go b/lexers/fallback.go similarity index 64% rename from lexers/default.go rename to lexers/fallback.go index a5d2c7a..3827371 100644 --- a/lexers/default.go +++ b/lexers/fallback.go @@ -4,9 +4,9 @@ import ( . "github.com/alecthomas/chroma" // nolint ) -// Default lexer if no other is found. -var Default = Register(NewLexer(&Config{ - Name: "default", +// Fallback lexer if no other is found. +var Fallback = Register(NewLexer(&Config{ + Name: "fallback", Filenames: []string{"*"}, Priority: 99, }, Rules{ diff --git a/lexers/makefile.go b/lexers/makefile.go new file mode 100644 index 0000000..cd0397c --- /dev/null +++ b/lexers/makefile.go @@ -0,0 +1,51 @@ +package lexers + +import ( + . "github.com/alecthomas/chroma" // nolint +) + +// Makefile lexer. +var Makefile = Register(NewLexer( + &Config{ + Name: "Makefile", + Aliases: []string{"make", "makefile", "mf", "bsdmake"}, + Filenames: []string{"*.mak", "*.mk", "Makefile", "makefile", "Makefile.*", "GNUmakefile"}, + MimeTypes: []string{"text/x-makefile"}, + }, + Rules{ + "root": { + {`^(?:[\t ]+.*\n|\n)+`, Using(Bash, nil), nil}, + {`\$[<@$+%?|*]`, Keyword, nil}, + {`\s+`, Text, nil}, + {`#.*?\n`, Comment, nil}, + {`(export)(\s+)`, ByGroups(Keyword, Text), Push("export")}, + {`export\s+`, Keyword, nil}, + {`([\w${}().-]+)(\s*)([!?:+]?=)([ \t]*)((?:.*\\\n)+|.*\n)`, ByGroups(NameVariable, Text, Operator, Text, Using(Bash, nil)), nil}, + {`(?s)"(\\\\|\\.|[^"\\])*"`, LiteralStringDouble, nil}, + {`(?s)'(\\\\|\\.|[^'\\])*'`, LiteralStringSingle, nil}, + {`([^\n:]+)(:+)([ \t]*)`, ByGroups(NameFunction, Operator, Text), Push("block-header")}, + {`\$\(`, Keyword, Push("expansion")}, + }, + "expansion": { + {`[^$a-zA-Z_()]+`, Text, nil}, + {`[a-zA-Z_]+`, NameVariable, nil}, + {`\$`, Keyword, nil}, + {`\(`, Keyword, Push()}, + {`\)`, Keyword, Pop(1)}, + }, + "export": { + {`[\w${}-]+`, NameVariable, nil}, + {`\n`, Text, Pop(1)}, + {`\s+`, Text, nil}, + }, + "block-header": { + {`[,|]`, Punctuation, nil}, + {`#.*?\n`, Comment, Pop(1)}, + {`\\\n`, Text, nil}, + {`\$\(`, Keyword, Push("expansion")}, + {`[a-zA-Z_]+`, Name, nil}, + {`\n`, Text, Pop(1)}, + {`.`, Text, nil}, + }, + }, +)) diff --git a/lexers/markdown.go b/lexers/markdown.go index 81f9bfa..899b774 100644 --- a/lexers/markdown.go +++ b/lexers/markdown.go @@ -19,13 +19,13 @@ var Markdown = Register(NewLexer( {`^(#{2,6})(.+\n)`, ByGroups(GenericSubheading, Text), nil}, // task list {`^(\s*)([*-] )(\[[ xX]\])( .+\n)`, - ByGroups(Text, Keyword, Keyword, Text), nil}, + ByGroups(Text, Keyword, Keyword, UsingSelf("inline")), nil}, // bulleted lists {`^(\s*)([*-])(\s)(.+\n)`, - ByGroups(Text, Keyword, Text, Text), nil}, + ByGroups(Text, Keyword, Text, UsingSelf("inline")), nil}, // numbered lists {`^(\s*)([0-9]+\.)( .+\n)`, - ByGroups(Text, Keyword, Text), nil}, + ByGroups(Text, Keyword, UsingSelf("inline")), nil}, // quote {`^(\s*>\s)(.+\n)`, ByGroups(Keyword, GenericEmph), nil}, // text block @@ -39,6 +39,8 @@ var Markdown = Register(NewLexer( {`\\.`, Text, nil}, // italics {`(\s)([*_][^*_]+[*_])(\W|\n)`, ByGroups(Text, GenericEmph, Text), nil}, + // underline + {`(\s)(__.*?__)`, ByGroups(Whitespace, GenericUnderline), nil}, // bold // warning: the following rule eats internal tags. eg. **foo _bar_ baz** bar is not italics {`(\s)(\*\*.*\*\*)`, ByGroups(Text, GenericStrong), nil}, @@ -58,12 +60,12 @@ var Markdown = Register(NewLexer( }, )) -func handleCodeblock(groups []string, out func(Token)) { +func handleCodeblock(groups []string, lexer Lexer, out func(Token)) { out(Token{String, groups[1]}) out(Token{String, groups[2]}) out(Token{Text, groups[3]}) code := groups[4] - lexer := Registry.Get(groups[2]) - lexer.Tokenise(code, out) + lexer = Registry.Get(groups[2]) + lexer.Tokenise(nil, code, out) out(Token{String, groups[5]}) } diff --git a/lexers/postgres.go b/lexers/postgres.go new file mode 100644 index 0000000..65f40bc --- /dev/null +++ b/lexers/postgres.go @@ -0,0 +1,57 @@ +package lexers + +import ( + . "github.com/alecthomas/chroma" // nolint +) + +// Postgresql Sql Dialect lexer. +var PostgresqlSqlDialect = Register(NewLexer( + &Config{ + Name: "PostgreSQL SQL dialect", + Aliases: []string{"postgresql", "postgres"}, + Filenames: []string{"*.sql"}, + MimeTypes: []string{"text/x-postgresql"}, + CaseInsensitive: true, + }, + Rules{ + "root": { + {`\s+`, Text, nil}, + {`--.*\n?`, CommentSingle, nil}, + {`/\*`, CommentMultiline, Push("multiline-comments")}, + {`(bigint|bigserial|bit|bit\s+varying|bool|boolean|box|bytea|char|character|character\s+varying|cidr|circle|date|decimal|double\s+precision|float4|float8|inet|int|int2|int4|int8|integer|interval|json|jsonb|line|lseg|macaddr|money|numeric|path|pg_lsn|point|polygon|real|serial|serial2|serial4|serial8|smallint|smallserial|text|time|timestamp|timestamptz|timetz|tsquery|tsvector|txid_snapshot|uuid|varbit|varchar|with\s+time\s+zone|without\s+time\s+zone|xml|anyarray|anyelement|anyenum|anynonarray|anyrange|cstring|fdw_handler|internal|language_handler|opaque|record|void)\b`, NameBuiltin, nil}, + {`(?:ABORT|ABSOLUTE|ACCESS|ACTION|ADD|ADMIN|AFTER|AGGREGATE|ALL|ALSO|ALTER|ALWAYS|ANALYSE|ANALYZE|AND|ANY|ARRAY|AS|ASC|ASSERTION|ASSIGNMENT|ASYMMETRIC|AT|ATTRIBUTE|AUTHORIZATION|BACKWARD|BEFORE|BEGIN|BETWEEN|BIGINT|BINARY|BIT|BOOLEAN|BOTH|BY|CACHE|CALLED|CASCADE|CASCADED|CASE|CAST|CATALOG|CHAIN|CHAR|CHARACTER|CHARACTERISTICS|CHECK|CHECKPOINT|CLASS|CLOSE|CLUSTER|COALESCE|COLLATE|COLLATION|COLUMN|COMMENT|COMMENTS|COMMIT|COMMITTED|CONCURRENTLY|CONFIGURATION|CONNECTION|CONSTRAINT|CONSTRAINTS|CONTENT|CONTINUE|CONVERSION|COPY|COST|CREATE|CROSS|CSV|CURRENT|CURRENT_CATALOG|CURRENT_DATE|CURRENT_ROLE|CURRENT_SCHEMA|CURRENT_TIME|CURRENT_TIMESTAMP|CURRENT_USER|CURSOR|CYCLE|DATA|DATABASE|DAY|DEALLOCATE|DEC|DECIMAL|DECLARE|DEFAULT|DEFAULTS|DEFERRABLE|DEFERRED|DEFINER|DELETE|DELIMITER|DELIMITERS|DESC|DICTIONARY|DISABLE|DISCARD|DISTINCT|DO|DOCUMENT|DOMAIN|DOUBLE|DROP|EACH|ELSE|ENABLE|ENCODING|ENCRYPTED|END|ENUM|ESCAPE|EVENT|EXCEPT|EXCLUDE|EXCLUDING|EXCLUSIVE|EXECUTE|EXISTS|EXPLAIN|EXTENSION|EXTERNAL|EXTRACT|FALSE|FAMILY|FETCH|FILTER|FIRST|FLOAT|FOLLOWING|FOR|FORCE|FOREIGN|FORWARD|FREEZE|FROM|FULL|FUNCTION|FUNCTIONS|GLOBAL|GRANT|GRANTED|GREATEST|GROUP|HANDLER|HAVING|HEADER|HOLD|HOUR|IDENTITY|IF|ILIKE|IMMEDIATE|IMMUTABLE|IMPLICIT|IN|INCLUDING|INCREMENT|INDEX|INDEXES|INHERIT|INHERITS|INITIALLY|INLINE|INNER|INOUT|INPUT|INSENSITIVE|INSERT|INSTEAD|INT|INTEGER|INTERSECT|INTERVAL|INTO|INVOKER|IS|ISNULL|ISOLATION|JOIN|KEY|LABEL|LANGUAGE|LARGE|LAST|LATERAL|LC_COLLATE|LC_CTYPE|LEADING|LEAKPROOF|LEAST|LEFT|LEVEL|LIKE|LIMIT|LISTEN|LOAD|LOCAL|LOCALTIME|LOCALTIMESTAMP|LOCATION|LOCK|MAPPING|MATCH|MATERIALIZED|MAXVALUE|MINUTE|MINVALUE|MODE|MONTH|MOVE|NAME|NAMES|NATIONAL|NATURAL|NCHAR|NEXT|NO|NONE|NOT|NOTHING|NOTIFY|NOTNULL|NOWAIT|NULL|NULLIF|NULLS|NUMERIC|OBJECT|OF|OFF|OFFSET|OIDS|ON|ONLY|OPERATOR|OPTION|OPTIONS|OR|ORDER|ORDINALITY|OUT|OUTER|OVER|OVERLAPS|OVERLAY|OWNED|OWNER|PARSER|PARTIAL|PARTITION|PASSING|PASSWORD|PLACING|PLANS|POLICY|POSITION|PRECEDING|PRECISION|PREPARE|PREPARED|PRESERVE|PRIMARY|PRIOR|PRIVILEGES|PROCEDURAL|PROCEDURE|PROGRAM|QUOTE|RANGE|READ|REAL|REASSIGN|RECHECK|RECURSIVE|REF|REFERENCES|REFRESH|REINDEX|RELATIVE|RELEASE|RENAME|REPEATABLE|REPLACE|REPLICA|RESET|RESTART|RESTRICT|RETURNING|RETURNS|REVOKE|RIGHT|ROLE|ROLLBACK|ROW|ROWS|RULE|SAVEPOINT|SCHEMA|SCROLL|SEARCH|SECOND|SECURITY|SELECT|SEQUENCE|SEQUENCES|SERIALIZABLE|SERVER|SESSION|SESSION_USER|SET|SETOF|SHARE|SHOW|SIMILAR|SIMPLE|SMALLINT|SNAPSHOT|SOME|STABLE|STANDALONE|START|STATEMENT|STATISTICS|STDIN|STDOUT|STORAGE|STRICT|STRIP|SUBSTRING|SYMMETRIC|SYSID|SYSTEM|TABLE|TABLES|TABLESPACE|TEMP|TEMPLATE|TEMPORARY|TEXT|THEN|TIME|TIMESTAMP|TO|TRAILING|TRANSACTION|TREAT|TRIGGER|TRIM|TRUE|TRUNCATE|TRUSTED|TYPE|TYPES|UNBOUNDED|UNCOMMITTED|UNENCRYPTED|UNION|UNIQUE|UNKNOWN|UNLISTEN|UNLOGGED|UNTIL|UPDATE|USER|USING|VACUUM|VALID|VALIDATE|VALIDATOR|VALUE|VALUES|VARCHAR|VARIADIC|VARYING|VERBOSE|VERSION|VIEW|VIEWS|VOLATILE|WHEN|WHERE|WHITESPACE|WINDOW|WITH|WITHIN|WITHOUT|WORK|WRAPPER|WRITE|XML|XMLATTRIBUTES|XMLCONCAT|XMLELEMENT|XMLEXISTS|XMLFOREST|XMLPARSE|XMLPI|XMLROOT|XMLSERIALIZE|YEAR|YES|ZONE)\b`, Keyword, nil}, + {"[+*/<>=~!@#%^&|?-]+", Operator, nil}, + {`::`, Operator, nil}, + {`\$\d+`, NameVariable, nil}, + {`([0-9]*\.[0-9]*|[0-9]+)(e[+-]?[0-9]+)?`, LiteralNumberFloat, nil}, + {`[0-9]+`, LiteralNumberInteger, nil}, + {`((?:E|U&)?)(')`, ByGroups(LiteralStringAffix, LiteralStringSingle), Push("string")}, + {`((?:U&)?)(")`, ByGroups(LiteralStringAffix, LiteralStringName), Push("quoted-ident")}, + // PL/SQL, etc. + // TODO: Make this work. + // {`(?s)(\$)([^$]*)(\$)(.*?)(\$)(\2)(\$)`, EmitterFunc(detectLanguage), nil}, + {`[a-z_]\w*`, Name, nil}, + {"`[^`]*`", LiteralStringName, nil}, + {`:'[a-z]\w*\b'`, NameVariable, nil}, + {`:"[a-z]\w*\b"`, NameVariable, nil}, + {`:[a-z]\w*\b`, NameVariable, nil}, + {`[;:()\[\]{},.]`, Punctuation, nil}, + }, + "multiline-comments": { + {`/\*`, CommentMultiline, Push("multiline-comments")}, + {`\*/`, CommentMultiline, Pop(1)}, + {`[^/*]+`, CommentMultiline, nil}, + {`[/*]`, CommentMultiline, nil}, + }, + "string": { + {`[^']+`, LiteralStringSingle, nil}, + {`''`, LiteralStringSingle, nil}, + {`'`, LiteralStringSingle, Pop(1)}, + }, + "quoted-ident": { + {`[^"]+`, LiteralStringName, nil}, + {`""`, LiteralStringName, nil}, + {`"`, LiteralStringName, Pop(1)}, + }, + }, +)) diff --git a/lexers/python.go b/lexers/python.go new file mode 100644 index 0000000..cc71779 --- /dev/null +++ b/lexers/python.go @@ -0,0 +1,136 @@ +package lexers + +import ( + . "github.com/alecthomas/chroma" // nolint +) + +// Python lexer. +var Python = Register(NewLexer( + &Config{ + Name: "Python", + Aliases: []string{"python", "py", "sage"}, + Filenames: []string{"*.py", "*.pyw", "*.sc", "SConstruct", "SConscript", "*.tac", "*.sage"}, + MimeTypes: []string{"text/x-python", "application/x-python"}, + }, + Rules{ + "root": { + {`\n`, Text, nil}, + {`^(\s*)([rRuUbB]{,2})("""(?:.|\n)*?""")`, ByGroups(Text, LiteralStringAffix, LiteralStringDoc), nil}, + {`^(\s*)([rRuUbB]{,2})('''(?:.|\n)*?''')`, ByGroups(Text, LiteralStringAffix, LiteralStringDoc), nil}, + {`[^\S\n]+`, Text, nil}, + {`\A#!.+$`, CommentHashbang, nil}, + {`#.*$`, CommentSingle, nil}, + {`[]{}:(),;[]`, Punctuation, nil}, + {`\\\n`, Text, nil}, + {`\\`, Text, nil}, + {`(in|is|and|or|not)\b`, OperatorWord, nil}, + {`!=|==|<<|>>|[-~+/*%=<>&^|.]`, Operator, nil}, + Include("keywords"), + {`(def)((?:\s|\\\s)+)`, ByGroups(Keyword, Text), Push("funcname")}, + {`(class)((?:\s|\\\s)+)`, ByGroups(Keyword, Text), Push("classname")}, + {`(from)((?:\s|\\\s)+)`, ByGroups(KeywordNamespace, Text), Push("fromimport")}, + {`(import)((?:\s|\\\s)+)`, ByGroups(KeywordNamespace, Text), Push("import")}, + Include("builtins"), + Include("magicfuncs"), + Include("magicvars"), + Include("backtick"), + {`([rR]|[uUbB][rR]|[rR][uUbB])(""")`, ByGroups(LiteralStringAffix, LiteralStringDouble), Push("tdqs")}, + {`([rR]|[uUbB][rR]|[rR][uUbB])(''')`, ByGroups(LiteralStringAffix, LiteralStringSingle), Push("tsqs")}, + {`([rR]|[uUbB][rR]|[rR][uUbB])(")`, ByGroups(LiteralStringAffix, LiteralStringDouble), Push("dqs")}, + {`([rR]|[uUbB][rR]|[rR][uUbB])(')`, ByGroups(LiteralStringAffix, LiteralStringSingle), Push("sqs")}, + {`([uUbB]?)(""")`, ByGroups(LiteralStringAffix, LiteralStringDouble), Combined("stringescape", "tdqs")}, + {`([uUbB]?)(''')`, ByGroups(LiteralStringAffix, LiteralStringSingle), Combined("stringescape", "tsqs")}, + {`([uUbB]?)(")`, ByGroups(LiteralStringAffix, LiteralStringDouble), Combined("stringescape", "dqs")}, + {`([uUbB]?)(')`, ByGroups(LiteralStringAffix, LiteralStringSingle), Combined("stringescape", "sqs")}, + Include("name"), + Include("numbers"), + }, + "keywords": { + {`(?:assert|break|continue|del|elif|else|except|exec|finally|for|global|if|lambda|pass|print|raise|return|try|while|yield|yield from|as|with)\b`, Keyword, nil}, + }, + "builtins": { + {`(?:__import__|abs|all|any|apply|basestring|bin|bool|buffer|bytearray|bytes|callable|chr|classmethod|cmp|coerce|compile|complex|delattr|dict|dir|divmod|enumerate|eval|execfile|exit|file|filter|float|frozenset|getattr|globals|hasattr|hash|hex|id|input|int|intern|isinstance|issubclass|iter|len|list|locals|long|map|max|min|next|object|oct|open|ord|pow|property|range|raw_input|reduce|reload|repr|reversed|round|set|setattr|slice|sorted|staticmethod|str|sum|super|tuple|type|unichr|unicode|vars|xrange|zip)\b`, NameBuiltin, nil}, + {`(self|None|Ellipsis|NotImplemented|False|True|cls)\b`, NameBuiltinPseudo, nil}, + {`(?:ArithmeticError|AssertionError|AttributeError|BaseException|DeprecationWarning|EOFError|EnvironmentError|Exception|FloatingPointError|FutureWarning|GeneratorExit|IOError|ImportError|ImportWarning|IndentationError|IndexError|KeyError|KeyboardInterrupt|LookupError|MemoryError|NameError|NotImplemented|NotImplementedError|OSError|OverflowError|OverflowWarning|PendingDeprecationWarning|ReferenceError|RuntimeError|RuntimeWarning|StandardError|StopIteration|SyntaxError|SyntaxWarning|SystemError|SystemExit|TabError|TypeError|UnboundLocalError|UnicodeDecodeError|UnicodeEncodeError|UnicodeError|UnicodeTranslateError|UnicodeWarning|UserWarning|ValueError|VMSError|Warning|WindowsError|ZeroDivisionError)\b`, NameException, nil}, + }, + "magicfuncs": { + {`(?:__abs__|__add__|__and__|__call__|__cmp__|__coerce__|__complex__|__contains__|__del__|__delattr__|__delete__|__delitem__|__delslice__|__div__|__divmod__|__enter__|__eq__|__exit__|__float__|__floordiv__|__ge__|__get__|__getattr__|__getattribute__|__getitem__|__getslice__|__gt__|__hash__|__hex__|__iadd__|__iand__|__idiv__|__ifloordiv__|__ilshift__|__imod__|__imul__|__index__|__init__|__instancecheck__|__int__|__invert__|__iop__|__ior__|__ipow__|__irshift__|__isub__|__iter__|__itruediv__|__ixor__|__le__|__len__|__long__|__lshift__|__lt__|__missing__|__mod__|__mul__|__ne__|__neg__|__new__|__nonzero__|__oct__|__op__|__or__|__pos__|__pow__|__radd__|__rand__|__rcmp__|__rdiv__|__rdivmod__|__repr__|__reversed__|__rfloordiv__|__rlshift__|__rmod__|__rmul__|__rop__|__ror__|__rpow__|__rrshift__|__rshift__|__rsub__|__rtruediv__|__rxor__|__set__|__setattr__|__setitem__|__setslice__|__str__|__sub__|__subclasscheck__|__truediv__|__unicode__|__xor__)\b`, NameFunctionMagic, nil}, + }, + "magicvars": { + {`(?:__bases__|__class__|__closure__|__code__|__defaults__|__dict__|__doc__|__file__|__func__|__globals__|__metaclass__|__module__|__mro__|__name__|__self__|__slots__|__weakref__)\b`, NameVariableMagic, nil}, + }, + "numbers": { + {`(\d+\.\d*|\d*\.\d+)([eE][+-]?[0-9]+)?j?`, LiteralNumberFloat, nil}, + {`\d+[eE][+-]?[0-9]+j?`, LiteralNumberFloat, nil}, + {`0[0-7]+j?`, LiteralNumberOct, nil}, + {`0[bB][01]+`, LiteralNumberBin, nil}, + {`0[xX][a-fA-F0-9]+`, LiteralNumberHex, nil}, + {`\d+L`, LiteralNumberIntegerLong, nil}, + {`\d+j?`, LiteralNumberInteger, nil}, + }, + "backtick": { + {"`.*?`", LiteralStringBacktick, nil}, + }, + "name": { + {`@[\w.]+`, NameDecorator, nil}, + {`[a-zA-Z_]\w*`, Name, nil}, + }, + "funcname": { + Include("magicfuncs"), + {`[a-zA-Z_]\w*`, NameFunction, Pop(1)}, + Default(Pop(1)), + }, + "classname": { + {`[a-zA-Z_]\w*`, NameClass, Pop(1)}, + }, + "import": { + {`(?:[ \t]|\\\n)+`, Text, nil}, + {`as\b`, KeywordNamespace, nil}, + {`,`, Operator, nil}, + {`[a-zA-Z_][\w.]*`, NameNamespace, nil}, + Default(Pop(1)), + }, + "fromimport": { + {`(?:[ \t]|\\\n)+`, Text, nil}, + {`import\b`, KeywordNamespace, Pop(1)}, + {`None\b`, NameBuiltinPseudo, Pop(1)}, + {`[a-zA-Z_.][\w.]*`, NameNamespace, nil}, + Default(Pop(1)), + }, + "stringescape": { + {`\\([\\abfnrtv"\']|\n|N\{.*?\}|u[a-fA-F0-9]{4}|U[a-fA-F0-9]{8}|x[a-fA-F0-9]{2}|[0-7]{1,3})`, LiteralStringEscape, nil}, + }, + "strings-single": { + {`%(\(\w+\))?[-#0 +]*([0-9]+|[*])?(\.([0-9]+|[*]))?[hlL]?[E-GXc-giorsux%]`, LiteralStringInterpol, nil}, + {`[^\\\'"%\n]+`, LiteralStringSingle, nil}, + {`[\'"\\]`, LiteralStringSingle, nil}, + {`%`, LiteralStringSingle, nil}, + }, + "strings-double": { + {`%(\(\w+\))?[-#0 +]*([0-9]+|[*])?(\.([0-9]+|[*]))?[hlL]?[E-GXc-giorsux%]`, LiteralStringInterpol, nil}, + {`[^\\\'"%\n]+`, LiteralStringDouble, nil}, + {`[\'"\\]`, LiteralStringDouble, nil}, + {`%`, LiteralStringDouble, nil}, + }, + "dqs": { + {`"`, LiteralStringDouble, Pop(1)}, + {`\\\\|\\"|\\\n`, LiteralStringEscape, nil}, + Include("strings-double"), + }, + "sqs": { + {`'`, LiteralStringSingle, Pop(1)}, + {`\\\\|\\'|\\\n`, LiteralStringEscape, nil}, + Include("strings-single"), + }, + "tdqs": { + {`"""`, LiteralStringDouble, Pop(1)}, + Include("strings-double"), + {`\n`, LiteralStringDouble, nil}, + }, + "tsqs": { + {`'''`, LiteralStringSingle, Pop(1)}, + Include("strings-single"), + {`\n`, LiteralStringSingle, nil}, + }, + }, +)) diff --git a/lexers/registry.go b/lexers/registry.go index 9d57ac4..77a6329 100644 --- a/lexers/registry.go +++ b/lexers/registry.go @@ -41,7 +41,7 @@ func (r *registry) Get(name string) chroma.Lexer { if ok { return lexer } - return Default + return Fallback } // Match returns all lexers matching filename. diff --git a/modifiers.go b/modifiers.go deleted file mode 100644 index 794e8e6..0000000 --- a/modifiers.go +++ /dev/null @@ -1,79 +0,0 @@ -package chroma - -import "fmt" - -// A Modifier modifies the behaviour of the lexer. -type Modifier interface { - // Preprocess the lexer rules. - // - // "self" and "rule" are the rule name and index this Modifier is associated with. - Preprocess(rules map[string][]CompiledRule, self string, rule int) error - // Mutate the lexer state machine as it is processing. - Mutate(state *LexerState) error -} - -// A MutatorFunc is a Modifier that mutates the lexer state machine as it is processing. -type MutatorFunc func(state *LexerState) error - -func (m MutatorFunc) Preprocess(rules map[string][]CompiledRule, self string, rule int) error { - return nil -} - -func (m MutatorFunc) Mutate(state *LexerState) error { - return m(state) -} - -// A PreprocessorFunc is a Modifier that pre-processes the lexer rules. -type PreprocessorFunc func(rules map[string][]CompiledRule, self string, rule int) error - -func (p PreprocessorFunc) Preprocess(rules map[string][]CompiledRule, self string, rule int) error { - return p(rules, self, rule) -} - -func (p PreprocessorFunc) Mutate(state *LexerState) error { - return nil -} - -// Modifiers applies a set of Modifiers in order. -func Modifiers(modifiers ...Modifier) MutatorFunc { - return func(state *LexerState) error { - for _, modifier := range modifiers { - if err := modifier.Mutate(state); err != nil { - return err - } - } - return nil - } -} - -// Include the given state. -func Include(state string) Rule { - return Rule{ - Modifier: PreprocessorFunc(func(rules map[string][]CompiledRule, self string, rule int) error { - includedRules, ok := rules[state] - if !ok { - return fmt.Errorf("invalid include state %q", state) - } - stateRules := rules[self] - stateRules = append(stateRules[:rule], append(includedRules, stateRules[rule+1:]...)...) - rules[self] = stateRules - return nil - }), - } -} - -// Push states onto the stack. -func Push(states ...string) MutatorFunc { - return func(s *LexerState) error { - s.Stack = append(s.Stack, states...) - return nil - } -} - -// Pop state from the stack when rule matches. -func Pop(n int) MutatorFunc { - return func(state *LexerState) error { - state.Stack = state.Stack[:len(state.Stack)-n] - return nil - } -} diff --git a/modifiers_test.go b/modifiers_test.go deleted file mode 100644 index be5e24e..0000000 --- a/modifiers_test.go +++ /dev/null @@ -1,6 +0,0 @@ -package chroma - -import "testing" - -func TestPop(t *testing.T) { -} diff --git a/mutators.go b/mutators.go new file mode 100644 index 0000000..a6f8c49 --- /dev/null +++ b/mutators.go @@ -0,0 +1,90 @@ +package chroma + +import ( + "fmt" + "strings" +) + +// A Mutator modifies the behaviour of the lexer. +type Mutator interface { + // Mutate the lexer state machine as it is processing. + Mutate(state *LexerState) error +} + +// A MutatorFunc is a Mutator that mutates the lexer state machine as it is processing. +type MutatorFunc func(state *LexerState) error + +func (m MutatorFunc) Mutate(state *LexerState) error { return m(state) } + +// Mutators applies a set of Mutators in order. +func Mutators(modifiers ...Mutator) MutatorFunc { + return func(state *LexerState) error { + for _, modifier := range modifiers { + if err := modifier.Mutate(state); err != nil { + return err + } + } + return nil + } +} + +// Include the given state. +func Include(state string) Rule { + return Rule{ + Mutator: MutatorFunc(func(ls *LexerState) error { + includedRules, ok := ls.Rules[state] + if !ok { + return fmt.Errorf("invalid include state %q", state) + } + stateRules := ls.Rules[ls.State] + stateRules = append(stateRules[:ls.Rule], append(includedRules, stateRules[ls.Rule+1:]...)...) + ls.Rules[ls.State] = stateRules + return nil + }), + } +} + +// Combined creates a new anonymous state from the given states, and pushes that state. +func Combined(states ...string) MutatorFunc { + return func(s *LexerState) error { + name := "__combined_" + strings.Join(states, "__") + if _, ok := s.Rules[name]; !ok { + combined := []CompiledRule{} + for _, state := range states { + rules, ok := s.Rules[state] + if !ok { + return fmt.Errorf("invalid combine state %q", state) + } + combined = append(combined, rules...) + } + s.Rules[name] = combined + } + s.Rules[s.State][s.Rule].Mutator = Push(name) + s.Stack = append(s.Stack, name) + return nil + } +} + +// Push states onto the stack. +func Push(states ...string) MutatorFunc { + return func(s *LexerState) error { + if len(states) == 0 { + s.Stack = append(s.Stack, s.State) + } else { + s.Stack = append(s.Stack, states...) + } + return nil + } +} + +// Pop state from the stack when rule matches. +func Pop(n int) MutatorFunc { + return func(state *LexerState) error { + state.Stack = state.Stack[:len(state.Stack)-n] + return nil + } +} + +func Default(mutator Mutator) Rule { + return Rule{Mutator: mutator} +} diff --git a/mutators_test.go b/mutators_test.go new file mode 100644 index 0000000..581fca1 --- /dev/null +++ b/mutators_test.go @@ -0,0 +1,55 @@ +package chroma + +import ( + "testing" + + "github.com/stretchr/testify/require" +) + +func TestInclude(t *testing.T) { + include := Include("other") + actual := CompiledRules{ + "root": { + CompiledRule{Rule: include}, + }, + "other": { + CompiledRule{Rule: Rule{ + Pattern: "//.+", + Type: Comment, + }}, + CompiledRule{Rule: Rule{ + Pattern: `"[^"]*"`, + Type: String, + }}, + }, + } + state := &LexerState{ + State: "root", + Rules: actual, + } + err := include.Mutator.Mutate(state) + require.NoError(t, err) + expected := CompiledRules{ + "root": { + CompiledRule{Rule: Rule{ + Pattern: "//.+", + Type: Comment, + }}, + CompiledRule{Rule: Rule{ + Pattern: `"[^"]*"`, + Type: String, + }}, + }, + "other": { + CompiledRule{Rule: Rule{ + Pattern: "//.+", + Type: Comment, + }}, + CompiledRule{Rule: Rule{ + Pattern: `"[^"]*"`, + Type: String, + }}, + }, + } + require.Equal(t, expected, actual) +} diff --git a/tokentype_string.go b/tokentype_string.go index 3c1542c..236e561 100644 --- a/tokentype_string.go +++ b/tokentype_string.go @@ -4,7 +4,7 @@ package chroma import "fmt" -const _TokenType_name = "EscapeErrorOtherKeywordKeywordConstantKeywordDeclarationKeywordNamespaceKeywordPseudoKeywordReservedKeywordTypeNameNameAttributeNameBuiltinNameBuiltinPseudoNameClassNameConstantNameDecoratorNameEntityNameExceptionNameFunctionNameFunctionMagicNamePropertyNameLabelNameNamespaceNameOtherNameTagNameVariableNameVariableClassNameVariableGlobalNameVariableInstanceNameVariableMagicLiteralLiteralDateLiteralStringLiteralStringAffixLiteralStringBacktickLiteralStringCharLiteralStringDelimiterLiteralStringDocLiteralStringDoubleLiteralStringEscapeLiteralStringHeredocLiteralStringInterpolLiteralStringOtherLiteralStringRegexLiteralStringSingleLiteralStringSymbolLiteralNumberLiteralNumberBinLiteralNumberFloatLiteralNumberHexLiteralNumberIntegerLiteralNumberIntegerLongLiteralNumberOctOperatorOperatorWordPunctuationCommentCommentHashbangCommentMultilineCommentPreprocCommentPreprocFileCommentSingleCommentSpecialGenericGenericDeletedGenericEmphGenericErrorGenericHeadingGenericInsertedGenericOutputGenericPromptGenericStrongGenericSubheadingGenericTracebackTextTextWhitespace" +const _TokenType_name = "EscapeErrorOtherKeywordKeywordConstantKeywordDeclarationKeywordNamespaceKeywordPseudoKeywordReservedKeywordTypeNameNameAttributeNameBuiltinNameBuiltinPseudoNameClassNameConstantNameDecoratorNameEntityNameExceptionNameFunctionNameFunctionMagicNamePropertyNameLabelNameNamespaceNameOtherNameTagNameVariableNameVariableClassNameVariableGlobalNameVariableInstanceNameVariableMagicLiteralLiteralDateLiteralStringLiteralStringAffixLiteralStringBacktickLiteralStringCharLiteralStringDelimiterLiteralStringDocLiteralStringDoubleLiteralStringEscapeLiteralStringHeredocLiteralStringInterpolLiteralStringOtherLiteralStringRegexLiteralStringSingleLiteralStringSymbolLiteralStringNameLiteralNumberLiteralNumberBinLiteralNumberFloatLiteralNumberHexLiteralNumberIntegerLiteralNumberIntegerLongLiteralNumberOctOperatorOperatorWordPunctuationCommentCommentHashbangCommentMultilineCommentSingleCommentSpecialCommentPreprocCommentPreprocFileGenericGenericDeletedGenericEmphGenericErrorGenericHeadingGenericInsertedGenericOutputGenericPromptGenericStrongGenericSubheadingGenericTracebackGenericUnderlineTextTextWhitespace" var _TokenType_map = map[TokenType]string{ 0: _TokenType_name[0:6], @@ -54,36 +54,38 @@ var _TokenType_map = map[TokenType]string{ 3111: _TokenType_name[598:616], 3112: _TokenType_name[616:635], 3113: _TokenType_name[635:654], - 3200: _TokenType_name[654:667], - 3201: _TokenType_name[667:683], - 3202: _TokenType_name[683:701], - 3203: _TokenType_name[701:717], - 3204: _TokenType_name[717:737], - 3205: _TokenType_name[737:761], - 3206: _TokenType_name[761:777], - 4000: _TokenType_name[777:785], - 4001: _TokenType_name[785:797], - 5000: _TokenType_name[797:808], - 6000: _TokenType_name[808:815], - 6001: _TokenType_name[815:830], - 6002: _TokenType_name[830:846], - 6003: _TokenType_name[846:860], - 6004: _TokenType_name[860:878], - 6005: _TokenType_name[878:891], - 6006: _TokenType_name[891:905], - 7000: _TokenType_name[905:912], - 7001: _TokenType_name[912:926], - 7002: _TokenType_name[926:937], - 7003: _TokenType_name[937:949], - 7004: _TokenType_name[949:963], - 7005: _TokenType_name[963:978], - 7006: _TokenType_name[978:991], - 7007: _TokenType_name[991:1004], - 7008: _TokenType_name[1004:1017], - 7009: _TokenType_name[1017:1034], - 7010: _TokenType_name[1034:1050], - 8000: _TokenType_name[1050:1054], - 8001: _TokenType_name[1054:1068], + 3114: _TokenType_name[654:671], + 3200: _TokenType_name[671:684], + 3201: _TokenType_name[684:700], + 3202: _TokenType_name[700:718], + 3203: _TokenType_name[718:734], + 3204: _TokenType_name[734:754], + 3205: _TokenType_name[754:778], + 3206: _TokenType_name[778:794], + 4000: _TokenType_name[794:802], + 4001: _TokenType_name[802:814], + 5000: _TokenType_name[814:825], + 6000: _TokenType_name[825:832], + 6001: _TokenType_name[832:847], + 6002: _TokenType_name[847:863], + 6003: _TokenType_name[863:876], + 6004: _TokenType_name[876:890], + 6100: _TokenType_name[890:904], + 6101: _TokenType_name[904:922], + 7000: _TokenType_name[922:929], + 7001: _TokenType_name[929:943], + 7002: _TokenType_name[943:954], + 7003: _TokenType_name[954:966], + 7004: _TokenType_name[966:980], + 7005: _TokenType_name[980:995], + 7006: _TokenType_name[995:1008], + 7007: _TokenType_name[1008:1021], + 7008: _TokenType_name[1021:1034], + 7009: _TokenType_name[1034:1051], + 7010: _TokenType_name[1051:1067], + 7011: _TokenType_name[1067:1083], + 8000: _TokenType_name[1083:1087], + 8001: _TokenType_name[1087:1101], } func (i TokenType) String() string { diff --git a/types.go b/types.go index 45f2b12..7bfe4ba 100644 --- a/types.go +++ b/types.go @@ -74,6 +74,7 @@ const ( LiteralStringRegex LiteralStringSingle LiteralStringSymbol + LiteralStringName ) // Literals. @@ -103,12 +104,16 @@ const ( Comment TokenType = 6000 + iota CommentHashbang CommentMultiline - CommentPreproc - CommentPreprocFile CommentSingle CommentSpecial ) +// Preprocessor "comments". +const ( + CommentPreproc TokenType = 6100 + iota + CommentPreprocFile +) + // Generic tokens. const ( Generic TokenType = 7000 + iota @@ -122,6 +127,7 @@ const ( GenericStrong GenericSubheading GenericTraceback + GenericUnderline ) // Text. @@ -176,6 +182,6 @@ func (t TokenType) InSubCategory(other TokenType) bool { return t/100 == other/100 } -func (t TokenType) Emit(groups []string, out func(Token)) { +func (t TokenType) Emit(groups []string, lexer Lexer, out func(Token)) { out(Token{Type: t, Value: groups[0]}) }