1
0
mirror of https://github.com/alecthomas/chroma.git synced 2025-01-26 03:20:10 +02:00

Use pointers to tokens + support regex flags in importer.

This commit is contained in:
Alec Thomas 2017-06-05 10:29:50 +10:00
parent c64e5829b5
commit 1f47bd705c
10 changed files with 83 additions and 56 deletions

View File

@ -24,6 +24,15 @@ var {{upper_name}} = Register(NewLexer(
Aliases: []string{ {{#aliases}}"{{.}}", {{/aliases}} },
Filenames: []string{ {{#filenames}}"{{.}}", {{/filenames}} },
MimeTypes: []string{ {{#mimetypes}}"{{.}}", {{/mimetypes}} },
{{#re_not_multiline}}
NotMultiline: true,
{{/re_not_multiline}}
{{#re_dotall}}
DotAll: true,
{{/re_dotall}}
{{#re_ignorecase}}
CaseInsensitive: true,
{{/re_ignorecase}}
},
Rules{
{{#tokens}}
@ -136,6 +145,21 @@ def translate_rules(rules):
return out
class TemplateView(object):
def __init__(self, **kwargs):
for key, value in kwargs.items():
setattr(self, key, value)
def re_not_multiline(self):
return not (self.regex_flags & re.MULTILINE)
def re_dotall(self):
return self.regex_flags & re.DOTALL
def re_ignorecase(self):
return self.regex_flags & re.IGNORECASE
def main():
package_name, symbol_name = sys.argv[1].rsplit(sep=".", maxsplit=1)
@ -145,15 +169,15 @@ def main():
assert issubclass(lexer_cls, pygments_lexer.RegexLexer), 'can only translate from RegexLexer'
print(pystache.render(TEMPLATE, {
'name': lexer_cls.name,
'options': lexer_cls.flags,
'upper_name': to_camel_case(lexer_cls.name),
'aliases': lexer_cls.aliases,
'filenames': lexer_cls.filenames,
'mimetypes': lexer_cls.mimetypes,
'tokens': [{'state': state, 'rules': translate_rules(rules)} for (state, rules) in lexer_cls.get_tokendefs().items()],
}))
print(pystache.render(TEMPLATE, TemplateView(
name=lexer_cls.name,
regex_flags=lexer_cls.flags,
upper_name=to_camel_case(lexer_cls.name),
aliases=lexer_cls.aliases,
filenames=lexer_cls.filenames,
mimetypes=lexer_cls.mimetypes,
tokens=[{'state': state, 'rules': translate_rules(rules)} for (state, rules) in lexer_cls.get_tokendefs().items()],
)))
if __name__ == '__main__':

View File

@ -52,15 +52,14 @@ func main() {
}
}
func getWriter(w io.Writer) func(chroma.Token) {
func getWriter(w io.Writer) func(*chroma.Token) {
if *tokensFlag {
return func(token chroma.Token) {
return func(token *chroma.Token) {
fmt.Println(token)
}
} else {
formatter := formatters.Console(formatters.DefaultConsoleTheme)
writer, err := formatter.Format(w)
kingpin.FatalIfError(err, "")
return writer
}
formatter := formatters.Console(formatters.DefaultConsoleTheme)
writer, err := formatter.Format(w)
kingpin.FatalIfError(err, "")
return writer
}

View File

@ -9,22 +9,18 @@ type coalescer struct {
Lexer
}
func (d *coalescer) Tokenise(options *TokeniseOptions, text string, out func(Token)) error {
func (d *coalescer) Tokenise(options *TokeniseOptions, text string, out func(*Token)) error {
var last *Token
defer func() {
if last != nil {
out(*last)
}
}()
return d.Lexer.Tokenise(options, text, func(token Token) {
defer func() { out(last) }()
return d.Lexer.Tokenise(options, text, func(token *Token) {
if last == nil {
last = &token
last = token
} else {
if last.Type == token.Type {
last.Value += token.Value
} else {
out(*last)
last = &token
out(last)
last = token
}
}
})

View File

@ -12,10 +12,10 @@ func TestCoalesce(t *testing.T) {
Rule{`[[:punct:]]`, Punctuation, nil},
},
}))
actual, err := Tokenise(lexer, nil, "!@#$%")
actual, err := Tokenise(lexer, nil, "!@#$")
require.NoError(t, err)
expected := []Token{
Token{Punctuation, "!@#$%"},
expected := []*Token{
&Token{Punctuation, "!@#$"},
}
require.Equal(t, expected, actual)
}

View File

@ -8,5 +8,5 @@ import (
// Formatter returns a formatting function for tokens.
type Formatter interface {
Format(w io.Writer) (func(chroma.Token), error)
Format(w io.Writer) (func(*chroma.Token), error)
}

View File

@ -34,8 +34,8 @@ type consoleFormatter struct {
theme map[TokenType]string
}
func (c *consoleFormatter) Format(w io.Writer) (func(Token), error) {
return func(token Token) {
func (c *consoleFormatter) Format(w io.Writer) (func(*Token), error) {
return func(token *Token) {
clr, ok := c.theme[token.Type]
if !ok {
clr, ok = c.theme[token.Type.SubCategory()]
@ -46,8 +46,12 @@ func (c *consoleFormatter) Format(w io.Writer) (func(Token), error) {
}
}
}
fmt.Fprint(w, clr)
if clr != "" {
fmt.Fprint(w, clr)
}
fmt.Fprint(w, token.Value)
fmt.Fprintf(w, "\033[0m")
if clr != "" {
fmt.Fprintf(w, "\033[0m")
}
}, nil
}

View File

@ -57,13 +57,14 @@ type Config struct {
// TabSize int
}
// Token output to formatter.
type Token struct {
Type TokenType
Value string
}
func (t Token) String() string { return fmt.Sprintf("Token{%s, %q}", t.Type, t.Value) }
func (t Token) GoString() string { return t.String() }
func (t *Token) String() string { return t.Value }
func (t *Token) GoString() string { return fmt.Sprintf("Token{%s, %q}", t.Type, t.Value) }
type TokeniseOptions struct {
// State to start tokenisation in. Defaults to "root".
@ -72,7 +73,7 @@ type TokeniseOptions struct {
type Lexer interface {
Config() *Config
Tokenise(options *TokeniseOptions, text string, out func(Token)) error
Tokenise(options *TokeniseOptions, text string, out func(*Token)) error
}
// Analyser determines if this lexer is appropriate for the given text.
@ -89,18 +90,18 @@ type Rule struct {
// An Emitter takes group matches and returns tokens.
type Emitter interface {
// Emit tokens for the given regex groups.
Emit(groups []string, lexer Lexer, out func(Token))
Emit(groups []string, lexer Lexer, out func(*Token))
}
// EmitterFunc is a function that is an Emitter.
type EmitterFunc func(groups []string, lexer Lexer, out func(Token))
type EmitterFunc func(groups []string, lexer Lexer, out func(*Token))
// Emit tokens for groups.
func (e EmitterFunc) Emit(groups []string, lexer Lexer, out func(Token)) { e(groups, lexer, out) }
func (e EmitterFunc) Emit(groups []string, lexer Lexer, out func(*Token)) { e(groups, lexer, out) }
// ByGroups emits a token for each matching group in the rule's regex.
func ByGroups(emitters ...Emitter) Emitter {
return EmitterFunc(func(groups []string, lexer Lexer, out func(Token)) {
return EmitterFunc(func(groups []string, lexer Lexer, out func(*Token)) {
for i, group := range groups[1:] {
emitters[i].Emit([]string{group}, lexer, out)
}
@ -110,7 +111,7 @@ func ByGroups(emitters ...Emitter) Emitter {
// Using returns an Emitter that uses a given Lexer for parsing and emitting.
func Using(lexer Lexer, options *TokeniseOptions) Emitter {
return EmitterFunc(func(groups []string, _ Lexer, out func(Token)) {
return EmitterFunc(func(groups []string, _ Lexer, out func(*Token)) {
if err := lexer.Tokenise(options, groups[0], out); err != nil {
panic(err)
}
@ -119,7 +120,7 @@ func Using(lexer Lexer, options *TokeniseOptions) Emitter {
// UsingSelf is like Using, but uses the current Lexer.
func UsingSelf(state string) Emitter {
return EmitterFunc(func(groups []string, lexer Lexer, out func(Token)) {
return EmitterFunc(func(groups []string, lexer Lexer, out func(*Token)) {
if err := lexer.Tokenise(&TokeniseOptions{State: state}, groups[0], out); err != nil {
panic(err)
}
@ -151,6 +152,9 @@ func MustNewLexer(config *Config, rules Rules) Lexer {
// "rules" is a state machine transitition map. Each key is a state. Values are sets of rules
// that match input, optionally modify lexer state, and output tokens.
func NewLexer(config *Config, rules Rules) (Lexer, error) {
if config == nil {
config = &Config{}
}
if _, ok := rules["root"]; !ok {
return nil, fmt.Errorf("no \"root\" state")
}
@ -208,7 +212,7 @@ func (r *regexLexer) Config() *Config {
return r.config
}
func (r *regexLexer) Tokenise(options *TokeniseOptions, text string, out func(Token)) error {
func (r *regexLexer) Tokenise(options *TokeniseOptions, text string, out func(*Token)) error {
if options == nil {
options = defaultOptions
}
@ -223,7 +227,7 @@ func (r *regexLexer) Tokenise(options *TokeniseOptions, text string, out func(To
// fmt.Println(text[state.Pos:state.Pos+1], rule, state.Text[state.Pos:state.Pos+1])
// No match.
if index == nil {
out(Token{Error, state.Text[state.Pos : state.Pos+1]})
out(&Token{Error, state.Text[state.Pos : state.Pos+1]})
state.Pos++
continue
}
@ -252,9 +256,9 @@ func (r *regexLexer) Tokenise(options *TokeniseOptions, text string, out func(To
}
// Tokenise text using lexer, returning tokens as a slice.
func Tokenise(lexer Lexer, options *TokeniseOptions, text string) ([]Token, error) {
out := []Token{}
return out, lexer.Tokenise(options, text, func(token Token) { out = append(out, token) })
func Tokenise(lexer Lexer, options *TokeniseOptions, text string) ([]*Token, error) {
out := []*Token{}
return out, lexer.Tokenise(options, text, func(token *Token) { out = append(out, token) })
}
func matchRules(text string, rules []CompiledRule) (int, CompiledRule, []int) {

View File

@ -35,7 +35,7 @@ func TestSimpleLexer(t *testing.T) {
a = 10
`)
require.NoError(t, err)
expected := []Token{
expected := []*Token{
{Whitespace, "\n\t"},
{Comment, "; this is a comment"},
{Whitespace, "\n\t"},

View File

@ -60,12 +60,12 @@ var Markdown = Register(NewLexer(
},
))
func handleCodeblock(groups []string, lexer Lexer, out func(Token)) {
out(Token{String, groups[1]})
out(Token{String, groups[2]})
out(Token{Text, groups[3]})
func handleCodeblock(groups []string, lexer Lexer, out func(*Token)) {
out(&Token{String, groups[1]})
out(&Token{String, groups[2]})
out(&Token{Text, groups[3]})
code := groups[4]
lexer = Registry.Get(groups[2])
lexer.Tokenise(nil, code, out)
out(Token{String, groups[5]})
out(&Token{String, groups[5]})
}

View File

@ -182,6 +182,6 @@ func (t TokenType) InSubCategory(other TokenType) bool {
return t/100 == other/100
}
func (t TokenType) Emit(groups []string, lexer Lexer, out func(Token)) {
out(Token{Type: t, Value: groups[0]})
func (t TokenType) Emit(groups []string, lexer Lexer, out func(*Token)) {
out(&Token{Type: t, Value: groups[0]})
}