mirror of
https://github.com/alecthomas/chroma.git
synced 2025-01-26 03:20:10 +02:00
Use pointers to tokens + support regex flags in importer.
This commit is contained in:
parent
c64e5829b5
commit
1f47bd705c
@ -24,6 +24,15 @@ var {{upper_name}} = Register(NewLexer(
|
||||
Aliases: []string{ {{#aliases}}"{{.}}", {{/aliases}} },
|
||||
Filenames: []string{ {{#filenames}}"{{.}}", {{/filenames}} },
|
||||
MimeTypes: []string{ {{#mimetypes}}"{{.}}", {{/mimetypes}} },
|
||||
{{#re_not_multiline}}
|
||||
NotMultiline: true,
|
||||
{{/re_not_multiline}}
|
||||
{{#re_dotall}}
|
||||
DotAll: true,
|
||||
{{/re_dotall}}
|
||||
{{#re_ignorecase}}
|
||||
CaseInsensitive: true,
|
||||
{{/re_ignorecase}}
|
||||
},
|
||||
Rules{
|
||||
{{#tokens}}
|
||||
@ -136,6 +145,21 @@ def translate_rules(rules):
|
||||
return out
|
||||
|
||||
|
||||
class TemplateView(object):
|
||||
def __init__(self, **kwargs):
|
||||
for key, value in kwargs.items():
|
||||
setattr(self, key, value)
|
||||
|
||||
def re_not_multiline(self):
|
||||
return not (self.regex_flags & re.MULTILINE)
|
||||
|
||||
def re_dotall(self):
|
||||
return self.regex_flags & re.DOTALL
|
||||
|
||||
def re_ignorecase(self):
|
||||
return self.regex_flags & re.IGNORECASE
|
||||
|
||||
|
||||
def main():
|
||||
package_name, symbol_name = sys.argv[1].rsplit(sep=".", maxsplit=1)
|
||||
|
||||
@ -145,15 +169,15 @@ def main():
|
||||
|
||||
assert issubclass(lexer_cls, pygments_lexer.RegexLexer), 'can only translate from RegexLexer'
|
||||
|
||||
print(pystache.render(TEMPLATE, {
|
||||
'name': lexer_cls.name,
|
||||
'options': lexer_cls.flags,
|
||||
'upper_name': to_camel_case(lexer_cls.name),
|
||||
'aliases': lexer_cls.aliases,
|
||||
'filenames': lexer_cls.filenames,
|
||||
'mimetypes': lexer_cls.mimetypes,
|
||||
'tokens': [{'state': state, 'rules': translate_rules(rules)} for (state, rules) in lexer_cls.get_tokendefs().items()],
|
||||
}))
|
||||
print(pystache.render(TEMPLATE, TemplateView(
|
||||
name=lexer_cls.name,
|
||||
regex_flags=lexer_cls.flags,
|
||||
upper_name=to_camel_case(lexer_cls.name),
|
||||
aliases=lexer_cls.aliases,
|
||||
filenames=lexer_cls.filenames,
|
||||
mimetypes=lexer_cls.mimetypes,
|
||||
tokens=[{'state': state, 'rules': translate_rules(rules)} for (state, rules) in lexer_cls.get_tokendefs().items()],
|
||||
)))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
@ -52,15 +52,14 @@ func main() {
|
||||
}
|
||||
}
|
||||
|
||||
func getWriter(w io.Writer) func(chroma.Token) {
|
||||
func getWriter(w io.Writer) func(*chroma.Token) {
|
||||
if *tokensFlag {
|
||||
return func(token chroma.Token) {
|
||||
return func(token *chroma.Token) {
|
||||
fmt.Println(token)
|
||||
}
|
||||
} else {
|
||||
formatter := formatters.Console(formatters.DefaultConsoleTheme)
|
||||
writer, err := formatter.Format(w)
|
||||
kingpin.FatalIfError(err, "")
|
||||
return writer
|
||||
}
|
||||
formatter := formatters.Console(formatters.DefaultConsoleTheme)
|
||||
writer, err := formatter.Format(w)
|
||||
kingpin.FatalIfError(err, "")
|
||||
return writer
|
||||
}
|
||||
|
16
coalesce.go
16
coalesce.go
@ -9,22 +9,18 @@ type coalescer struct {
|
||||
Lexer
|
||||
}
|
||||
|
||||
func (d *coalescer) Tokenise(options *TokeniseOptions, text string, out func(Token)) error {
|
||||
func (d *coalescer) Tokenise(options *TokeniseOptions, text string, out func(*Token)) error {
|
||||
var last *Token
|
||||
defer func() {
|
||||
if last != nil {
|
||||
out(*last)
|
||||
}
|
||||
}()
|
||||
return d.Lexer.Tokenise(options, text, func(token Token) {
|
||||
defer func() { out(last) }()
|
||||
return d.Lexer.Tokenise(options, text, func(token *Token) {
|
||||
if last == nil {
|
||||
last = &token
|
||||
last = token
|
||||
} else {
|
||||
if last.Type == token.Type {
|
||||
last.Value += token.Value
|
||||
} else {
|
||||
out(*last)
|
||||
last = &token
|
||||
out(last)
|
||||
last = token
|
||||
}
|
||||
}
|
||||
})
|
||||
|
@ -12,10 +12,10 @@ func TestCoalesce(t *testing.T) {
|
||||
Rule{`[[:punct:]]`, Punctuation, nil},
|
||||
},
|
||||
}))
|
||||
actual, err := Tokenise(lexer, nil, "!@#$%")
|
||||
actual, err := Tokenise(lexer, nil, "!@#$")
|
||||
require.NoError(t, err)
|
||||
expected := []Token{
|
||||
Token{Punctuation, "!@#$%"},
|
||||
expected := []*Token{
|
||||
&Token{Punctuation, "!@#$"},
|
||||
}
|
||||
require.Equal(t, expected, actual)
|
||||
}
|
||||
|
@ -8,5 +8,5 @@ import (
|
||||
|
||||
// Formatter returns a formatting function for tokens.
|
||||
type Formatter interface {
|
||||
Format(w io.Writer) (func(chroma.Token), error)
|
||||
Format(w io.Writer) (func(*chroma.Token), error)
|
||||
}
|
||||
|
@ -34,8 +34,8 @@ type consoleFormatter struct {
|
||||
theme map[TokenType]string
|
||||
}
|
||||
|
||||
func (c *consoleFormatter) Format(w io.Writer) (func(Token), error) {
|
||||
return func(token Token) {
|
||||
func (c *consoleFormatter) Format(w io.Writer) (func(*Token), error) {
|
||||
return func(token *Token) {
|
||||
clr, ok := c.theme[token.Type]
|
||||
if !ok {
|
||||
clr, ok = c.theme[token.Type.SubCategory()]
|
||||
@ -46,8 +46,12 @@ func (c *consoleFormatter) Format(w io.Writer) (func(Token), error) {
|
||||
}
|
||||
}
|
||||
}
|
||||
fmt.Fprint(w, clr)
|
||||
if clr != "" {
|
||||
fmt.Fprint(w, clr)
|
||||
}
|
||||
fmt.Fprint(w, token.Value)
|
||||
fmt.Fprintf(w, "\033[0m")
|
||||
if clr != "" {
|
||||
fmt.Fprintf(w, "\033[0m")
|
||||
}
|
||||
}, nil
|
||||
}
|
||||
|
32
lexer.go
32
lexer.go
@ -57,13 +57,14 @@ type Config struct {
|
||||
// TabSize int
|
||||
}
|
||||
|
||||
// Token output to formatter.
|
||||
type Token struct {
|
||||
Type TokenType
|
||||
Value string
|
||||
}
|
||||
|
||||
func (t Token) String() string { return fmt.Sprintf("Token{%s, %q}", t.Type, t.Value) }
|
||||
func (t Token) GoString() string { return t.String() }
|
||||
func (t *Token) String() string { return t.Value }
|
||||
func (t *Token) GoString() string { return fmt.Sprintf("Token{%s, %q}", t.Type, t.Value) }
|
||||
|
||||
type TokeniseOptions struct {
|
||||
// State to start tokenisation in. Defaults to "root".
|
||||
@ -72,7 +73,7 @@ type TokeniseOptions struct {
|
||||
|
||||
type Lexer interface {
|
||||
Config() *Config
|
||||
Tokenise(options *TokeniseOptions, text string, out func(Token)) error
|
||||
Tokenise(options *TokeniseOptions, text string, out func(*Token)) error
|
||||
}
|
||||
|
||||
// Analyser determines if this lexer is appropriate for the given text.
|
||||
@ -89,18 +90,18 @@ type Rule struct {
|
||||
// An Emitter takes group matches and returns tokens.
|
||||
type Emitter interface {
|
||||
// Emit tokens for the given regex groups.
|
||||
Emit(groups []string, lexer Lexer, out func(Token))
|
||||
Emit(groups []string, lexer Lexer, out func(*Token))
|
||||
}
|
||||
|
||||
// EmitterFunc is a function that is an Emitter.
|
||||
type EmitterFunc func(groups []string, lexer Lexer, out func(Token))
|
||||
type EmitterFunc func(groups []string, lexer Lexer, out func(*Token))
|
||||
|
||||
// Emit tokens for groups.
|
||||
func (e EmitterFunc) Emit(groups []string, lexer Lexer, out func(Token)) { e(groups, lexer, out) }
|
||||
func (e EmitterFunc) Emit(groups []string, lexer Lexer, out func(*Token)) { e(groups, lexer, out) }
|
||||
|
||||
// ByGroups emits a token for each matching group in the rule's regex.
|
||||
func ByGroups(emitters ...Emitter) Emitter {
|
||||
return EmitterFunc(func(groups []string, lexer Lexer, out func(Token)) {
|
||||
return EmitterFunc(func(groups []string, lexer Lexer, out func(*Token)) {
|
||||
for i, group := range groups[1:] {
|
||||
emitters[i].Emit([]string{group}, lexer, out)
|
||||
}
|
||||
@ -110,7 +111,7 @@ func ByGroups(emitters ...Emitter) Emitter {
|
||||
|
||||
// Using returns an Emitter that uses a given Lexer for parsing and emitting.
|
||||
func Using(lexer Lexer, options *TokeniseOptions) Emitter {
|
||||
return EmitterFunc(func(groups []string, _ Lexer, out func(Token)) {
|
||||
return EmitterFunc(func(groups []string, _ Lexer, out func(*Token)) {
|
||||
if err := lexer.Tokenise(options, groups[0], out); err != nil {
|
||||
panic(err)
|
||||
}
|
||||
@ -119,7 +120,7 @@ func Using(lexer Lexer, options *TokeniseOptions) Emitter {
|
||||
|
||||
// UsingSelf is like Using, but uses the current Lexer.
|
||||
func UsingSelf(state string) Emitter {
|
||||
return EmitterFunc(func(groups []string, lexer Lexer, out func(Token)) {
|
||||
return EmitterFunc(func(groups []string, lexer Lexer, out func(*Token)) {
|
||||
if err := lexer.Tokenise(&TokeniseOptions{State: state}, groups[0], out); err != nil {
|
||||
panic(err)
|
||||
}
|
||||
@ -151,6 +152,9 @@ func MustNewLexer(config *Config, rules Rules) Lexer {
|
||||
// "rules" is a state machine transitition map. Each key is a state. Values are sets of rules
|
||||
// that match input, optionally modify lexer state, and output tokens.
|
||||
func NewLexer(config *Config, rules Rules) (Lexer, error) {
|
||||
if config == nil {
|
||||
config = &Config{}
|
||||
}
|
||||
if _, ok := rules["root"]; !ok {
|
||||
return nil, fmt.Errorf("no \"root\" state")
|
||||
}
|
||||
@ -208,7 +212,7 @@ func (r *regexLexer) Config() *Config {
|
||||
return r.config
|
||||
}
|
||||
|
||||
func (r *regexLexer) Tokenise(options *TokeniseOptions, text string, out func(Token)) error {
|
||||
func (r *regexLexer) Tokenise(options *TokeniseOptions, text string, out func(*Token)) error {
|
||||
if options == nil {
|
||||
options = defaultOptions
|
||||
}
|
||||
@ -223,7 +227,7 @@ func (r *regexLexer) Tokenise(options *TokeniseOptions, text string, out func(To
|
||||
// fmt.Println(text[state.Pos:state.Pos+1], rule, state.Text[state.Pos:state.Pos+1])
|
||||
// No match.
|
||||
if index == nil {
|
||||
out(Token{Error, state.Text[state.Pos : state.Pos+1]})
|
||||
out(&Token{Error, state.Text[state.Pos : state.Pos+1]})
|
||||
state.Pos++
|
||||
continue
|
||||
}
|
||||
@ -252,9 +256,9 @@ func (r *regexLexer) Tokenise(options *TokeniseOptions, text string, out func(To
|
||||
}
|
||||
|
||||
// Tokenise text using lexer, returning tokens as a slice.
|
||||
func Tokenise(lexer Lexer, options *TokeniseOptions, text string) ([]Token, error) {
|
||||
out := []Token{}
|
||||
return out, lexer.Tokenise(options, text, func(token Token) { out = append(out, token) })
|
||||
func Tokenise(lexer Lexer, options *TokeniseOptions, text string) ([]*Token, error) {
|
||||
out := []*Token{}
|
||||
return out, lexer.Tokenise(options, text, func(token *Token) { out = append(out, token) })
|
||||
}
|
||||
|
||||
func matchRules(text string, rules []CompiledRule) (int, CompiledRule, []int) {
|
||||
|
@ -35,7 +35,7 @@ func TestSimpleLexer(t *testing.T) {
|
||||
a = 10
|
||||
`)
|
||||
require.NoError(t, err)
|
||||
expected := []Token{
|
||||
expected := []*Token{
|
||||
{Whitespace, "\n\t"},
|
||||
{Comment, "; this is a comment"},
|
||||
{Whitespace, "\n\t"},
|
||||
|
@ -60,12 +60,12 @@ var Markdown = Register(NewLexer(
|
||||
},
|
||||
))
|
||||
|
||||
func handleCodeblock(groups []string, lexer Lexer, out func(Token)) {
|
||||
out(Token{String, groups[1]})
|
||||
out(Token{String, groups[2]})
|
||||
out(Token{Text, groups[3]})
|
||||
func handleCodeblock(groups []string, lexer Lexer, out func(*Token)) {
|
||||
out(&Token{String, groups[1]})
|
||||
out(&Token{String, groups[2]})
|
||||
out(&Token{Text, groups[3]})
|
||||
code := groups[4]
|
||||
lexer = Registry.Get(groups[2])
|
||||
lexer.Tokenise(nil, code, out)
|
||||
out(Token{String, groups[5]})
|
||||
out(&Token{String, groups[5]})
|
||||
}
|
||||
|
4
types.go
4
types.go
@ -182,6 +182,6 @@ func (t TokenType) InSubCategory(other TokenType) bool {
|
||||
return t/100 == other/100
|
||||
}
|
||||
|
||||
func (t TokenType) Emit(groups []string, lexer Lexer, out func(Token)) {
|
||||
out(Token{Type: t, Value: groups[0]})
|
||||
func (t TokenType) Emit(groups []string, lexer Lexer, out func(*Token)) {
|
||||
out(&Token{Type: t, Value: groups[0]})
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user