mirror of
https://github.com/alecthomas/chroma.git
synced 2025-03-19 21:10:15 +02:00
feat: improve regex analysers in XML (#831)
This commit is contained in:
parent
d37de78b18
commit
708662a581
18
lexer.go
18
lexer.go
@ -60,15 +60,23 @@ type Config struct {
|
|||||||
|
|
||||||
// Analyse is a list of regexes to match against the input.
|
// Analyse is a list of regexes to match against the input.
|
||||||
//
|
//
|
||||||
// The sum of all the score of matching patterns will be
|
// If a match is found, the score is returned if single attribute is set to true,
|
||||||
|
// otherwise the sum of all the score of matching patterns will be
|
||||||
// used as the final score.
|
// used as the final score.
|
||||||
Analyse []AnalyseConfig `xml:"analyse,omitempty"`
|
Analyse *AnalyseConfig `xml:"analyse,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
// AnalyseConfig defines a single regex analyser pattern.
|
// AnalyseConfig defines the list of regexes analysers.
|
||||||
type AnalyseConfig struct {
|
type AnalyseConfig struct {
|
||||||
Regex string `xml:"regex,attr"`
|
Regexes []RegexConfig `xml:"regex,omitempty"`
|
||||||
Score float32 `xml:"score,attr"`
|
// If true, the score is returned despite other matches.
|
||||||
|
Single bool `xml:"single,attr"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// RegexConfig defines a single regex pattern and its score in case of match.
|
||||||
|
type RegexConfig struct {
|
||||||
|
Pattern string `xml:"pattern,attr"`
|
||||||
|
Score float32 `xml:"score,attr"`
|
||||||
}
|
}
|
||||||
|
|
||||||
// Token output to formatter.
|
// Token output to formatter.
|
||||||
|
@ -19,8 +19,10 @@
|
|||||||
<mime_type>text/x-c++hdr</mime_type>
|
<mime_type>text/x-c++hdr</mime_type>
|
||||||
<mime_type>text/x-c++src</mime_type>
|
<mime_type>text/x-c++src</mime_type>
|
||||||
<ensure_nl>true</ensure_nl>
|
<ensure_nl>true</ensure_nl>
|
||||||
<analyse regex="#include <[a-z_]+>" score="0.2" />
|
<analyse single="true">
|
||||||
<analyse regex="using namespace " score="0.4" />
|
<regex pattern="#include <[a-z_]+>" score="0.2" />
|
||||||
|
<regex pattern="using namespace " score="0.4" />
|
||||||
|
</analyse>
|
||||||
</config>
|
</config>
|
||||||
<rules>
|
<rules>
|
||||||
<state name="classname">
|
<state name="classname">
|
||||||
|
@ -11,8 +11,10 @@
|
|||||||
<mime_type>image/x-xbitmap</mime_type>
|
<mime_type>image/x-xbitmap</mime_type>
|
||||||
<mime_type>image/x-xpixmap</mime_type>
|
<mime_type>image/x-xpixmap</mime_type>
|
||||||
<ensure_nl>true</ensure_nl>
|
<ensure_nl>true</ensure_nl>
|
||||||
<analyse regex="(?m)^\s*#include <" score="0.1"/>
|
<analyse single="true" >
|
||||||
<analyse regex="(?m)^\s*#ifn?def " score="0.1" />
|
<regex pattern="(?m)^\s*#include <" score="0.1" />
|
||||||
|
<regex pattern="(?m)^\s*#ifn?def " score="0.1" />
|
||||||
|
</analyse>
|
||||||
</config>
|
</config>
|
||||||
<rules>
|
<rules>
|
||||||
<state name="statement">
|
<state name="statement">
|
||||||
|
11
regexp.go
11
regexp.go
@ -298,11 +298,12 @@ func (r *RegexLexer) SetAnalyser(analyser func(text string) float32) Lexer {
|
|||||||
return r
|
return r
|
||||||
}
|
}
|
||||||
|
|
||||||
func (r *RegexLexer) AnalyseText(text string) float32 { // nolint
|
// AnalyseText scores how likely a fragment of text is to match this lexer, between 0.0 and 1.0.
|
||||||
|
func (r *RegexLexer) AnalyseText(text string) float32 {
|
||||||
if r.analyser != nil {
|
if r.analyser != nil {
|
||||||
return r.analyser(text)
|
return r.analyser(text)
|
||||||
}
|
}
|
||||||
return 0.0
|
return 0
|
||||||
}
|
}
|
||||||
|
|
||||||
// SetConfig replaces the Config for this Lexer.
|
// SetConfig replaces the Config for this Lexer.
|
||||||
@ -311,7 +312,8 @@ func (r *RegexLexer) SetConfig(config *Config) *RegexLexer {
|
|||||||
return r
|
return r
|
||||||
}
|
}
|
||||||
|
|
||||||
func (r *RegexLexer) Config() *Config { // nolint
|
// Config returns the Config for this Lexer.
|
||||||
|
func (r *RegexLexer) Config() *Config {
|
||||||
return r.config
|
return r.config
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -406,7 +408,8 @@ func (r *RegexLexer) needRules() error {
|
|||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
func (r *RegexLexer) Tokenise(options *TokeniseOptions, text string) (Iterator, error) { // nolint
|
// Tokenise text using lexer, returning an iterator.
|
||||||
|
func (r *RegexLexer) Tokenise(options *TokeniseOptions, text string) (Iterator, error) {
|
||||||
err := r.needRules()
|
err := r.needRules()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
|
55
serialise.go
55
serialise.go
@ -131,35 +131,58 @@ func NewXMLLexer(from fs.FS, path string) (*RegexLexer, error) {
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, glob := range append(config.Filenames, config.AliasFilenames...) {
|
for _, glob := range append(config.Filenames, config.AliasFilenames...) {
|
||||||
_, err := filepath.Match(glob, "")
|
_, err := filepath.Match(glob, "")
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf("%s: %q is not a valid glob: %w", config.Name, glob, err)
|
return nil, fmt.Errorf("%s: %q is not a valid glob: %w", config.Name, glob, err)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
type regexAnalyse struct {
|
|
||||||
re *regexp2.Regexp
|
var analyserFn func(string) float32
|
||||||
score float32
|
|
||||||
}
|
if config.Analyse != nil {
|
||||||
regexAnalysers := make([]regexAnalyse, 0, len(config.Analyse))
|
type regexAnalyse struct {
|
||||||
for _, ra := range config.Analyse {
|
re *regexp2.Regexp
|
||||||
re, err := regexp2.Compile(ra.Regex, regexp2.None)
|
score float32
|
||||||
if err != nil {
|
|
||||||
return nil, fmt.Errorf("%s: %q is not a valid analyser regex: %w", config.Name, ra.Regex, err)
|
|
||||||
}
|
}
|
||||||
regexAnalysers = append(regexAnalysers, regexAnalyse{re, ra.Score})
|
|
||||||
}
|
regexAnalysers := make([]regexAnalyse, 0, len(config.Analyse.Regexes))
|
||||||
return &RegexLexer{
|
|
||||||
config: config,
|
for _, ra := range config.Analyse.Regexes {
|
||||||
analyser: func(text string) float32 {
|
re, err := regexp2.Compile(ra.Pattern, regexp2.None)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("%s: %q is not a valid analyser regex: %w", config.Name, ra.Pattern, err)
|
||||||
|
}
|
||||||
|
|
||||||
|
regexAnalysers = append(regexAnalysers, regexAnalyse{re, ra.Score})
|
||||||
|
}
|
||||||
|
|
||||||
|
analyserFn = func(text string) float32 {
|
||||||
var score float32
|
var score float32
|
||||||
|
|
||||||
for _, ra := range regexAnalysers {
|
for _, ra := range regexAnalysers {
|
||||||
if ok, _ := ra.re.MatchString(text); ok {
|
ok, err := ra.re.MatchString(text)
|
||||||
|
if err != nil {
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
|
if ok && config.Analyse.Single {
|
||||||
|
return ra.score
|
||||||
|
}
|
||||||
|
|
||||||
|
if ok {
|
||||||
score += ra.score
|
score += ra.score
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return score
|
return score
|
||||||
},
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return &RegexLexer{
|
||||||
|
config: config,
|
||||||
|
analyser: analyserFn,
|
||||||
fetchRulesFunc: func() (Rules, error) {
|
fetchRulesFunc: func() (Rules, error) {
|
||||||
var lexer struct {
|
var lexer struct {
|
||||||
Config
|
Config
|
||||||
|
Loading…
x
Reference in New Issue
Block a user