1
0
mirror of https://github.com/alecthomas/chroma.git synced 2025-03-19 21:10:15 +02:00

feat: improve regex analysers in XML (#831)

This commit is contained in:
Carlos Henrique Guardão Gandarez 2023-08-22 22:51:13 -03:00 committed by GitHub
parent d37de78b18
commit 708662a581
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 67 additions and 29 deletions

View File

@ -60,14 +60,22 @@ type Config struct {
// Analyse is a list of regexes to match against the input. // Analyse is a list of regexes to match against the input.
// //
// The sum of all the score of matching patterns will be // If a match is found, the score is returned if single attribute is set to true,
// otherwise the sum of all the score of matching patterns will be
// used as the final score. // used as the final score.
Analyse []AnalyseConfig `xml:"analyse,omitempty"` Analyse *AnalyseConfig `xml:"analyse,omitempty"`
} }
// AnalyseConfig defines a single regex analyser pattern. // AnalyseConfig defines the list of regexes analysers.
type AnalyseConfig struct { type AnalyseConfig struct {
Regex string `xml:"regex,attr"` Regexes []RegexConfig `xml:"regex,omitempty"`
// If true, the score is returned despite other matches.
Single bool `xml:"single,attr"`
}
// RegexConfig defines a single regex pattern and its score in case of match.
type RegexConfig struct {
Pattern string `xml:"pattern,attr"`
Score float32 `xml:"score,attr"` Score float32 `xml:"score,attr"`
} }

View File

@ -19,8 +19,10 @@
<mime_type>text/x-c++hdr</mime_type> <mime_type>text/x-c++hdr</mime_type>
<mime_type>text/x-c++src</mime_type> <mime_type>text/x-c++src</mime_type>
<ensure_nl>true</ensure_nl> <ensure_nl>true</ensure_nl>
<analyse regex="#include &lt;[a-z_]+>" score="0.2" /> <analyse single="true">
<analyse regex="using namespace " score="0.4" /> <regex pattern="#include &lt;[a-z_]+>" score="0.2" />
<regex pattern="using namespace " score="0.4" />
</analyse>
</config> </config>
<rules> <rules>
<state name="classname"> <state name="classname">

View File

@ -11,8 +11,10 @@
<mime_type>image/x-xbitmap</mime_type> <mime_type>image/x-xbitmap</mime_type>
<mime_type>image/x-xpixmap</mime_type> <mime_type>image/x-xpixmap</mime_type>
<ensure_nl>true</ensure_nl> <ensure_nl>true</ensure_nl>
<analyse regex="(?m)^\s*#include &lt;" score="0.1"/> <analyse single="true" >
<analyse regex="(?m)^\s*#ifn?def " score="0.1" /> <regex pattern="(?m)^\s*#include &lt;" score="0.1" />
<regex pattern="(?m)^\s*#ifn?def " score="0.1" />
</analyse>
</config> </config>
<rules> <rules>
<state name="statement"> <state name="statement">

View File

@ -298,11 +298,12 @@ func (r *RegexLexer) SetAnalyser(analyser func(text string) float32) Lexer {
return r return r
} }
func (r *RegexLexer) AnalyseText(text string) float32 { // nolint // AnalyseText scores how likely a fragment of text is to match this lexer, between 0.0 and 1.0.
func (r *RegexLexer) AnalyseText(text string) float32 {
if r.analyser != nil { if r.analyser != nil {
return r.analyser(text) return r.analyser(text)
} }
return 0.0 return 0
} }
// SetConfig replaces the Config for this Lexer. // SetConfig replaces the Config for this Lexer.
@ -311,7 +312,8 @@ func (r *RegexLexer) SetConfig(config *Config) *RegexLexer {
return r return r
} }
func (r *RegexLexer) Config() *Config { // nolint // Config returns the Config for this Lexer.
func (r *RegexLexer) Config() *Config {
return r.config return r.config
} }
@ -406,7 +408,8 @@ func (r *RegexLexer) needRules() error {
return err return err
} }
func (r *RegexLexer) Tokenise(options *TokeniseOptions, text string) (Iterator, error) { // nolint // Tokenise text using lexer, returning an iterator.
func (r *RegexLexer) Tokenise(options *TokeniseOptions, text string) (Iterator, error) {
err := r.needRules() err := r.needRules()
if err != nil { if err != nil {
return nil, err return nil, err

View File

@ -131,35 +131,58 @@ func NewXMLLexer(from fs.FS, path string) (*RegexLexer, error) {
if err != nil { if err != nil {
return nil, err return nil, err
} }
for _, glob := range append(config.Filenames, config.AliasFilenames...) { for _, glob := range append(config.Filenames, config.AliasFilenames...) {
_, err := filepath.Match(glob, "") _, err := filepath.Match(glob, "")
if err != nil { if err != nil {
return nil, fmt.Errorf("%s: %q is not a valid glob: %w", config.Name, glob, err) return nil, fmt.Errorf("%s: %q is not a valid glob: %w", config.Name, glob, err)
} }
} }
var analyserFn func(string) float32
if config.Analyse != nil {
type regexAnalyse struct { type regexAnalyse struct {
re *regexp2.Regexp re *regexp2.Regexp
score float32 score float32
} }
regexAnalysers := make([]regexAnalyse, 0, len(config.Analyse))
for _, ra := range config.Analyse { regexAnalysers := make([]regexAnalyse, 0, len(config.Analyse.Regexes))
re, err := regexp2.Compile(ra.Regex, regexp2.None)
for _, ra := range config.Analyse.Regexes {
re, err := regexp2.Compile(ra.Pattern, regexp2.None)
if err != nil { if err != nil {
return nil, fmt.Errorf("%s: %q is not a valid analyser regex: %w", config.Name, ra.Regex, err) return nil, fmt.Errorf("%s: %q is not a valid analyser regex: %w", config.Name, ra.Pattern, err)
} }
regexAnalysers = append(regexAnalysers, regexAnalyse{re, ra.Score}) regexAnalysers = append(regexAnalysers, regexAnalyse{re, ra.Score})
} }
return &RegexLexer{
config: config, analyserFn = func(text string) float32 {
analyser: func(text string) float32 {
var score float32 var score float32
for _, ra := range regexAnalysers { for _, ra := range regexAnalysers {
if ok, _ := ra.re.MatchString(text); ok { ok, err := ra.re.MatchString(text)
if err != nil {
return 0
}
if ok && config.Analyse.Single {
return ra.score
}
if ok {
score += ra.score score += ra.score
} }
} }
return score return score
}, }
}
return &RegexLexer{
config: config,
analyser: analyserFn,
fetchRulesFunc: func() (Rules, error) { fetchRulesFunc: func() (Rules, error) {
var lexer struct { var lexer struct {
Config Config