1
0
mirror of https://github.com/alecthomas/chroma.git synced 2025-03-17 20:58:08 +02:00

feat: improve regex analysers in XML (#831)

This commit is contained in:
Carlos Henrique Guardão Gandarez 2023-08-22 22:51:13 -03:00 committed by GitHub
parent d37de78b18
commit 708662a581
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 67 additions and 29 deletions

View File

@ -60,15 +60,23 @@ type Config struct {
// Analyse is a list of regexes to match against the input.
//
// The sum of all the score of matching patterns will be
// If a match is found, the score is returned if single attribute is set to true,
// otherwise the sum of all the score of matching patterns will be
// used as the final score.
Analyse []AnalyseConfig `xml:"analyse,omitempty"`
Analyse *AnalyseConfig `xml:"analyse,omitempty"`
}
// AnalyseConfig defines a single regex analyser pattern.
// AnalyseConfig defines the list of regexes analysers.
type AnalyseConfig struct {
Regex string `xml:"regex,attr"`
Score float32 `xml:"score,attr"`
Regexes []RegexConfig `xml:"regex,omitempty"`
// If true, the score is returned despite other matches.
Single bool `xml:"single,attr"`
}
// RegexConfig defines a single regex pattern and its score in case of match.
type RegexConfig struct {
Pattern string `xml:"pattern,attr"`
Score float32 `xml:"score,attr"`
}
// Token output to formatter.

View File

@ -19,8 +19,10 @@
<mime_type>text/x-c++hdr</mime_type>
<mime_type>text/x-c++src</mime_type>
<ensure_nl>true</ensure_nl>
<analyse regex="#include &lt;[a-z_]+>" score="0.2" />
<analyse regex="using namespace " score="0.4" />
<analyse single="true">
<regex pattern="#include &lt;[a-z_]+>" score="0.2" />
<regex pattern="using namespace " score="0.4" />
</analyse>
</config>
<rules>
<state name="classname">

View File

@ -11,8 +11,10 @@
<mime_type>image/x-xbitmap</mime_type>
<mime_type>image/x-xpixmap</mime_type>
<ensure_nl>true</ensure_nl>
<analyse regex="(?m)^\s*#include &lt;" score="0.1"/>
<analyse regex="(?m)^\s*#ifn?def " score="0.1" />
<analyse single="true" >
<regex pattern="(?m)^\s*#include &lt;" score="0.1" />
<regex pattern="(?m)^\s*#ifn?def " score="0.1" />
</analyse>
</config>
<rules>
<state name="statement">

View File

@ -298,11 +298,12 @@ func (r *RegexLexer) SetAnalyser(analyser func(text string) float32) Lexer {
return r
}
func (r *RegexLexer) AnalyseText(text string) float32 { // nolint
// AnalyseText scores how likely a fragment of text is to match this lexer, between 0.0 and 1.0.
func (r *RegexLexer) AnalyseText(text string) float32 {
if r.analyser != nil {
return r.analyser(text)
}
return 0.0
return 0
}
// SetConfig replaces the Config for this Lexer.
@ -311,7 +312,8 @@ func (r *RegexLexer) SetConfig(config *Config) *RegexLexer {
return r
}
func (r *RegexLexer) Config() *Config { // nolint
// Config returns the Config for this Lexer.
func (r *RegexLexer) Config() *Config {
return r.config
}
@ -406,7 +408,8 @@ func (r *RegexLexer) needRules() error {
return err
}
func (r *RegexLexer) Tokenise(options *TokeniseOptions, text string) (Iterator, error) { // nolint
// Tokenise text using lexer, returning an iterator.
func (r *RegexLexer) Tokenise(options *TokeniseOptions, text string) (Iterator, error) {
err := r.needRules()
if err != nil {
return nil, err

View File

@ -131,35 +131,58 @@ func NewXMLLexer(from fs.FS, path string) (*RegexLexer, error) {
if err != nil {
return nil, err
}
for _, glob := range append(config.Filenames, config.AliasFilenames...) {
_, err := filepath.Match(glob, "")
if err != nil {
return nil, fmt.Errorf("%s: %q is not a valid glob: %w", config.Name, glob, err)
}
}
type regexAnalyse struct {
re *regexp2.Regexp
score float32
}
regexAnalysers := make([]regexAnalyse, 0, len(config.Analyse))
for _, ra := range config.Analyse {
re, err := regexp2.Compile(ra.Regex, regexp2.None)
if err != nil {
return nil, fmt.Errorf("%s: %q is not a valid analyser regex: %w", config.Name, ra.Regex, err)
var analyserFn func(string) float32
if config.Analyse != nil {
type regexAnalyse struct {
re *regexp2.Regexp
score float32
}
regexAnalysers = append(regexAnalysers, regexAnalyse{re, ra.Score})
}
return &RegexLexer{
config: config,
analyser: func(text string) float32 {
regexAnalysers := make([]regexAnalyse, 0, len(config.Analyse.Regexes))
for _, ra := range config.Analyse.Regexes {
re, err := regexp2.Compile(ra.Pattern, regexp2.None)
if err != nil {
return nil, fmt.Errorf("%s: %q is not a valid analyser regex: %w", config.Name, ra.Pattern, err)
}
regexAnalysers = append(regexAnalysers, regexAnalyse{re, ra.Score})
}
analyserFn = func(text string) float32 {
var score float32
for _, ra := range regexAnalysers {
if ok, _ := ra.re.MatchString(text); ok {
ok, err := ra.re.MatchString(text)
if err != nil {
return 0
}
if ok && config.Analyse.Single {
return ra.score
}
if ok {
score += ra.score
}
}
return score
},
}
}
return &RegexLexer{
config: config,
analyser: analyserFn,
fetchRulesFunc: func() (Rules, error) {
var lexer struct {
Config