1
0
mirror of https://github.com/alecthomas/chroma.git synced 2025-01-26 03:20:10 +02:00

feat: support basic regex analysers in XML (#828)

The `<analyse>` element contains a regex to match against the input, and
a score if the pattern matches.

The scores of all matching patterns for a lexer are summed.

Replaces #815, #813 and #826.
This commit is contained in:
Alec Thomas 2023-08-22 05:32:23 +10:00 committed by GitHub
parent 22266635c1
commit a20cd7e8df
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
18 changed files with 46 additions and 175 deletions

View File

@ -57,6 +57,18 @@ type Config struct {
//
// If this is 0 it will be treated as a default of 1.
Priority float32 `xml:"priority,omitempty"`
// Analyse is a list of regexes to match against the input.
//
// The sum of all the score of matching patterns will be
// used as the final score.
Analyse []AnalyseConfig `xml:"analyse,omitempty"`
}
// AnalyseConfig defines a single regex analyser pattern.
type AnalyseConfig struct {
Regex string `xml:"regex,attr"`
Score float32 `xml:"score,attr"`
}
// Token output to formatter.

View File

@ -1,37 +0,0 @@
package lexers
import (
"regexp"
. "github.com/alecthomas/chroma/v2" // nolint
)
var (
cAnalyserIncludeRe = regexp.MustCompile(`(?m)^\s*#include [<"]`)
cAnalyserIfdefRe = regexp.MustCompile(`(?m)^\s*#ifn?def `)
)
// C lexer.
var C = Register(MustNewXMLLexer(
embedded,
"embedded/c.xml",
).SetConfig(
&Config{
Name: "C",
Aliases: []string{"c"},
Filenames: []string{"*.c", "*.h", "*.idc", "*.x[bp]m"},
MimeTypes: []string{"text/x-chdr", "text/x-csrc", "image/x-xbitmap", "image/x-xpixmap"},
EnsureNL: true,
Priority: 0.1,
},
).SetAnalyser(func(text string) float32 {
if cAnalyserIncludeRe.MatchString(text) {
return 0.1
}
if cAnalyserIfdefRe.MatchString(text) {
return 0.1
}
return 0
}))

View File

@ -1,44 +0,0 @@
package lexers_test
import (
"io/ioutil"
"testing"
"github.com/alecthomas/chroma/v2"
"github.com/alecthomas/chroma/v2/lexers"
"github.com/alecthomas/assert/v2"
)
func TestC_AnalyseText(t *testing.T) {
tests := map[string]struct {
Filepath string
Expected float32
}{
"include": {
Filepath: "testdata/c_include.c",
Expected: 0.1,
},
"ifdef": {
Filepath: "testdata/c_ifdef.c",
Expected: 0.1,
},
"ifndef": {
Filepath: "testdata/c_ifndef.c",
Expected: 0.1,
},
}
for name, test := range tests {
test := test
t.Run(name, func(t *testing.T) {
data, err := ioutil.ReadFile(test.Filepath)
assert.NoError(t, err)
analyser, ok := lexers.C.(chroma.Analyser)
assert.True(t, ok)
assert.Equal(t, test.Expected, analyser.AnalyseText(string(data)))
})
}
}

View File

@ -1,36 +0,0 @@
package lexers
import (
"regexp"
. "github.com/alecthomas/chroma/v2" // nolint
)
var (
cppAnalyserIncludeRe = regexp.MustCompile(`#include <[a-z_]+>`)
cppAnalyserNamespaceRe = regexp.MustCompile(`using namespace `)
)
var CPP = Register(MustNewXMLLexer(
embedded,
"embedded/c++.xml",
).SetConfig(
&Config{
Name: "C++",
Aliases: []string{"cpp", "c++"},
Filenames: []string{"*.cpp", "*.hpp", "*.c++", "*.h++", "*.cc", "*.hh", "*.cxx", "*.hxx", "*.C", "*.H", "*.cp", "*.CPP", "*.cppm", "*.ixx", "*.tpp"},
MimeTypes: []string{"text/x-c++hdr", "text/x-c++src"},
Priority: 0.1,
EnsureNL: true,
},
)).SetAnalyser(func(text string) float32 {
if cppAnalyserIncludeRe.MatchString(text) {
return 0.2
}
if cppAnalyserNamespaceRe.MatchString(text) {
return 0.4
}
return 0
})

View File

@ -1,57 +0,0 @@
package lexers_test
import (
"os"
"testing"
"github.com/alecthomas/chroma/v2"
"github.com/alecthomas/chroma/v2/lexers"
"github.com/alecthomas/assert/v2"
)
func TestCpp_AnalyseText(t *testing.T) {
tests := map[string]struct {
Filepath string
Expected float32
}{
"include": {
Filepath: "testdata/cpp_include.cpp",
Expected: 0.2,
},
"namespace": {
Filepath: "testdata/cpp_namespace.cpp",
Expected: 0.4,
},
}
for name, test := range tests {
test := test
t.Run(name, func(t *testing.T) {
data, err := os.ReadFile(test.Filepath)
assert.NoError(t, err)
analyser, ok := lexers.CPP.(chroma.Analyser)
assert.True(t, ok)
assert.Equal(t, test.Expected, analyser.AnalyseText(string(data)))
})
}
}
func TestIssue290(t *testing.T) {
input := `// 64-bit floats have 53 digits of precision, including the whole-number-part.
double a = 0011111110111001100110011001100110011001100110011001100110011010; // imperfect representation of 0.1
double b = 0011111111001001100110011001100110011001100110011001100110011010; // imperfect representation of 0.2
double c = 0011111111010011001100110011001100110011001100110011001100110011; // imperfect representation of 0.3
double a + b = 0011111111010011001100110011001100110011001100110011001100110100; // Note that this is not quite equal to the "canonical" 0.3!a
`
it, err := lexers.GlobalLexerRegistry.Get("C++").Tokenise(nil, input)
assert.NoError(t, err)
for {
token := it()
if token == chroma.EOF {
break
}
}
}

View File

@ -15,9 +15,12 @@
<filename>*.H</filename>
<filename>*.cp</filename>
<filename>*.CPP</filename>
<filename>*.tpp</filename>
<mime_type>text/x-c++hdr</mime_type>
<mime_type>text/x-c++src</mime_type>
<ensure_nl>true</ensure_nl>
<analyse regex="#include &lt;[a-z_]+>" score="0.2" />
<analyse regex="using namespace " score="0.4" />
</config>
<rules>
<state name="classname">

View File

@ -11,6 +11,8 @@
<mime_type>image/x-xbitmap</mime_type>
<mime_type>image/x-xpixmap</mime_type>
<ensure_nl>true</ensure_nl>
<analyse regex="(?m)^\s*#include &lt;" score="0.1"/>
<analyse regex="(?m)^\s*#ifn?def " score="0.1" />
</config>
<rules>
<state name="statement">

View File

@ -0,0 +1 @@
0.1

View File

@ -0,0 +1 @@
0.1

View File

@ -0,0 +1 @@
0.1

View File

@ -0,0 +1 @@
0.2

View File

@ -0,0 +1 @@
0.4

View File

@ -11,6 +11,8 @@ import (
"reflect"
"regexp"
"strings"
"github.com/dlclark/regexp2"
)
// Serialisation of Chroma rules to XML. The format is:
@ -107,7 +109,7 @@ func fastUnmarshalConfig(from fs.FS, path string) (*Config, error) {
var config Config
err = dec.DecodeElement(&config, &se)
if err != nil {
panic(err)
return nil, fmt.Errorf("%s: %w", path, err)
}
return &config, nil
}
@ -135,8 +137,29 @@ func NewXMLLexer(from fs.FS, path string) (*RegexLexer, error) {
return nil, fmt.Errorf("%s: %q is not a valid glob: %w", config.Name, glob, err)
}
}
type regexAnalyse struct {
re *regexp2.Regexp
score float32
}
regexAnalysers := make([]regexAnalyse, 0, len(config.Analyse))
for _, ra := range config.Analyse {
re, err := regexp2.Compile(ra.Regex, regexp2.None)
if err != nil {
return nil, fmt.Errorf("%s: %q is not a valid analyser regex: %w", config.Name, ra.Regex, err)
}
regexAnalysers = append(regexAnalysers, regexAnalyse{re, ra.Score})
}
return &RegexLexer{
config: config,
analyser: func(text string) float32 {
var score float32
for _, ra := range regexAnalysers {
if ok, _ := ra.re.MatchString(text); ok {
score += ra.score
}
}
return score
},
fetchRulesFunc: func() (Rules, error) {
var lexer struct {
Config