mirror of
https://github.com/alecthomas/chroma.git
synced 2025-04-21 12:06:53 +02:00
feat: support basic regex analysers in XML (#828)
The `<analyse>` element contains a regex to match against the input, and a score if the pattern matches. The scores of all matching patterns for a lexer are summed. Replaces #815, #813 and #826.
This commit is contained in:
parent
22266635c1
commit
a20cd7e8df
12
lexer.go
12
lexer.go
@ -57,6 +57,18 @@ type Config struct {
|
|||||||
//
|
//
|
||||||
// If this is 0 it will be treated as a default of 1.
|
// If this is 0 it will be treated as a default of 1.
|
||||||
Priority float32 `xml:"priority,omitempty"`
|
Priority float32 `xml:"priority,omitempty"`
|
||||||
|
|
||||||
|
// Analyse is a list of regexes to match against the input.
|
||||||
|
//
|
||||||
|
// The sum of all the score of matching patterns will be
|
||||||
|
// used as the final score.
|
||||||
|
Analyse []AnalyseConfig `xml:"analyse,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// AnalyseConfig defines a single regex analyser pattern.
|
||||||
|
type AnalyseConfig struct {
|
||||||
|
Regex string `xml:"regex,attr"`
|
||||||
|
Score float32 `xml:"score,attr"`
|
||||||
}
|
}
|
||||||
|
|
||||||
// Token output to formatter.
|
// Token output to formatter.
|
||||||
|
37
lexers/c.go
37
lexers/c.go
@ -1,37 +0,0 @@
|
|||||||
package lexers
|
|
||||||
|
|
||||||
import (
|
|
||||||
"regexp"
|
|
||||||
|
|
||||||
. "github.com/alecthomas/chroma/v2" // nolint
|
|
||||||
)
|
|
||||||
|
|
||||||
var (
|
|
||||||
cAnalyserIncludeRe = regexp.MustCompile(`(?m)^\s*#include [<"]`)
|
|
||||||
cAnalyserIfdefRe = regexp.MustCompile(`(?m)^\s*#ifn?def `)
|
|
||||||
)
|
|
||||||
|
|
||||||
// C lexer.
|
|
||||||
var C = Register(MustNewXMLLexer(
|
|
||||||
embedded,
|
|
||||||
"embedded/c.xml",
|
|
||||||
).SetConfig(
|
|
||||||
&Config{
|
|
||||||
Name: "C",
|
|
||||||
Aliases: []string{"c"},
|
|
||||||
Filenames: []string{"*.c", "*.h", "*.idc", "*.x[bp]m"},
|
|
||||||
MimeTypes: []string{"text/x-chdr", "text/x-csrc", "image/x-xbitmap", "image/x-xpixmap"},
|
|
||||||
EnsureNL: true,
|
|
||||||
Priority: 0.1,
|
|
||||||
},
|
|
||||||
).SetAnalyser(func(text string) float32 {
|
|
||||||
if cAnalyserIncludeRe.MatchString(text) {
|
|
||||||
return 0.1
|
|
||||||
}
|
|
||||||
|
|
||||||
if cAnalyserIfdefRe.MatchString(text) {
|
|
||||||
return 0.1
|
|
||||||
}
|
|
||||||
|
|
||||||
return 0
|
|
||||||
}))
|
|
@ -1,44 +0,0 @@
|
|||||||
package lexers_test
|
|
||||||
|
|
||||||
import (
|
|
||||||
"io/ioutil"
|
|
||||||
"testing"
|
|
||||||
|
|
||||||
"github.com/alecthomas/chroma/v2"
|
|
||||||
"github.com/alecthomas/chroma/v2/lexers"
|
|
||||||
|
|
||||||
"github.com/alecthomas/assert/v2"
|
|
||||||
)
|
|
||||||
|
|
||||||
func TestC_AnalyseText(t *testing.T) {
|
|
||||||
tests := map[string]struct {
|
|
||||||
Filepath string
|
|
||||||
Expected float32
|
|
||||||
}{
|
|
||||||
"include": {
|
|
||||||
Filepath: "testdata/c_include.c",
|
|
||||||
Expected: 0.1,
|
|
||||||
},
|
|
||||||
"ifdef": {
|
|
||||||
Filepath: "testdata/c_ifdef.c",
|
|
||||||
Expected: 0.1,
|
|
||||||
},
|
|
||||||
"ifndef": {
|
|
||||||
Filepath: "testdata/c_ifndef.c",
|
|
||||||
Expected: 0.1,
|
|
||||||
},
|
|
||||||
}
|
|
||||||
|
|
||||||
for name, test := range tests {
|
|
||||||
test := test
|
|
||||||
t.Run(name, func(t *testing.T) {
|
|
||||||
data, err := ioutil.ReadFile(test.Filepath)
|
|
||||||
assert.NoError(t, err)
|
|
||||||
|
|
||||||
analyser, ok := lexers.C.(chroma.Analyser)
|
|
||||||
assert.True(t, ok)
|
|
||||||
|
|
||||||
assert.Equal(t, test.Expected, analyser.AnalyseText(string(data)))
|
|
||||||
})
|
|
||||||
}
|
|
||||||
}
|
|
@ -1,36 +0,0 @@
|
|||||||
package lexers
|
|
||||||
|
|
||||||
import (
|
|
||||||
"regexp"
|
|
||||||
|
|
||||||
. "github.com/alecthomas/chroma/v2" // nolint
|
|
||||||
)
|
|
||||||
|
|
||||||
var (
|
|
||||||
cppAnalyserIncludeRe = regexp.MustCompile(`#include <[a-z_]+>`)
|
|
||||||
cppAnalyserNamespaceRe = regexp.MustCompile(`using namespace `)
|
|
||||||
)
|
|
||||||
|
|
||||||
var CPP = Register(MustNewXMLLexer(
|
|
||||||
embedded,
|
|
||||||
"embedded/c++.xml",
|
|
||||||
).SetConfig(
|
|
||||||
&Config{
|
|
||||||
Name: "C++",
|
|
||||||
Aliases: []string{"cpp", "c++"},
|
|
||||||
Filenames: []string{"*.cpp", "*.hpp", "*.c++", "*.h++", "*.cc", "*.hh", "*.cxx", "*.hxx", "*.C", "*.H", "*.cp", "*.CPP", "*.cppm", "*.ixx", "*.tpp"},
|
|
||||||
MimeTypes: []string{"text/x-c++hdr", "text/x-c++src"},
|
|
||||||
Priority: 0.1,
|
|
||||||
EnsureNL: true,
|
|
||||||
},
|
|
||||||
)).SetAnalyser(func(text string) float32 {
|
|
||||||
if cppAnalyserIncludeRe.MatchString(text) {
|
|
||||||
return 0.2
|
|
||||||
}
|
|
||||||
|
|
||||||
if cppAnalyserNamespaceRe.MatchString(text) {
|
|
||||||
return 0.4
|
|
||||||
}
|
|
||||||
|
|
||||||
return 0
|
|
||||||
})
|
|
@ -1,57 +0,0 @@
|
|||||||
package lexers_test
|
|
||||||
|
|
||||||
import (
|
|
||||||
"os"
|
|
||||||
"testing"
|
|
||||||
|
|
||||||
"github.com/alecthomas/chroma/v2"
|
|
||||||
"github.com/alecthomas/chroma/v2/lexers"
|
|
||||||
|
|
||||||
"github.com/alecthomas/assert/v2"
|
|
||||||
)
|
|
||||||
|
|
||||||
func TestCpp_AnalyseText(t *testing.T) {
|
|
||||||
tests := map[string]struct {
|
|
||||||
Filepath string
|
|
||||||
Expected float32
|
|
||||||
}{
|
|
||||||
"include": {
|
|
||||||
Filepath: "testdata/cpp_include.cpp",
|
|
||||||
Expected: 0.2,
|
|
||||||
},
|
|
||||||
"namespace": {
|
|
||||||
Filepath: "testdata/cpp_namespace.cpp",
|
|
||||||
Expected: 0.4,
|
|
||||||
},
|
|
||||||
}
|
|
||||||
|
|
||||||
for name, test := range tests {
|
|
||||||
test := test
|
|
||||||
t.Run(name, func(t *testing.T) {
|
|
||||||
data, err := os.ReadFile(test.Filepath)
|
|
||||||
assert.NoError(t, err)
|
|
||||||
|
|
||||||
analyser, ok := lexers.CPP.(chroma.Analyser)
|
|
||||||
assert.True(t, ok)
|
|
||||||
|
|
||||||
assert.Equal(t, test.Expected, analyser.AnalyseText(string(data)))
|
|
||||||
})
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestIssue290(t *testing.T) {
|
|
||||||
input := `// 64-bit floats have 53 digits of precision, including the whole-number-part.
|
|
||||||
double a = 0011111110111001100110011001100110011001100110011001100110011010; // imperfect representation of 0.1
|
|
||||||
double b = 0011111111001001100110011001100110011001100110011001100110011010; // imperfect representation of 0.2
|
|
||||||
double c = 0011111111010011001100110011001100110011001100110011001100110011; // imperfect representation of 0.3
|
|
||||||
double a + b = 0011111111010011001100110011001100110011001100110011001100110100; // Note that this is not quite equal to the "canonical" 0.3!a
|
|
||||||
`
|
|
||||||
it, err := lexers.GlobalLexerRegistry.Get("C++").Tokenise(nil, input)
|
|
||||||
assert.NoError(t, err)
|
|
||||||
for {
|
|
||||||
token := it()
|
|
||||||
if token == chroma.EOF {
|
|
||||||
break
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
@ -15,9 +15,12 @@
|
|||||||
<filename>*.H</filename>
|
<filename>*.H</filename>
|
||||||
<filename>*.cp</filename>
|
<filename>*.cp</filename>
|
||||||
<filename>*.CPP</filename>
|
<filename>*.CPP</filename>
|
||||||
|
<filename>*.tpp</filename>
|
||||||
<mime_type>text/x-c++hdr</mime_type>
|
<mime_type>text/x-c++hdr</mime_type>
|
||||||
<mime_type>text/x-c++src</mime_type>
|
<mime_type>text/x-c++src</mime_type>
|
||||||
<ensure_nl>true</ensure_nl>
|
<ensure_nl>true</ensure_nl>
|
||||||
|
<analyse regex="#include <[a-z_]+>" score="0.2" />
|
||||||
|
<analyse regex="using namespace " score="0.4" />
|
||||||
</config>
|
</config>
|
||||||
<rules>
|
<rules>
|
||||||
<state name="classname">
|
<state name="classname">
|
||||||
|
@ -11,6 +11,8 @@
|
|||||||
<mime_type>image/x-xbitmap</mime_type>
|
<mime_type>image/x-xbitmap</mime_type>
|
||||||
<mime_type>image/x-xpixmap</mime_type>
|
<mime_type>image/x-xpixmap</mime_type>
|
||||||
<ensure_nl>true</ensure_nl>
|
<ensure_nl>true</ensure_nl>
|
||||||
|
<analyse regex="(?m)^\s*#include <" score="0.1"/>
|
||||||
|
<analyse regex="(?m)^\s*#ifn?def " score="0.1" />
|
||||||
</config>
|
</config>
|
||||||
<rules>
|
<rules>
|
||||||
<state name="statement">
|
<state name="statement">
|
||||||
|
1
lexers/testdata/analysis/c.ifdef.expected
vendored
Normal file
1
lexers/testdata/analysis/c.ifdef.expected
vendored
Normal file
@ -0,0 +1 @@
|
|||||||
|
0.1
|
1
lexers/testdata/analysis/c.ifndef.expected
vendored
Normal file
1
lexers/testdata/analysis/c.ifndef.expected
vendored
Normal file
@ -0,0 +1 @@
|
|||||||
|
0.1
|
1
lexers/testdata/analysis/c.include.expected
vendored
Normal file
1
lexers/testdata/analysis/c.include.expected
vendored
Normal file
@ -0,0 +1 @@
|
|||||||
|
0.1
|
1
lexers/testdata/analysis/cpp.include.expected
vendored
Normal file
1
lexers/testdata/analysis/cpp.include.expected
vendored
Normal file
@ -0,0 +1 @@
|
|||||||
|
0.2
|
1
lexers/testdata/analysis/cpp.namespace.expected
vendored
Normal file
1
lexers/testdata/analysis/cpp.namespace.expected
vendored
Normal file
@ -0,0 +1 @@
|
|||||||
|
0.4
|
25
serialise.go
25
serialise.go
@ -11,6 +11,8 @@ import (
|
|||||||
"reflect"
|
"reflect"
|
||||||
"regexp"
|
"regexp"
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
|
"github.com/dlclark/regexp2"
|
||||||
)
|
)
|
||||||
|
|
||||||
// Serialisation of Chroma rules to XML. The format is:
|
// Serialisation of Chroma rules to XML. The format is:
|
||||||
@ -107,7 +109,7 @@ func fastUnmarshalConfig(from fs.FS, path string) (*Config, error) {
|
|||||||
var config Config
|
var config Config
|
||||||
err = dec.DecodeElement(&config, &se)
|
err = dec.DecodeElement(&config, &se)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
panic(err)
|
return nil, fmt.Errorf("%s: %w", path, err)
|
||||||
}
|
}
|
||||||
return &config, nil
|
return &config, nil
|
||||||
}
|
}
|
||||||
@ -135,8 +137,29 @@ func NewXMLLexer(from fs.FS, path string) (*RegexLexer, error) {
|
|||||||
return nil, fmt.Errorf("%s: %q is not a valid glob: %w", config.Name, glob, err)
|
return nil, fmt.Errorf("%s: %q is not a valid glob: %w", config.Name, glob, err)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
type regexAnalyse struct {
|
||||||
|
re *regexp2.Regexp
|
||||||
|
score float32
|
||||||
|
}
|
||||||
|
regexAnalysers := make([]regexAnalyse, 0, len(config.Analyse))
|
||||||
|
for _, ra := range config.Analyse {
|
||||||
|
re, err := regexp2.Compile(ra.Regex, regexp2.None)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("%s: %q is not a valid analyser regex: %w", config.Name, ra.Regex, err)
|
||||||
|
}
|
||||||
|
regexAnalysers = append(regexAnalysers, regexAnalyse{re, ra.Score})
|
||||||
|
}
|
||||||
return &RegexLexer{
|
return &RegexLexer{
|
||||||
config: config,
|
config: config,
|
||||||
|
analyser: func(text string) float32 {
|
||||||
|
var score float32
|
||||||
|
for _, ra := range regexAnalysers {
|
||||||
|
if ok, _ := ra.re.MatchString(text); ok {
|
||||||
|
score += ra.score
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return score
|
||||||
|
},
|
||||||
fetchRulesFunc: func() (Rules, error) {
|
fetchRulesFunc: func() (Rules, error) {
|
||||||
var lexer struct {
|
var lexer struct {
|
||||||
Config
|
Config
|
||||||
|
Loading…
x
Reference in New Issue
Block a user