1
0
mirror of https://github.com/alecthomas/chroma.git synced 2025-12-24 00:01:39 +02:00
Files
chroma/parser_test.go
Alec Thomas ee1172e04f feat: a custom grammar for lexers
While I do prefer XML over YAML, it's also super verbose.

```xml
<lexer>
  <config>
    <name>INI</name>
    <alias>ini</alias>
    <alias>cfg</alias>
    <alias>dosini</alias>
    <filename>*.ini</filename>
    <filename>*.cfg</filename>
    <filename>*.inf</filename>
    <filename>*.service</filename>
    <filename>*.socket</filename>
    <filename>.gitconfig</filename>
    <filename>.editorconfig</filename>
    <filename>pylintrc</filename>
    <filename>.pylintrc</filename>
    <mime_type>text/x-ini</mime_type>
    <mime_type>text/inf</mime_type>
    <priority>0.1</priority> <!-- higher priority than Inform 6 -->
  </config>
  <rules>
    <state name="root">
      <rule pattern="\s+">
        <token type="Text"/>
      </rule>
      <rule pattern="[;#].*">
        <token type="CommentSingle"/>
      </rule>
      <rule pattern="\[.*?\]$">
        <token type="Keyword"/>
      </rule>
      <rule pattern="(.*?)([ \t]*)(=)([ \t]*)(.*(?:\n[ \t].+)*)">
        <bygroups>
          <token type="NameAttribute"/>
          <token type="Text"/>
          <token type="Operator"/>
          <token type="Text"/>
          <token type="LiteralString"/>
        </bygroups>
      </rule>
      <rule pattern="(.+?)$">
        <token type="NameAttribute"/>
      </rule>
    </state>
  </rules>
</lexer>
```

Pros:
- Very succinct

Cons:
- No need to escape regexes
- Bespoke syntax that people will have to learn
- No syntax highlighting in editors, no validation beyond parser

```
config {
  name "INI"
  aliases "ini", "cfg"
  filenames "*.ini", "*.cfg", "*.inf", "*.service", "*.socket", ".gitconfig",
            ".editorconfig", "pylintrc", ".pylintrc"
  mime-types "text/x-ini", "text/inf"
  priority 0.1
}

state root {
  /\s+/ text
  /[;#].*/ commentsingle
  /\[.*?\]$/ keyword
  /(.*?)([ \t]*)(=)([ \t]*)(.*(?:\n[ \t].+)*)/ by groups
    nameattribute, text, operator, text, literalstring
  /(.+?)$/ nameattribute
}
`)
```

Pros:
- More succinct than XML
- Can define a schema and have editors use it to validate.

Cons:
- Fucking YAML
  - Indentation is awful
- Less succinct than bespoke syntax
- Will need some way to discriminate between "emitters" and "mutators" when parsing, eg. `type: Keyword` vs. `type: {bygroups: [...]}`

```yaml
config:
  name: "INI"
  aliases: ["ini", "cfg"]
  filenames: ["*.ini", "*.cfg", "*.inf", "*.service", "*.socket", ".gitconfig",
              ".editorconfig", "pylintrc", ".pylintrc"]
  mime-types: ["text/x-ini", "text/inf"]
  priority: 0.1
state:
  root:
    rule:
      - pattern: "\\s+"
        type: Text
      - pattern: "[;#].*"
        type: CommentSingle
      - pattern: "\\[.*?\\]"
        type: Keyword
      - pattern: "(.*?)([ \\t]*)(=)([ \\t]*)(.*(?:\\n[ \\t].+)*)"
        type:
          bygroups: [NameAttribute, Text, Operator, Text, LiteralString]
      - pattern: "(.+?)$"
        type: NameAttribute
```
2025-03-23 10:28:53 +11:00

73 lines
1.5 KiB
Go

package chroma
import (
"testing"
assert "github.com/alecthomas/assert/v2"
)
func TestParser(t *testing.T) {
ast, err := parser.ParseString("", `
config {
name "INI"
aliases "ini", "cfg"
filenames "*.ini", "*.cfg", "*.inf", "*.service", "*.socket", ".gitconfig",
".editorconfig", "pylintrc", ".pylintrc"
mime-types "text/x-ini", "text/inf"
priority 0.1
}
state root {
/\s+/ text
/[;#].*/ commentsingle
/\[.*?\]$/ keyword
/(.*?)([ \t]*)(=)([ \t]*)(.*(?:\n[ \t].+)*)/ by groups
nameattribute, text, operator, text, literalstring
/(.+?)$/ nameattribute
}
`)
assert.NoError(t, err)
assert.Equal(t, &AST{
Config: &Config{
Name: "INI",
Aliases: []string{
"ini",
"cfg",
},
Filenames: []string{
"*.ini",
"*.cfg",
"*.inf",
"*.service",
"*.socket",
".gitconfig",
".editorconfig",
"pylintrc",
".pylintrc",
},
MimeTypes: []string{
"text/x-ini",
"text/inf",
},
Priority: 0.1,
},
States: []stateAST{
{Name: "root",
Rules: []Rule{
{Pattern: `\s+`, Type: &tokenTypeAST{Text}},
{Pattern: `[;#].*`, Type: &tokenTypeAST{CommentSingle}},
{Pattern: `\[.*?\]$`, Type: &tokenTypeAST{Keyword}},
{Pattern: `(.*?)([ \t]*)(=)([ \t]*)(.*(?:\n[ \t].+)*)`, Type: &byGroupsEmitter{Emitters{
&tokenTypeAST{NameAttribute},
&tokenTypeAST{Text},
&tokenTypeAST{Operator},
&tokenTypeAST{Text},
&tokenTypeAST{LiteralString},
}}},
{Pattern: `(.+?)$`, Type: &tokenTypeAST{NameAttribute}},
},
},
},
}, ast)
}