1
0
mirror of https://github.com/BurntSushi/ripgrep.git synced 2025-11-23 21:54:45 +02:00

globset: add allow_unclosed_class toggle

When enabled, patterns like `[abc`, `[]`, `[!]` are treated as if the
opening `[` is just a literal. This is in contrast the default behavior,
which prioritizes better error messages, of returning a parse error.

Fixes #3127, Closes #3145
This commit is contained in:
mostafa
2025-09-12 22:53:35 +03:00
committed by Andrew Gallant
parent 556623684e
commit f596a5d875
4 changed files with 193 additions and 17 deletions

View File

@@ -229,6 +229,11 @@ struct GlobOptions {
/// Whether or not an empty case in an alternate will be removed.
/// e.g., when enabled, `{,a}` will match "" and "a".
empty_alternates: bool,
/// Whether or not an unclosed character class is allowed. When an unclosed
/// character class is found, the opening `[` is treated as a literal `[`.
/// When this isn't enabled, an opening `[` without a corresponding `]` is
/// treated as an error.
allow_unclosed_class: bool,
}
impl GlobOptions {
@@ -238,6 +243,7 @@ impl GlobOptions {
literal_separator: false,
backslash_escape: !is_separator('\\'),
empty_alternates: false,
allow_unclosed_class: false,
}
}
}
@@ -594,6 +600,7 @@ impl<'a> GlobBuilder<'a> {
chars: self.glob.chars().peekable(),
prev: None,
cur: None,
found_unclosed_class: false,
opts: &self.opts,
};
p.parse()?;
@@ -657,6 +664,22 @@ impl<'a> GlobBuilder<'a> {
self.opts.empty_alternates = yes;
self
}
/// Toggle whether unclosed character classes are allowed. When allowed,
/// a `[` without a matching `]` is treated literally instead of resulting
/// in a parse error.
///
/// For example, if this is set then the glob `[abc` will be treated as the
/// literal string `[abc` instead of returning an error.
///
/// By default, this is false. Generally speaking, enabling this leads to
/// worse failure modes since the glob parser becomes more permissive. You
/// might want to enable this when compatibility (e.g., with POSIX glob
/// implementations) is more important than good error messages.
pub fn allow_unclosed_class(&mut self, yes: bool) -> &mut GlobBuilder<'a> {
self.opts.allow_unclosed_class = yes;
self
}
}
impl Tokens {
@@ -782,15 +805,29 @@ fn bytes_to_escaped_literal(bs: &[u8]) -> String {
}
struct Parser<'a> {
/// The glob to parse.
glob: &'a str,
// Marks the index in `stack` where the alternation started.
/// Marks the index in `stack` where the alternation started.
alternates_stack: Vec<usize>,
// The set of active alternation branches being parsed.
// Tokens are added to the end of the last one.
/// The set of active alternation branches being parsed.
/// Tokens are added to the end of the last one.
branches: Vec<Tokens>,
/// A character iterator over the glob pattern to parse.
chars: std::iter::Peekable<std::str::Chars<'a>>,
/// The previous character seen.
prev: Option<char>,
/// The current character.
cur: Option<char>,
/// Whether we failed to find a closing `]` for a character
/// class. This can only be true when `GlobOptions::allow_unclosed_class`
/// is enabled. When enabled, it is impossible to ever parse another
/// character class with this glob. That's because classes cannot be
/// nested *and* the only way this happens is when there is never a `]`.
///
/// We track this state so that we don't end up spending quadratic time
/// trying to parse something like `[[[[[[[[[[[[[[[[[[[[[[[...`.
found_unclosed_class: bool,
/// Glob options, which may influence parsing.
opts: &'a GlobOptions,
}
@@ -804,7 +841,7 @@ impl<'a> Parser<'a> {
match c {
'?' => self.push_token(Token::Any)?,
'*' => self.parse_star()?,
'[' => self.parse_class()?,
'[' if !self.found_unclosed_class => self.parse_class()?,
'{' => self.push_alternate()?,
'}' => self.pop_alternate()?,
',' => self.parse_comma()?,
@@ -939,6 +976,11 @@ impl<'a> Parser<'a> {
}
fn parse_class(&mut self) -> Result<(), Error> {
// Save parser state for potential rollback to literal '[' parsing.
let saved_chars = self.chars.clone();
let saved_prev = self.prev;
let saved_cur = self.cur;
fn add_to_last_range(
glob: &str,
r: &mut (char, char),
@@ -966,11 +1008,17 @@ impl<'a> Parser<'a> {
let mut first = true;
let mut in_range = false;
loop {
let c = match self.bump() {
Some(c) => c,
// The only way to successfully break this loop is to observe
// a ']'.
None => return Err(self.error(ErrorKind::UnclosedClass)),
let Some(c) = self.bump() else {
return if self.opts.allow_unclosed_class == true {
self.chars = saved_chars;
self.cur = saved_cur;
self.prev = saved_prev;
self.found_unclosed_class = true;
self.push_token(Token::Literal('['))
} else {
Err(self.error(ErrorKind::UnclosedClass))
};
};
match c {
']' => {
@@ -1055,6 +1103,7 @@ mod tests {
litsep: Option<bool>,
bsesc: Option<bool>,
ealtre: Option<bool>,
unccls: Option<bool>,
}
macro_rules! syntax {
@@ -1097,6 +1146,10 @@ mod tests {
if let Some(ealtre) = $options.ealtre {
builder.empty_alternates(ealtre);
}
if let Some(unccls) = $options.unccls {
builder.allow_unclosed_class(unccls);
}
let pat = builder.build().unwrap();
assert_eq!(format!("(?-u){}", $re), pat.regex());
}
@@ -1242,24 +1295,73 @@ mod tests {
syntaxerr!(err_alt3, "a,b}", ErrorKind::UnopenedAlternates);
syntaxerr!(err_alt4, "{a,b}}", ErrorKind::UnopenedAlternates);
const CASEI: Options =
Options { casei: Some(true), litsep: None, bsesc: None, ealtre: None };
const SLASHLIT: Options =
Options { casei: None, litsep: Some(true), bsesc: None, ealtre: None };
const CASEI: Options = Options {
casei: Some(true),
litsep: None,
bsesc: None,
ealtre: None,
unccls: None,
};
const SLASHLIT: Options = Options {
casei: None,
litsep: Some(true),
bsesc: None,
ealtre: None,
unccls: None,
};
const NOBSESC: Options = Options {
casei: None,
litsep: None,
bsesc: Some(false),
ealtre: None,
unccls: None,
};
const BSESC: Options = Options {
casei: None,
litsep: None,
bsesc: Some(true),
ealtre: None,
unccls: None,
};
const BSESC: Options =
Options { casei: None, litsep: None, bsesc: Some(true), ealtre: None };
const EALTRE: Options = Options {
casei: None,
litsep: None,
bsesc: Some(true),
ealtre: Some(true),
unccls: None,
};
const UNCCLS: Options = Options {
casei: None,
litsep: None,
bsesc: None,
ealtre: None,
unccls: Some(true),
};
toregex!(allow_unclosed_class_single, r"[", r"^\[$", &UNCCLS);
toregex!(allow_unclosed_class_many, r"[abc", r"^\[abc$", &UNCCLS);
toregex!(allow_unclosed_class_empty1, r"[]", r"^\[\]$", &UNCCLS);
toregex!(allow_unclosed_class_empty2, r"[][", r"^\[\]\[$", &UNCCLS);
toregex!(allow_unclosed_class_negated_unclosed, r"[!", r"^\[!$", &UNCCLS);
toregex!(allow_unclosed_class_negated_empty, r"[!]", r"^\[!\]$", &UNCCLS);
toregex!(
allow_unclosed_class_brace1,
r"{[abc,xyz}",
r"^(?:\[abc|xyz)$",
&UNCCLS
);
toregex!(
allow_unclosed_class_brace2,
r"{[abc,[xyz}",
r"^(?:\[abc|\[xyz)$",
&UNCCLS
);
toregex!(
allow_unclosed_class_brace3,
r"{[abc],[xyz}",
r"^(?:[abc]|\[xyz)$",
&UNCCLS
);
toregex!(re_empty, "", "^$");

View File

@@ -308,6 +308,7 @@ pub struct GitignoreBuilder {
root: PathBuf,
globs: Vec<Glob>,
case_insensitive: bool,
allow_unclosed_class: bool,
}
impl GitignoreBuilder {
@@ -324,6 +325,7 @@ impl GitignoreBuilder {
root: strip_prefix("./", root).unwrap_or(root).to_path_buf(),
globs: vec![],
case_insensitive: false,
allow_unclosed_class: true,
}
}
@@ -511,6 +513,7 @@ impl GitignoreBuilder {
.literal_separator(true)
.case_insensitive(self.case_insensitive)
.backslash_escape(true)
.allow_unclosed_class(self.allow_unclosed_class)
.build()
.map_err(|err| Error::Glob {
glob: Some(glob.original.clone()),
@@ -536,6 +539,26 @@ impl GitignoreBuilder {
self.case_insensitive = yes;
Ok(self)
}
/// Toggle whether unclosed character classes are allowed. When allowed,
/// a `[` without a matching `]` is treated literally instead of resulting
/// in a parse error.
///
/// For example, if this is set then the glob `[abc` will be treated as the
/// literal string `[abc` instead of returning an error.
///
/// By default, this is true in order to match established `gitignore`
/// semantics. Generally speaking, enabling this leads to worse failure
/// modes since the glob parser becomes more permissive. You might want to
/// enable this when compatibility (e.g., with POSIX glob implementations)
/// is more important than good error messages.
pub fn allow_unclosed_class(
&mut self,
yes: bool,
) -> &mut GitignoreBuilder {
self.allow_unclosed_class = yes;
self
}
}
/// Return the file path of the current environment's global gitignore file.

View File

@@ -1,5 +1,6 @@
/*!
The overrides module provides a way to specify a set of override globs.
This provides functionality similar to `--include` or `--exclude` in command
line tools.
*/
@@ -120,7 +121,9 @@ impl OverrideBuilder {
///
/// Matching is done relative to the directory path provided.
pub fn new<P: AsRef<Path>>(path: P) -> OverrideBuilder {
OverrideBuilder { builder: GitignoreBuilder::new(path) }
let mut builder = GitignoreBuilder::new(path);
builder.allow_unclosed_class(false);
OverrideBuilder { builder }
}
/// Builds a new override matcher from the globs added so far.
@@ -143,7 +146,8 @@ impl OverrideBuilder {
/// Toggle whether the globs should be matched case insensitively or not.
///
/// When this option is changed, only globs added after the change will be affected.
/// When this option is changed, only globs added after the change will be
/// affected.
///
/// This is disabled by default.
pub fn case_insensitive(
@@ -155,6 +159,28 @@ impl OverrideBuilder {
self.builder.case_insensitive(yes)?;
Ok(self)
}
/// Toggle whether unclosed character classes are allowed. When allowed,
/// a `[` without a matching `]` is treated literally instead of resulting
/// in a parse error.
///
/// For example, if this is set then the glob `[abc` will be treated as the
/// literal string `[abc` instead of returning an error.
///
/// By default, this is false. Generally speaking, enabling this leads to
/// worse failure modes since the glob parser becomes more permissive. You
/// might want to enable this when compatibility (e.g., with POSIX glob
/// implementations) is more important than good error messages.
///
/// This default is different from the default for [`Gitignore`]. Namely,
/// [`Gitignore`] is intended to match git's behavior as-is. But this
/// abstraction for "override" globs does not necessarily conform to any
/// other known specification and instead prioritizes better error
/// messages.
pub fn allow_unclosed_class(&mut self, yes: bool) -> &mut OverrideBuilder {
self.builder.allow_unclosed_class(yes);
self
}
}
#[cfg(test)]