diff --git a/crates/globset/src/glob.rs b/crates/globset/src/glob.rs index 14c2cba5..f745b1de 100644 --- a/crates/globset/src/glob.rs +++ b/crates/globset/src/glob.rs @@ -229,6 +229,11 @@ struct GlobOptions { /// Whether or not an empty case in an alternate will be removed. /// e.g., when enabled, `{,a}` will match "" and "a". empty_alternates: bool, + /// Whether or not an unclosed character class is allowed. When an unclosed + /// character class is found, the opening `[` is treated as a literal `[`. + /// When this isn't enabled, an opening `[` without a corresponding `]` is + /// treated as an error. + allow_unclosed_class: bool, } impl GlobOptions { @@ -238,6 +243,7 @@ impl GlobOptions { literal_separator: false, backslash_escape: !is_separator('\\'), empty_alternates: false, + allow_unclosed_class: false, } } } @@ -594,6 +600,7 @@ impl<'a> GlobBuilder<'a> { chars: self.glob.chars().peekable(), prev: None, cur: None, + found_unclosed_class: false, opts: &self.opts, }; p.parse()?; @@ -657,6 +664,22 @@ impl<'a> GlobBuilder<'a> { self.opts.empty_alternates = yes; self } + + /// Toggle whether unclosed character classes are allowed. When allowed, + /// a `[` without a matching `]` is treated literally instead of resulting + /// in a parse error. + /// + /// For example, if this is set then the glob `[abc` will be treated as the + /// literal string `[abc` instead of returning an error. + /// + /// By default, this is false. Generally speaking, enabling this leads to + /// worse failure modes since the glob parser becomes more permissive. You + /// might want to enable this when compatibility (e.g., with POSIX glob + /// implementations) is more important than good error messages. + pub fn allow_unclosed_class(&mut self, yes: bool) -> &mut GlobBuilder<'a> { + self.opts.allow_unclosed_class = yes; + self + } } impl Tokens { @@ -782,15 +805,29 @@ fn bytes_to_escaped_literal(bs: &[u8]) -> String { } struct Parser<'a> { + /// The glob to parse. glob: &'a str, - // Marks the index in `stack` where the alternation started. + /// Marks the index in `stack` where the alternation started. alternates_stack: Vec, - // The set of active alternation branches being parsed. - // Tokens are added to the end of the last one. + /// The set of active alternation branches being parsed. + /// Tokens are added to the end of the last one. branches: Vec, + /// A character iterator over the glob pattern to parse. chars: std::iter::Peekable>, + /// The previous character seen. prev: Option, + /// The current character. cur: Option, + /// Whether we failed to find a closing `]` for a character + /// class. This can only be true when `GlobOptions::allow_unclosed_class` + /// is enabled. When enabled, it is impossible to ever parse another + /// character class with this glob. That's because classes cannot be + /// nested *and* the only way this happens is when there is never a `]`. + /// + /// We track this state so that we don't end up spending quadratic time + /// trying to parse something like `[[[[[[[[[[[[[[[[[[[[[[[...`. + found_unclosed_class: bool, + /// Glob options, which may influence parsing. opts: &'a GlobOptions, } @@ -804,7 +841,7 @@ impl<'a> Parser<'a> { match c { '?' => self.push_token(Token::Any)?, '*' => self.parse_star()?, - '[' => self.parse_class()?, + '[' if !self.found_unclosed_class => self.parse_class()?, '{' => self.push_alternate()?, '}' => self.pop_alternate()?, ',' => self.parse_comma()?, @@ -939,6 +976,11 @@ impl<'a> Parser<'a> { } fn parse_class(&mut self) -> Result<(), Error> { + // Save parser state for potential rollback to literal '[' parsing. + let saved_chars = self.chars.clone(); + let saved_prev = self.prev; + let saved_cur = self.cur; + fn add_to_last_range( glob: &str, r: &mut (char, char), @@ -966,11 +1008,17 @@ impl<'a> Parser<'a> { let mut first = true; let mut in_range = false; loop { - let c = match self.bump() { - Some(c) => c, - // The only way to successfully break this loop is to observe - // a ']'. - None => return Err(self.error(ErrorKind::UnclosedClass)), + let Some(c) = self.bump() else { + return if self.opts.allow_unclosed_class == true { + self.chars = saved_chars; + self.cur = saved_cur; + self.prev = saved_prev; + self.found_unclosed_class = true; + + self.push_token(Token::Literal('[')) + } else { + Err(self.error(ErrorKind::UnclosedClass)) + }; }; match c { ']' => { @@ -1055,6 +1103,7 @@ mod tests { litsep: Option, bsesc: Option, ealtre: Option, + unccls: Option, } macro_rules! syntax { @@ -1097,6 +1146,10 @@ mod tests { if let Some(ealtre) = $options.ealtre { builder.empty_alternates(ealtre); } + if let Some(unccls) = $options.unccls { + builder.allow_unclosed_class(unccls); + } + let pat = builder.build().unwrap(); assert_eq!(format!("(?-u){}", $re), pat.regex()); } @@ -1242,24 +1295,73 @@ mod tests { syntaxerr!(err_alt3, "a,b}", ErrorKind::UnopenedAlternates); syntaxerr!(err_alt4, "{a,b}}", ErrorKind::UnopenedAlternates); - const CASEI: Options = - Options { casei: Some(true), litsep: None, bsesc: None, ealtre: None }; - const SLASHLIT: Options = - Options { casei: None, litsep: Some(true), bsesc: None, ealtre: None }; + const CASEI: Options = Options { + casei: Some(true), + litsep: None, + bsesc: None, + ealtre: None, + unccls: None, + }; + const SLASHLIT: Options = Options { + casei: None, + litsep: Some(true), + bsesc: None, + ealtre: None, + unccls: None, + }; const NOBSESC: Options = Options { casei: None, litsep: None, bsesc: Some(false), ealtre: None, + unccls: None, + }; + const BSESC: Options = Options { + casei: None, + litsep: None, + bsesc: Some(true), + ealtre: None, + unccls: None, }; - const BSESC: Options = - Options { casei: None, litsep: None, bsesc: Some(true), ealtre: None }; const EALTRE: Options = Options { casei: None, litsep: None, bsesc: Some(true), ealtre: Some(true), + unccls: None, }; + const UNCCLS: Options = Options { + casei: None, + litsep: None, + bsesc: None, + ealtre: None, + unccls: Some(true), + }; + + toregex!(allow_unclosed_class_single, r"[", r"^\[$", &UNCCLS); + toregex!(allow_unclosed_class_many, r"[abc", r"^\[abc$", &UNCCLS); + toregex!(allow_unclosed_class_empty1, r"[]", r"^\[\]$", &UNCCLS); + toregex!(allow_unclosed_class_empty2, r"[][", r"^\[\]\[$", &UNCCLS); + toregex!(allow_unclosed_class_negated_unclosed, r"[!", r"^\[!$", &UNCCLS); + toregex!(allow_unclosed_class_negated_empty, r"[!]", r"^\[!\]$", &UNCCLS); + toregex!( + allow_unclosed_class_brace1, + r"{[abc,xyz}", + r"^(?:\[abc|xyz)$", + &UNCCLS + ); + toregex!( + allow_unclosed_class_brace2, + r"{[abc,[xyz}", + r"^(?:\[abc|\[xyz)$", + &UNCCLS + ); + toregex!( + allow_unclosed_class_brace3, + r"{[abc],[xyz}", + r"^(?:[abc]|\[xyz)$", + &UNCCLS + ); toregex!(re_empty, "", "^$"); diff --git a/crates/ignore/src/gitignore.rs b/crates/ignore/src/gitignore.rs index eeb51b82..1b9a7e18 100644 --- a/crates/ignore/src/gitignore.rs +++ b/crates/ignore/src/gitignore.rs @@ -308,6 +308,7 @@ pub struct GitignoreBuilder { root: PathBuf, globs: Vec, case_insensitive: bool, + allow_unclosed_class: bool, } impl GitignoreBuilder { @@ -324,6 +325,7 @@ impl GitignoreBuilder { root: strip_prefix("./", root).unwrap_or(root).to_path_buf(), globs: vec![], case_insensitive: false, + allow_unclosed_class: true, } } @@ -511,6 +513,7 @@ impl GitignoreBuilder { .literal_separator(true) .case_insensitive(self.case_insensitive) .backslash_escape(true) + .allow_unclosed_class(self.allow_unclosed_class) .build() .map_err(|err| Error::Glob { glob: Some(glob.original.clone()), @@ -536,6 +539,26 @@ impl GitignoreBuilder { self.case_insensitive = yes; Ok(self) } + + /// Toggle whether unclosed character classes are allowed. When allowed, + /// a `[` without a matching `]` is treated literally instead of resulting + /// in a parse error. + /// + /// For example, if this is set then the glob `[abc` will be treated as the + /// literal string `[abc` instead of returning an error. + /// + /// By default, this is true in order to match established `gitignore` + /// semantics. Generally speaking, enabling this leads to worse failure + /// modes since the glob parser becomes more permissive. You might want to + /// enable this when compatibility (e.g., with POSIX glob implementations) + /// is more important than good error messages. + pub fn allow_unclosed_class( + &mut self, + yes: bool, + ) -> &mut GitignoreBuilder { + self.allow_unclosed_class = yes; + self + } } /// Return the file path of the current environment's global gitignore file. diff --git a/crates/ignore/src/overrides.rs b/crates/ignore/src/overrides.rs index 1b02b4d4..079b02c4 100644 --- a/crates/ignore/src/overrides.rs +++ b/crates/ignore/src/overrides.rs @@ -1,5 +1,6 @@ /*! The overrides module provides a way to specify a set of override globs. + This provides functionality similar to `--include` or `--exclude` in command line tools. */ @@ -120,7 +121,9 @@ impl OverrideBuilder { /// /// Matching is done relative to the directory path provided. pub fn new>(path: P) -> OverrideBuilder { - OverrideBuilder { builder: GitignoreBuilder::new(path) } + let mut builder = GitignoreBuilder::new(path); + builder.allow_unclosed_class(false); + OverrideBuilder { builder } } /// Builds a new override matcher from the globs added so far. @@ -143,7 +146,8 @@ impl OverrideBuilder { /// Toggle whether the globs should be matched case insensitively or not. /// - /// When this option is changed, only globs added after the change will be affected. + /// When this option is changed, only globs added after the change will be + /// affected. /// /// This is disabled by default. pub fn case_insensitive( @@ -155,6 +159,28 @@ impl OverrideBuilder { self.builder.case_insensitive(yes)?; Ok(self) } + + /// Toggle whether unclosed character classes are allowed. When allowed, + /// a `[` without a matching `]` is treated literally instead of resulting + /// in a parse error. + /// + /// For example, if this is set then the glob `[abc` will be treated as the + /// literal string `[abc` instead of returning an error. + /// + /// By default, this is false. Generally speaking, enabling this leads to + /// worse failure modes since the glob parser becomes more permissive. You + /// might want to enable this when compatibility (e.g., with POSIX glob + /// implementations) is more important than good error messages. + /// + /// This default is different from the default for [`Gitignore`]. Namely, + /// [`Gitignore`] is intended to match git's behavior as-is. But this + /// abstraction for "override" globs does not necessarily conform to any + /// other known specification and instead prioritizes better error + /// messages. + pub fn allow_unclosed_class(&mut self, yes: bool) -> &mut OverrideBuilder { + self.builder.allow_unclosed_class(yes); + self + } } #[cfg(test)] diff --git a/tests/regression.rs b/tests/regression.rs index ac806ae3..5772b014 100644 --- a/tests/regression.rs +++ b/tests/regression.rs @@ -1519,3 +1519,28 @@ rgtest!(r3108_files_without_match_quiet_exit, |dir: Dir, _: TestCommand| { .stdout(); eqnice!("", got); }); + +// See: https://github.com/BurntSushi/ripgrep/issues/3127 +rgtest!( + r3127_gitignore_allow_unclosed_class, + |dir: Dir, mut cmd: TestCommand| { + dir.create_dir(".git"); + dir.create(".gitignore", "[abc"); + dir.create("[abc", ""); + dir.create("test", ""); + + let got = cmd.args(&["--files"]).stdout(); + eqnice!("test\n", got); + } +); + +// See: https://github.com/BurntSushi/ripgrep/issues/3127 +rgtest!( + r3127_glob_flag_not_allow_unclosed_class, + |dir: Dir, mut cmd: TestCommand| { + dir.create("[abc", ""); + dir.create("test", ""); + + cmd.args(&["--files", "-g", "[abc"]).assert_err(); + } +);