grep: add "perfect" smart case detection

This commit removes the previous smart case detection logic and replaces it with detection based on the regex AST. This particular AST is a faithful representation of the concrete syntax, which lets us be very precise in how we handle it. Closes #851
2025-08-04 21:52:54 +02:00 · 2018-03-13 21:43:23 -04:00
parent cd08707c7c
commit 42b8132d0a
4 changed files with 203 additions and 49 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -43,6 +43,8 @@ Bug fixes:
  Support backslash escapes in globs.
 * [BUG #832](https://github.com/BurntSushi/ripgrep/issues/832):
  Clarify usage instructions for `-f/--file` flag.
+* [BUG #851](https://github.com/BurntSushi/ripgrep/issues/851):
+  Fix `-S/--smart-case` detection once and for all.
 * [BUG #852](https://github.com/BurntSushi/ripgrep/issues/852):
  Be robust with respect to `ENOMEM` errors returned by `mmap`.
 * [BUG #853](https://github.com/BurntSushi/ripgrep/issues/853):
--- a/grep/src/lib.rs
+++ b/grep/src/lib.rs
@ -19,6 +19,7 @@ pub use search::{Grep, GrepBuilder, Iter, Match};
 mod literals;
 mod nonl;
 mod search;
+mod smart_case;
 mod word_boundary;

 /// Result is a convenient type alias that fixes the type of the error to
--- a/grep/src/search.rs
+++ b/grep/src/search.rs
@ -1,10 +1,11 @@
 use memchr::{memchr, memrchr};
+use syntax::ParserBuilder;
+use syntax::hir::Hir;
 use regex::bytes::{Regex, RegexBuilder};

 use literals::LiteralSets;
 use nonl;
-use syntax::ParserBuilder;
-use syntax::hir::Hir;
+use smart_case::Cased;
 use word_boundary::strip_unicode_word_boundaries;
 use Result;

@ -205,7 +206,11 @@ impl GrepBuilder {
        if !self.opts.case_smart {
            return Ok(false);
        }
-        Ok(!has_uppercase_literal(&self.pattern))
+        let cased = match Cased::from_pattern(&self.pattern) {
+            None => return Ok(false),
+            Some(cased) => cased,
+        };
+        Ok(cased.any_literal && !cased.any_uppercase)
    }
 }

@ -311,44 +316,15 @@ impl<'b, 's> Iterator for Iter<'b, 's> {
    }
 }

-/// Determine whether the pattern contains an uppercase character which should
-/// negate the effect of the smart-case option.
-///
-/// Ideally we would be able to check the AST in order to correctly handle
-/// things like '\p{Ll}' and '\p{Lu}' (which should be treated as explicitly
-/// cased), but we don't currently have that option. For now, our 'good enough'
-/// solution is to simply perform a semi-naïve scan of the input pattern and
-/// ignore all characters following a '\'. The ExprBuilder will handle any
-/// actual errors, and this at least lets us support the most common cases,
-/// like 'foo\w' and 'foo\S', in an intuitive manner.
-fn has_uppercase_literal(pattern: &str) -> bool {
-    let mut chars = pattern.chars();
-    while let Some(c) = chars.next() {
-        if c == '\\' {
-            chars.next();
-        } else if c.is_uppercase() {
-            return true;
-        }
-    }
-    false
-}
-
 #[cfg(test)]
 mod tests {
-    #![allow(unused_imports)]
-
    use memchr::{memchr, memrchr};
    use regex::bytes::Regex;

-    use super::{GrepBuilder, Match, has_uppercase_literal};
+    use super::{GrepBuilder, Match};

    static SHERLOCK: &'static [u8] = include_bytes!("./data/sherlock.txt");

-    #[allow(dead_code)]
-    fn s(bytes: &[u8]) -> String {
-        String::from_utf8(bytes.to_vec()).unwrap()
-    }
-
    fn find_lines(pat: &str, haystack: &[u8]) -> Vec<Match> {
        let re = Regex::new(pat).unwrap();
        let mut lines = vec![];
@ -377,20 +353,4 @@ mod tests {
        assert_eq!(expected.len(), got.len());
        assert_eq!(expected, got);
    }
-
-    #[test]
-    fn pattern_case() {
-        assert_eq!(has_uppercase_literal(&"".to_string()), false);
-        assert_eq!(has_uppercase_literal(&"foo".to_string()), false);
-        assert_eq!(has_uppercase_literal(&"Foo".to_string()), true);
-        assert_eq!(has_uppercase_literal(&"foO".to_string()), true);
-        assert_eq!(has_uppercase_literal(&"foo\\\\".to_string()), false);
-        assert_eq!(has_uppercase_literal(&"foo\\w".to_string()), false);
-        assert_eq!(has_uppercase_literal(&"foo\\S".to_string()), false);
-        assert_eq!(has_uppercase_literal(&"foo\\p{Ll}".to_string()), true);
-        assert_eq!(has_uppercase_literal(&"foo[a-z]".to_string()), false);
-        assert_eq!(has_uppercase_literal(&"foo[A-Z]".to_string()), true);
-        assert_eq!(has_uppercase_literal(&"foo[\\S\\t]".to_string()), false);
-        assert_eq!(has_uppercase_literal(&"foo\\\\S".to_string()), true);
-    }
 }
--- a/grep/src/smart_case.rs
+++ b/grep/src/smart_case.rs
@ -0,0 +1,191 @@
+use syntax::ast::{self, Ast};
+use syntax::ast::parse::Parser;
+
+/// The results of analyzing a regex for cased literals.
+#[derive(Clone, Debug, Default)]
+pub struct Cased {
+    /// True if and only if a literal uppercase character occurs in the regex.
+    ///
+    /// A regex like `\pL` contains no uppercase literals, even though `L`
+    /// is uppercase and the `\pL` class contains uppercase characters.
+    pub any_uppercase: bool,
+    /// True if and only if the regex contains any literal at all. A regex like
+    /// `\pL` has this set to false.
+    pub any_literal: bool,
+}
+
+impl Cased {
+    /// Returns a `Cased` value by doing analysis on the AST of `pattern`.
+    ///
+    /// If `pattern` is not a valid regular expression, then `None` is
+    /// returned.
+    pub fn from_pattern(pattern: &str) -> Option<Cased> {
+        Parser::new()
+            .parse(pattern)
+            .map(|ast| Cased::from_ast(&ast))
+            .ok()
+    }
+
+    fn from_ast(ast: &Ast) -> Cased {
+        let mut cased = Cased::default();
+        cased.from_ast_impl(ast);
+        cased
+    }
+
+    fn from_ast_impl(&mut self, ast: &Ast) {
+        if self.done() {
+            return;
+        }
+        match *ast {
+            Ast::Empty(_)
+            | Ast::Flags(_)
+            | Ast::Dot(_)
+            | Ast::Assertion(_)
+            | Ast::Class(ast::Class::Unicode(_))
+            | Ast::Class(ast::Class::Perl(_)) => {}
+            Ast::Literal(ref x) => {
+                self.from_ast_literal(x);
+            }
+            Ast::Class(ast::Class::Bracketed(ref x)) => {
+                self.from_ast_class_set(&x.kind);
+            }
+            Ast::Repetition(ref x) => {
+                self.from_ast_impl(&x.ast);
+            }
+            Ast::Group(ref x) => {
+                self.from_ast_impl(&x.ast);
+            }
+            Ast::Alternation(ref alt) => {
+                for x in &alt.asts {
+                    self.from_ast_impl(x);
+                }
+            }
+            Ast::Concat(ref alt) => {
+                for x in &alt.asts {
+                    self.from_ast_impl(x);
+                }
+            }
+        }
+    }
+
+    fn from_ast_class_set(&mut self, ast: &ast::ClassSet) {
+        if self.done() {
+            return;
+        }
+        match *ast {
+            ast::ClassSet::Item(ref item) => {
+                self.from_ast_class_set_item(item);
+            }
+            ast::ClassSet::BinaryOp(ref x) => {
+                self.from_ast_class_set(&x.lhs);
+                self.from_ast_class_set(&x.rhs);
+            }
+        }
+    }
+
+    fn from_ast_class_set_item(&mut self, ast: &ast::ClassSetItem) {
+        if self.done() {
+            return;
+        }
+        match *ast {
+            ast::ClassSetItem::Empty(_)
+            | ast::ClassSetItem::Ascii(_)
+            | ast::ClassSetItem::Unicode(_)
+            | ast::ClassSetItem::Perl(_) => {}
+            ast::ClassSetItem::Literal(ref x) => {
+                self.from_ast_literal(x);
+            }
+            ast::ClassSetItem::Range(ref x) => {
+                self.from_ast_literal(&x.start);
+                self.from_ast_literal(&x.end);
+            }
+            ast::ClassSetItem::Bracketed(ref x) => {
+                self.from_ast_class_set(&x.kind);
+            }
+            ast::ClassSetItem::Union(ref union) => {
+                for x in &union.items {
+                    self.from_ast_class_set_item(x);
+                }
+            }
+        }
+    }
+
+    fn from_ast_literal(&mut self, ast: &ast::Literal) {
+        self.any_literal = true;
+        self.any_uppercase = self.any_uppercase || ast.c.is_uppercase();
+    }
+
+    /// Returns true if and only if the attributes can never change no matter
+    /// what other AST it might see.
+    fn done(&self) -> bool {
+        self.any_uppercase && self.any_literal
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    fn cased(pattern: &str) -> Cased {
+        Cased::from_pattern(pattern).unwrap()
+    }
+
+    #[test]
+    fn various() {
+        let x = cased("");
+        assert!(!x.any_uppercase);
+        assert!(!x.any_literal);
+
+        let x = cased("foo");
+        assert!(!x.any_uppercase);
+        assert!(x.any_literal);
+
+        let x = cased("Foo");
+        assert!(x.any_uppercase);
+        assert!(x.any_literal);
+
+        let x = cased("foO");
+        assert!(x.any_uppercase);
+        assert!(x.any_literal);
+
+        let x = cased(r"foo\\");
+        assert!(!x.any_uppercase);
+        assert!(x.any_literal);
+
+        let x = cased(r"foo\w");
+        assert!(!x.any_uppercase);
+        assert!(x.any_literal);
+
+        let x = cased(r"foo\S");
+        assert!(!x.any_uppercase);
+        assert!(x.any_literal);
+
+        let x = cased(r"foo\p{Ll}");
+        assert!(!x.any_uppercase);
+        assert!(x.any_literal);
+
+        let x = cased(r"foo[a-z]");
+        assert!(!x.any_uppercase);
+        assert!(x.any_literal);
+
+        let x = cased(r"foo[A-Z]");
+        assert!(x.any_uppercase);
+        assert!(x.any_literal);
+
+        let x = cased(r"foo[\S\t]");
+        assert!(!x.any_uppercase);
+        assert!(x.any_literal);
+
+        let x = cased(r"foo\\S");
+        assert!(x.any_uppercase);
+        assert!(x.any_literal);
+
+        let x = cased(r"\p{Ll}");
+        assert!(!x.any_uppercase);
+        assert!(!x.any_literal);
+
+        let x = cased(r"aBc\w");
+        assert!(x.any_uppercase);
+        assert!(x.any_literal);
+    }
+}