Improve detection of upper-case characters by smart-case feature

Fixes #717 (partially) The previous implementation of the smart-case feature was actually *too* smart, in that it inspected the final character ranges in the AST to determine if the pattern contained upper-case characters. This meant that patterns like `foo\w` would not be handled case-insensitively, since `\w` includes the range of upper-case characters A–Z. As a medium-term solution to this problem, we now inspect the input pattern itself for upper-case characters, ignoring any that immediately follow a `\`. This neatly handles all of the most basic cases like `\w`, `\S`, and `É`, though it still has problems with more complex features like `\p{Ll}`. Handling those correctly will require improvements to the AST.
2025-11-23 21:54:45 +02:00 · 2017-12-18 14:56:01 -06:00
parent d775259ed9
commit 86c890bcec
1 changed files with 39 additions and 42 deletions
--- a/grep/src/search.rs
+++ b/grep/src/search.rs
@@ -102,9 +102,9 @@ impl GrepBuilder {

    /// Whether to enable smart case search or not (disabled by default).
    ///
-    /// Smart case uses case insensitive search if the regex is contains all
-    /// lowercase literal characters. Otherwise, a case sensitive search is
-    /// used instead.
+    /// Smart case uses case insensitive search if the pattern contains only
+    /// lowercase characters (ignoring any characters which immediately follow
+    /// a '\'). Otherwise, a case sensitive search is used instead.
    ///
    /// Enabling the case_insensitive flag overrides this.
    pub fn case_smart(mut self, yes: bool) -> GrepBuilder {
@@ -197,8 +197,6 @@ impl GrepBuilder {
    }

    /// Determines whether the case insensitive flag should be enabled or not.
-    ///
-    /// An error is returned if the regex could not be parsed.
    fn is_case_insensitive(&self) -> Result<bool> {
        if self.opts.case_insensitive {
            return Ok(true);
@@ -206,12 +204,7 @@ impl GrepBuilder {
        if !self.opts.case_smart {
            return Ok(false);
        }
-        let expr =
-            try!(syntax::ExprBuilder::new()
-                 .allow_bytes(true)
-                 .unicode(true)
-                 .parse(&self.pattern));
-        Ok(!has_uppercase_literal(&expr))
+        Ok(!has_uppercase_literal(&self.pattern))
    }
 }

@@ -317,38 +310,26 @@ impl<'b, 's> Iterator for Iter<'b, 's> {
    }
 }

-fn has_uppercase_literal(expr: &Expr) -> bool {
-    use syntax::Expr::*;
-    fn byte_is_upper(b: u8) -> bool { b'A' <= b && b <= b'Z' }
-    match *expr {
-        Literal { ref chars, casei } => {
-            casei || chars.iter().any(|c| c.is_uppercase())
+/// Determine whether the pattern contains an uppercase character which should
+/// negate the effect of the smart-case option.
+///
+/// Ideally we would be able to check the AST in order to correctly handle
+/// things like '\p{Ll}' and '\p{Lu}' (which should be treated as explicitly
+/// cased), but we don't currently have that option. For now, our 'good enough'
+/// solution is to simply perform a semi-naïve scan of the input pattern and
+/// ignore all characters following a '\'. The ExprBuilder will handle any
+/// actual errors, and this at least lets us support the most common cases,
+/// like 'foo\w' and 'foo\S', in an intuitive manner.
+fn has_uppercase_literal(pattern: &str) -> bool {
+    let mut chars = pattern.chars();
+    while let Some(c) = chars.next() {
+        if c == '\\' {
+            chars.next();
+        } else if c.is_uppercase() {
+            return true;
        }
-        LiteralBytes { ref bytes, casei } => {
-            casei || bytes.iter().any(|&b| byte_is_upper(b))
-        }
-        Class(ref ranges) => {
-            for r in ranges {
-                if r.start.is_uppercase() || r.end.is_uppercase() {
-                    return true;
-                }
-            }
-            false
-        }
-        ClassBytes(ref ranges) => {
-            for r in ranges {
-                if byte_is_upper(r.start) || byte_is_upper(r.end) {
-                    return true;
-                }
-            }
-            false
-        }
-        Group { ref e, .. } => has_uppercase_literal(e),
-        Repeat { ref e, .. } => has_uppercase_literal(e),
-        Concat(ref es) => es.iter().any(has_uppercase_literal),
-        Alternate(ref es) => es.iter().any(has_uppercase_literal),
-        _ => false,
    }
+    false
 }

 #[cfg(test)]
@@ -358,7 +339,7 @@ mod tests {
    use memchr::{memchr, memrchr};
    use regex::bytes::Regex;

-    use super::{GrepBuilder, Match};
+    use super::{GrepBuilder, Match, has_uppercase_literal};

    static SHERLOCK: &'static [u8] = include_bytes!("./data/sherlock.txt");

@@ -395,4 +376,20 @@ mod tests {
        assert_eq!(expected.len(), got.len());
        assert_eq!(expected, got);
    }
+
+    #[test]
+    fn pattern_case() {
+        assert_eq!(has_uppercase_literal(&"".to_string()), false);
+        assert_eq!(has_uppercase_literal(&"foo".to_string()), false);
+        assert_eq!(has_uppercase_literal(&"Foo".to_string()), true);
+        assert_eq!(has_uppercase_literal(&"foO".to_string()), true);
+        assert_eq!(has_uppercase_literal(&"foo\\\\".to_string()), false);
+        assert_eq!(has_uppercase_literal(&"foo\\w".to_string()), false);
+        assert_eq!(has_uppercase_literal(&"foo\\S".to_string()), false);
+        assert_eq!(has_uppercase_literal(&"foo\\p{Ll}".to_string()), true);
+        assert_eq!(has_uppercase_literal(&"foo[a-z]".to_string()), false);
+        assert_eq!(has_uppercase_literal(&"foo[A-Z]".to_string()), true);
+        assert_eq!(has_uppercase_literal(&"foo[\\S\\t]".to_string()), false);
+        assert_eq!(has_uppercase_literal(&"foo\\\\S".to_string()), true);
+    }
 }