1
0
mirror of https://github.com/BurntSushi/ripgrep.git synced 2025-03-03 14:32:22 +02:00

Improve detection of upper-case characters by smart-case feature

Fixes #717 (partially)

The previous implementation of the smart-case feature was actually *too* smart,
in that it inspected the final character ranges in the AST to determine if the
pattern contained upper-case characters. This meant that patterns like `foo\w`
would not be handled case-insensitively, since `\w` includes the range of
upper-case characters A–Z.

As a medium-term solution to this problem, we now inspect the input pattern
itself for upper-case characters, ignoring any that immediately follow a `\`.
This neatly handles all of the most basic cases like `\w`, `\S`, and `É`, though
it still has problems with more complex features like `\p{Ll}`. Handling those
correctly will require improvements to the AST.
This commit is contained in:
dana 2017-12-18 14:56:01 -06:00 committed by Andrew Gallant
parent d775259ed9
commit 86c890bcec

View File

@ -102,9 +102,9 @@ impl GrepBuilder {
/// Whether to enable smart case search or not (disabled by default).
///
/// Smart case uses case insensitive search if the regex is contains all
/// lowercase literal characters. Otherwise, a case sensitive search is
/// used instead.
/// Smart case uses case insensitive search if the pattern contains only
/// lowercase characters (ignoring any characters which immediately follow
/// a '\'). Otherwise, a case sensitive search is used instead.
///
/// Enabling the case_insensitive flag overrides this.
pub fn case_smart(mut self, yes: bool) -> GrepBuilder {
@ -197,8 +197,6 @@ impl GrepBuilder {
}
/// Determines whether the case insensitive flag should be enabled or not.
///
/// An error is returned if the regex could not be parsed.
fn is_case_insensitive(&self) -> Result<bool> {
if self.opts.case_insensitive {
return Ok(true);
@ -206,12 +204,7 @@ impl GrepBuilder {
if !self.opts.case_smart {
return Ok(false);
}
let expr =
try!(syntax::ExprBuilder::new()
.allow_bytes(true)
.unicode(true)
.parse(&self.pattern));
Ok(!has_uppercase_literal(&expr))
Ok(!has_uppercase_literal(&self.pattern))
}
}
@ -317,38 +310,26 @@ impl<'b, 's> Iterator for Iter<'b, 's> {
}
}
fn has_uppercase_literal(expr: &Expr) -> bool {
use syntax::Expr::*;
fn byte_is_upper(b: u8) -> bool { b'A' <= b && b <= b'Z' }
match *expr {
Literal { ref chars, casei } => {
casei || chars.iter().any(|c| c.is_uppercase())
/// Determine whether the pattern contains an uppercase character which should
/// negate the effect of the smart-case option.
///
/// Ideally we would be able to check the AST in order to correctly handle
/// things like '\p{Ll}' and '\p{Lu}' (which should be treated as explicitly
/// cased), but we don't currently have that option. For now, our 'good enough'
/// solution is to simply perform a semi-naïve scan of the input pattern and
/// ignore all characters following a '\'. The ExprBuilder will handle any
/// actual errors, and this at least lets us support the most common cases,
/// like 'foo\w' and 'foo\S', in an intuitive manner.
fn has_uppercase_literal(pattern: &str) -> bool {
let mut chars = pattern.chars();
while let Some(c) = chars.next() {
if c == '\\' {
chars.next();
} else if c.is_uppercase() {
return true;
}
LiteralBytes { ref bytes, casei } => {
casei || bytes.iter().any(|&b| byte_is_upper(b))
}
Class(ref ranges) => {
for r in ranges {
if r.start.is_uppercase() || r.end.is_uppercase() {
return true;
}
}
false
}
ClassBytes(ref ranges) => {
for r in ranges {
if byte_is_upper(r.start) || byte_is_upper(r.end) {
return true;
}
}
false
}
Group { ref e, .. } => has_uppercase_literal(e),
Repeat { ref e, .. } => has_uppercase_literal(e),
Concat(ref es) => es.iter().any(has_uppercase_literal),
Alternate(ref es) => es.iter().any(has_uppercase_literal),
_ => false,
}
false
}
#[cfg(test)]
@ -358,7 +339,7 @@ mod tests {
use memchr::{memchr, memrchr};
use regex::bytes::Regex;
use super::{GrepBuilder, Match};
use super::{GrepBuilder, Match, has_uppercase_literal};
static SHERLOCK: &'static [u8] = include_bytes!("./data/sherlock.txt");
@ -395,4 +376,20 @@ mod tests {
assert_eq!(expected.len(), got.len());
assert_eq!(expected, got);
}
#[test]
fn pattern_case() {
assert_eq!(has_uppercase_literal(&"".to_string()), false);
assert_eq!(has_uppercase_literal(&"foo".to_string()), false);
assert_eq!(has_uppercase_literal(&"Foo".to_string()), true);
assert_eq!(has_uppercase_literal(&"foO".to_string()), true);
assert_eq!(has_uppercase_literal(&"foo\\\\".to_string()), false);
assert_eq!(has_uppercase_literal(&"foo\\w".to_string()), false);
assert_eq!(has_uppercase_literal(&"foo\\S".to_string()), false);
assert_eq!(has_uppercase_literal(&"foo\\p{Ll}".to_string()), true);
assert_eq!(has_uppercase_literal(&"foo[a-z]".to_string()), false);
assert_eq!(has_uppercase_literal(&"foo[A-Z]".to_string()), true);
assert_eq!(has_uppercase_literal(&"foo[\\S\\t]".to_string()), false);
assert_eq!(has_uppercase_literal(&"foo\\\\S".to_string()), true);
}
}