mirror of
https://github.com/BurntSushi/ripgrep.git
synced 2025-03-03 14:32:22 +02:00
Improve detection of upper-case characters by smart-case feature
Fixes #717 (partially) The previous implementation of the smart-case feature was actually *too* smart, in that it inspected the final character ranges in the AST to determine if the pattern contained upper-case characters. This meant that patterns like `foo\w` would not be handled case-insensitively, since `\w` includes the range of upper-case characters A–Z. As a medium-term solution to this problem, we now inspect the input pattern itself for upper-case characters, ignoring any that immediately follow a `\`. This neatly handles all of the most basic cases like `\w`, `\S`, and `É`, though it still has problems with more complex features like `\p{Ll}`. Handling those correctly will require improvements to the AST.
This commit is contained in:
parent
d775259ed9
commit
86c890bcec
@ -102,9 +102,9 @@ impl GrepBuilder {
|
||||
|
||||
/// Whether to enable smart case search or not (disabled by default).
|
||||
///
|
||||
/// Smart case uses case insensitive search if the regex is contains all
|
||||
/// lowercase literal characters. Otherwise, a case sensitive search is
|
||||
/// used instead.
|
||||
/// Smart case uses case insensitive search if the pattern contains only
|
||||
/// lowercase characters (ignoring any characters which immediately follow
|
||||
/// a '\'). Otherwise, a case sensitive search is used instead.
|
||||
///
|
||||
/// Enabling the case_insensitive flag overrides this.
|
||||
pub fn case_smart(mut self, yes: bool) -> GrepBuilder {
|
||||
@ -197,8 +197,6 @@ impl GrepBuilder {
|
||||
}
|
||||
|
||||
/// Determines whether the case insensitive flag should be enabled or not.
|
||||
///
|
||||
/// An error is returned if the regex could not be parsed.
|
||||
fn is_case_insensitive(&self) -> Result<bool> {
|
||||
if self.opts.case_insensitive {
|
||||
return Ok(true);
|
||||
@ -206,12 +204,7 @@ impl GrepBuilder {
|
||||
if !self.opts.case_smart {
|
||||
return Ok(false);
|
||||
}
|
||||
let expr =
|
||||
try!(syntax::ExprBuilder::new()
|
||||
.allow_bytes(true)
|
||||
.unicode(true)
|
||||
.parse(&self.pattern));
|
||||
Ok(!has_uppercase_literal(&expr))
|
||||
Ok(!has_uppercase_literal(&self.pattern))
|
||||
}
|
||||
}
|
||||
|
||||
@ -317,38 +310,26 @@ impl<'b, 's> Iterator for Iter<'b, 's> {
|
||||
}
|
||||
}
|
||||
|
||||
fn has_uppercase_literal(expr: &Expr) -> bool {
|
||||
use syntax::Expr::*;
|
||||
fn byte_is_upper(b: u8) -> bool { b'A' <= b && b <= b'Z' }
|
||||
match *expr {
|
||||
Literal { ref chars, casei } => {
|
||||
casei || chars.iter().any(|c| c.is_uppercase())
|
||||
/// Determine whether the pattern contains an uppercase character which should
|
||||
/// negate the effect of the smart-case option.
|
||||
///
|
||||
/// Ideally we would be able to check the AST in order to correctly handle
|
||||
/// things like '\p{Ll}' and '\p{Lu}' (which should be treated as explicitly
|
||||
/// cased), but we don't currently have that option. For now, our 'good enough'
|
||||
/// solution is to simply perform a semi-naïve scan of the input pattern and
|
||||
/// ignore all characters following a '\'. The ExprBuilder will handle any
|
||||
/// actual errors, and this at least lets us support the most common cases,
|
||||
/// like 'foo\w' and 'foo\S', in an intuitive manner.
|
||||
fn has_uppercase_literal(pattern: &str) -> bool {
|
||||
let mut chars = pattern.chars();
|
||||
while let Some(c) = chars.next() {
|
||||
if c == '\\' {
|
||||
chars.next();
|
||||
} else if c.is_uppercase() {
|
||||
return true;
|
||||
}
|
||||
LiteralBytes { ref bytes, casei } => {
|
||||
casei || bytes.iter().any(|&b| byte_is_upper(b))
|
||||
}
|
||||
Class(ref ranges) => {
|
||||
for r in ranges {
|
||||
if r.start.is_uppercase() || r.end.is_uppercase() {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
false
|
||||
}
|
||||
ClassBytes(ref ranges) => {
|
||||
for r in ranges {
|
||||
if byte_is_upper(r.start) || byte_is_upper(r.end) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
false
|
||||
}
|
||||
Group { ref e, .. } => has_uppercase_literal(e),
|
||||
Repeat { ref e, .. } => has_uppercase_literal(e),
|
||||
Concat(ref es) => es.iter().any(has_uppercase_literal),
|
||||
Alternate(ref es) => es.iter().any(has_uppercase_literal),
|
||||
_ => false,
|
||||
}
|
||||
false
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
@ -358,7 +339,7 @@ mod tests {
|
||||
use memchr::{memchr, memrchr};
|
||||
use regex::bytes::Regex;
|
||||
|
||||
use super::{GrepBuilder, Match};
|
||||
use super::{GrepBuilder, Match, has_uppercase_literal};
|
||||
|
||||
static SHERLOCK: &'static [u8] = include_bytes!("./data/sherlock.txt");
|
||||
|
||||
@ -395,4 +376,20 @@ mod tests {
|
||||
assert_eq!(expected.len(), got.len());
|
||||
assert_eq!(expected, got);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn pattern_case() {
|
||||
assert_eq!(has_uppercase_literal(&"".to_string()), false);
|
||||
assert_eq!(has_uppercase_literal(&"foo".to_string()), false);
|
||||
assert_eq!(has_uppercase_literal(&"Foo".to_string()), true);
|
||||
assert_eq!(has_uppercase_literal(&"foO".to_string()), true);
|
||||
assert_eq!(has_uppercase_literal(&"foo\\\\".to_string()), false);
|
||||
assert_eq!(has_uppercase_literal(&"foo\\w".to_string()), false);
|
||||
assert_eq!(has_uppercase_literal(&"foo\\S".to_string()), false);
|
||||
assert_eq!(has_uppercase_literal(&"foo\\p{Ll}".to_string()), true);
|
||||
assert_eq!(has_uppercase_literal(&"foo[a-z]".to_string()), false);
|
||||
assert_eq!(has_uppercase_literal(&"foo[A-Z]".to_string()), true);
|
||||
assert_eq!(has_uppercase_literal(&"foo[\\S\\t]".to_string()), false);
|
||||
assert_eq!(has_uppercase_literal(&"foo\\\\S".to_string()), true);
|
||||
}
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user