1
0
mirror of https://github.com/BurntSushi/ripgrep.git synced 2024-12-02 02:56:32 +02:00

Fix required literal handling and add debug prints.

In particular, if we had an inner literal and were doing a case insensitive
search, then the literals are dropped because we previously only allowed
a single inner literal to have an effect. Now we allow alternations of
inner literals, but still don't quite take full advantage.
This commit is contained in:
Andrew Gallant 2016-09-06 19:33:03 -04:00
parent 0891b4a3c0
commit fd3e5069b6
4 changed files with 67 additions and 8 deletions

View File

@ -14,6 +14,7 @@ keywords = ["regex", "grep", "egrep", "search", "pattern"]
license = "Unlicense/MIT"
[dependencies]
log = "0.3"
memchr = "0.1"
memmap = "0.2"
regex = "0.1.75"

View File

@ -4,6 +4,8 @@
A fast line oriented regex searcher.
*/
#[macro_use]
extern crate log;
extern crate memchr;
extern crate regex;
extern crate regex_syntax as syntax;

View File

@ -1,13 +1,22 @@
/*!
The literals module is responsible for extracting *inner* literals out of the
AST of a regular expression. Normally this is the job of the regex engine
itself, but the regex engine doesn't look for inner literals. Since we're doing
line based searching, we can use them, so we need to do it ourselves.
Note that this implementation is incredibly suspicious. We need something more
principled.
*/
use std::cmp;
use std::iter;
use regex::bytes::Regex;
use syntax::{
Expr, Literals, Lit,
Repeater,
ByteClass, ByteRange, CharClass, ClassRange, Repeater,
};
#[derive(Debug)]
#[derive(Clone, Debug)]
pub struct LiteralSets {
prefixes: Literals,
suffixes: Literals,
@ -27,6 +36,7 @@ impl LiteralSets {
pub fn to_regex(&self) -> Option<Regex> {
if self.prefixes.all_complete() && !self.prefixes.is_empty() {
debug!("literal prefixes detected: {:?}", self.prefixes);
// When this is true, the regex engine will do a literal scan.
return None;
}
@ -56,13 +66,27 @@ impl LiteralSets {
if suf_lcs.len() > lit.len() {
lit = suf_lcs;
}
if req.len() > lit.len() {
if req_lits.len() == 1 && req.len() > lit.len() {
lit = req;
}
if lit.is_empty() {
// Special case: if we detected an alternation of inner required
// literals and its longest literal is bigger than the longest
// prefix/suffix, then choose the alternation. In practice, this
// helps with case insensitive matching, which can generate lots of
// inner required literals.
let any_empty = req_lits.iter().any(|lit| lit.is_empty());
if req.len() > lit.len() && req_lits.len() > 1 && !any_empty {
debug!("required literals found: {:?}", req_lits);
let alts: Vec<String> =
req_lits.into_iter().map(|x| bytes_to_regex(x)).collect();
// Literals always compile.
Some(Regex::new(&alts.join("|")).unwrap())
} else if lit.is_empty() {
None
} else {
// Literals always compile.
debug!("required literal found: {:?}", show(lit));
Some(Regex::new(&bytes_to_regex(lit)).unwrap())
}
}
@ -75,14 +99,30 @@ fn union_required(expr: &Expr, lits: &mut Literals) {
let s: String = chars.iter().cloned().collect();
lits.cross_add(s.as_bytes());
}
Literal { casei: true, .. } => {
lits.cut();
Literal { ref chars, casei: true } => {
for &c in chars {
let cls = CharClass::new(vec![
ClassRange { start: c, end: c },
]).case_fold();
if !lits.add_char_class(&cls) {
lits.cut();
return;
}
}
}
LiteralBytes { ref bytes, casei: false } => {
lits.cross_add(bytes);
}
LiteralBytes { casei: true, .. } => {
lits.cut();
LiteralBytes { ref bytes, casei: true } => {
for &b in bytes {
let cls = ByteClass::new(vec![
ByteRange { start: b, end: b },
]).case_fold();
if !lits.add_byte_class(&cls) {
lits.cut();
return;
}
}
}
Class(_) => {
lits.cut();
@ -205,3 +245,18 @@ fn bytes_to_regex(bs: &[u8]) -> String {
}
s
}
/// Converts arbitrary bytes to a nice string.
fn show(bs: &[u8]) -> String {
// Why aren't we using this to feed to the regex? Doesn't really matter
// I guess. ---AG
use std::ascii::escape_default;
use std::str;
let mut nice = String::new();
for &b in bs {
let part: Vec<u8> = escape_default(b).collect();
nice.push_str(str::from_utf8(&part).unwrap());
}
nice
}

View File

@ -152,6 +152,7 @@ impl GrepBuilder {
.unicode(true)
.case_insensitive(self.opts.case_insensitive)
.parse(&self.pattern));
debug!("regex ast:\n{:#?}", expr);
Ok(try!(nonl::remove(expr, self.opts.line_terminator)))
}
}