mirror of
https://github.com/BurntSushi/ripgrep.git
synced 2024-12-02 02:56:32 +02:00
Fix required literal handling and add debug prints.
In particular, if we had an inner literal and were doing a case insensitive search, then the literals are dropped because we previously only allowed a single inner literal to have an effect. Now we allow alternations of inner literals, but still don't quite take full advantage.
This commit is contained in:
parent
0891b4a3c0
commit
fd3e5069b6
@ -14,6 +14,7 @@ keywords = ["regex", "grep", "egrep", "search", "pattern"]
|
||||
license = "Unlicense/MIT"
|
||||
|
||||
[dependencies]
|
||||
log = "0.3"
|
||||
memchr = "0.1"
|
||||
memmap = "0.2"
|
||||
regex = "0.1.75"
|
||||
|
@ -4,6 +4,8 @@
|
||||
A fast line oriented regex searcher.
|
||||
*/
|
||||
|
||||
#[macro_use]
|
||||
extern crate log;
|
||||
extern crate memchr;
|
||||
extern crate regex;
|
||||
extern crate regex_syntax as syntax;
|
||||
|
@ -1,13 +1,22 @@
|
||||
/*!
|
||||
The literals module is responsible for extracting *inner* literals out of the
|
||||
AST of a regular expression. Normally this is the job of the regex engine
|
||||
itself, but the regex engine doesn't look for inner literals. Since we're doing
|
||||
line based searching, we can use them, so we need to do it ourselves.
|
||||
|
||||
Note that this implementation is incredibly suspicious. We need something more
|
||||
principled.
|
||||
*/
|
||||
use std::cmp;
|
||||
use std::iter;
|
||||
|
||||
use regex::bytes::Regex;
|
||||
use syntax::{
|
||||
Expr, Literals, Lit,
|
||||
Repeater,
|
||||
ByteClass, ByteRange, CharClass, ClassRange, Repeater,
|
||||
};
|
||||
|
||||
#[derive(Debug)]
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct LiteralSets {
|
||||
prefixes: Literals,
|
||||
suffixes: Literals,
|
||||
@ -27,6 +36,7 @@ impl LiteralSets {
|
||||
|
||||
pub fn to_regex(&self) -> Option<Regex> {
|
||||
if self.prefixes.all_complete() && !self.prefixes.is_empty() {
|
||||
debug!("literal prefixes detected: {:?}", self.prefixes);
|
||||
// When this is true, the regex engine will do a literal scan.
|
||||
return None;
|
||||
}
|
||||
@ -56,13 +66,27 @@ impl LiteralSets {
|
||||
if suf_lcs.len() > lit.len() {
|
||||
lit = suf_lcs;
|
||||
}
|
||||
if req.len() > lit.len() {
|
||||
if req_lits.len() == 1 && req.len() > lit.len() {
|
||||
lit = req;
|
||||
}
|
||||
if lit.is_empty() {
|
||||
|
||||
// Special case: if we detected an alternation of inner required
|
||||
// literals and its longest literal is bigger than the longest
|
||||
// prefix/suffix, then choose the alternation. In practice, this
|
||||
// helps with case insensitive matching, which can generate lots of
|
||||
// inner required literals.
|
||||
let any_empty = req_lits.iter().any(|lit| lit.is_empty());
|
||||
if req.len() > lit.len() && req_lits.len() > 1 && !any_empty {
|
||||
debug!("required literals found: {:?}", req_lits);
|
||||
let alts: Vec<String> =
|
||||
req_lits.into_iter().map(|x| bytes_to_regex(x)).collect();
|
||||
// Literals always compile.
|
||||
Some(Regex::new(&alts.join("|")).unwrap())
|
||||
} else if lit.is_empty() {
|
||||
None
|
||||
} else {
|
||||
// Literals always compile.
|
||||
debug!("required literal found: {:?}", show(lit));
|
||||
Some(Regex::new(&bytes_to_regex(lit)).unwrap())
|
||||
}
|
||||
}
|
||||
@ -75,14 +99,30 @@ fn union_required(expr: &Expr, lits: &mut Literals) {
|
||||
let s: String = chars.iter().cloned().collect();
|
||||
lits.cross_add(s.as_bytes());
|
||||
}
|
||||
Literal { casei: true, .. } => {
|
||||
lits.cut();
|
||||
Literal { ref chars, casei: true } => {
|
||||
for &c in chars {
|
||||
let cls = CharClass::new(vec![
|
||||
ClassRange { start: c, end: c },
|
||||
]).case_fold();
|
||||
if !lits.add_char_class(&cls) {
|
||||
lits.cut();
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
LiteralBytes { ref bytes, casei: false } => {
|
||||
lits.cross_add(bytes);
|
||||
}
|
||||
LiteralBytes { casei: true, .. } => {
|
||||
lits.cut();
|
||||
LiteralBytes { ref bytes, casei: true } => {
|
||||
for &b in bytes {
|
||||
let cls = ByteClass::new(vec![
|
||||
ByteRange { start: b, end: b },
|
||||
]).case_fold();
|
||||
if !lits.add_byte_class(&cls) {
|
||||
lits.cut();
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
Class(_) => {
|
||||
lits.cut();
|
||||
@ -205,3 +245,18 @@ fn bytes_to_regex(bs: &[u8]) -> String {
|
||||
}
|
||||
s
|
||||
}
|
||||
|
||||
/// Converts arbitrary bytes to a nice string.
|
||||
fn show(bs: &[u8]) -> String {
|
||||
// Why aren't we using this to feed to the regex? Doesn't really matter
|
||||
// I guess. ---AG
|
||||
use std::ascii::escape_default;
|
||||
use std::str;
|
||||
|
||||
let mut nice = String::new();
|
||||
for &b in bs {
|
||||
let part: Vec<u8> = escape_default(b).collect();
|
||||
nice.push_str(str::from_utf8(&part).unwrap());
|
||||
}
|
||||
nice
|
||||
}
|
||||
|
@ -152,6 +152,7 @@ impl GrepBuilder {
|
||||
.unicode(true)
|
||||
.case_insensitive(self.opts.case_insensitive)
|
||||
.parse(&self.pattern));
|
||||
debug!("regex ast:\n{:#?}", expr);
|
||||
Ok(try!(nonl::remove(expr, self.opts.line_terminator)))
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user