1
0
mirror of https://github.com/BurntSushi/ripgrep.git synced 2025-01-03 05:10:12 +02:00
This commit is contained in:
Andrew Gallant 2016-03-29 21:21:34 -04:00
parent 4ae67a8587
commit 79a51029c1
3 changed files with 210 additions and 66 deletions

View File

@ -1,12 +1,10 @@
use std::cmp;
use std::iter;
use std::str;
use regex::quote;
use regex::bytes::Regex;
use syntax::{
Expr, Literals, Lit,
ByteClass, CharClass, Repeater, ClassRange, ByteRange,
Repeater,
};
#[derive(Debug)]
@ -16,11 +14,6 @@ pub struct LiteralSets {
required: Literals,
}
#[derive(Debug)]
pub struct LiteralMatcher {
re: Regex,
}
impl LiteralSets {
pub fn create(expr: &Expr) -> Self {
let mut required = Literals::empty();
@ -32,7 +25,11 @@ impl LiteralSets {
}
}
pub fn to_matcher(&self) -> Option<LiteralMatcher> {
pub fn to_matcher(&self) -> Option<Regex> {
if self.prefixes.all_complete() && !self.prefixes.is_empty() {
// When this is true, the regex engine will do a literal scan.
return None;
}
let pre_lcp = self.prefixes.longest_common_prefix();
let pre_lcs = self.prefixes.longest_common_suffix();
let suf_lcp = self.suffixes.longest_common_prefix();
@ -60,8 +57,8 @@ impl LiteralSets {
if lit.is_empty() {
None
} else {
let s = str::from_utf8(lit).unwrap();
Some(LiteralMatcher { re: Regex::new(&quote(s)).unwrap() })
// Literals always compile.
Some(Regex::new(&bytes_to_regex(lit)).unwrap())
}
}
}
@ -74,39 +71,19 @@ fn union_required(expr: &Expr, lits: &mut Literals) {
lits.cross_add(s.as_bytes());
}
Literal { ref chars, casei: true } => {
for &c in chars {
let cls = CharClass::new(vec![
ClassRange { start: c, end: c },
]).case_fold();
if !lits.add_char_class(&cls) {
lits.cut();
return;
}
}
lits.cut();
}
LiteralBytes { ref bytes, casei: false } => {
lits.cross_add(bytes);
}
LiteralBytes { ref bytes, casei: true } => {
for &b in bytes {
let cls = ByteClass::new(vec![
ByteRange { start: b, end: b },
]).case_fold();
if !lits.add_byte_class(&cls) {
lits.cut();
return;
}
}
lits.cut();
}
Class(ref cls) => {
if !lits.add_char_class(cls) {
lits.cut();
}
lits.cut();
}
ClassBytes(ref cls) => {
if !lits.add_byte_class(cls) {
lits.cut();
}
lits.cut();
}
Group { ref e, .. } => {
union_required(&**e, lits);
@ -212,3 +189,13 @@ fn alternate_literals<F: FnMut(&Expr, &mut Literals)>(
lits.add(Lit::new(lcs.to_vec()));
}
}
/// Converts an arbitrary sequence of bytes to a literal suitable for building
/// a regular expression.
fn bytes_to_regex(bs: &[u8]) -> String {
let mut s = String::with_capacity(bs.len());
for &b in bs {
s.push_str(&format!("\\x{:02x}", b));
}
s
}

View File

@ -20,9 +20,11 @@ use docopt::Docopt;
use regex::bytes::Regex;
use literals::LiteralSets;
use search::{LineSearcher, LineSearcherBuilder};
mod literals;
mod nonl;
mod search;
pub type Result<T> = result::Result<T, Box<Error + Send + Sync>>;
@ -46,20 +48,36 @@ fn main() {
}
fn run(args: &Args) -> Result<u64> {
let expr = try!(parse(&args.arg_pattern));
let literals = LiteralSets::create(&expr);
// println!("{:?}", literals);
// println!("{:?}", literals.to_matcher());
let re = Regex::new(&expr.to_string()).unwrap();
if args.arg_file.is_empty() {
let expr = try!(parse(&args.arg_pattern));
let literals = LiteralSets::create(&expr);
let re = Regex::new(&expr.to_string()).unwrap();
let _stdin = io::stdin();
let stdin = _stdin.lock();
run_by_line(args, &re, stdin)
} else {
run_mmap(args, &re)
let searcher =
try!(LineSearcherBuilder::new(&args.arg_pattern).create());
run_mmap(args, &searcher)
}
}
fn run_mmap(args: &Args, searcher: &LineSearcher) -> Result<u64> {
use memmap::{Mmap, Protection};
assert!(args.arg_file.len() == 1);
let mut wtr = io::BufWriter::new(io::stdout());
let mut count = 0;
let mmap = try!(Mmap::open_path(&args.arg_file[0], Protection::Read));
let text = unsafe { mmap.as_slice() };
for m in searcher.search(text) {
try!(wtr.write(&text[m.start..m.end]));
try!(wtr.write(b"\n"));
count += 1;
}
Ok(count)
}
fn run_by_line<B: BufRead>(
args: &Args,
re: &Regex,
@ -84,31 +102,6 @@ fn run_by_line<B: BufRead>(
Ok(count)
}
fn run_mmap(args: &Args, re: &Regex) -> Result<u64> {
use memchr::{memchr, memrchr};
use memmap::{Mmap, Protection};
assert!(args.arg_file.len() == 1);
let mut wtr = io::BufWriter::new(io::stdout());
let mut count = 0;
let mmap = try!(Mmap::open_path(&args.arg_file[0], Protection::Read));
let text = unsafe { mmap.as_slice() };
let mut start = 0;
while let Some((s, e)) = re.find(&text[start..]) {
let (s, e) = (start + s, start + e);
let prevnl = memrchr(b'\n', &text[0..s]).map_or(0, |i| i + 1);
let nextnl = memchr(b'\n', &text[e..]).map_or(text.len(), |i| e + i);
try!(wtr.write(&text[prevnl..nextnl]));
try!(wtr.write(b"\n"));
start = nextnl + 1;
count += 1;
if start >= text.len() {
break;
}
}
Ok(count)
}
fn parse(re: &str) -> Result<syntax::Expr> {
let expr =
try!(syntax::ExprBuilder::new()

164
src/search.rs Normal file
View File

@ -0,0 +1,164 @@
use memchr::{memchr, memrchr};
use regex::bytes::Regex;
use syntax;
use literals::LiteralSets;
use nonl;
use Result;
#[derive(Clone, Debug)]
pub struct LineSearcher {
re: Regex,
required: Option<Regex>,
opts: Options,
}
#[derive(Clone, Debug)]
pub struct LineSearcherBuilder {
pattern: String,
opts: Options,
}
#[derive(Clone, Debug, Default)]
struct Options {
case_insensitive: bool,
lines: bool,
locations: bool,
}
impl LineSearcherBuilder {
pub fn new(pattern: &str) -> LineSearcherBuilder {
LineSearcherBuilder {
pattern: pattern.to_string(),
opts: Options::default(),
}
}
pub fn case_insensitive(mut self, yes: bool) -> LineSearcherBuilder {
self.opts.case_insensitive = yes;
self
}
pub fn line_numbers(mut self, yes: bool) -> LineSearcherBuilder {
self.opts.lines = yes;
self
}
pub fn locations(mut self, yes: bool) -> LineSearcherBuilder {
self.opts.locations = yes;
self
}
pub fn create(self) -> Result<LineSearcher> {
let expr = try!(parse(&self.pattern));
let literals = LiteralSets::create(&expr);
let pat =
if self.opts.case_insensitive {
format!("(?i){}", expr)
} else {
expr.to_string()
};
// We've already parsed the pattern, so we know it will compiled.
let re = Regex::new(&pat).unwrap();
Ok(LineSearcher {
re: re,
required: literals.to_matcher(),
opts: self.opts,
})
}
}
impl LineSearcher {
pub fn search<'b, 's>(&'s self, buf: &'b [u8]) -> Iter<'b, 's> {
Iter {
searcher: self,
buf: buf,
start: 0,
count: 0,
}
}
}
pub struct Match {
pub start: usize,
pub end: usize,
pub count: usize,
pub line: Option<usize>,
pub locations: Vec<(usize, usize)>,
}
pub struct Iter<'b, 's> {
searcher: &'s LineSearcher,
buf: &'b [u8],
start: usize,
count: usize,
}
impl<'b, 's> Iter<'b, 's> {
fn next_line_match(&mut self) -> Option<(usize, usize)> {
if self.start >= self.buf.len() {
return None;
}
if let Some(ref req) = self.searcher.required {
while self.start < self.buf.len() {
let (s, e) = match req.find(&self.buf[self.start..]) {
None => return None,
Some((s, e)) => (self.start + s, self.start + e),
};
let (prevnl, nextnl) = self.find_line(s, e);
match self.searcher.re.find(&self.buf[prevnl..nextnl]) {
None => {
self.start = nextnl + 1;
continue;
}
Some(_) => return Some((prevnl, nextnl)),
}
}
None
} else {
let (s, e) = match self.searcher.re.find(&self.buf[self.start..]) {
None => return None,
Some((s, e)) => (self.start + s, self.start + e),
};
Some(self.find_line(s, e))
}
}
fn find_line(&self, s: usize, e: usize) -> (usize, usize) {
let prevnl =
memrchr(b'\n', &self.buf[0..s]).map_or(0, |i| i + 1);
let nextnl =
memchr(b'\n', &self.buf[e..]).map_or(self.buf.len(), |i| e + i);
(prevnl, nextnl)
}
}
impl<'b, 's> Iterator for Iter<'b, 's> {
type Item = Match;
fn next(&mut self) -> Option<Match> {
let (prevnl, nextnl) = match self.next_line_match() {
None => return None,
Some((s, e)) => (s, e),
};
let count = self.count;
self.start = nextnl + 1;
self.count += 1;
Some(Match {
start: prevnl,
end: nextnl,
count: count,
line: None,
locations: vec![],
})
}
}
fn parse(re: &str) -> Result<syntax::Expr> {
let expr =
try!(syntax::ExprBuilder::new()
.allow_bytes(true)
.unicode(false)
.parse(re));
Ok(try!(nonl::remove(expr)))
}