use memchr::{memchr, memrchr}; use regex::bytes::{Regex, RegexBuilder}; use syntax; use literals::LiteralSets; use nonl; use Result; /// A matched line. #[derive(Clone, Debug, Default, Eq, PartialEq)] pub struct Match { start: usize, end: usize, } impl Match { /// Create a new empty match value. pub fn new() -> Match { Match::default() } /// Return the starting byte offset of the line that matched. #[inline] pub fn start(&self) -> usize { self.start } /// Return the ending byte offset of the line that matched. #[inline] pub fn end(&self) -> usize { self.end } } /// A fast line oriented regex searcher. #[derive(Clone, Debug)] pub struct Grep { re: Regex, required: Option, opts: Options, } /// A builder for a grep searcher. #[derive(Clone, Debug)] pub struct GrepBuilder { pattern: String, opts: Options, } #[derive(Clone, Debug)] struct Options { case_insensitive: bool, line_terminator: u8, size_limit: usize, dfa_size_limit: usize, } impl Default for Options { fn default() -> Options { Options { case_insensitive: false, line_terminator: b'\n', size_limit: 10 * (1 << 20), dfa_size_limit: 10 * (1 << 20), } } } impl GrepBuilder { /// Create a new builder for line searching. /// /// The pattern given should be a regular expression. The precise syntax /// supported is documented on the regex crate. pub fn new(pattern: &str) -> GrepBuilder { GrepBuilder { pattern: pattern.to_string(), opts: Options::default(), } } /// Set the line terminator. /// /// The line terminator can be any ASCII character and serves to delineate /// the match boundaries in the text searched. /// /// This panics if `ascii_byte` is greater than `0x7F` (i.e., not ASCII). pub fn line_terminator(mut self, ascii_byte: u8) -> GrepBuilder { assert!(ascii_byte <= 0x7F); self.opts.line_terminator = ascii_byte; self } /// Set the case sensitive flag (`i`) on the regex. pub fn case_insensitive(mut self, yes: bool) -> GrepBuilder { self.opts.case_insensitive = yes; self } /// Set the approximate size limit of the compiled regular expression. /// /// This roughly corresponds to the number of bytes occupied by a /// single compiled program. If the program exceeds this number, then a /// compilation error is returned. pub fn size_limit(mut self, limit: usize) -> GrepBuilder { self.opts.size_limit = limit; self } /// Set the approximate size of the cache used by the DFA. /// /// This roughly corresponds to the number of bytes that the DFA will use /// while searching. /// /// Note that this is a per thread limit. There is no way to set a global /// limit. In particular, if a regex is used from multiple threads /// simulanteously, then each thread may use up to the number of bytes /// specified here. pub fn dfa_size_limit(mut self, limit: usize) -> GrepBuilder { self.opts.dfa_size_limit = limit; self } /// Create a line searcher. /// /// If there was a problem parsing or compiling the regex with the given /// options, then an error is returned. pub fn build(self) -> Result { let expr = try!(self.parse()); let literals = LiteralSets::create(&expr); let re = try!( RegexBuilder::new(&expr.to_string()) .case_insensitive(self.opts.case_insensitive) .multi_line(true) .unicode(true) .size_limit(self.opts.size_limit) .dfa_size_limit(self.opts.dfa_size_limit) .compile() ); Ok(Grep { re: re, required: literals.to_regex(), opts: self.opts, }) } /// Parses the underlying pattern and ensures the pattern can never match /// the line terminator. fn parse(&self) -> Result { let expr = try!(syntax::ExprBuilder::new() .allow_bytes(true) .unicode(true) .case_insensitive(self.opts.case_insensitive) .parse(&self.pattern)); debug!("regex ast:\n{:#?}", expr); Ok(try!(nonl::remove(expr, self.opts.line_terminator))) } } impl Grep { /// Returns a reference to the underlying regex used by the searcher. pub fn regex(&self) -> &Regex { &self.re } /// Returns an iterator over all matches in the given buffer. pub fn iter<'b, 's>(&'s self, buf: &'b [u8]) -> Iter<'b, 's> { Iter { searcher: self, buf: buf, start: 0, } } /// Fills in the next line that matches in the given buffer starting at /// the position given. /// /// If no match could be found, `false` is returned, otherwise, `true` is /// returned. pub fn read_match( &self, mat: &mut Match, buf: &[u8], mut start: usize, ) -> bool { if start >= buf.len() { return false; } if let Some(ref req) = self.required { while start < buf.len() { let e = match req.shortest_match(&buf[start..]) { None => return false, Some(e) => start + e, }; let (prevnl, nextnl) = self.find_line(buf, e, e); match self.re.shortest_match(&buf[prevnl..nextnl]) { None => { start = nextnl + 1; continue; } Some(_) => { self.fill_match(mat, prevnl, nextnl); return true; } } } false } else { let e = match self.re.shortest_match(&buf[start..]) { None => return false, Some(e) => start + e, }; let (s, e) = self.find_line(buf, e, e); self.fill_match(mat, s, e); true } } fn fill_match(&self, mat: &mut Match, start: usize, end: usize) { mat.start = start; mat.end = end; } fn find_line(&self, buf: &[u8], s: usize, e: usize) -> (usize, usize) { (self.find_line_start(buf, s), self.find_line_end(buf, e)) } fn find_line_start(&self, buf: &[u8], pos: usize) -> usize { memrchr(self.opts.line_terminator, &buf[0..pos]).map_or(0, |i| i + 1) } fn find_line_end(&self, buf: &[u8], pos: usize) -> usize { memchr(self.opts.line_terminator, &buf[pos..]) .map_or(buf.len(), |i| pos + i + 1) } } /// An iterator over all matches in a particular buffer. /// /// `'b` refers to the lifetime of the buffer, and `'s` refers to the lifetime /// of the searcher. pub struct Iter<'b, 's> { searcher: &'s Grep, buf: &'b [u8], start: usize, } impl<'b, 's> Iterator for Iter<'b, 's> { type Item = Match; fn next(&mut self) -> Option { let mut mat = Match::default(); if !self.searcher.read_match(&mut mat, self.buf, self.start) { self.start = self.buf.len(); return None; } self.start = mat.end; Some(mat) } } #[cfg(test)] mod tests { #![allow(unused_imports)] use memchr::{memchr, memrchr}; use regex::bytes::Regex; use super::{GrepBuilder, Match}; static SHERLOCK: &'static [u8] = include_bytes!("./data/sherlock.txt"); #[allow(dead_code)] fn s(bytes: &[u8]) -> String { String::from_utf8(bytes.to_vec()).unwrap() } fn find_lines(pat: &str, haystack: &[u8]) -> Vec { let re = Regex::new(pat).unwrap(); let mut lines = vec![]; for (s, e) in re.find_iter(haystack) { let start = memrchr(b'\n', &haystack[..s]) .map_or(0, |i| i + 1); let end = memchr(b'\n', &haystack[e..]) .map_or(haystack.len(), |i| e + i + 1); lines.push(Match { start: start, end: end, }); } lines } fn grep_lines(pat: &str, haystack: &[u8]) -> Vec { let g = GrepBuilder::new(pat).build().unwrap(); g.iter(haystack).collect() } #[test] fn buffered_literal() { let expected = find_lines("Sherlock Holmes", SHERLOCK); let got = grep_lines("Sherlock Holmes", SHERLOCK); assert_eq!(expected.len(), got.len()); assert_eq!(expected, got); } }