use aho_corasick::{AhoCorasick, AhoCorasickBuilder, MatchKind}; use grep_matcher::{Matcher, Match, NoError}; use regex_syntax::hir::Hir; use error::Error; use matcher::RegexCaptures; /// A matcher for an alternation of literals. /// /// Ideally, this optimization would be pushed down into the regex engine, but /// making this work correctly there would require quite a bit of refactoring. /// Moreover, doing it one layer above lets us do thing like, "if we /// specifically only want to search for literals, then don't bother with /// regex parsing at all." #[derive(Clone, Debug)] pub struct MultiLiteralMatcher { /// The Aho-Corasick automaton. ac: AhoCorasick, } impl MultiLiteralMatcher { /// Create a new multi-literal matcher from the given literals. pub fn new>( literals: &[B], ) -> Result { let ac = AhoCorasickBuilder::new() .match_kind(MatchKind::LeftmostFirst) .auto_configure(literals) .build_with_size::(literals) .map_err(Error::regex)?; Ok(MultiLiteralMatcher { ac }) } } impl Matcher for MultiLiteralMatcher { type Captures = RegexCaptures; type Error = NoError; fn find_at( &self, haystack: &[u8], at: usize, ) -> Result, NoError> { match self.ac.find(&haystack[at..]) { None => Ok(None), Some(m) => Ok(Some(Match::new(at + m.start(), at + m.end()))), } } fn new_captures(&self) -> Result { Ok(RegexCaptures::simple()) } fn capture_count(&self) -> usize { 1 } fn capture_index(&self, _: &str) -> Option { None } fn captures_at( &self, haystack: &[u8], at: usize, caps: &mut RegexCaptures, ) -> Result { caps.set_simple(None); let mat = self.find_at(haystack, at)?; caps.set_simple(mat); Ok(mat.is_some()) } // We specifically do not implement other methods like find_iter. Namely, // the iter methods are guaranteed to be correct by virtue of implementing // find_at above. } /// Alternation literals checks if the given HIR is a simple alternation of /// literals, and if so, returns them. Otherwise, this returns None. pub fn alternation_literals(expr: &Hir) -> Option>> { use regex_syntax::hir::{HirKind, Literal}; // This is pretty hacky, but basically, if `is_alternation_literal` is // true, then we can make several assumptions about the structure of our // HIR. This is what justifies the `unreachable!` statements below. if !expr.is_alternation_literal() { return None; } let alts = match *expr.kind() { HirKind::Alternation(ref alts) => alts, _ => return None, // one literal isn't worth it }; let extendlit = |lit: &Literal, dst: &mut Vec| { match *lit { Literal::Unicode(c) => { let mut buf = [0; 4]; dst.extend_from_slice(c.encode_utf8(&mut buf).as_bytes()); } Literal::Byte(b) => { dst.push(b); } } }; let mut lits = vec![]; for alt in alts { let mut lit = vec![]; match *alt.kind() { HirKind::Empty => {} HirKind::Literal(ref x) => extendlit(x, &mut lit), HirKind::Concat(ref exprs) => { for e in exprs { match *e.kind() { HirKind::Literal(ref x) => extendlit(x, &mut lit), _ => unreachable!("expected literal, got {:?}", e), } } } _ => unreachable!("expected literal or concat, got {:?}", alt), } lits.push(lit); } Some(lits) }