mirror of
https://github.com/BurntSushi/ripgrep.git
synced 2025-06-25 14:22:54 +02:00
127 lines
3.8 KiB
Rust
127 lines
3.8 KiB
Rust
![]() |
use aho_corasick::{AhoCorasick, AhoCorasickBuilder, MatchKind};
|
||
|
use grep_matcher::{Matcher, Match, NoError};
|
||
|
use regex_syntax::hir::Hir;
|
||
|
|
||
|
use error::Error;
|
||
|
use matcher::RegexCaptures;
|
||
|
|
||
|
/// A matcher for an alternation of literals.
|
||
|
///
|
||
|
/// Ideally, this optimization would be pushed down into the regex engine, but
|
||
|
/// making this work correctly there would require quite a bit of refactoring.
|
||
|
/// Moreover, doing it one layer above lets us do thing like, "if we
|
||
|
/// specifically only want to search for literals, then don't bother with
|
||
|
/// regex parsing at all."
|
||
|
#[derive(Clone, Debug)]
|
||
|
pub struct MultiLiteralMatcher {
|
||
|
/// The Aho-Corasick automaton.
|
||
|
ac: AhoCorasick,
|
||
|
}
|
||
|
|
||
|
impl MultiLiteralMatcher {
|
||
|
/// Create a new multi-literal matcher from the given literals.
|
||
|
pub fn new<B: AsRef<[u8]>>(
|
||
|
literals: &[B],
|
||
|
) -> Result<MultiLiteralMatcher, Error> {
|
||
|
let ac = AhoCorasickBuilder::new()
|
||
|
.match_kind(MatchKind::LeftmostFirst)
|
||
|
.auto_configure(literals)
|
||
|
.build_with_size::<usize, _, _>(literals)
|
||
|
.map_err(Error::regex)?;
|
||
|
Ok(MultiLiteralMatcher { ac })
|
||
|
}
|
||
|
}
|
||
|
|
||
|
impl Matcher for MultiLiteralMatcher {
|
||
|
type Captures = RegexCaptures;
|
||
|
type Error = NoError;
|
||
|
|
||
|
fn find_at(
|
||
|
&self,
|
||
|
haystack: &[u8],
|
||
|
at: usize,
|
||
|
) -> Result<Option<Match>, NoError> {
|
||
|
match self.ac.find(&haystack[at..]) {
|
||
|
None => Ok(None),
|
||
|
Some(m) => Ok(Some(Match::new(at + m.start(), at + m.end()))),
|
||
|
}
|
||
|
}
|
||
|
|
||
|
fn new_captures(&self) -> Result<RegexCaptures, NoError> {
|
||
|
Ok(RegexCaptures::simple())
|
||
|
}
|
||
|
|
||
|
fn capture_count(&self) -> usize {
|
||
|
1
|
||
|
}
|
||
|
|
||
|
fn capture_index(&self, _: &str) -> Option<usize> {
|
||
|
None
|
||
|
}
|
||
|
|
||
|
fn captures_at(
|
||
|
&self,
|
||
|
haystack: &[u8],
|
||
|
at: usize,
|
||
|
caps: &mut RegexCaptures,
|
||
|
) -> Result<bool, NoError> {
|
||
|
caps.set_simple(None);
|
||
|
let mat = self.find_at(haystack, at)?;
|
||
|
caps.set_simple(mat);
|
||
|
Ok(mat.is_some())
|
||
|
}
|
||
|
|
||
|
// We specifically do not implement other methods like find_iter. Namely,
|
||
|
// the iter methods are guaranteed to be correct by virtue of implementing
|
||
|
// find_at above.
|
||
|
}
|
||
|
|
||
|
/// Alternation literals checks if the given HIR is a simple alternation of
|
||
|
/// literals, and if so, returns them. Otherwise, this returns None.
|
||
|
pub fn alternation_literals(expr: &Hir) -> Option<Vec<Vec<u8>>> {
|
||
|
use regex_syntax::hir::{HirKind, Literal};
|
||
|
|
||
|
// This is pretty hacky, but basically, if `is_alternation_literal` is
|
||
|
// true, then we can make several assumptions about the structure of our
|
||
|
// HIR. This is what justifies the `unreachable!` statements below.
|
||
|
|
||
|
if !expr.is_alternation_literal() {
|
||
|
return None;
|
||
|
}
|
||
|
let alts = match *expr.kind() {
|
||
|
HirKind::Alternation(ref alts) => alts,
|
||
|
_ => return None, // one literal isn't worth it
|
||
|
};
|
||
|
|
||
|
let extendlit = |lit: &Literal, dst: &mut Vec<u8>| {
|
||
|
match *lit {
|
||
|
Literal::Unicode(c) => {
|
||
|
let mut buf = [0; 4];
|
||
|
dst.extend_from_slice(c.encode_utf8(&mut buf).as_bytes());
|
||
|
}
|
||
|
Literal::Byte(b) => {
|
||
|
dst.push(b);
|
||
|
}
|
||
|
}
|
||
|
};
|
||
|
|
||
|
let mut lits = vec![];
|
||
|
for alt in alts {
|
||
|
let mut lit = vec![];
|
||
|
match *alt.kind() {
|
||
|
HirKind::Literal(ref x) => extendlit(x, &mut lit),
|
||
|
HirKind::Concat(ref exprs) => {
|
||
|
for e in exprs {
|
||
|
match *e.kind() {
|
||
|
HirKind::Literal(ref x) => extendlit(x, &mut lit),
|
||
|
_ => unreachable!("expected literal, got {:?}", e),
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
_ => unreachable!("expected literal or concat, got {:?}", alt),
|
||
|
}
|
||
|
lits.push(lit);
|
||
|
}
|
||
|
Some(lits)
|
||
|
}
|