1
0
mirror of https://github.com/BurntSushi/ripgrep.git synced 2025-01-03 05:10:12 +02:00

regex: some minor polish

I think I already did a clean-up of this crate when I moved it to regex
1.9, so the polish here is very minor.
This commit is contained in:
Andrew Gallant 2023-09-25 17:21:28 -04:00
parent 798f8981eb
commit 82d3183a04
6 changed files with 16 additions and 127 deletions

5
Cargo.lock generated
View File

@ -4,9 +4,9 @@ version = 3
[[package]]
name = "aho-corasick"
version = "1.1.0"
version = "1.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0f2135563fb5c609d2b2b87c1e8ce7bc41b0b45430fa9661f457981503dd5bf0"
checksum = "ea5d730647d4fadd988536d06fecce94b7b4f2a7efdae548f1cf4b63205518ab"
dependencies = [
"memchr",
]
@ -219,7 +219,6 @@ dependencies = [
name = "grep-regex"
version = "0.1.11"
dependencies = [
"aho-corasick",
"bstr",
"grep-matcher",
"log",

View File

@ -14,9 +14,8 @@ license = "Unlicense OR MIT"
edition = "2021"
[dependencies]
aho-corasick = "1.0.2"
bstr = "1.6.0"
bstr = "1.6.2"
grep-matcher = { version = "0.1.6", path = "../matcher" }
log = "0.4.19"
regex-automata = { version = "0.3.0" }
regex-syntax = "0.7.2"
log = "0.4.20"
regex-automata = { version = "0.3.8" }
regex-syntax = "0.7.5"

View File

@ -3,8 +3,10 @@ An implementation of `grep-matcher`'s `Matcher` trait for Rust's regex engine.
*/
#![deny(missing_docs)]
pub use crate::error::{Error, ErrorKind};
pub use crate::matcher::{RegexCaptures, RegexMatcher, RegexMatcherBuilder};
pub use crate::{
error::{Error, ErrorKind},
matcher::{RegexCaptures, RegexMatcher, RegexMatcherBuilder},
};
mod ast;
mod config;

View File

@ -831,9 +831,10 @@ impl RegexCaptures {
#[cfg(test)]
mod tests {
use super::*;
use grep_matcher::{LineMatchKind, Matcher};
use super::*;
// Test that enabling word matches does the right thing and demonstrate
// the difference between it and surrounding the regex in `\b`.
#[test]

View File

@ -1,112 +0,0 @@
use aho_corasick::{AhoCorasick, MatchKind};
use grep_matcher::{Match, Matcher, NoError};
use regex_syntax::hir::{Hir, HirKind};
use crate::error::Error;
use crate::matcher::RegexCaptures;
/// A matcher for an alternation of literals.
///
/// Ideally, this optimization would be pushed down into the regex engine, but
/// making this work correctly there would require quite a bit of refactoring.
/// Moreover, doing it one layer above lets us do thing like, "if we
/// specifically only want to search for literals, then don't bother with
/// regex parsing at all."
#[derive(Clone, Debug)]
pub struct MultiLiteralMatcher {
/// The Aho-Corasick automaton.
ac: AhoCorasick,
}
impl MultiLiteralMatcher {
/// Create a new multi-literal matcher from the given literals.
pub fn new<B: AsRef<[u8]>>(
literals: &[B],
) -> Result<MultiLiteralMatcher, Error> {
let ac = AhoCorasick::builder()
.match_kind(MatchKind::LeftmostFirst)
.build(literals)
.map_err(Error::generic)?;
Ok(MultiLiteralMatcher { ac })
}
}
impl Matcher for MultiLiteralMatcher {
type Captures = RegexCaptures;
type Error = NoError;
fn find_at(
&self,
haystack: &[u8],
at: usize,
) -> Result<Option<Match>, NoError> {
match self.ac.find(&haystack[at..]) {
None => Ok(None),
Some(m) => Ok(Some(Match::new(at + m.start(), at + m.end()))),
}
}
fn new_captures(&self) -> Result<RegexCaptures, NoError> {
Ok(RegexCaptures::simple())
}
fn capture_count(&self) -> usize {
1
}
fn capture_index(&self, _: &str) -> Option<usize> {
None
}
fn captures_at(
&self,
haystack: &[u8],
at: usize,
caps: &mut RegexCaptures,
) -> Result<bool, NoError> {
caps.set_simple(None);
let mat = self.find_at(haystack, at)?;
caps.set_simple(mat);
Ok(mat.is_some())
}
// We specifically do not implement other methods like find_iter. Namely,
// the iter methods are guaranteed to be correct by virtue of implementing
// find_at above.
}
/// Alternation literals checks if the given HIR is a simple alternation of
/// literals, and if so, returns them. Otherwise, this returns None.
pub fn alternation_literals(expr: &Hir) -> Option<Vec<Vec<u8>>> {
// This is pretty hacky, but basically, if `is_alternation_literal` is
// true, then we can make several assumptions about the structure of our
// HIR. This is what justifies the `unreachable!` statements below.
if !expr.properties().is_alternation_literal() {
return None;
}
let alts = match *expr.kind() {
HirKind::Alternation(ref alts) => alts,
_ => return None, // one literal isn't worth it
};
let mut lits = vec![];
for alt in alts {
let mut lit = vec![];
match *alt.kind() {
HirKind::Empty => {}
HirKind::Literal(ref x) => lit.extend_from_slice(&x.0),
HirKind::Concat(ref exprs) => {
for e in exprs {
match *e.kind() {
HirKind::Literal(ref x) => lit.extend_from_slice(&x.0),
_ => unreachable!("expected literal, got {:?}", e),
}
}
}
_ => unreachable!("expected literal or concat, got {:?}", alt),
}
lits.push(lit);
}
Some(lits)
}

View File

@ -20,11 +20,11 @@ use crate::error::{Error, ErrorKind};
///
/// Note that as of regex 1.9, this routine could theoretically be implemented
/// without returning an error. Namely, for example, we could turn
/// `foo\nbar` into `foo[a&&b]bar`. That is, replace line terminators with a
/// `foo\nbar` into `foo[a&&b]bar`. That is, replace line terminator with a
/// sub-expression that can never match anything. Thus, ripgrep would accept
/// such regexes and just silently not match anything. Regex versions prior to 1.8
/// don't support such constructs. I ended up deciding to leave the existing
/// behavior of returning an error instead. For example:
/// such regexes and just silently not match anything. Regex versions prior
/// to 1.8 don't support such constructs. I ended up deciding to leave the
/// existing behavior of returning an error instead. For example:
///
/// ```text
/// $ echo -n 'foo\nbar\n' | rg 'foo\nbar'