regex: some minor polish

I think I already did a clean-up of this crate when I moved it to regex 1.9, so the polish here is very minor.
2025-08-04 21:52:54 +02:00 · 2023-09-25 17:21:28 -04:00
parent 798f8981eb
commit 82d3183a04
6 changed files with 16 additions and 127 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -4,9 +4,9 @@ version = 3

 [[package]]
 name = "aho-corasick"
-version = "1.1.0"
+version = "1.1.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0f2135563fb5c609d2b2b87c1e8ce7bc41b0b45430fa9661f457981503dd5bf0"
+checksum = "ea5d730647d4fadd988536d06fecce94b7b4f2a7efdae548f1cf4b63205518ab"
 dependencies = [
 "memchr",
 ]
@ -219,7 +219,6 @@ dependencies = [
 name = "grep-regex"
 version = "0.1.11"
 dependencies = [
- "aho-corasick",
 "bstr",
 "grep-matcher",
 "log",
--- a/crates/regex/Cargo.toml
+++ b/crates/regex/Cargo.toml
@ -14,9 +14,8 @@ license = "Unlicense OR MIT"
 edition = "2021"

 [dependencies]
-aho-corasick = "1.0.2"
-bstr = "1.6.0"
+bstr = "1.6.2"
 grep-matcher = { version = "0.1.6", path = "../matcher" }
-log = "0.4.19"
-regex-automata = { version = "0.3.0" }
-regex-syntax = "0.7.2"
+log = "0.4.20"
+regex-automata = { version = "0.3.8" }
+regex-syntax = "0.7.5"
--- a/crates/regex/src/lib.rs
+++ b/crates/regex/src/lib.rs
@ -3,8 +3,10 @@ An implementation of `grep-matcher`'s `Matcher` trait for Rust's regex engine.
 */
 #![deny(missing_docs)]

-pub use crate::error::{Error, ErrorKind};
-pub use crate::matcher::{RegexCaptures, RegexMatcher, RegexMatcherBuilder};
+pub use crate::{
+    error::{Error, ErrorKind},
+    matcher::{RegexCaptures, RegexMatcher, RegexMatcherBuilder},
+};

 mod ast;
 mod config;
--- a/crates/regex/src/matcher.rs
+++ b/crates/regex/src/matcher.rs
@ -831,9 +831,10 @@ impl RegexCaptures {

 #[cfg(test)]
 mod tests {
-    use super::*;
    use grep_matcher::{LineMatchKind, Matcher};

+    use super::*;
+
    // Test that enabling word matches does the right thing and demonstrate
    // the difference between it and surrounding the regex in `\b`.
    #[test]
--- a/crates/regex/src/multi.rs
+++ b/crates/regex/src/multi.rs
@ -1,112 +0,0 @@
-use aho_corasick::{AhoCorasick, MatchKind};
-use grep_matcher::{Match, Matcher, NoError};
-use regex_syntax::hir::{Hir, HirKind};
-
-use crate::error::Error;
-use crate::matcher::RegexCaptures;
-
-/// A matcher for an alternation of literals.
-///
-/// Ideally, this optimization would be pushed down into the regex engine, but
-/// making this work correctly there would require quite a bit of refactoring.
-/// Moreover, doing it one layer above lets us do thing like, "if we
-/// specifically only want to search for literals, then don't bother with
-/// regex parsing at all."
-#[derive(Clone, Debug)]
-pub struct MultiLiteralMatcher {
-    /// The Aho-Corasick automaton.
-    ac: AhoCorasick,
-}
-
-impl MultiLiteralMatcher {
-    /// Create a new multi-literal matcher from the given literals.
-    pub fn new<B: AsRef<[u8]>>(
-        literals: &[B],
-    ) -> Result<MultiLiteralMatcher, Error> {
-        let ac = AhoCorasick::builder()
-            .match_kind(MatchKind::LeftmostFirst)
-            .build(literals)
-            .map_err(Error::generic)?;
-        Ok(MultiLiteralMatcher { ac })
-    }
-}
-
-impl Matcher for MultiLiteralMatcher {
-    type Captures = RegexCaptures;
-    type Error = NoError;
-
-    fn find_at(
-        &self,
-        haystack: &[u8],
-        at: usize,
-    ) -> Result<Option<Match>, NoError> {
-        match self.ac.find(&haystack[at..]) {
-            None => Ok(None),
-            Some(m) => Ok(Some(Match::new(at + m.start(), at + m.end()))),
-        }
-    }
-
-    fn new_captures(&self) -> Result<RegexCaptures, NoError> {
-        Ok(RegexCaptures::simple())
-    }
-
-    fn capture_count(&self) -> usize {
-        1
-    }
-
-    fn capture_index(&self, _: &str) -> Option<usize> {
-        None
-    }
-
-    fn captures_at(
-        &self,
-        haystack: &[u8],
-        at: usize,
-        caps: &mut RegexCaptures,
-    ) -> Result<bool, NoError> {
-        caps.set_simple(None);
-        let mat = self.find_at(haystack, at)?;
-        caps.set_simple(mat);
-        Ok(mat.is_some())
-    }
-
-    // We specifically do not implement other methods like find_iter. Namely,
-    // the iter methods are guaranteed to be correct by virtue of implementing
-    // find_at above.
-}
-
-/// Alternation literals checks if the given HIR is a simple alternation of
-/// literals, and if so, returns them. Otherwise, this returns None.
-pub fn alternation_literals(expr: &Hir) -> Option<Vec<Vec<u8>>> {
-    // This is pretty hacky, but basically, if `is_alternation_literal` is
-    // true, then we can make several assumptions about the structure of our
-    // HIR. This is what justifies the `unreachable!` statements below.
-
-    if !expr.properties().is_alternation_literal() {
-        return None;
-    }
-    let alts = match *expr.kind() {
-        HirKind::Alternation(ref alts) => alts,
-        _ => return None, // one literal isn't worth it
-    };
-
-    let mut lits = vec![];
-    for alt in alts {
-        let mut lit = vec![];
-        match *alt.kind() {
-            HirKind::Empty => {}
-            HirKind::Literal(ref x) => lit.extend_from_slice(&x.0),
-            HirKind::Concat(ref exprs) => {
-                for e in exprs {
-                    match *e.kind() {
-                        HirKind::Literal(ref x) => lit.extend_from_slice(&x.0),
-                        _ => unreachable!("expected literal, got {:?}", e),
-                    }
-                }
-            }
-            _ => unreachable!("expected literal or concat, got {:?}", alt),
-        }
-        lits.push(lit);
-    }
-    Some(lits)
-}
--- a/crates/regex/src/strip.rs
+++ b/crates/regex/src/strip.rs
@ -20,11 +20,11 @@ use crate::error::{Error, ErrorKind};
 ///
 /// Note that as of regex 1.9, this routine could theoretically be implemented
 /// without returning an error. Namely, for example, we could turn
-/// `foo\nbar` into `foo[a&&b]bar`. That is, replace line terminators with a
+/// `foo\nbar` into `foo[a&&b]bar`. That is, replace line terminator with a
 /// sub-expression that can never match anything. Thus, ripgrep would accept
-/// such regexes and just silently not match anything. Regex versions prior to 1.8
-/// don't support such constructs. I ended up deciding to leave the existing
-/// behavior of returning an error instead. For example:
+/// such regexes and just silently not match anything. Regex versions prior
+/// to 1.8 don't support such constructs. I ended up deciding to leave the
+/// existing behavior of returning an error instead. For example:
 ///
 /// ```text
 /// $ echo -n 'foo\nbar\n' | rg 'foo\nbar'