mirror of
https://github.com/BurntSushi/ripgrep.git
synced 2025-01-03 05:10:12 +02:00
regex: some minor polish
I think I already did a clean-up of this crate when I moved it to regex 1.9, so the polish here is very minor.
This commit is contained in:
parent
798f8981eb
commit
82d3183a04
5
Cargo.lock
generated
5
Cargo.lock
generated
@ -4,9 +4,9 @@ version = 3
|
||||
|
||||
[[package]]
|
||||
name = "aho-corasick"
|
||||
version = "1.1.0"
|
||||
version = "1.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0f2135563fb5c609d2b2b87c1e8ce7bc41b0b45430fa9661f457981503dd5bf0"
|
||||
checksum = "ea5d730647d4fadd988536d06fecce94b7b4f2a7efdae548f1cf4b63205518ab"
|
||||
dependencies = [
|
||||
"memchr",
|
||||
]
|
||||
@ -219,7 +219,6 @@ dependencies = [
|
||||
name = "grep-regex"
|
||||
version = "0.1.11"
|
||||
dependencies = [
|
||||
"aho-corasick",
|
||||
"bstr",
|
||||
"grep-matcher",
|
||||
"log",
|
||||
|
@ -14,9 +14,8 @@ license = "Unlicense OR MIT"
|
||||
edition = "2021"
|
||||
|
||||
[dependencies]
|
||||
aho-corasick = "1.0.2"
|
||||
bstr = "1.6.0"
|
||||
bstr = "1.6.2"
|
||||
grep-matcher = { version = "0.1.6", path = "../matcher" }
|
||||
log = "0.4.19"
|
||||
regex-automata = { version = "0.3.0" }
|
||||
regex-syntax = "0.7.2"
|
||||
log = "0.4.20"
|
||||
regex-automata = { version = "0.3.8" }
|
||||
regex-syntax = "0.7.5"
|
||||
|
@ -3,8 +3,10 @@ An implementation of `grep-matcher`'s `Matcher` trait for Rust's regex engine.
|
||||
*/
|
||||
#![deny(missing_docs)]
|
||||
|
||||
pub use crate::error::{Error, ErrorKind};
|
||||
pub use crate::matcher::{RegexCaptures, RegexMatcher, RegexMatcherBuilder};
|
||||
pub use crate::{
|
||||
error::{Error, ErrorKind},
|
||||
matcher::{RegexCaptures, RegexMatcher, RegexMatcherBuilder},
|
||||
};
|
||||
|
||||
mod ast;
|
||||
mod config;
|
||||
|
@ -831,9 +831,10 @@ impl RegexCaptures {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use grep_matcher::{LineMatchKind, Matcher};
|
||||
|
||||
use super::*;
|
||||
|
||||
// Test that enabling word matches does the right thing and demonstrate
|
||||
// the difference between it and surrounding the regex in `\b`.
|
||||
#[test]
|
||||
|
@ -1,112 +0,0 @@
|
||||
use aho_corasick::{AhoCorasick, MatchKind};
|
||||
use grep_matcher::{Match, Matcher, NoError};
|
||||
use regex_syntax::hir::{Hir, HirKind};
|
||||
|
||||
use crate::error::Error;
|
||||
use crate::matcher::RegexCaptures;
|
||||
|
||||
/// A matcher for an alternation of literals.
|
||||
///
|
||||
/// Ideally, this optimization would be pushed down into the regex engine, but
|
||||
/// making this work correctly there would require quite a bit of refactoring.
|
||||
/// Moreover, doing it one layer above lets us do thing like, "if we
|
||||
/// specifically only want to search for literals, then don't bother with
|
||||
/// regex parsing at all."
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct MultiLiteralMatcher {
|
||||
/// The Aho-Corasick automaton.
|
||||
ac: AhoCorasick,
|
||||
}
|
||||
|
||||
impl MultiLiteralMatcher {
|
||||
/// Create a new multi-literal matcher from the given literals.
|
||||
pub fn new<B: AsRef<[u8]>>(
|
||||
literals: &[B],
|
||||
) -> Result<MultiLiteralMatcher, Error> {
|
||||
let ac = AhoCorasick::builder()
|
||||
.match_kind(MatchKind::LeftmostFirst)
|
||||
.build(literals)
|
||||
.map_err(Error::generic)?;
|
||||
Ok(MultiLiteralMatcher { ac })
|
||||
}
|
||||
}
|
||||
|
||||
impl Matcher for MultiLiteralMatcher {
|
||||
type Captures = RegexCaptures;
|
||||
type Error = NoError;
|
||||
|
||||
fn find_at(
|
||||
&self,
|
||||
haystack: &[u8],
|
||||
at: usize,
|
||||
) -> Result<Option<Match>, NoError> {
|
||||
match self.ac.find(&haystack[at..]) {
|
||||
None => Ok(None),
|
||||
Some(m) => Ok(Some(Match::new(at + m.start(), at + m.end()))),
|
||||
}
|
||||
}
|
||||
|
||||
fn new_captures(&self) -> Result<RegexCaptures, NoError> {
|
||||
Ok(RegexCaptures::simple())
|
||||
}
|
||||
|
||||
fn capture_count(&self) -> usize {
|
||||
1
|
||||
}
|
||||
|
||||
fn capture_index(&self, _: &str) -> Option<usize> {
|
||||
None
|
||||
}
|
||||
|
||||
fn captures_at(
|
||||
&self,
|
||||
haystack: &[u8],
|
||||
at: usize,
|
||||
caps: &mut RegexCaptures,
|
||||
) -> Result<bool, NoError> {
|
||||
caps.set_simple(None);
|
||||
let mat = self.find_at(haystack, at)?;
|
||||
caps.set_simple(mat);
|
||||
Ok(mat.is_some())
|
||||
}
|
||||
|
||||
// We specifically do not implement other methods like find_iter. Namely,
|
||||
// the iter methods are guaranteed to be correct by virtue of implementing
|
||||
// find_at above.
|
||||
}
|
||||
|
||||
/// Alternation literals checks if the given HIR is a simple alternation of
|
||||
/// literals, and if so, returns them. Otherwise, this returns None.
|
||||
pub fn alternation_literals(expr: &Hir) -> Option<Vec<Vec<u8>>> {
|
||||
// This is pretty hacky, but basically, if `is_alternation_literal` is
|
||||
// true, then we can make several assumptions about the structure of our
|
||||
// HIR. This is what justifies the `unreachable!` statements below.
|
||||
|
||||
if !expr.properties().is_alternation_literal() {
|
||||
return None;
|
||||
}
|
||||
let alts = match *expr.kind() {
|
||||
HirKind::Alternation(ref alts) => alts,
|
||||
_ => return None, // one literal isn't worth it
|
||||
};
|
||||
|
||||
let mut lits = vec![];
|
||||
for alt in alts {
|
||||
let mut lit = vec![];
|
||||
match *alt.kind() {
|
||||
HirKind::Empty => {}
|
||||
HirKind::Literal(ref x) => lit.extend_from_slice(&x.0),
|
||||
HirKind::Concat(ref exprs) => {
|
||||
for e in exprs {
|
||||
match *e.kind() {
|
||||
HirKind::Literal(ref x) => lit.extend_from_slice(&x.0),
|
||||
_ => unreachable!("expected literal, got {:?}", e),
|
||||
}
|
||||
}
|
||||
}
|
||||
_ => unreachable!("expected literal or concat, got {:?}", alt),
|
||||
}
|
||||
lits.push(lit);
|
||||
}
|
||||
Some(lits)
|
||||
}
|
@ -20,11 +20,11 @@ use crate::error::{Error, ErrorKind};
|
||||
///
|
||||
/// Note that as of regex 1.9, this routine could theoretically be implemented
|
||||
/// without returning an error. Namely, for example, we could turn
|
||||
/// `foo\nbar` into `foo[a&&b]bar`. That is, replace line terminators with a
|
||||
/// `foo\nbar` into `foo[a&&b]bar`. That is, replace line terminator with a
|
||||
/// sub-expression that can never match anything. Thus, ripgrep would accept
|
||||
/// such regexes and just silently not match anything. Regex versions prior to 1.8
|
||||
/// don't support such constructs. I ended up deciding to leave the existing
|
||||
/// behavior of returning an error instead. For example:
|
||||
/// such regexes and just silently not match anything. Regex versions prior
|
||||
/// to 1.8 don't support such constructs. I ended up deciding to leave the
|
||||
/// existing behavior of returning an error instead. For example:
|
||||
///
|
||||
/// ```text
|
||||
/// $ echo -n 'foo\nbar\n' | rg 'foo\nbar'
|
||||
|
Loading…
Reference in New Issue
Block a user