mirror of
https://github.com/BurntSushi/ripgrep.git
synced 2025-03-17 20:28:03 +02:00
progress
This commit is contained in:
parent
f7ff34fdf9
commit
9626f16757
24
Cargo.lock
generated
24
Cargo.lock
generated
@ -4,9 +4,9 @@ version = 3
|
||||
|
||||
[[package]]
|
||||
name = "aho-corasick"
|
||||
version = "1.1.1"
|
||||
version = "1.1.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ea5d730647d4fadd988536d06fecce94b7b4f2a7efdae548f1cf4b63205518ab"
|
||||
checksum = "b2969dcb958b36655471fc61f7e416fa76033bdd4bfed0678d8fee1e2d07a1f0"
|
||||
dependencies = [
|
||||
"memchr",
|
||||
]
|
||||
@ -31,9 +31,9 @@ checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
|
||||
|
||||
[[package]]
|
||||
name = "bstr"
|
||||
version = "1.6.2"
|
||||
version = "1.7.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4c2f7349907b712260e64b0afe2f84692af14a454be26187d9df565c7f69266a"
|
||||
checksum = "c79ad7fb2dd38f3dabd76b09c6a5a20c038fc0213ef1e9afd30eb777f120f019"
|
||||
dependencies = [
|
||||
"memchr",
|
||||
"regex-automata",
|
||||
@ -305,9 +305,9 @@ checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f"
|
||||
|
||||
[[package]]
|
||||
name = "memchr"
|
||||
version = "2.6.3"
|
||||
version = "2.6.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8f232d6ef707e1956a43342693d2a31e72989554d58299d7a88738cc95b0d35c"
|
||||
checksum = "f665ee40bc4a3c5590afb1e9677db74a508659dfd71e126420da8274909a0167"
|
||||
|
||||
[[package]]
|
||||
name = "memmap2"
|
||||
@ -395,9 +395,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "regex"
|
||||
version = "1.9.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ebee201405406dbf528b8b672104ae6d6d63e6d118cb10e4d51abbc7b58044ff"
|
||||
version = "1.10.0"
|
||||
dependencies = [
|
||||
"aho-corasick",
|
||||
"memchr",
|
||||
@ -407,9 +405,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "regex-automata"
|
||||
version = "0.3.9"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "59b23e92ee4318893fa3fe3e6fb365258efbfe6ac6ab30f090cdcbb7aa37efa9"
|
||||
version = "0.4.1"
|
||||
dependencies = [
|
||||
"aho-corasick",
|
||||
"memchr",
|
||||
@ -418,9 +414,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "regex-syntax"
|
||||
version = "0.7.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "dbb5fb1acd8a1a18b3dd5be62d25485eb770e05afb408a9627d14d451bae12da"
|
||||
version = "0.8.0"
|
||||
|
||||
[[package]]
|
||||
name = "ripgrep"
|
||||
|
@ -48,6 +48,11 @@ members = [
|
||||
"crates/ignore",
|
||||
]
|
||||
|
||||
[patch.crates-io]
|
||||
regex = { path = "/home/andrew/rust/regex" }
|
||||
regex-automata = { path = "/home/andrew/rust/regex/regex-automata" }
|
||||
regex-syntax = { path = "/home/andrew/rust/regex/regex-syntax" }
|
||||
|
||||
[dependencies]
|
||||
bstr = "1.6.0"
|
||||
grep = { version = "0.2.12", path = "crates/grep" }
|
||||
|
@ -26,12 +26,12 @@ log = { version = "0.4.20", optional = true }
|
||||
serde = { version = "1.0.188", optional = true }
|
||||
|
||||
[dependencies.regex-syntax]
|
||||
version = "0.7.5"
|
||||
version = "0.8.0"
|
||||
default-features = false
|
||||
features = ["std"]
|
||||
|
||||
[dependencies.regex-automata]
|
||||
version = "0.3.8"
|
||||
version = "0.4.0"
|
||||
default-features = false
|
||||
features = ["std", "perf", "syntax", "meta", "nfa", "hybrid"]
|
||||
|
||||
|
@ -27,7 +27,7 @@ same-file = "1.0.6"
|
||||
walkdir = "2.4.0"
|
||||
|
||||
[dependencies.regex-automata]
|
||||
version = "0.3.8"
|
||||
version = "0.4.0"
|
||||
default-features = false
|
||||
features = ["std", "perf", "syntax", "meta", "nfa", "hybrid", "dfa-onepass"]
|
||||
|
||||
|
@ -17,5 +17,5 @@ edition = "2021"
|
||||
bstr = "1.6.2"
|
||||
grep-matcher = { version = "0.1.6", path = "../matcher" }
|
||||
log = "0.4.20"
|
||||
regex-automata = { version = "0.3.8" }
|
||||
regex-syntax = "0.7.5"
|
||||
regex-automata = { version = "0.4.0" }
|
||||
regex-syntax = "0.8.0"
|
||||
|
@ -62,12 +62,12 @@ impl AstAnalysis {
|
||||
Ast::Flags(_)
|
||||
| Ast::Dot(_)
|
||||
| Ast::Assertion(_)
|
||||
| Ast::Class(ast::Class::Unicode(_))
|
||||
| Ast::Class(ast::Class::Perl(_)) => {}
|
||||
| Ast::ClassUnicode(_)
|
||||
| Ast::ClassPerl(_) => {}
|
||||
Ast::Literal(ref x) => {
|
||||
self.from_ast_literal(x);
|
||||
}
|
||||
Ast::Class(ast::Class::Bracketed(ref x)) => {
|
||||
Ast::ClassBracketed(ref x) => {
|
||||
self.from_ast_class_set(&x.kind);
|
||||
}
|
||||
Ast::Repetition(ref x) => {
|
||||
|
@ -3,7 +3,7 @@ use {
|
||||
regex_automata::meta::Regex,
|
||||
regex_syntax::{
|
||||
ast,
|
||||
hir::{self, Hir, HirKind},
|
||||
hir::{self, Hir},
|
||||
},
|
||||
};
|
||||
|
||||
@ -296,35 +296,6 @@ impl ConfiguredHIR {
|
||||
}
|
||||
}
|
||||
|
||||
/// Turns this configured HIR into one that only matches when both sides of
|
||||
/// the match correspond to a word boundary.
|
||||
///
|
||||
/// Note that the HIR returned is like turning `pat` into
|
||||
/// `(?m:^|\W)(pat)(?m:$|\W)`. That is, the true match is at capture group
|
||||
/// `1` and not `0`.
|
||||
pub(crate) fn into_word(self) -> Result<ConfiguredHIR, Error> {
|
||||
// In theory building the HIR for \W should never fail, but there are
|
||||
// likely some pathological cases (particularly with respect to certain
|
||||
// values of limits) where it could in theory fail.
|
||||
let non_word = {
|
||||
let mut config = self.config.clone();
|
||||
config.fixed_strings = false;
|
||||
ConfiguredHIR::new(config, &[r"\W"])?
|
||||
};
|
||||
let line_anchor_start = Hir::look(self.line_anchor_start());
|
||||
let line_anchor_end = Hir::look(self.line_anchor_end());
|
||||
let hir = Hir::concat(vec![
|
||||
Hir::alternation(vec![line_anchor_start, non_word.hir.clone()]),
|
||||
Hir::capture(hir::Capture {
|
||||
index: 1,
|
||||
name: None,
|
||||
sub: Box::new(renumber_capture_indices(self.hir)?),
|
||||
}),
|
||||
Hir::alternation(vec![non_word.hir, line_anchor_end]),
|
||||
]);
|
||||
Ok(ConfiguredHIR { config: self.config, hir })
|
||||
}
|
||||
|
||||
/// Turns this configured HIR into an equivalent one, but where it must
|
||||
/// match at the start and end of a line.
|
||||
pub(crate) fn into_whole_line(self) -> ConfiguredHIR {
|
||||
@ -336,12 +307,20 @@ impl ConfiguredHIR {
|
||||
}
|
||||
|
||||
/// Turns this configured HIR into an equivalent one, but where it must
|
||||
/// match at the start and end of the haystack.
|
||||
pub(crate) fn into_anchored(self) -> ConfiguredHIR {
|
||||
/// match at word boundaries.
|
||||
pub(crate) fn into_word(self) -> ConfiguredHIR {
|
||||
let hir = Hir::concat(vec![
|
||||
Hir::look(hir::Look::Start),
|
||||
Hir::look(if self.config.unicode {
|
||||
hir::Look::WordStartHalfUnicode
|
||||
} else {
|
||||
hir::Look::WordStartHalfAscii
|
||||
}),
|
||||
self.hir,
|
||||
Hir::look(hir::Look::End),
|
||||
Hir::look(if self.config.unicode {
|
||||
hir::Look::WordEndHalfUnicode
|
||||
} else {
|
||||
hir::Look::WordEndHalfAscii
|
||||
}),
|
||||
]);
|
||||
ConfiguredHIR { config: self.config, hir }
|
||||
}
|
||||
@ -365,50 +344,6 @@ impl ConfiguredHIR {
|
||||
}
|
||||
}
|
||||
|
||||
/// This increments the index of every capture group in the given hir by 1. If
|
||||
/// any increment results in an overflow, then an error is returned.
|
||||
fn renumber_capture_indices(hir: Hir) -> Result<Hir, Error> {
|
||||
Ok(match hir.into_kind() {
|
||||
HirKind::Empty => Hir::empty(),
|
||||
HirKind::Literal(hir::Literal(lit)) => Hir::literal(lit),
|
||||
HirKind::Class(cls) => Hir::class(cls),
|
||||
HirKind::Look(x) => Hir::look(x),
|
||||
HirKind::Repetition(mut x) => {
|
||||
x.sub = Box::new(renumber_capture_indices(*x.sub)?);
|
||||
Hir::repetition(x)
|
||||
}
|
||||
HirKind::Capture(mut cap) => {
|
||||
cap.index = match cap.index.checked_add(1) {
|
||||
Some(index) => index,
|
||||
None => {
|
||||
// This error message kind of sucks, but it's probably
|
||||
// impossible for it to happen. The only way a capture
|
||||
// index can overflow addition is if the regex is huge
|
||||
// (or something else has gone horribly wrong).
|
||||
let msg = "could not renumber capture index, too big";
|
||||
return Err(Error::any(msg));
|
||||
}
|
||||
};
|
||||
cap.sub = Box::new(renumber_capture_indices(*cap.sub)?);
|
||||
Hir::capture(cap)
|
||||
}
|
||||
HirKind::Concat(subs) => {
|
||||
let subs = subs
|
||||
.into_iter()
|
||||
.map(|sub| renumber_capture_indices(sub))
|
||||
.collect::<Result<Vec<Hir>, Error>>()?;
|
||||
Hir::concat(subs)
|
||||
}
|
||||
HirKind::Alternation(subs) => {
|
||||
let subs = subs
|
||||
.into_iter()
|
||||
.map(|sub| renumber_capture_indices(sub))
|
||||
.collect::<Result<Vec<Hir>, Error>>()?;
|
||||
Hir::alternation(subs)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
/// Returns true if the given literal string contains any byte from the line
|
||||
/// terminator given.
|
||||
fn has_line_terminator(lineterm: LineTerminator, literal: &str) -> bool {
|
||||
|
@ -30,10 +30,6 @@ impl Error {
|
||||
Error { kind: ErrorKind::Regex(err.to_string()) }
|
||||
}
|
||||
|
||||
pub(crate) fn any<E: ToString>(msg: E) -> Error {
|
||||
Error { kind: ErrorKind::Regex(msg.to_string()) }
|
||||
}
|
||||
|
||||
/// Return the kind of this error.
|
||||
pub fn kind(&self) -> &ErrorKind {
|
||||
&self.kind
|
||||
|
@ -15,4 +15,3 @@ mod literal;
|
||||
mod matcher;
|
||||
mod non_matching;
|
||||
mod strip;
|
||||
mod word;
|
||||
|
@ -1,5 +1,3 @@
|
||||
use std::sync::Arc;
|
||||
|
||||
use {
|
||||
grep_matcher::{
|
||||
ByteSet, Captures, LineMatchKind, LineTerminator, Match, Matcher,
|
||||
@ -11,12 +9,7 @@ use {
|
||||
},
|
||||
};
|
||||
|
||||
use crate::{
|
||||
config::{Config, ConfiguredHIR},
|
||||
error::Error,
|
||||
literal::InnerLiterals,
|
||||
word::WordMatcher,
|
||||
};
|
||||
use crate::{config::Config, error::Error, literal::InnerLiterals};
|
||||
|
||||
/// A builder for constructing a `Matcher` using regular expressions.
|
||||
///
|
||||
@ -61,9 +54,15 @@ impl RegexMatcherBuilder {
|
||||
&self,
|
||||
patterns: &[P],
|
||||
) -> Result<RegexMatcher, Error> {
|
||||
let chir = self.config.build_many(patterns)?;
|
||||
let matcher = RegexMatcherImpl::new(chir)?;
|
||||
let (chir, re) = (matcher.chir(), matcher.regex());
|
||||
let mut chir = self.config.build_many(patterns)?;
|
||||
// 'whole_line' is a strict subset of 'word', so when it is enabled,
|
||||
// we don't need to both with any specific to word matching.
|
||||
if chir.config().whole_line {
|
||||
chir = chir.into_whole_line();
|
||||
} else if chir.config().word {
|
||||
chir = chir.into_word();
|
||||
}
|
||||
let regex = chir.to_regex()?;
|
||||
log::trace!("final regex: {:?}", chir.hir().to_string());
|
||||
|
||||
let non_matching_bytes = chir.non_matching_bytes();
|
||||
@ -76,18 +75,13 @@ impl RegexMatcherBuilder {
|
||||
// then run the original regex on only that line. (In this case, the
|
||||
// regex engine is likely to handle this case for us since it's so
|
||||
// simple, but the idea applies.)
|
||||
let fast_line_regex = InnerLiterals::new(chir, re).one_regex()?;
|
||||
let fast_line_regex = InnerLiterals::new(&chir, ®ex).one_regex()?;
|
||||
|
||||
// We override the line terminator in case the configured HIR doesn't
|
||||
// support it.
|
||||
let mut config = self.config.clone();
|
||||
config.line_terminator = chir.line_terminator();
|
||||
Ok(RegexMatcher {
|
||||
config,
|
||||
matcher,
|
||||
fast_line_regex,
|
||||
non_matching_bytes,
|
||||
})
|
||||
Ok(RegexMatcher { config, regex, fast_line_regex, non_matching_bytes })
|
||||
}
|
||||
|
||||
/// Build a new matcher from a plain alternation of literals.
|
||||
@ -357,8 +351,9 @@ impl RegexMatcherBuilder {
|
||||
pub struct RegexMatcher {
|
||||
/// The configuration specified by the caller.
|
||||
config: Config,
|
||||
/// The underlying matcher implementation.
|
||||
matcher: RegexMatcherImpl,
|
||||
/// The regular expression compiled from the pattern provided by the
|
||||
/// caller.
|
||||
regex: Regex,
|
||||
/// A regex that never reports false negatives but may report false
|
||||
/// positives that is believed to be capable of being matched more quickly
|
||||
/// than `regex`. Typically, this is a single literal or an alternation
|
||||
@ -392,53 +387,6 @@ impl RegexMatcher {
|
||||
}
|
||||
}
|
||||
|
||||
/// An encapsulation of the type of matcher we use in `RegexMatcher`.
|
||||
#[derive(Clone, Debug)]
|
||||
enum RegexMatcherImpl {
|
||||
/// The standard matcher used for all regular expressions.
|
||||
Standard(StandardMatcher),
|
||||
/// A matcher that only matches at word boundaries. This transforms the
|
||||
/// regex to `(^|\W)(...)($|\W)` instead of the more intuitive `\b(...)\b`.
|
||||
/// Because of this, the WordMatcher provides its own implementation of
|
||||
/// `Matcher` to encapsulate its use of capture groups to make them
|
||||
/// invisible to the caller.
|
||||
Word(WordMatcher),
|
||||
}
|
||||
|
||||
impl RegexMatcherImpl {
|
||||
/// Based on the configuration, create a new implementation of the
|
||||
/// `Matcher` trait.
|
||||
fn new(mut chir: ConfiguredHIR) -> Result<RegexMatcherImpl, Error> {
|
||||
// When whole_line is set, we don't use a word matcher even if word
|
||||
// matching was requested. Why? Because `(?m:^)(pat)(?m:$)` implies
|
||||
// word matching.
|
||||
Ok(if chir.config().word && !chir.config().whole_line {
|
||||
RegexMatcherImpl::Word(WordMatcher::new(chir)?)
|
||||
} else {
|
||||
if chir.config().whole_line {
|
||||
chir = chir.into_whole_line();
|
||||
}
|
||||
RegexMatcherImpl::Standard(StandardMatcher::new(chir)?)
|
||||
})
|
||||
}
|
||||
|
||||
/// Return the underlying regex object used.
|
||||
fn regex(&self) -> &Regex {
|
||||
match *self {
|
||||
RegexMatcherImpl::Word(ref x) => x.regex(),
|
||||
RegexMatcherImpl::Standard(ref x) => &x.regex,
|
||||
}
|
||||
}
|
||||
|
||||
/// Return the underlying HIR of the regex used for searching.
|
||||
fn chir(&self) -> &ConfiguredHIR {
|
||||
match *self {
|
||||
RegexMatcherImpl::Word(ref x) => x.chir(),
|
||||
RegexMatcherImpl::Standard(ref x) => &x.chir,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// This implementation just dispatches on the internal matcher impl except
|
||||
// for the line terminator optimization, which is possibly executed via
|
||||
// `fast_line_regex`.
|
||||
@ -446,265 +394,7 @@ impl Matcher for RegexMatcher {
|
||||
type Captures = RegexCaptures;
|
||||
type Error = NoError;
|
||||
|
||||
fn find_at(
|
||||
&self,
|
||||
haystack: &[u8],
|
||||
at: usize,
|
||||
) -> Result<Option<Match>, NoError> {
|
||||
use self::RegexMatcherImpl::*;
|
||||
match self.matcher {
|
||||
Standard(ref m) => m.find_at(haystack, at),
|
||||
Word(ref m) => m.find_at(haystack, at),
|
||||
}
|
||||
}
|
||||
|
||||
fn new_captures(&self) -> Result<RegexCaptures, NoError> {
|
||||
use self::RegexMatcherImpl::*;
|
||||
match self.matcher {
|
||||
Standard(ref m) => m.new_captures(),
|
||||
Word(ref m) => m.new_captures(),
|
||||
}
|
||||
}
|
||||
|
||||
fn capture_count(&self) -> usize {
|
||||
use self::RegexMatcherImpl::*;
|
||||
match self.matcher {
|
||||
Standard(ref m) => m.capture_count(),
|
||||
Word(ref m) => m.capture_count(),
|
||||
}
|
||||
}
|
||||
|
||||
fn capture_index(&self, name: &str) -> Option<usize> {
|
||||
use self::RegexMatcherImpl::*;
|
||||
match self.matcher {
|
||||
Standard(ref m) => m.capture_index(name),
|
||||
Word(ref m) => m.capture_index(name),
|
||||
}
|
||||
}
|
||||
|
||||
fn find(&self, haystack: &[u8]) -> Result<Option<Match>, NoError> {
|
||||
use self::RegexMatcherImpl::*;
|
||||
match self.matcher {
|
||||
Standard(ref m) => m.find(haystack),
|
||||
Word(ref m) => m.find(haystack),
|
||||
}
|
||||
}
|
||||
|
||||
fn find_iter<F>(&self, haystack: &[u8], matched: F) -> Result<(), NoError>
|
||||
where
|
||||
F: FnMut(Match) -> bool,
|
||||
{
|
||||
use self::RegexMatcherImpl::*;
|
||||
match self.matcher {
|
||||
Standard(ref m) => m.find_iter(haystack, matched),
|
||||
Word(ref m) => m.find_iter(haystack, matched),
|
||||
}
|
||||
}
|
||||
|
||||
fn try_find_iter<F, E>(
|
||||
&self,
|
||||
haystack: &[u8],
|
||||
matched: F,
|
||||
) -> Result<Result<(), E>, NoError>
|
||||
where
|
||||
F: FnMut(Match) -> Result<bool, E>,
|
||||
{
|
||||
use self::RegexMatcherImpl::*;
|
||||
match self.matcher {
|
||||
Standard(ref m) => m.try_find_iter(haystack, matched),
|
||||
Word(ref m) => m.try_find_iter(haystack, matched),
|
||||
}
|
||||
}
|
||||
|
||||
fn captures(
|
||||
&self,
|
||||
haystack: &[u8],
|
||||
caps: &mut RegexCaptures,
|
||||
) -> Result<bool, NoError> {
|
||||
use self::RegexMatcherImpl::*;
|
||||
match self.matcher {
|
||||
Standard(ref m) => m.captures(haystack, caps),
|
||||
Word(ref m) => m.captures(haystack, caps),
|
||||
}
|
||||
}
|
||||
|
||||
fn captures_iter<F>(
|
||||
&self,
|
||||
haystack: &[u8],
|
||||
caps: &mut RegexCaptures,
|
||||
matched: F,
|
||||
) -> Result<(), NoError>
|
||||
where
|
||||
F: FnMut(&RegexCaptures) -> bool,
|
||||
{
|
||||
use self::RegexMatcherImpl::*;
|
||||
match self.matcher {
|
||||
Standard(ref m) => m.captures_iter(haystack, caps, matched),
|
||||
Word(ref m) => m.captures_iter(haystack, caps, matched),
|
||||
}
|
||||
}
|
||||
|
||||
fn try_captures_iter<F, E>(
|
||||
&self,
|
||||
haystack: &[u8],
|
||||
caps: &mut RegexCaptures,
|
||||
matched: F,
|
||||
) -> Result<Result<(), E>, NoError>
|
||||
where
|
||||
F: FnMut(&RegexCaptures) -> Result<bool, E>,
|
||||
{
|
||||
use self::RegexMatcherImpl::*;
|
||||
match self.matcher {
|
||||
Standard(ref m) => m.try_captures_iter(haystack, caps, matched),
|
||||
Word(ref m) => m.try_captures_iter(haystack, caps, matched),
|
||||
}
|
||||
}
|
||||
|
||||
fn captures_at(
|
||||
&self,
|
||||
haystack: &[u8],
|
||||
at: usize,
|
||||
caps: &mut RegexCaptures,
|
||||
) -> Result<bool, NoError> {
|
||||
use self::RegexMatcherImpl::*;
|
||||
match self.matcher {
|
||||
Standard(ref m) => m.captures_at(haystack, at, caps),
|
||||
Word(ref m) => m.captures_at(haystack, at, caps),
|
||||
}
|
||||
}
|
||||
|
||||
fn replace<F>(
|
||||
&self,
|
||||
haystack: &[u8],
|
||||
dst: &mut Vec<u8>,
|
||||
append: F,
|
||||
) -> Result<(), NoError>
|
||||
where
|
||||
F: FnMut(Match, &mut Vec<u8>) -> bool,
|
||||
{
|
||||
use self::RegexMatcherImpl::*;
|
||||
match self.matcher {
|
||||
Standard(ref m) => m.replace(haystack, dst, append),
|
||||
Word(ref m) => m.replace(haystack, dst, append),
|
||||
}
|
||||
}
|
||||
|
||||
fn replace_with_captures<F>(
|
||||
&self,
|
||||
haystack: &[u8],
|
||||
caps: &mut RegexCaptures,
|
||||
dst: &mut Vec<u8>,
|
||||
append: F,
|
||||
) -> Result<(), NoError>
|
||||
where
|
||||
F: FnMut(&Self::Captures, &mut Vec<u8>) -> bool,
|
||||
{
|
||||
use self::RegexMatcherImpl::*;
|
||||
match self.matcher {
|
||||
Standard(ref m) => {
|
||||
m.replace_with_captures(haystack, caps, dst, append)
|
||||
}
|
||||
Word(ref m) => {
|
||||
m.replace_with_captures(haystack, caps, dst, append)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn is_match(&self, haystack: &[u8]) -> Result<bool, NoError> {
|
||||
use self::RegexMatcherImpl::*;
|
||||
match self.matcher {
|
||||
Standard(ref m) => m.is_match(haystack),
|
||||
Word(ref m) => m.is_match(haystack),
|
||||
}
|
||||
}
|
||||
|
||||
fn is_match_at(
|
||||
&self,
|
||||
haystack: &[u8],
|
||||
at: usize,
|
||||
) -> Result<bool, NoError> {
|
||||
use self::RegexMatcherImpl::*;
|
||||
match self.matcher {
|
||||
Standard(ref m) => m.is_match_at(haystack, at),
|
||||
Word(ref m) => m.is_match_at(haystack, at),
|
||||
}
|
||||
}
|
||||
|
||||
fn shortest_match(
|
||||
&self,
|
||||
haystack: &[u8],
|
||||
) -> Result<Option<usize>, NoError> {
|
||||
use self::RegexMatcherImpl::*;
|
||||
match self.matcher {
|
||||
Standard(ref m) => m.shortest_match(haystack),
|
||||
Word(ref m) => m.shortest_match(haystack),
|
||||
}
|
||||
}
|
||||
|
||||
fn shortest_match_at(
|
||||
&self,
|
||||
haystack: &[u8],
|
||||
at: usize,
|
||||
) -> Result<Option<usize>, NoError> {
|
||||
use self::RegexMatcherImpl::*;
|
||||
match self.matcher {
|
||||
Standard(ref m) => m.shortest_match_at(haystack, at),
|
||||
Word(ref m) => m.shortest_match_at(haystack, at),
|
||||
}
|
||||
}
|
||||
|
||||
fn non_matching_bytes(&self) -> Option<&ByteSet> {
|
||||
Some(&self.non_matching_bytes)
|
||||
}
|
||||
|
||||
fn line_terminator(&self) -> Option<LineTerminator> {
|
||||
self.config.line_terminator
|
||||
}
|
||||
|
||||
fn find_candidate_line(
|
||||
&self,
|
||||
haystack: &[u8],
|
||||
) -> Result<Option<LineMatchKind>, NoError> {
|
||||
Ok(match self.fast_line_regex {
|
||||
Some(ref regex) => {
|
||||
let input = Input::new(haystack);
|
||||
regex
|
||||
.search_half(&input)
|
||||
.map(|hm| LineMatchKind::Candidate(hm.offset()))
|
||||
}
|
||||
None => {
|
||||
self.shortest_match(haystack)?.map(LineMatchKind::Confirmed)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// The implementation of the standard regex matcher.
|
||||
#[derive(Clone, Debug)]
|
||||
struct StandardMatcher {
|
||||
/// The regular expression compiled from the pattern provided by the
|
||||
/// caller.
|
||||
regex: Regex,
|
||||
/// The HIR that produced this regex.
|
||||
///
|
||||
/// We put this in an `Arc` because by the time it gets here, it won't
|
||||
/// change. And because cloning and dropping an `Hir` is somewhat expensive
|
||||
/// due to its deep recursive representation.
|
||||
chir: Arc<ConfiguredHIR>,
|
||||
}
|
||||
|
||||
impl StandardMatcher {
|
||||
fn new(chir: ConfiguredHIR) -> Result<StandardMatcher, Error> {
|
||||
let chir = Arc::new(chir);
|
||||
let regex = chir.to_regex()?;
|
||||
Ok(StandardMatcher { regex, chir })
|
||||
}
|
||||
}
|
||||
|
||||
impl Matcher for StandardMatcher {
|
||||
type Captures = RegexCaptures;
|
||||
type Error = NoError;
|
||||
|
||||
#[inline]
|
||||
fn find_at(
|
||||
&self,
|
||||
haystack: &[u8],
|
||||
@ -714,18 +404,22 @@ impl Matcher for StandardMatcher {
|
||||
Ok(self.regex.find(input).map(|m| Match::new(m.start(), m.end())))
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn new_captures(&self) -> Result<RegexCaptures, NoError> {
|
||||
Ok(RegexCaptures::new(self.regex.create_captures()))
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn capture_count(&self) -> usize {
|
||||
self.regex.captures_len()
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn capture_index(&self, name: &str) -> Option<usize> {
|
||||
self.regex.group_info().to_index(PatternID::ZERO, name)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn try_find_iter<F, E>(
|
||||
&self,
|
||||
haystack: &[u8],
|
||||
@ -744,6 +438,7 @@ impl Matcher for StandardMatcher {
|
||||
Ok(Ok(()))
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn captures_at(
|
||||
&self,
|
||||
haystack: &[u8],
|
||||
@ -756,6 +451,7 @@ impl Matcher for StandardMatcher {
|
||||
Ok(caps.is_match())
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn shortest_match_at(
|
||||
&self,
|
||||
haystack: &[u8],
|
||||
@ -764,6 +460,34 @@ impl Matcher for StandardMatcher {
|
||||
let input = Input::new(haystack).span(at..haystack.len());
|
||||
Ok(self.regex.search_half(&input).map(|hm| hm.offset()))
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn non_matching_bytes(&self) -> Option<&ByteSet> {
|
||||
Some(&self.non_matching_bytes)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn line_terminator(&self) -> Option<LineTerminator> {
|
||||
self.config.line_terminator
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn find_candidate_line(
|
||||
&self,
|
||||
haystack: &[u8],
|
||||
) -> Result<Option<LineMatchKind>, NoError> {
|
||||
Ok(match self.fast_line_regex {
|
||||
Some(ref regex) => {
|
||||
let input = Input::new(haystack);
|
||||
regex
|
||||
.search_half(&input)
|
||||
.map(|hm| LineMatchKind::Candidate(hm.offset()))
|
||||
}
|
||||
None => {
|
||||
self.shortest_match(haystack)?.map(LineMatchKind::Confirmed)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// Represents the match offsets of each capturing group in a match.
|
||||
@ -784,46 +508,27 @@ impl Matcher for StandardMatcher {
|
||||
pub struct RegexCaptures {
|
||||
/// Where the captures are stored.
|
||||
caps: AutomataCaptures,
|
||||
/// These captures behave as if the capturing groups begin at the given
|
||||
/// offset. When set to `0`, this has no affect and capture groups are
|
||||
/// indexed like normal.
|
||||
///
|
||||
/// This is useful when building matchers that wrap arbitrary regular
|
||||
/// expressions. For example, `WordMatcher` takes an existing regex
|
||||
/// `re` and creates `(?:^|\W)(re)(?:$|\W)`, but hides the fact that
|
||||
/// the regex has been wrapped from the caller. In order to do this,
|
||||
/// the matcher and the capturing groups must behave as if `(re)` is
|
||||
/// the `0`th capture group.
|
||||
offset: usize,
|
||||
}
|
||||
|
||||
impl Captures for RegexCaptures {
|
||||
#[inline]
|
||||
fn len(&self) -> usize {
|
||||
self.caps
|
||||
.group_info()
|
||||
.all_group_len()
|
||||
.checked_sub(self.offset)
|
||||
.unwrap()
|
||||
self.caps.group_info().all_group_len()
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn get(&self, i: usize) -> Option<Match> {
|
||||
let actual = i.checked_add(self.offset).unwrap();
|
||||
self.caps.get_group(actual).map(|sp| Match::new(sp.start, sp.end))
|
||||
self.caps.get_group(i).map(|sp| Match::new(sp.start, sp.end))
|
||||
}
|
||||
}
|
||||
|
||||
impl RegexCaptures {
|
||||
#[inline]
|
||||
pub(crate) fn new(caps: AutomataCaptures) -> RegexCaptures {
|
||||
RegexCaptures::with_offset(caps, 0)
|
||||
}
|
||||
|
||||
pub(crate) fn with_offset(
|
||||
caps: AutomataCaptures,
|
||||
offset: usize,
|
||||
) -> RegexCaptures {
|
||||
RegexCaptures { caps, offset }
|
||||
RegexCaptures { caps }
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub(crate) fn captures_mut(&mut self) -> &mut AutomataCaptures {
|
||||
&mut self.caps
|
||||
}
|
||||
|
@ -19,7 +19,14 @@ fn remove_matching_bytes(expr: &Hir, set: &mut ByteSet) {
|
||||
match *expr.kind() {
|
||||
HirKind::Empty
|
||||
| HirKind::Look(Look::WordAscii | Look::WordAsciiNegate)
|
||||
| HirKind::Look(Look::WordUnicode | Look::WordUnicodeNegate) => {}
|
||||
| HirKind::Look(Look::WordUnicode | Look::WordUnicodeNegate)
|
||||
| HirKind::Look(Look::WordStartAscii | Look::WordStartUnicode)
|
||||
| HirKind::Look(Look::WordEndAscii | Look::WordEndUnicode)
|
||||
| HirKind::Look(
|
||||
Look::WordStartHalfAscii | Look::WordStartHalfUnicode,
|
||||
)
|
||||
| HirKind::Look(Look::WordEndHalfAscii | Look::WordEndHalfUnicode) => {
|
||||
}
|
||||
HirKind::Look(Look::Start | Look::End) => {
|
||||
// FIXME: This is wrong, but not doing this leads to incorrect
|
||||
// results because of how anchored searches are implemented in
|
||||
|
@ -1,341 +0,0 @@
|
||||
use std::{
|
||||
collections::HashMap,
|
||||
panic::{RefUnwindSafe, UnwindSafe},
|
||||
sync::Arc,
|
||||
};
|
||||
|
||||
use {
|
||||
grep_matcher::{Match, Matcher, NoError},
|
||||
regex_automata::{
|
||||
meta::Regex, util::captures::Captures, util::pool::Pool, Input,
|
||||
PatternID,
|
||||
},
|
||||
};
|
||||
|
||||
use crate::{config::ConfiguredHIR, error::Error, matcher::RegexCaptures};
|
||||
|
||||
type PoolFn =
|
||||
Box<dyn Fn() -> Captures + Send + Sync + UnwindSafe + RefUnwindSafe>;
|
||||
|
||||
/// A matcher for implementing "word match" semantics.
|
||||
#[derive(Debug)]
|
||||
pub(crate) struct WordMatcher {
|
||||
/// The regex which is roughly `(?:^|\W)(<original pattern>)(?:$|\W)`.
|
||||
regex: Regex,
|
||||
/// The HIR that produced the regex above. We don't keep the HIR for the
|
||||
/// `original` regex.
|
||||
///
|
||||
/// We put this in an `Arc` because by the time it gets here, it won't
|
||||
/// change. And because cloning and dropping an `Hir` is somewhat expensive
|
||||
/// due to its deep recursive representation.
|
||||
chir: Arc<ConfiguredHIR>,
|
||||
/// The original regex supplied by the user, which we use in a fast path
|
||||
/// to try and detect matches before deferring to slower engines.
|
||||
original: Regex,
|
||||
/// A map from capture group name to capture group index.
|
||||
names: HashMap<String, usize>,
|
||||
/// A thread-safe pool of reusable buffers for finding the match offset of
|
||||
/// the inner group.
|
||||
caps: Arc<Pool<Captures, PoolFn>>,
|
||||
}
|
||||
|
||||
impl Clone for WordMatcher {
|
||||
fn clone(&self) -> WordMatcher {
|
||||
// We implement Clone manually so that we get a fresh Pool such that it
|
||||
// can set its own thread owner. This permits each thread usings `caps`
|
||||
// to hit the fast path.
|
||||
//
|
||||
// Note that cloning a regex is "cheap" since it uses reference
|
||||
// counting internally.
|
||||
let re = self.regex.clone();
|
||||
WordMatcher {
|
||||
regex: self.regex.clone(),
|
||||
chir: Arc::clone(&self.chir),
|
||||
original: self.original.clone(),
|
||||
names: self.names.clone(),
|
||||
caps: Arc::new(Pool::new(Box::new(move || re.create_captures()))),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl WordMatcher {
|
||||
/// Create a new matcher from the given pattern that only produces matches
|
||||
/// that are considered "words."
|
||||
///
|
||||
/// The given options are used to construct the regular expression
|
||||
/// internally.
|
||||
pub(crate) fn new(chir: ConfiguredHIR) -> Result<WordMatcher, Error> {
|
||||
let original = chir.clone().into_anchored().to_regex()?;
|
||||
let chir = Arc::new(chir.into_word()?);
|
||||
let regex = chir.to_regex()?;
|
||||
let caps = Arc::new(Pool::new({
|
||||
let regex = regex.clone();
|
||||
Box::new(move || regex.create_captures()) as PoolFn
|
||||
}));
|
||||
|
||||
let mut names = HashMap::new();
|
||||
let it = regex.group_info().pattern_names(PatternID::ZERO);
|
||||
for (i, optional_name) in it.enumerate() {
|
||||
if let Some(name) = optional_name {
|
||||
names.insert(name.to_string(), i.checked_sub(1).unwrap());
|
||||
}
|
||||
}
|
||||
Ok(WordMatcher { regex, chir, original, names, caps })
|
||||
}
|
||||
|
||||
/// Return the underlying regex used to match at word boundaries.
|
||||
///
|
||||
/// The original regex is in the capture group at index 1.
|
||||
pub(crate) fn regex(&self) -> &Regex {
|
||||
&self.regex
|
||||
}
|
||||
|
||||
/// Return the underlying HIR for the regex used to match at word
|
||||
/// boundaries.
|
||||
pub(crate) fn chir(&self) -> &ConfiguredHIR {
|
||||
&self.chir
|
||||
}
|
||||
|
||||
/// Attempt to do a fast confirmation of a word match that covers a subset
|
||||
/// (but hopefully a big subset) of most cases. Ok(Some(..)) is returned
|
||||
/// when a match is found. Ok(None) is returned when there is definitively
|
||||
/// no match. Err(()) is returned when this routine could not detect
|
||||
/// whether there was a match or not.
|
||||
fn fast_find(
|
||||
&self,
|
||||
haystack: &[u8],
|
||||
at: usize,
|
||||
) -> Result<Option<Match>, ()> {
|
||||
// This is a bit hairy. The whole point here is to avoid running a
|
||||
// slower regex engine to extract capture groups. Remember, our word
|
||||
// regex looks like this:
|
||||
//
|
||||
// (^|\W)(<original regex>)(\W|$)
|
||||
//
|
||||
// What we want are the match offsets of <original regex>. So in the
|
||||
// easy/common case, the original regex will be sandwiched between
|
||||
// two codepoints that are in the \W class. So our approach here is to
|
||||
// look for a match of the overall word regexp, strip the \W ends and
|
||||
// then check whether the original regex matches what's left. If so,
|
||||
// then we are guaranteed a correct match.
|
||||
//
|
||||
// This only works though if we know that the match is sandwiched
|
||||
// between two \W codepoints. This only occurs when neither ^ nor $
|
||||
// match. This in turn only occurs when the match is at either the
|
||||
// beginning or end of the haystack. In either of those cases, we
|
||||
// declare defeat and defer to the slower implementation.
|
||||
//
|
||||
// The reason why we cannot handle the ^/$ cases here is because we
|
||||
// can't assume anything about the original pattern. (Try commenting
|
||||
// out the checks for ^/$ below and run the tests to see examples.)
|
||||
//
|
||||
// NOTE(2023-07-31): After fixing #2574, this logic honestly still
|
||||
// doesn't seem correct. Regex composition is hard.
|
||||
let input = Input::new(haystack).span(at..haystack.len());
|
||||
let mut cand = match self.regex.find(input) {
|
||||
None => return Ok(None),
|
||||
Some(m) => Match::new(m.start(), m.end()),
|
||||
};
|
||||
if cand.start() == 0 || cand.end() == haystack.len() {
|
||||
return Err(());
|
||||
}
|
||||
// We decode the chars on either side of the match. If either char is
|
||||
// a word character, then that means the ^/$ matched and not \W. In
|
||||
// that case, we defer to the slower engine.
|
||||
let (ch, slen) = bstr::decode_utf8(&haystack[cand]);
|
||||
if ch.map_or(true, regex_syntax::is_word_character) {
|
||||
return Err(());
|
||||
}
|
||||
let (ch, elen) = bstr::decode_last_utf8(&haystack[cand]);
|
||||
if ch.map_or(true, regex_syntax::is_word_character) {
|
||||
return Err(());
|
||||
}
|
||||
let new_start = cand.start() + slen;
|
||||
let new_end = cand.end() - elen;
|
||||
// This occurs the original regex can match the empty string. In this
|
||||
// case, just bail instead of trying to get it right here since it's
|
||||
// likely a pathological case.
|
||||
if new_start > new_end {
|
||||
return Err(());
|
||||
}
|
||||
cand = cand.with_start(new_start).with_end(new_end);
|
||||
if self.original.is_match(&haystack[cand]) {
|
||||
Ok(Some(cand))
|
||||
} else {
|
||||
Err(())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Matcher for WordMatcher {
|
||||
type Captures = RegexCaptures;
|
||||
type Error = NoError;
|
||||
|
||||
fn find_at(
|
||||
&self,
|
||||
haystack: &[u8],
|
||||
at: usize,
|
||||
) -> Result<Option<Match>, NoError> {
|
||||
// To make this easy to get right, we extract captures here instead of
|
||||
// calling `find_at`. The actual match is at capture group `1` instead
|
||||
// of `0`. We *could* use `find_at` here and then trim the match after
|
||||
// the fact, but that's a bit harder to get right, and it's not clear
|
||||
// if it's worth it.
|
||||
//
|
||||
// OK, well, it turns out that it is worth it! But it is quite tricky.
|
||||
// See `fast_find` for details. Effectively, this lets us skip running
|
||||
// a slower regex engine to extract capture groups in the vast majority
|
||||
// of cases. However, the slower engine is I believe required for full
|
||||
// correctness.
|
||||
match self.fast_find(haystack, at) {
|
||||
Ok(Some(m)) => return Ok(Some(m)),
|
||||
Ok(None) => return Ok(None),
|
||||
Err(()) => {}
|
||||
}
|
||||
|
||||
let input = Input::new(haystack).span(at..haystack.len());
|
||||
let mut caps = self.caps.get();
|
||||
self.regex.search_captures(&input, &mut caps);
|
||||
Ok(caps.get_group(1).map(|sp| Match::new(sp.start, sp.end)))
|
||||
}
|
||||
|
||||
fn new_captures(&self) -> Result<RegexCaptures, NoError> {
|
||||
Ok(RegexCaptures::with_offset(self.regex.create_captures(), 1))
|
||||
}
|
||||
|
||||
fn capture_count(&self) -> usize {
|
||||
self.regex.captures_len().checked_sub(1).unwrap()
|
||||
}
|
||||
|
||||
fn capture_index(&self, name: &str) -> Option<usize> {
|
||||
self.names.get(name).map(|i| *i)
|
||||
}
|
||||
|
||||
fn captures_at(
|
||||
&self,
|
||||
haystack: &[u8],
|
||||
at: usize,
|
||||
caps: &mut RegexCaptures,
|
||||
) -> Result<bool, NoError> {
|
||||
let input = Input::new(haystack).span(at..haystack.len());
|
||||
let caps = caps.captures_mut();
|
||||
self.regex.search_captures(&input, caps);
|
||||
Ok(caps.is_match())
|
||||
}
|
||||
|
||||
// We specifically do not implement other methods like find_iter or
|
||||
// captures_iter. Namely, the iter methods are guaranteed to be correct
|
||||
// by virtue of implementing find_at and captures_at above.
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::WordMatcher;
|
||||
use crate::config::Config;
|
||||
use grep_matcher::{Captures, Match, Matcher};
|
||||
|
||||
fn matcher(pattern: &str) -> WordMatcher {
|
||||
let chir = Config::default().build_many(&[pattern]).unwrap();
|
||||
WordMatcher::new(chir).unwrap()
|
||||
}
|
||||
|
||||
fn find(pattern: &str, haystack: &str) -> Option<(usize, usize)> {
|
||||
matcher(pattern)
|
||||
.find(haystack.as_bytes())
|
||||
.unwrap()
|
||||
.map(|m| (m.start(), m.end()))
|
||||
}
|
||||
|
||||
fn find_by_caps(pattern: &str, haystack: &str) -> Option<(usize, usize)> {
|
||||
let m = matcher(pattern);
|
||||
let mut caps = m.new_captures().unwrap();
|
||||
if !m.captures(haystack.as_bytes(), &mut caps).unwrap() {
|
||||
None
|
||||
} else {
|
||||
caps.get(0).map(|m| (m.start(), m.end()))
|
||||
}
|
||||
}
|
||||
|
||||
// Test that the standard `find` API reports offsets correctly.
|
||||
#[test]
|
||||
fn various_find() {
|
||||
assert_eq!(Some((0, 3)), find(r"foo", "foo"));
|
||||
assert_eq!(Some((0, 3)), find(r"foo", "foo("));
|
||||
assert_eq!(Some((1, 4)), find(r"foo", "!foo("));
|
||||
assert_eq!(None, find(r"foo", "!afoo("));
|
||||
|
||||
assert_eq!(Some((0, 3)), find(r"foo", "foo☃"));
|
||||
assert_eq!(None, find(r"foo", "fooб"));
|
||||
|
||||
assert_eq!(Some((0, 4)), find(r"foo5", "foo5"));
|
||||
assert_eq!(None, find(r"foo", "foo5"));
|
||||
|
||||
assert_eq!(Some((1, 4)), find(r"foo", "!foo!"));
|
||||
assert_eq!(Some((1, 5)), find(r"foo!", "!foo!"));
|
||||
assert_eq!(Some((0, 5)), find(r"!foo!", "!foo!"));
|
||||
|
||||
assert_eq!(Some((0, 3)), find(r"foo", "foo\n"));
|
||||
assert_eq!(Some((1, 4)), find(r"foo", "!foo!\n"));
|
||||
assert_eq!(Some((1, 5)), find(r"foo!", "!foo!\n"));
|
||||
assert_eq!(Some((0, 5)), find(r"!foo!", "!foo!\n"));
|
||||
|
||||
assert_eq!(Some((1, 6)), find(r"!?foo!?", "!!foo!!"));
|
||||
assert_eq!(Some((0, 5)), find(r"!?foo!?", "!foo!"));
|
||||
assert_eq!(Some((2, 5)), find(r"!?foo!?", "a!foo!a"));
|
||||
|
||||
assert_eq!(Some((2, 7)), find(r"!?foo!?", "##!foo!\n"));
|
||||
assert_eq!(Some((3, 8)), find(r"!?foo!?", "##\n!foo!##"));
|
||||
assert_eq!(Some((3, 8)), find(r"!?foo!?", "##\n!foo!\n##"));
|
||||
assert_eq!(Some((3, 7)), find(r"f?oo!?", "##\nfoo!##"));
|
||||
assert_eq!(Some((2, 5)), find(r"(?-u)foo[^a]*", "#!foo☃aaa"));
|
||||
}
|
||||
|
||||
// See: https://github.com/BurntSushi/ripgrep/issues/389
|
||||
#[test]
|
||||
fn regression_dash() {
|
||||
assert_eq!(Some((0, 2)), find(r"-2", "-2"));
|
||||
}
|
||||
|
||||
// Test that the captures API also reports offsets correctly, just as
|
||||
// find does. This exercises a different path in the code since captures
|
||||
// are handled differently.
|
||||
#[test]
|
||||
fn various_captures() {
|
||||
assert_eq!(Some((0, 3)), find_by_caps(r"foo", "foo"));
|
||||
assert_eq!(Some((0, 3)), find_by_caps(r"foo", "foo("));
|
||||
assert_eq!(Some((1, 4)), find_by_caps(r"foo", "!foo("));
|
||||
assert_eq!(None, find_by_caps(r"foo", "!afoo("));
|
||||
|
||||
assert_eq!(Some((0, 3)), find_by_caps(r"foo", "foo☃"));
|
||||
assert_eq!(None, find_by_caps(r"foo", "fooб"));
|
||||
// assert_eq!(Some((0, 3)), find_by_caps(r"foo", "fooб"));
|
||||
|
||||
// See: https://github.com/BurntSushi/ripgrep/issues/389
|
||||
assert_eq!(Some((0, 2)), find_by_caps(r"-2", "-2"));
|
||||
}
|
||||
|
||||
// Test that the capture reporting methods work as advertised.
|
||||
#[test]
|
||||
fn capture_indexing() {
|
||||
let m = matcher(r"(a)(?P<foo>b)(c)");
|
||||
assert_eq!(4, m.capture_count());
|
||||
assert_eq!(Some(2), m.capture_index("foo"));
|
||||
|
||||
let mut caps = m.new_captures().unwrap();
|
||||
assert_eq!(4, caps.len());
|
||||
|
||||
assert!(m.captures(b"abc", &mut caps).unwrap());
|
||||
assert_eq!(caps.get(0), Some(Match::new(0, 3)));
|
||||
assert_eq!(caps.get(1), Some(Match::new(0, 1)));
|
||||
assert_eq!(caps.get(2), Some(Match::new(1, 2)));
|
||||
assert_eq!(caps.get(3), Some(Match::new(2, 3)));
|
||||
assert_eq!(caps.get(4), None);
|
||||
|
||||
assert!(m.captures(b"#abc#", &mut caps).unwrap());
|
||||
assert_eq!(caps.get(0), Some(Match::new(1, 4)));
|
||||
assert_eq!(caps.get(1), Some(Match::new(1, 2)));
|
||||
assert_eq!(caps.get(2), Some(Match::new(2, 3)));
|
||||
assert_eq!(caps.get(3), Some(Match::new(3, 4)));
|
||||
assert_eq!(caps.get(4), None);
|
||||
}
|
||||
}
|
@ -144,6 +144,18 @@ For the Doctor Watsons of this world, as opposed to the Sherlock
|
||||
eqnice!(expected, cmd.stdout());
|
||||
});
|
||||
|
||||
rgtest!(word_period, |dir: Dir, mut cmd: TestCommand| {
|
||||
dir.create("haystack", "...");
|
||||
cmd.arg("-ow").arg(".").arg("haystack");
|
||||
|
||||
let expected = "\
|
||||
.
|
||||
.
|
||||
.
|
||||
";
|
||||
eqnice!(expected, cmd.stdout());
|
||||
});
|
||||
|
||||
rgtest!(line, |dir: Dir, mut cmd: TestCommand| {
|
||||
dir.create("sherlock", SHERLOCK);
|
||||
cmd.args(&[
|
||||
|
@ -1046,17 +1046,10 @@ rgtest!(r1878, |dir: Dir, _: TestCommand| {
|
||||
|
||||
// See: https://github.com/BurntSushi/ripgrep/issues/1891
|
||||
rgtest!(r1891, |dir: Dir, mut cmd: TestCommand| {
|
||||
// TODO: Sadly, PCRE2 has different behavior here. Not clear why. We should
|
||||
// look into this and see if there's a fix needed at the regex engine
|
||||
// level.
|
||||
if dir.is_pcre2() {
|
||||
return;
|
||||
}
|
||||
|
||||
dir.create("test", "\n##\n");
|
||||
// N.B. We use -o here to force the issue to occur, which seems to only
|
||||
// happen when each match needs to be detected.
|
||||
eqnice!("1:\n2:\n2:\n", cmd.args(&["-won", "", "test"]).stdout());
|
||||
eqnice!("1:\n2:\n2:\n2:\n", cmd.args(&["-won", "", "test"]).stdout());
|
||||
});
|
||||
|
||||
// See: https://github.com/BurntSushi/ripgrep/issues/2095
|
||||
|
Loading…
x
Reference in New Issue
Block a user