1
0
mirror of https://github.com/BurntSushi/ripgrep.git synced 2025-02-09 14:14:56 +02:00

regex: fix matching bug when text anchors are used

It turns out that if there are text anchors (that is, \A or \z, or ^/$
when multi-line is disabled), then the "fast" line searching path isn't
quite correct. Since searching without multi-line mode is exceptionally
rare, we just look for the presence of text anchors and specifically
disable the line terminator option in 'grep-regex'. This in turn
inhibits the "fast" line searching path.

Fixes #2260
This commit is contained in:
Andrew Gallant 2022-07-15 09:53:39 -04:00
parent b9f5835534
commit 8e57989cd2
No known key found for this signature in database
GPG Key ID: B2E3A4923F8B0D44
4 changed files with 63 additions and 2 deletions

View File

@ -175,6 +175,36 @@ impl ConfiguredHIR {
self.config.crlf && self.expr.is_line_anchored_end()
}
/// Returns the line terminator configured on this expression.
///
/// When we have beginning/end anchors (NOT line anchors), the fast line
/// searching path isn't quite correct. Or at least, doesn't match the
/// slow path. Namely, the slow path strips line terminators while the
/// fast path does not. Since '$' (when multi-line mode is disabled)
/// doesn't match at line boundaries, the existence of a line terminator
/// might cause it to not match when it otherwise would with the line
/// terminator stripped.
///
/// Since searching with text anchors is exceptionally rare in the
/// context of line oriented searching (multi-line mode is basically
/// always enabled), we just disable this optimization when there are
/// text anchors. We disable it by not returning a line terminator, since
/// without a line terminator, the fast search path can't be executed.
///
/// See: https://github.com/BurntSushi/ripgrep/issues/2260
pub fn line_terminator(&self) -> Option<LineTerminator> {
if self.is_any_anchored() {
None
} else {
self.config.line_terminator
}
}
/// Returns true if and only if the underlying HIR has any text anchors.
fn is_any_anchored(&self) -> bool {
self.expr.is_any_anchored_start() || self.expr.is_any_anchored_end()
}
/// Builds a regular expression from this HIR expression.
pub fn regex(&self) -> Result<Regex, Error> {
self.pattern_to_regex(&self.expr.to_string())

View File

@ -52,8 +52,12 @@ impl RegexMatcherBuilder {
let matcher = RegexMatcherImpl::new(&chir)?;
log::trace!("final regex: {:?}", matcher.regex());
let mut config = self.config.clone();
// We override the line terminator in case the configured expr doesn't
// support it.
config.line_terminator = chir.line_terminator();
Ok(RegexMatcher {
config: self.config.clone(),
config,
matcher,
fast_line_regex,
non_matching_bytes,

View File

@ -108,7 +108,7 @@ impl<'s, M: Matcher, S: Sink> Core<'s, M, S> {
}
pub fn match_by_line(&mut self, buf: &[u8]) -> Result<bool, S::Error> {
if self.is_line_by_line_fast() {
if dbg!(self.is_line_by_line_fast()) {
self.match_by_line_fast(buf)
} else {
self.match_by_line_slow(buf)

View File

@ -1512,4 +1512,31 @@ and exhibited clearly, with a label attached.\
)
.unwrap();
}
// See: https://github.com/BurntSushi/ripgrep/issues/2260
#[test]
fn regression_2260() {
use grep_regex::RegexMatcherBuilder;
use crate::SearcherBuilder;
let matcher = RegexMatcherBuilder::new()
.line_terminator(Some(b'\n'))
.build(r"^\w+$")
.unwrap();
let mut searcher = SearcherBuilder::new().line_number(true).build();
let mut matched = false;
searcher
.search_slice(
&matcher,
b"GATC\n",
crate::sinks::UTF8(|_, _| {
matched = true;
Ok(true)
}),
)
.unwrap();
assert!(matched);
}
}