mirror of
https://github.com/BurntSushi/ripgrep.git
synced 2025-02-09 14:14:56 +02:00
regex: fix matching bug when text anchors are used
It turns out that if there are text anchors (that is, \A or \z, or ^/$ when multi-line is disabled), then the "fast" line searching path isn't quite correct. Since searching without multi-line mode is exceptionally rare, we just look for the presence of text anchors and specifically disable the line terminator option in 'grep-regex'. This in turn inhibits the "fast" line searching path. Fixes #2260
This commit is contained in:
parent
b9f5835534
commit
8e57989cd2
@ -175,6 +175,36 @@ impl ConfiguredHIR {
|
||||
self.config.crlf && self.expr.is_line_anchored_end()
|
||||
}
|
||||
|
||||
/// Returns the line terminator configured on this expression.
|
||||
///
|
||||
/// When we have beginning/end anchors (NOT line anchors), the fast line
|
||||
/// searching path isn't quite correct. Or at least, doesn't match the
|
||||
/// slow path. Namely, the slow path strips line terminators while the
|
||||
/// fast path does not. Since '$' (when multi-line mode is disabled)
|
||||
/// doesn't match at line boundaries, the existence of a line terminator
|
||||
/// might cause it to not match when it otherwise would with the line
|
||||
/// terminator stripped.
|
||||
///
|
||||
/// Since searching with text anchors is exceptionally rare in the
|
||||
/// context of line oriented searching (multi-line mode is basically
|
||||
/// always enabled), we just disable this optimization when there are
|
||||
/// text anchors. We disable it by not returning a line terminator, since
|
||||
/// without a line terminator, the fast search path can't be executed.
|
||||
///
|
||||
/// See: https://github.com/BurntSushi/ripgrep/issues/2260
|
||||
pub fn line_terminator(&self) -> Option<LineTerminator> {
|
||||
if self.is_any_anchored() {
|
||||
None
|
||||
} else {
|
||||
self.config.line_terminator
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns true if and only if the underlying HIR has any text anchors.
|
||||
fn is_any_anchored(&self) -> bool {
|
||||
self.expr.is_any_anchored_start() || self.expr.is_any_anchored_end()
|
||||
}
|
||||
|
||||
/// Builds a regular expression from this HIR expression.
|
||||
pub fn regex(&self) -> Result<Regex, Error> {
|
||||
self.pattern_to_regex(&self.expr.to_string())
|
||||
|
@ -52,8 +52,12 @@ impl RegexMatcherBuilder {
|
||||
|
||||
let matcher = RegexMatcherImpl::new(&chir)?;
|
||||
log::trace!("final regex: {:?}", matcher.regex());
|
||||
let mut config = self.config.clone();
|
||||
// We override the line terminator in case the configured expr doesn't
|
||||
// support it.
|
||||
config.line_terminator = chir.line_terminator();
|
||||
Ok(RegexMatcher {
|
||||
config: self.config.clone(),
|
||||
config,
|
||||
matcher,
|
||||
fast_line_regex,
|
||||
non_matching_bytes,
|
||||
|
@ -108,7 +108,7 @@ impl<'s, M: Matcher, S: Sink> Core<'s, M, S> {
|
||||
}
|
||||
|
||||
pub fn match_by_line(&mut self, buf: &[u8]) -> Result<bool, S::Error> {
|
||||
if self.is_line_by_line_fast() {
|
||||
if dbg!(self.is_line_by_line_fast()) {
|
||||
self.match_by_line_fast(buf)
|
||||
} else {
|
||||
self.match_by_line_slow(buf)
|
||||
|
@ -1512,4 +1512,31 @@ and exhibited clearly, with a label attached.\
|
||||
)
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
// See: https://github.com/BurntSushi/ripgrep/issues/2260
|
||||
#[test]
|
||||
fn regression_2260() {
|
||||
use grep_regex::RegexMatcherBuilder;
|
||||
|
||||
use crate::SearcherBuilder;
|
||||
|
||||
let matcher = RegexMatcherBuilder::new()
|
||||
.line_terminator(Some(b'\n'))
|
||||
.build(r"^\w+$")
|
||||
.unwrap();
|
||||
let mut searcher = SearcherBuilder::new().line_number(true).build();
|
||||
|
||||
let mut matched = false;
|
||||
searcher
|
||||
.search_slice(
|
||||
&matcher,
|
||||
b"GATC\n",
|
||||
crate::sinks::UTF8(|_, _| {
|
||||
matched = true;
|
||||
Ok(true)
|
||||
}),
|
||||
)
|
||||
.unwrap();
|
||||
assert!(matched);
|
||||
}
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user