mirror of
https://github.com/BurntSushi/ripgrep.git
synced 2025-05-13 21:26:27 +02:00
The top-level listing was just getting a bit too long for my taste. So put all of the code in one directory and shrink the large top-level mess to a small top-level mess. NOTE: This commit only contains renames. The subsequent commit will actually make ripgrep build again. We do it this way with the naive hope that this will make it easier for git history to track the renames. Sigh.
190 lines
6.1 KiB
Rust
190 lines
6.1 KiB
Rust
use std::collections::HashMap;
|
|
|
|
use grep_matcher::{Match, Matcher, NoError};
|
|
use regex::bytes::Regex;
|
|
use regex_syntax::hir::{self, Hir, HirKind};
|
|
|
|
use config::ConfiguredHIR;
|
|
use error::Error;
|
|
use matcher::RegexCaptures;
|
|
|
|
/// A matcher for implementing "word match" semantics.
|
|
#[derive(Clone, Debug)]
|
|
pub struct CRLFMatcher {
|
|
/// The regex.
|
|
regex: Regex,
|
|
/// A map from capture group name to capture group index.
|
|
names: HashMap<String, usize>,
|
|
}
|
|
|
|
impl CRLFMatcher {
|
|
/// Create a new matcher from the given pattern that strips `\r` from the
|
|
/// end of every match.
|
|
///
|
|
/// This panics if the given expression doesn't need its CRLF stripped.
|
|
pub fn new(expr: &ConfiguredHIR) -> Result<CRLFMatcher, Error> {
|
|
assert!(expr.needs_crlf_stripped());
|
|
|
|
let regex = expr.regex()?;
|
|
let mut names = HashMap::new();
|
|
for (i, optional_name) in regex.capture_names().enumerate() {
|
|
if let Some(name) = optional_name {
|
|
names.insert(name.to_string(), i.checked_sub(1).unwrap());
|
|
}
|
|
}
|
|
Ok(CRLFMatcher { regex, names })
|
|
}
|
|
|
|
/// Return the underlying regex used by this matcher.
|
|
pub fn regex(&self) -> &Regex {
|
|
&self.regex
|
|
}
|
|
}
|
|
|
|
impl Matcher for CRLFMatcher {
|
|
type Captures = RegexCaptures;
|
|
type Error = NoError;
|
|
|
|
fn find_at(
|
|
&self,
|
|
haystack: &[u8],
|
|
at: usize,
|
|
) -> Result<Option<Match>, NoError> {
|
|
let m = match self.regex.find_at(haystack, at) {
|
|
None => return Ok(None),
|
|
Some(m) => Match::new(m.start(), m.end()),
|
|
};
|
|
Ok(Some(adjust_match(haystack, m)))
|
|
}
|
|
|
|
fn new_captures(&self) -> Result<RegexCaptures, NoError> {
|
|
Ok(RegexCaptures::new(self.regex.capture_locations()))
|
|
}
|
|
|
|
fn capture_count(&self) -> usize {
|
|
self.regex.captures_len().checked_sub(1).unwrap()
|
|
}
|
|
|
|
fn capture_index(&self, name: &str) -> Option<usize> {
|
|
self.names.get(name).map(|i| *i)
|
|
}
|
|
|
|
fn captures_at(
|
|
&self,
|
|
haystack: &[u8],
|
|
at: usize,
|
|
caps: &mut RegexCaptures,
|
|
) -> Result<bool, NoError> {
|
|
caps.strip_crlf(false);
|
|
let r =
|
|
self.regex.captures_read_at(caps.locations_mut(), haystack, at);
|
|
if !r.is_some() {
|
|
return Ok(false);
|
|
}
|
|
|
|
// If the end of our match includes a `\r`, then strip it from all
|
|
// capture groups ending at the same location.
|
|
let end = caps.locations().get(0).unwrap().1;
|
|
if end > 0 && haystack.get(end - 1) == Some(&b'\r') {
|
|
caps.strip_crlf(true);
|
|
}
|
|
Ok(true)
|
|
}
|
|
|
|
// We specifically do not implement other methods like find_iter or
|
|
// captures_iter. Namely, the iter methods are guaranteed to be correct
|
|
// by virtue of implementing find_at and captures_at above.
|
|
}
|
|
|
|
/// If the given match ends with a `\r`, then return a new match that ends
|
|
/// immediately before the `\r`.
|
|
pub fn adjust_match(haystack: &[u8], m: Match) -> Match {
|
|
if m.end() > 0 && haystack.get(m.end() - 1) == Some(&b'\r') {
|
|
m.with_end(m.end() - 1)
|
|
} else {
|
|
m
|
|
}
|
|
}
|
|
|
|
/// Substitutes all occurrences of multi-line enabled `$` with `(?:\r?$)`.
|
|
///
|
|
/// This does not preserve the exact semantics of the given expression,
|
|
/// however, it does have the useful property that anything that matched the
|
|
/// given expression will also match the returned expression. The difference is
|
|
/// that the returned expression can match possibly other things as well.
|
|
///
|
|
/// The principle reason why we do this is because the underlying regex engine
|
|
/// doesn't support CRLF aware `$` look-around. It's planned to fix it at that
|
|
/// level, but we perform this kludge in the mean time.
|
|
///
|
|
/// Note that while the match preserving semantics are nice and neat, the
|
|
/// match position semantics are quite a bit messier. Namely, `$` only ever
|
|
/// matches the position between characters where as `\r??` can match a
|
|
/// character and change the offset. This is regretable, but works out pretty
|
|
/// nicely in most cases, especially when a match is limited to a single line.
|
|
pub fn crlfify(expr: Hir) -> Hir {
|
|
match expr.into_kind() {
|
|
HirKind::Anchor(hir::Anchor::EndLine) => {
|
|
let concat = Hir::concat(vec![
|
|
Hir::repetition(hir::Repetition {
|
|
kind: hir::RepetitionKind::ZeroOrOne,
|
|
greedy: false,
|
|
hir: Box::new(Hir::literal(hir::Literal::Unicode('\r'))),
|
|
}),
|
|
Hir::anchor(hir::Anchor::EndLine),
|
|
]);
|
|
Hir::group(hir::Group {
|
|
kind: hir::GroupKind::NonCapturing,
|
|
hir: Box::new(concat),
|
|
})
|
|
}
|
|
HirKind::Empty => Hir::empty(),
|
|
HirKind::Literal(x) => Hir::literal(x),
|
|
HirKind::Class(x) => Hir::class(x),
|
|
HirKind::Anchor(x) => Hir::anchor(x),
|
|
HirKind::WordBoundary(x) => Hir::word_boundary(x),
|
|
HirKind::Repetition(mut x) => {
|
|
x.hir = Box::new(crlfify(*x.hir));
|
|
Hir::repetition(x)
|
|
}
|
|
HirKind::Group(mut x) => {
|
|
x.hir = Box::new(crlfify(*x.hir));
|
|
Hir::group(x)
|
|
}
|
|
HirKind::Concat(xs) => {
|
|
Hir::concat(xs.into_iter().map(crlfify).collect())
|
|
}
|
|
HirKind::Alternation(xs) => {
|
|
Hir::alternation(xs.into_iter().map(crlfify).collect())
|
|
}
|
|
}
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::crlfify;
|
|
use regex_syntax::Parser;
|
|
|
|
fn roundtrip(pattern: &str) -> String {
|
|
let expr1 = Parser::new().parse(pattern).unwrap();
|
|
let expr2 = crlfify(expr1);
|
|
expr2.to_string()
|
|
}
|
|
|
|
#[test]
|
|
fn various() {
|
|
assert_eq!(roundtrip(r"(?m)$"), "(?:\r??(?m:$))");
|
|
assert_eq!(roundtrip(r"(?m)$$"), "(?:\r??(?m:$))(?:\r??(?m:$))");
|
|
assert_eq!(
|
|
roundtrip(r"(?m)(?:foo$|bar$)"),
|
|
"(?:foo(?:\r??(?m:$))|bar(?:\r??(?m:$)))"
|
|
);
|
|
assert_eq!(roundtrip(r"(?m)$a"), "(?:\r??(?m:$))a");
|
|
|
|
// Not a multiline `$`, so no crlfifying occurs.
|
|
assert_eq!(roundtrip(r"$"), "\\z");
|
|
// It's a literal, derp.
|
|
assert_eq!(roundtrip(r"\$"), "\\$");
|
|
}
|
|
}
|