mirror of
https://github.com/BurntSushi/ripgrep.git
synced 2025-05-13 21:26:27 +02:00
457 lines
16 KiB
Rust
457 lines
16 KiB
Rust
use std::collections::HashMap;
|
|
|
|
use grep_matcher::{Captures, Match, Matcher};
|
|
use pcre2::bytes::{CaptureLocations, Regex, RegexBuilder};
|
|
|
|
use crate::error::Error;
|
|
|
|
/// A builder for configuring the compilation of a PCRE2 regex.
|
|
#[derive(Clone, Debug)]
|
|
pub struct RegexMatcherBuilder {
|
|
builder: RegexBuilder,
|
|
case_smart: bool,
|
|
word: bool,
|
|
}
|
|
|
|
impl RegexMatcherBuilder {
|
|
/// Create a new matcher builder with a default configuration.
|
|
pub fn new() -> RegexMatcherBuilder {
|
|
RegexMatcherBuilder {
|
|
builder: RegexBuilder::new(),
|
|
case_smart: false,
|
|
word: false,
|
|
}
|
|
}
|
|
|
|
/// Compile the given pattern into a PCRE matcher using the current
|
|
/// configuration.
|
|
///
|
|
/// If there was a problem compiling the pattern, then an error is
|
|
/// returned.
|
|
pub fn build(&self, pattern: &str) -> Result<RegexMatcher, Error> {
|
|
let mut builder = self.builder.clone();
|
|
if self.case_smart && !has_uppercase_literal(pattern) {
|
|
builder.caseless(true);
|
|
}
|
|
let res = if self.word {
|
|
let pattern = format!(r"(?<!\w)(?:{})(?!\w)", pattern);
|
|
builder.build(&pattern)
|
|
} else {
|
|
builder.build(pattern)
|
|
};
|
|
res.map_err(Error::regex).map(|regex| {
|
|
let mut names = HashMap::new();
|
|
for (i, name) in regex.capture_names().iter().enumerate() {
|
|
if let Some(ref name) = *name {
|
|
names.insert(name.to_string(), i);
|
|
}
|
|
}
|
|
RegexMatcher { regex, names }
|
|
})
|
|
}
|
|
|
|
/// Enables case insensitive matching.
|
|
///
|
|
/// If the `utf` option is also set, then Unicode case folding is used
|
|
/// to determine case insensitivity. When the `utf` option is not set,
|
|
/// then only standard ASCII case insensitivity is considered.
|
|
///
|
|
/// This option corresponds to the `i` flag.
|
|
pub fn caseless(&mut self, yes: bool) -> &mut RegexMatcherBuilder {
|
|
self.builder.caseless(yes);
|
|
self
|
|
}
|
|
|
|
/// Whether to enable "smart case" or not.
|
|
///
|
|
/// When smart case is enabled, the builder will automatically enable
|
|
/// case insensitive matching based on how the pattern is written. Namely,
|
|
/// case insensitive mode is enabled when both of the following things
|
|
/// are believed to be true:
|
|
///
|
|
/// 1. The pattern contains at least one literal character. For example,
|
|
/// `a\w` contains a literal (`a`) but `\w` does not.
|
|
/// 2. Of the literals in the pattern, none of them are considered to be
|
|
/// uppercase according to Unicode. For example, `foo\pL` has no
|
|
/// uppercase literals but `Foo\pL` does.
|
|
///
|
|
/// Note that the implementation of this is not perfect. Namely, `\p{Ll}`
|
|
/// will prevent case insensitive matching even though it is part of a meta
|
|
/// sequence. This bug will probably never be fixed.
|
|
pub fn case_smart(&mut self, yes: bool) -> &mut RegexMatcherBuilder {
|
|
self.case_smart = yes;
|
|
self
|
|
}
|
|
|
|
/// Enables "dot all" matching.
|
|
///
|
|
/// When enabled, the `.` metacharacter in the pattern matches any
|
|
/// character, include `\n`. When disabled (the default), `.` will match
|
|
/// any character except for `\n`.
|
|
///
|
|
/// This option corresponds to the `s` flag.
|
|
pub fn dotall(&mut self, yes: bool) -> &mut RegexMatcherBuilder {
|
|
self.builder.dotall(yes);
|
|
self
|
|
}
|
|
|
|
/// Enable "extended" mode in the pattern, where whitespace is ignored.
|
|
///
|
|
/// This option corresponds to the `x` flag.
|
|
pub fn extended(&mut self, yes: bool) -> &mut RegexMatcherBuilder {
|
|
self.builder.extended(yes);
|
|
self
|
|
}
|
|
|
|
/// Enable multiline matching mode.
|
|
///
|
|
/// When enabled, the `^` and `$` anchors will match both at the beginning
|
|
/// and end of a subject string, in addition to matching at the start of
|
|
/// a line and the end of a line. When disabled, the `^` and `$` anchors
|
|
/// will only match at the beginning and end of a subject string.
|
|
///
|
|
/// This option corresponds to the `m` flag.
|
|
pub fn multi_line(&mut self, yes: bool) -> &mut RegexMatcherBuilder {
|
|
self.builder.multi_line(yes);
|
|
self
|
|
}
|
|
|
|
/// Enable matching of CRLF as a line terminator.
|
|
///
|
|
/// When enabled, anchors such as `^` and `$` will match any of the
|
|
/// following as a line terminator: `\r`, `\n` or `\r\n`.
|
|
///
|
|
/// This is disabled by default, in which case, only `\n` is recognized as
|
|
/// a line terminator.
|
|
pub fn crlf(&mut self, yes: bool) -> &mut RegexMatcherBuilder {
|
|
self.builder.crlf(yes);
|
|
self
|
|
}
|
|
|
|
/// Require that all matches occur on word boundaries.
|
|
///
|
|
/// Enabling this option is subtly different than putting `\b` assertions
|
|
/// on both sides of your pattern. In particular, a `\b` assertion requires
|
|
/// that one side of it match a word character while the other match a
|
|
/// non-word character. This option, in contrast, merely requires that
|
|
/// one side match a non-word character.
|
|
///
|
|
/// For example, `\b-2\b` will not match `foo -2 bar` since `-` is not a
|
|
/// word character. However, `-2` with this `word` option enabled will
|
|
/// match the `-2` in `foo -2 bar`.
|
|
pub fn word(&mut self, yes: bool) -> &mut RegexMatcherBuilder {
|
|
self.word = yes;
|
|
self
|
|
}
|
|
|
|
/// Enable Unicode matching mode.
|
|
///
|
|
/// When enabled, the following patterns become Unicode aware: `\b`, `\B`,
|
|
/// `\d`, `\D`, `\s`, `\S`, `\w`, `\W`.
|
|
///
|
|
/// When set, this implies UTF matching mode. It is not possible to enable
|
|
/// Unicode matching mode without enabling UTF matching mode.
|
|
///
|
|
/// This is disabled by default.
|
|
pub fn ucp(&mut self, yes: bool) -> &mut RegexMatcherBuilder {
|
|
self.builder.ucp(yes);
|
|
self
|
|
}
|
|
|
|
/// Enable UTF matching mode.
|
|
///
|
|
/// When enabled, characters are treated as sequences of code units that
|
|
/// make up a single codepoint instead of as single bytes. For example,
|
|
/// this will cause `.` to match any single UTF-8 encoded codepoint, where
|
|
/// as when this is disabled, `.` will any single byte (except for `\n` in
|
|
/// both cases, unless "dot all" mode is enabled).
|
|
///
|
|
/// Note that when UTF matching mode is enabled, every search performed
|
|
/// will do a UTF-8 validation check, which can impact performance. The
|
|
/// UTF-8 check can be disabled via the `disable_utf_check` option, but it
|
|
/// is undefined behavior to enable UTF matching mode and search invalid
|
|
/// UTF-8.
|
|
///
|
|
/// This is disabled by default.
|
|
pub fn utf(&mut self, yes: bool) -> &mut RegexMatcherBuilder {
|
|
self.builder.utf(yes);
|
|
self
|
|
}
|
|
|
|
/// When UTF matching mode is enabled, this will disable the UTF checking
|
|
/// that PCRE2 will normally perform automatically. If UTF matching mode
|
|
/// is not enabled, then this has no effect.
|
|
///
|
|
/// UTF checking is enabled by default when UTF matching mode is enabled.
|
|
/// If UTF matching mode is enabled and UTF checking is enabled, then PCRE2
|
|
/// will return an error if you attempt to search a subject string that is
|
|
/// not valid UTF-8.
|
|
///
|
|
/// # Safety
|
|
///
|
|
/// It is undefined behavior to disable the UTF check in UTF matching mode
|
|
/// and search a subject string that is not valid UTF-8. When the UTF check
|
|
/// is disabled, callers must guarantee that the subject string is valid
|
|
/// UTF-8.
|
|
pub unsafe fn disable_utf_check(&mut self) -> &mut RegexMatcherBuilder {
|
|
self.builder.disable_utf_check();
|
|
self
|
|
}
|
|
|
|
/// Enable PCRE2's JIT and return an error if it's not available.
|
|
///
|
|
/// This generally speeds up matching quite a bit. The downside is that it
|
|
/// can increase the time it takes to compile a pattern.
|
|
///
|
|
/// If the JIT isn't available or if JIT compilation returns an error, then
|
|
/// regex compilation will fail with the corresponding error.
|
|
///
|
|
/// This is disabled by default, and always overrides `jit_if_available`.
|
|
pub fn jit(&mut self, yes: bool) -> &mut RegexMatcherBuilder {
|
|
self.builder.jit(yes);
|
|
self
|
|
}
|
|
|
|
/// Enable PCRE2's JIT if it's available.
|
|
///
|
|
/// This generally speeds up matching quite a bit. The downside is that it
|
|
/// can increase the time it takes to compile a pattern.
|
|
///
|
|
/// If the JIT isn't available or if JIT compilation returns an error,
|
|
/// then a debug message with the error will be emitted and the regex will
|
|
/// otherwise silently fall back to non-JIT matching.
|
|
///
|
|
/// This is disabled by default, and always overrides `jit`.
|
|
pub fn jit_if_available(&mut self, yes: bool) -> &mut RegexMatcherBuilder {
|
|
self.builder.jit_if_available(yes);
|
|
self
|
|
}
|
|
|
|
/// Set the maximum size of PCRE2's JIT stack, in bytes. If the JIT is
|
|
/// not enabled, then this has no effect.
|
|
///
|
|
/// When `None` is given, no custom JIT stack will be created, and instead,
|
|
/// the default JIT stack is used. When the default is used, its maximum
|
|
/// size is 32 KB.
|
|
///
|
|
/// When this is set, then a new JIT stack will be created with the given
|
|
/// maximum size as its limit.
|
|
///
|
|
/// Increasing the stack size can be useful for larger regular expressions.
|
|
///
|
|
/// By default, this is set to `None`.
|
|
pub fn max_jit_stack_size(
|
|
&mut self,
|
|
bytes: Option<usize>,
|
|
) -> &mut RegexMatcherBuilder {
|
|
self.builder.max_jit_stack_size(bytes);
|
|
self
|
|
}
|
|
}
|
|
|
|
/// An implementation of the `Matcher` trait using PCRE2.
|
|
#[derive(Clone, Debug)]
|
|
pub struct RegexMatcher {
|
|
regex: Regex,
|
|
names: HashMap<String, usize>,
|
|
}
|
|
|
|
impl RegexMatcher {
|
|
/// Create a new matcher from the given pattern using the default
|
|
/// configuration.
|
|
pub fn new(pattern: &str) -> Result<RegexMatcher, Error> {
|
|
RegexMatcherBuilder::new().build(pattern)
|
|
}
|
|
}
|
|
|
|
impl Matcher for RegexMatcher {
|
|
type Captures = RegexCaptures;
|
|
type Error = Error;
|
|
|
|
fn find_at(
|
|
&self,
|
|
haystack: &[u8],
|
|
at: usize,
|
|
) -> Result<Option<Match>, Error> {
|
|
Ok(self
|
|
.regex
|
|
.find_at(haystack, at)
|
|
.map_err(Error::regex)?
|
|
.map(|m| Match::new(m.start(), m.end())))
|
|
}
|
|
|
|
fn new_captures(&self) -> Result<RegexCaptures, Error> {
|
|
Ok(RegexCaptures::new(self.regex.capture_locations()))
|
|
}
|
|
|
|
fn capture_count(&self) -> usize {
|
|
self.regex.captures_len()
|
|
}
|
|
|
|
fn capture_index(&self, name: &str) -> Option<usize> {
|
|
self.names.get(name).map(|i| *i)
|
|
}
|
|
|
|
fn try_find_iter<F, E>(
|
|
&self,
|
|
haystack: &[u8],
|
|
mut matched: F,
|
|
) -> Result<Result<(), E>, Error>
|
|
where
|
|
F: FnMut(Match) -> Result<bool, E>,
|
|
{
|
|
for result in self.regex.find_iter(haystack) {
|
|
let m = result.map_err(Error::regex)?;
|
|
match matched(Match::new(m.start(), m.end())) {
|
|
Ok(true) => continue,
|
|
Ok(false) => return Ok(Ok(())),
|
|
Err(err) => return Ok(Err(err)),
|
|
}
|
|
}
|
|
Ok(Ok(()))
|
|
}
|
|
|
|
fn captures_at(
|
|
&self,
|
|
haystack: &[u8],
|
|
at: usize,
|
|
caps: &mut RegexCaptures,
|
|
) -> Result<bool, Error> {
|
|
Ok(self
|
|
.regex
|
|
.captures_read_at(&mut caps.locs, haystack, at)
|
|
.map_err(Error::regex)?
|
|
.is_some())
|
|
}
|
|
}
|
|
|
|
/// Represents the match offsets of each capturing group in a match.
|
|
///
|
|
/// The first, or `0`th capture group, always corresponds to the entire match
|
|
/// and is guaranteed to be present when a match occurs. The next capture
|
|
/// group, at index `1`, corresponds to the first capturing group in the regex,
|
|
/// ordered by the position at which the left opening parenthesis occurs.
|
|
///
|
|
/// Note that not all capturing groups are guaranteed to be present in a match.
|
|
/// For example, in the regex, `(?P<foo>\w)|(?P<bar>\W)`, only one of `foo`
|
|
/// or `bar` will ever be set in any given match.
|
|
///
|
|
/// In order to access a capture group by name, you'll need to first find the
|
|
/// index of the group using the corresponding matcher's `capture_index`
|
|
/// method, and then use that index with `RegexCaptures::get`.
|
|
#[derive(Clone, Debug)]
|
|
pub struct RegexCaptures {
|
|
/// Where the locations are stored.
|
|
locs: CaptureLocations,
|
|
}
|
|
|
|
impl Captures for RegexCaptures {
|
|
fn len(&self) -> usize {
|
|
self.locs.len()
|
|
}
|
|
|
|
fn get(&self, i: usize) -> Option<Match> {
|
|
self.locs.get(i).map(|(s, e)| Match::new(s, e))
|
|
}
|
|
}
|
|
|
|
impl RegexCaptures {
|
|
pub(crate) fn new(locs: CaptureLocations) -> RegexCaptures {
|
|
RegexCaptures { locs }
|
|
}
|
|
}
|
|
|
|
/// Determine whether the pattern contains an uppercase character which should
|
|
/// negate the effect of the smart-case option.
|
|
///
|
|
/// Ideally we would be able to check the AST in order to correctly handle
|
|
/// things like '\p{Ll}' and '\p{Lu}' (which should be treated as explicitly
|
|
/// cased), but PCRE doesn't expose enough details for that kind of analysis.
|
|
/// For now, our 'good enough' solution is to simply perform a semi-naïve
|
|
/// scan of the input pattern and ignore all characters following a '\'. The
|
|
/// This at least lets us support the most common cases, like 'foo\w' and
|
|
/// 'foo\S', in an intuitive manner.
|
|
fn has_uppercase_literal(pattern: &str) -> bool {
|
|
let mut chars = pattern.chars();
|
|
while let Some(c) = chars.next() {
|
|
if c == '\\' {
|
|
chars.next();
|
|
} else if c.is_uppercase() {
|
|
return true;
|
|
}
|
|
}
|
|
false
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
use grep_matcher::{LineMatchKind, Matcher};
|
|
|
|
// Test that enabling word matches does the right thing and demonstrate
|
|
// the difference between it and surrounding the regex in `\b`.
|
|
#[test]
|
|
fn word() {
|
|
let matcher =
|
|
RegexMatcherBuilder::new().word(true).build(r"-2").unwrap();
|
|
assert!(matcher.is_match(b"abc -2 foo").unwrap());
|
|
|
|
let matcher =
|
|
RegexMatcherBuilder::new().word(false).build(r"\b-2\b").unwrap();
|
|
assert!(!matcher.is_match(b"abc -2 foo").unwrap());
|
|
}
|
|
|
|
// Test that enabling CRLF permits `$` to match at the end of a line.
|
|
#[test]
|
|
fn line_terminator_crlf() {
|
|
// Test normal use of `$` with a `\n` line terminator.
|
|
let matcher = RegexMatcherBuilder::new()
|
|
.multi_line(true)
|
|
.build(r"abc$")
|
|
.unwrap();
|
|
assert!(matcher.is_match(b"abc\n").unwrap());
|
|
|
|
// Test that `$` doesn't match at `\r\n` boundary normally.
|
|
let matcher = RegexMatcherBuilder::new()
|
|
.multi_line(true)
|
|
.build(r"abc$")
|
|
.unwrap();
|
|
assert!(!matcher.is_match(b"abc\r\n").unwrap());
|
|
|
|
// Now check the CRLF handling.
|
|
let matcher = RegexMatcherBuilder::new()
|
|
.multi_line(true)
|
|
.crlf(true)
|
|
.build(r"abc$")
|
|
.unwrap();
|
|
assert!(matcher.is_match(b"abc\r\n").unwrap());
|
|
}
|
|
|
|
// Test that smart case works.
|
|
#[test]
|
|
fn case_smart() {
|
|
let matcher =
|
|
RegexMatcherBuilder::new().case_smart(true).build(r"abc").unwrap();
|
|
assert!(matcher.is_match(b"ABC").unwrap());
|
|
|
|
let matcher =
|
|
RegexMatcherBuilder::new().case_smart(true).build(r"aBc").unwrap();
|
|
assert!(!matcher.is_match(b"ABC").unwrap());
|
|
}
|
|
|
|
// Test that finding candidate lines works as expected.
|
|
#[test]
|
|
fn candidate_lines() {
|
|
fn is_confirmed(m: LineMatchKind) -> bool {
|
|
match m {
|
|
LineMatchKind::Confirmed(_) => true,
|
|
_ => false,
|
|
}
|
|
}
|
|
|
|
let matcher = RegexMatcherBuilder::new().build(r"\wfoo\s").unwrap();
|
|
let m = matcher.find_candidate_line(b"afoo ").unwrap().unwrap();
|
|
assert!(is_confirmed(m));
|
|
}
|
|
}
|