ripgrep/grep-regex/src/matcher.rs

use std::collections::HashMap;

use grep_matcher::{
    Captures, LineMatchKind, LineTerminator, Match, Matcher, NoError, ByteSet,
};
use regex::bytes::{CaptureLocations, Regex};

use config::{Config, ConfiguredHIR};
use crlf::CRLFMatcher;
use error::Error;
use word::WordMatcher;

/// A builder for constructing a `Matcher` using regular expressions.
///
/// This builder re-exports many of the same options found on the regex crate's
/// builder, in addition to a few other options such as smart case, word
/// matching and the ability to set a line terminator which may enable certain
/// types of optimizations.
///
/// The syntax supported is documented as part of the regex crate:
/// https://docs.rs/regex/*/regex/#syntax
#[derive(Clone, Debug)]
pub struct RegexMatcherBuilder {
    config: Config,
}

impl Default for RegexMatcherBuilder {
    fn default() -> RegexMatcherBuilder {
        RegexMatcherBuilder::new()
    }
}

impl RegexMatcherBuilder {
    /// Create a new builder for configuring a regex matcher.
    pub fn new() -> RegexMatcherBuilder {
        RegexMatcherBuilder {
            config: Config::default(),
        }
    }

    /// Build a new matcher using the current configuration for the provided
    /// pattern.
    ///
    /// The syntax supported is documented as part of the regex crate:
    /// https://docs.rs/regex/*/regex/#syntax
    pub fn build(&self, pattern: &str) -> Result<RegexMatcher, Error> {
        let chir = self.config.hir(pattern)?;
        let fast_line_regex = chir.fast_line_regex()?;
        let non_matching_bytes = chir.non_matching_bytes();
        if let Some(ref re) = fast_line_regex {
            trace!("extracted fast line regex: {:?}", re);
        }

        let matcher = RegexMatcherImpl::new(&chir)?;
        trace!("final regex: {:?}", matcher.regex());
        Ok(RegexMatcher {
            config: self.config.clone(),
            matcher: matcher,
            fast_line_regex: fast_line_regex,
            non_matching_bytes: non_matching_bytes,
        })
    }

    /// Set the value for the case insensitive (`i`) flag.
    ///
    /// When enabled, letters in the pattern will match both upper case and
    /// lower case variants.
    pub fn case_insensitive(&mut self, yes: bool) -> &mut RegexMatcherBuilder {
        self.config.case_insensitive = yes;
        self
    }

    /// Whether to enable "smart case" or not.
    ///
    /// When smart case is enabled, the builder will automatically enable
    /// case insensitive matching based on how the pattern is written. Namely,
    /// case insensitive mode is enabled when both of the following things
    /// are true:
    ///
    /// 1. The pattern contains at least one literal character. For example,
    ///    `a\w` contains a literal (`a`) but `\w` does not.
    /// 2. Of the literals in the pattern, none of them are considered to be
    ///    uppercase according to Unicode. For example, `foo\pL` has no
    ///    uppercase literals but `Foo\pL` does.
    pub fn case_smart(&mut self, yes: bool) -> &mut RegexMatcherBuilder {
        self.config.case_smart = yes;
        self
    }

    /// Set the value for the multi-line matching (`m`) flag.
    ///
    /// When enabled, `^` matches the beginning of lines and `$` matches the
    /// end of lines.
    ///
    /// By default, they match beginning/end of the input.
    pub fn multi_line(&mut self, yes: bool) -> &mut RegexMatcherBuilder {
        self.config.multi_line = yes;
        self
    }

    /// Set the value for the any character (`s`) flag, where in `.` matches
    /// anything when `s` is set and matches anything except for new line when
    /// it is not set (the default).
    ///
    /// N.B. "matches anything" means "any byte" when Unicode is disabled and
    /// means "any valid UTF-8 encoding of any Unicode scalar value" when
    /// Unicode is enabled.
    pub fn dot_matches_new_line(
        &mut self,
        yes: bool,
    ) -> &mut RegexMatcherBuilder {
        self.config.dot_matches_new_line = yes;
        self
    }

    /// Set the value for the greedy swap (`U`) flag.
    ///
    /// When enabled, a pattern like `a*` is lazy (tries to find shortest
    /// match) and `a*?` is greedy (tries to find longest match).
    ///
    /// By default, `a*` is greedy and `a*?` is lazy.
    pub fn swap_greed(&mut self, yes: bool) -> &mut RegexMatcherBuilder {
        self.config.swap_greed = yes;
        self
    }

    /// Set the value for the ignore whitespace (`x`) flag.
    ///
    /// When enabled, whitespace such as new lines and spaces will be ignored
    /// between expressions of the pattern, and `#` can be used to start a
    /// comment until the next new line.
    pub fn ignore_whitespace(
        &mut self,
        yes: bool,
    ) -> &mut RegexMatcherBuilder {
        self.config.ignore_whitespace = yes;
        self
    }

    /// Set the value for the Unicode (`u`) flag.
    ///
    /// Enabled by default. When disabled, character classes such as `\w` only
    /// match ASCII word characters instead of all Unicode word characters.
    pub fn unicode(&mut self, yes: bool) -> &mut RegexMatcherBuilder {
        self.config.unicode = yes;
        self
    }

    /// Whether to support octal syntax or not.
    ///
    /// Octal syntax is a little-known way of uttering Unicode codepoints in
    /// a regular expression. For example, `a`, `\x61`, `\u0061` and
    /// `\141` are all equivalent regular expressions, where the last example
    /// shows octal syntax.
    ///
    /// While supporting octal syntax isn't in and of itself a problem, it does
    /// make good error messages harder. That is, in PCRE based regex engines,
    /// syntax like `\0` invokes a backreference, which is explicitly
    /// unsupported in Rust's regex engine. However, many users expect it to
    /// be supported. Therefore, when octal support is disabled, the error
    /// message will explicitly mention that backreferences aren't supported.
    ///
    /// Octal syntax is disabled by default.
    pub fn octal(&mut self, yes: bool) -> &mut RegexMatcherBuilder {
        self.config.octal = yes;
        self
    }

    /// Set the approximate size limit of the compiled regular expression.
    ///
    /// This roughly corresponds to the number of bytes occupied by a single
    /// compiled program. If the program exceeds this number, then a
    /// compilation error is returned.
    pub fn size_limit(&mut self, bytes: usize) -> &mut RegexMatcherBuilder {
        self.config.size_limit = bytes;
        self
    }

    /// Set the approximate size of the cache used by the DFA.
    ///
    /// This roughly corresponds to the number of bytes that the DFA will
    /// use while searching.
    ///
    /// Note that this is a *per thread* limit. There is no way to set a global
    /// limit. In particular, if a regex is used from multiple threads
    /// simultaneously, then each thread may use up to the number of bytes
    /// specified here.
    pub fn dfa_size_limit(
        &mut self,
        bytes: usize,
    ) -> &mut RegexMatcherBuilder {
        self.config.dfa_size_limit = bytes;
        self
    }

    /// Set the nesting limit for this parser.
    ///
    /// The nesting limit controls how deep the abstract syntax tree is allowed
    /// to be. If the AST exceeds the given limit (e.g., with too many nested
    /// groups), then an error is returned by the parser.
    ///
    /// The purpose of this limit is to act as a heuristic to prevent stack
    /// overflow for consumers that do structural induction on an `Ast` using
    /// explicit recursion. While this crate never does this (instead using
    /// constant stack space and moving the call stack to the heap), other
    /// crates may.
    ///
    /// This limit is not checked until the entire Ast is parsed. Therefore,
    /// if callers want to put a limit on the amount of heap space used, then
    /// they should impose a limit on the length, in bytes, of the concrete
    /// pattern string. In particular, this is viable since this parser
    /// implementation will limit itself to heap space proportional to the
    /// lenth of the pattern string.
    ///
    /// Note that a nest limit of `0` will return a nest limit error for most
    /// patterns but not all. For example, a nest limit of `0` permits `a` but
    /// not `ab`, since `ab` requires a concatenation, which results in a nest
    /// depth of `1`. In general, a nest limit is not something that manifests
    /// in an obvious way in the concrete syntax, therefore, it should not be
    /// used in a granular way.
    pub fn nest_limit(&mut self, limit: u32) -> &mut RegexMatcherBuilder {
        self.config.nest_limit = limit;
        self
    }

    /// Set an ASCII line terminator for the matcher.
    ///
    /// The purpose of setting a line terminator is to enable a certain class
    /// of optimizations that can make line oriented searching faster. Namely,
    /// when a line terminator is enabled, then the builder will guarantee that
    /// the resulting matcher will never be capable of producing a match that
    /// contains the line terminator. Because of this guarantee, users of the
    /// resulting matcher do not need to slowly execute a search line by line
    /// for line oriented search.
    ///
    /// If the aforementioned guarantee about not matching a line terminator
    /// cannot be made because of how the pattern was written, then the builder
    /// will return an error when attempting to construct the matcher. For
    /// example, the pattern `a\sb` will be transformed such that it can never
    /// match `a\nb` (when `\n` is the line terminator), but the pattern `a\nb`
    /// will result in an error since the `\n` cannot be easily removed without
    /// changing the fundamental intent of the pattern.
    ///
    /// If the given line terminator isn't an ASCII byte (`<=127`), then the
    /// builder will return an error when constructing the matcher.
    pub fn line_terminator(
        &mut self,
        line_term: Option<u8>,
    ) -> &mut RegexMatcherBuilder {
        self.config.line_terminator = line_term.map(LineTerminator::byte);
        self
    }

    /// Set the line terminator to `\r\n` and enable CRLF matching for `$` in
    /// regex patterns.
    ///
    /// This method sets two distinct settings:
    ///
    /// 1. It causes the line terminator for the matcher to be `\r\n`. Namely,
    ///    this prevents the matcher from ever producing a match that contains
    ///    a `\r` or `\n`.
    /// 2. It translates all instances of `$` in the pattern to `(?:\r??$)`.
    ///    This works around the fact that the regex engine does not support
    ///    matching CRLF as a line terminator when using `$`.
    ///
    /// In particular, because of (2), the matches produced by the matcher may
    /// be slightly different than what one would expect given the pattern.
    /// This is the trade off made: in many cases, `$` will "just work" in the
    /// presence of `\r\n` line terminators, but matches may require some
    /// trimming to faithfully represent the intended match.
    ///
    /// Note that if you do not wish to set the line terminator but would still
    /// like `$` to match `\r\n` line terminators, then it is valid to call
    /// `crlf(true)` followed by `line_terminator(None)`. Ordering is
    /// important, since `crlf` and `line_terminator` override each other.
    pub fn crlf(&mut self, yes: bool) -> &mut RegexMatcherBuilder {
        if yes {
            self.config.line_terminator = Some(LineTerminator::crlf());
        } else {
            self.config.line_terminator = None;
        }
        self.config.crlf = yes;
        self
    }

    /// Require that all matches occur on word boundaries.
    ///
    /// Enabling this option is subtly different than putting `\b` assertions
    /// on both sides of your pattern. In particular, a `\b` assertion requires
    /// that one side of it match a word character while the other match a
    /// non-word character. This option, in contrast, merely requires that
    /// one side match a non-word character.
    ///
    /// For example, `\b-2\b` will not match `foo -2 bar` since `-` is not a
    /// word character. However, `-2` with this `word` option enabled will
    /// match the `-2` in `foo -2 bar`.
    pub fn word(&mut self, yes: bool) -> &mut RegexMatcherBuilder {
        self.config.word = yes;
        self
    }
}

/// An implementation of the `Matcher` trait using Rust's standard regex
/// library.
#[derive(Clone, Debug)]
pub struct RegexMatcher {
    /// The configuration specified by the caller.
    config: Config,
    /// The underlying matcher implementation.
    matcher: RegexMatcherImpl,
    /// A regex that never reports false negatives but may report false
    /// positives that is believed to be capable of being matched more quickly
    /// than `regex`. Typically, this is a single literal or an alternation
    /// of literals.
    fast_line_regex: Option<Regex>,
    /// A set of bytes that will never appear in a match.
    non_matching_bytes: ByteSet,
}

impl RegexMatcher {
    /// Create a new matcher from the given pattern using the default
    /// configuration.
    pub fn new(pattern: &str) -> Result<RegexMatcher, Error> {
        RegexMatcherBuilder::new().build(pattern)
    }

    /// Create a new matcher from the given pattern using the default
    /// configuration, but matches lines terminated by `\n`.
    ///
    /// This is meant to be a convenience constructor for using a
    /// `RegexMatcherBuilder` and setting its
    /// [`line_terminator`](struct.RegexMatcherBuilder.html#method.line_terminator)
    /// to `\n`. The purpose of using this constructor is to permit special
    /// optimizations that help speed up line oriented search. These types of
    /// optimizations are only appropriate when matches span no more than one
    /// line. For this reason, this constructor will return an error if the
    /// given pattern contains a literal `\n`. Other uses of `\n` (such as in
    /// `\s`) are removed transparently.
    pub fn new_line_matcher(pattern: &str) -> Result<RegexMatcher, Error> {
        RegexMatcherBuilder::new()
            .line_terminator(Some(b'\n'))
            .build(pattern)
    }
}

/// An encapsulation of the type of matcher we use in `RegexMatcher`.
#[derive(Clone, Debug)]
enum RegexMatcherImpl {
    /// The standard matcher used for all regular expressions.
    Standard(StandardMatcher),
    /// A matcher that strips `\r` from the end of matches.
    ///
    /// This is only used when the CRLF hack is enabled and the regex is line
    /// anchored at the end.
    CRLF(CRLFMatcher),
    /// A matcher that only matches at word boundaries. This transforms the
    /// regex to `(^|\W)(...)($|\W)` instead of the more intuitive `\b(...)\b`.
    /// Because of this, the WordMatcher provides its own implementation of
    /// `Matcher` to encapsulate its use of capture groups to make them
    /// invisible to the caller.
    Word(WordMatcher),
}

impl RegexMatcherImpl {
    /// Based on the configuration, create a new implementation of the
    /// `Matcher` trait.
    fn new(expr: &ConfiguredHIR) -> Result<RegexMatcherImpl, Error> {
        if expr.config().word {
            Ok(RegexMatcherImpl::Word(WordMatcher::new(expr)?))
        } else if expr.needs_crlf_stripped() {
            Ok(RegexMatcherImpl::CRLF(CRLFMatcher::new(expr)?))
        } else {
            Ok(RegexMatcherImpl::Standard(StandardMatcher::new(expr)?))
        }
    }

    /// Return the underlying regex object used.
    fn regex(&self) -> &Regex {
        match *self {
            RegexMatcherImpl::Word(ref x) => x.regex(),
            RegexMatcherImpl::CRLF(ref x) => x.regex(),
            RegexMatcherImpl::Standard(ref x) => &x.regex,
        }
    }
}

// This implementation just dispatches on the internal matcher impl except
// for the line terminator optimization, which is possibly executed via
// `fast_line_regex`.
impl Matcher for RegexMatcher {
    type Captures = RegexCaptures;
    type Error = NoError;

    fn find_at(
        &self,
        haystack: &[u8],
        at: usize,
    ) -> Result<Option<Match>, NoError> {
        use self::RegexMatcherImpl::*;
        match self.matcher {
            Standard(ref m) => m.find_at(haystack, at),
            CRLF(ref m) => m.find_at(haystack, at),
            Word(ref m) => m.find_at(haystack, at),
        }
    }

    fn new_captures(&self) -> Result<RegexCaptures, NoError> {
        use self::RegexMatcherImpl::*;
        match self.matcher {
            Standard(ref m) => m.new_captures(),
            CRLF(ref m) => m.new_captures(),
            Word(ref m) => m.new_captures(),
        }
    }

    fn capture_count(&self) -> usize {
        use self::RegexMatcherImpl::*;
        match self.matcher {
            Standard(ref m) => m.capture_count(),
            CRLF(ref m) => m.capture_count(),
            Word(ref m) => m.capture_count(),
        }
    }

    fn capture_index(&self, name: &str) -> Option<usize> {
        use self::RegexMatcherImpl::*;
        match self.matcher {
            Standard(ref m) => m.capture_index(name),
            CRLF(ref m) => m.capture_index(name),
            Word(ref m) => m.capture_index(name),
        }
    }

    fn find(&self, haystack: &[u8]) -> Result<Option<Match>, NoError> {
        use self::RegexMatcherImpl::*;
        match self.matcher {
            Standard(ref m) => m.find(haystack),
            CRLF(ref m) => m.find(haystack),
            Word(ref m) => m.find(haystack),
        }
    }

    fn find_iter<F>(
        &self,
        haystack: &[u8],
        matched: F,
    ) -> Result<(), NoError>
    where F: FnMut(Match) -> bool
    {
        use self::RegexMatcherImpl::*;
        match self.matcher {
            Standard(ref m) => m.find_iter(haystack, matched),
            CRLF(ref m) => m.find_iter(haystack, matched),
            Word(ref m) => m.find_iter(haystack, matched),
        }
    }

    fn try_find_iter<F, E>(
        &self,
        haystack: &[u8],
        matched: F,
    ) -> Result<Result<(), E>, NoError>
    where F: FnMut(Match) -> Result<bool, E>
    {
        use self::RegexMatcherImpl::*;
        match self.matcher {
            Standard(ref m) => m.try_find_iter(haystack, matched),
            CRLF(ref m) => m.try_find_iter(haystack, matched),
            Word(ref m) => m.try_find_iter(haystack, matched),
        }
    }

    fn captures(
        &self,
        haystack: &[u8],
        caps: &mut RegexCaptures,
    ) -> Result<bool, NoError> {
        use self::RegexMatcherImpl::*;
        match self.matcher {
            Standard(ref m) => m.captures(haystack, caps),
            CRLF(ref m) => m.captures(haystack, caps),
            Word(ref m) => m.captures(haystack, caps),
        }
    }

    fn captures_iter<F>(
        &self,
        haystack: &[u8],
        caps: &mut RegexCaptures,
        matched: F,
    ) -> Result<(), NoError>
    where F: FnMut(&RegexCaptures) -> bool
    {
        use self::RegexMatcherImpl::*;
        match self.matcher {
            Standard(ref m) => m.captures_iter(haystack, caps, matched),
            CRLF(ref m) => m.captures_iter(haystack, caps, matched),
            Word(ref m) => m.captures_iter(haystack, caps, matched),
        }
    }

    fn try_captures_iter<F, E>(
        &self,
        haystack: &[u8],
        caps: &mut RegexCaptures,
        matched: F,
    ) -> Result<Result<(), E>, NoError>
    where F: FnMut(&RegexCaptures) -> Result<bool, E>
    {
        use self::RegexMatcherImpl::*;
        match self.matcher {
            Standard(ref m) => m.try_captures_iter(haystack, caps, matched),
            CRLF(ref m) => m.try_captures_iter(haystack, caps, matched),
            Word(ref m) => m.try_captures_iter(haystack, caps, matched),
        }
    }

    fn captures_at(
        &self,
        haystack: &[u8],
        at: usize,
        caps: &mut RegexCaptures,
    ) -> Result<bool, NoError> {
        use self::RegexMatcherImpl::*;
        match self.matcher {
            Standard(ref m) => m.captures_at(haystack, at, caps),
            CRLF(ref m) => m.captures_at(haystack, at, caps),
            Word(ref m) => m.captures_at(haystack, at, caps),
        }
    }

    fn replace<F>(
        &self,
        haystack: &[u8],
        dst: &mut Vec<u8>,
        append: F,
    ) -> Result<(), NoError>
    where F: FnMut(Match, &mut Vec<u8>) -> bool
    {
        use self::RegexMatcherImpl::*;
        match self.matcher {
            Standard(ref m) => m.replace(haystack, dst, append),
            CRLF(ref m) => m.replace(haystack, dst, append),
            Word(ref m) => m.replace(haystack, dst, append),
        }
    }

    fn replace_with_captures<F>(
        &self,
        haystack: &[u8],
        caps: &mut RegexCaptures,
        dst: &mut Vec<u8>,
        append: F,
    ) -> Result<(), NoError>
    where F: FnMut(&Self::Captures, &mut Vec<u8>) -> bool
    {
        use self::RegexMatcherImpl::*;
        match self.matcher {
            Standard(ref m) => {
                m.replace_with_captures(haystack, caps, dst, append)
            }
            CRLF(ref m) => {
                m.replace_with_captures(haystack, caps, dst, append)
            }
            Word(ref m) => {
                m.replace_with_captures(haystack, caps, dst, append)
            }
        }
    }

    fn is_match(&self, haystack: &[u8]) -> Result<bool, NoError> {
        use self::RegexMatcherImpl::*;
        match self.matcher {
            Standard(ref m) => m.is_match(haystack),
            CRLF(ref m) => m.is_match(haystack),
            Word(ref m) => m.is_match(haystack),
        }
    }

    fn is_match_at(
        &self,
        haystack: &[u8],
        at: usize,
    ) -> Result<bool, NoError> {
        use self::RegexMatcherImpl::*;
        match self.matcher {
            Standard(ref m) => m.is_match_at(haystack, at),
            CRLF(ref m) => m.is_match_at(haystack, at),
            Word(ref m) => m.is_match_at(haystack, at),
        }
    }

    fn shortest_match(
        &self,
        haystack: &[u8],
    ) -> Result<Option<usize>, NoError> {
        use self::RegexMatcherImpl::*;
        match self.matcher {
            Standard(ref m) => m.shortest_match(haystack),
            CRLF(ref m) => m.shortest_match(haystack),
            Word(ref m) => m.shortest_match(haystack),
        }
    }

    fn shortest_match_at(
        &self,
        haystack: &[u8],
        at: usize,
    ) -> Result<Option<usize>, NoError> {
        use self::RegexMatcherImpl::*;
        match self.matcher {
            Standard(ref m) => m.shortest_match_at(haystack, at),
            CRLF(ref m) => m.shortest_match_at(haystack, at),
            Word(ref m) => m.shortest_match_at(haystack, at),
        }
    }

    fn non_matching_bytes(&self) -> Option<&ByteSet> {
        Some(&self.non_matching_bytes)
    }

    fn line_terminator(&self) -> Option<LineTerminator> {
        self.config.line_terminator
    }

    fn find_candidate_line(
        &self,
        haystack: &[u8],
    ) -> Result<Option<LineMatchKind>, NoError> {
        Ok(match self.fast_line_regex {
            Some(ref regex) => {
                regex.shortest_match(haystack).map(LineMatchKind::Candidate)
            }
            None => {
                self.shortest_match(haystack)?.map(LineMatchKind::Confirmed)
            }
        })
    }
}

/// The implementation of the standard regex matcher.
#[derive(Clone, Debug)]
struct StandardMatcher {
    /// The regular expression compiled from the pattern provided by the
    /// caller.
    regex: Regex,
    /// A map from capture group name to its corresponding index.
    names: HashMap<String, usize>,
}

impl StandardMatcher {
    fn new(expr: &ConfiguredHIR) -> Result<StandardMatcher, Error> {
        let regex = expr.regex()?;
        let mut names = HashMap::new();
        for (i, optional_name) in regex.capture_names().enumerate() {
            if let Some(name) = optional_name {
                names.insert(name.to_string(), i);
            }
        }
        Ok(StandardMatcher { regex, names })
    }
}

impl Matcher for StandardMatcher {
    type Captures = RegexCaptures;
    type Error = NoError;

    fn find_at(
        &self,
        haystack: &[u8],
        at: usize,
    ) -> Result<Option<Match>, NoError> {
        Ok(self.regex
            .find_at(haystack, at)
            .map(|m| Match::new(m.start(), m.end())))
    }

    fn new_captures(&self) -> Result<RegexCaptures, NoError> {
        Ok(RegexCaptures::new(self.regex.capture_locations()))
    }

    fn capture_count(&self) -> usize {
        self.regex.captures_len()
    }

    fn capture_index(&self, name: &str) -> Option<usize> {
        self.names.get(name).map(|i| *i)
    }

    fn try_find_iter<F, E>(
        &self,
        haystack: &[u8],
        mut matched: F,
    ) -> Result<Result<(), E>, NoError>
    where F: FnMut(Match) -> Result<bool, E>
    {
        for m in self.regex.find_iter(haystack) {
            match matched(Match::new(m.start(), m.end())) {
                Ok(true) => continue,
                Ok(false) => return Ok(Ok(())),
                Err(err) => return Ok(Err(err)),
            }
        }
        Ok(Ok(()))
    }

    fn captures_at(
        &self,
        haystack: &[u8],
        at: usize,
        caps: &mut RegexCaptures,
    ) -> Result<bool, NoError> {
        Ok(self.regex.captures_read_at(&mut caps.locs, haystack, at).is_some())
    }

    fn shortest_match_at(
        &self,
        haystack: &[u8],
        at: usize,
    ) -> Result<Option<usize>, NoError> {
        Ok(self.regex.shortest_match_at(haystack, at))
    }
}

/// Represents the match offsets of each capturing group in a match.
///
/// The first, or `0`th capture group, always corresponds to the entire match
/// and is guaranteed to be present when a match occurs. The next capture
/// group, at index `1`, corresponds to the first capturing group in the regex,
/// ordered by the position at which the left opening parenthesis occurs.
///
/// Note that not all capturing groups are guaranteed to be present in a match.
/// For example, in the regex, `(?P<foo>\w)|(?P<bar>\W)`, only one of `foo`
/// or `bar` will ever be set in any given match.
///
/// In order to access a capture group by name, you'll need to first find the
/// index of the group using the corresponding matcher's `capture_index`
/// method, and then use that index with `RegexCaptures::get`.
#[derive(Clone, Debug)]
pub struct RegexCaptures {
    /// Where the locations are stored.
    locs: CaptureLocations,
    /// These captures behave as if the capturing groups begin at the given
    /// offset. When set to `0`, this has no affect and capture groups are
    /// indexed like normal.
    ///
    /// This is useful when building matchers that wrap arbitrary regular
    /// expressions. For example, `WordMatcher` takes an existing regex `re`
    /// and creates `(?:^|\W)(re)(?:$|\W)`, but hides the fact that the regex
    /// has been wrapped from the caller. In order to do this, the matcher
    /// and the capturing groups must behave as if `(re)` is the `0`th capture
    /// group.
    offset: usize,
    /// When enable, the end of a match has `\r` stripped from it, if one
    /// exists.
    strip_crlf: bool,
}

impl Captures for RegexCaptures {
    fn len(&self) -> usize {
        self.locs.len().checked_sub(self.offset).unwrap()
    }

    fn get(&self, i: usize) -> Option<Match> {
        if !self.strip_crlf {
            let actual = i.checked_add(self.offset).unwrap();
            return self.locs.pos(actual).map(|(s, e)| Match::new(s, e));
        }

        // currently don't support capture offsetting with CRLF stripping
        assert_eq!(self.offset, 0);
        let m = match self.locs.pos(i).map(|(s, e)| Match::new(s, e)) {
            None => return None,
            Some(m) => m,
        };
        // If the end position of this match corresponds to the end position
        // of the overall match, then we apply our CRLF stripping. Otherwise,
        // we cannot assume stripping is correct.
        if i == 0 || m.end() == self.locs.pos(0).unwrap().1 {
            Some(m.with_end(m.end() - 1))
        } else {
            Some(m)
        }
    }
}

impl RegexCaptures {
    pub(crate) fn new(locs: CaptureLocations) -> RegexCaptures {
        RegexCaptures::with_offset(locs, 0)
    }

    pub(crate) fn with_offset(
        locs: CaptureLocations,
        offset: usize,
    ) -> RegexCaptures {
        RegexCaptures { locs, offset, strip_crlf: false }
    }

    pub(crate) fn locations(&mut self) -> &mut CaptureLocations {
        &mut self.locs
    }

    pub(crate) fn strip_crlf(&mut self, yes: bool) {
        self.strip_crlf = yes;
    }
}

#[cfg(test)]
mod tests {
    use grep_matcher::{LineMatchKind, Matcher};
    use super::*;

    // Test that enabling word matches does the right thing and demonstrate
    // the difference between it and surrounding the regex in `\b`.
    #[test]
    fn word() {
        let matcher = RegexMatcherBuilder::new()
            .word(true)
            .build(r"-2")
            .unwrap();
        assert!(matcher.is_match(b"abc -2 foo").unwrap());

        let matcher = RegexMatcherBuilder::new()
            .word(false)
            .build(r"\b-2\b")
            .unwrap();
        assert!(!matcher.is_match(b"abc -2 foo").unwrap());
    }

    // Test that enabling a line terminator prevents it from matching through
    // said line terminator.
    #[test]
    fn line_terminator() {
        // This works, because there's no line terminator specified.
        let matcher = RegexMatcherBuilder::new()
            .build(r"abc\sxyz")
            .unwrap();
        assert!(matcher.is_match(b"abc\nxyz").unwrap());

        // This doesn't.
        let matcher = RegexMatcherBuilder::new()
            .line_terminator(Some(b'\n'))
            .build(r"abc\sxyz")
            .unwrap();
        assert!(!matcher.is_match(b"abc\nxyz").unwrap());
    }

    // Ensure that the builder returns an error if a line terminator is set
    // and the regex could not be modified to remove a line terminator.
    #[test]
    fn line_terminator_error() {
        assert!(RegexMatcherBuilder::new()
            .line_terminator(Some(b'\n'))
            .build(r"a\nz")
            .is_err())
    }

    // Test that enabling CRLF permits `$` to match at the end of a line.
    #[test]
    fn line_terminator_crlf() {
        // Test normal use of `$` with a `\n` line terminator.
        let matcher = RegexMatcherBuilder::new()
            .multi_line(true)
            .build(r"abc$")
            .unwrap();
        assert!(matcher.is_match(b"abc\n").unwrap());

        // Test that `$` doesn't match at `\r\n` boundary normally.
        let matcher = RegexMatcherBuilder::new()
            .multi_line(true)
            .build(r"abc$")
            .unwrap();
        assert!(!matcher.is_match(b"abc\r\n").unwrap());

        // Now check the CRLF handling.
        let matcher = RegexMatcherBuilder::new()
            .multi_line(true)
            .crlf(true)
            .build(r"abc$")
            .unwrap();
        assert!(matcher.is_match(b"abc\r\n").unwrap());
    }

    // Test that smart case works.
    #[test]
    fn case_smart() {
        let matcher = RegexMatcherBuilder::new()
            .case_smart(true)
            .build(r"abc")
            .unwrap();
        assert!(matcher.is_match(b"ABC").unwrap());

        let matcher = RegexMatcherBuilder::new()
            .case_smart(true)
            .build(r"aBc")
            .unwrap();
        assert!(!matcher.is_match(b"ABC").unwrap());
    }

    // Test that finding candidate lines works as expected.
    #[test]
    fn candidate_lines() {
        fn is_confirmed(m: LineMatchKind) -> bool {
            match m {
                LineMatchKind::Confirmed(_) => true,
                _ => false,
            }
        }
        fn is_candidate(m: LineMatchKind) -> bool {
            match m {
                LineMatchKind::Candidate(_) => true,
                _ => false,
            }
        }

        // With no line terminator set, we can't employ any optimizations,
        // so we get a confirmed match.
        let matcher = RegexMatcherBuilder::new()
            .build(r"\wfoo\s")
            .unwrap();
        let m = matcher.find_candidate_line(b"afoo ").unwrap().unwrap();
        assert!(is_confirmed(m));

        // With a line terminator and a regex specially crafted to have an
        // easy-to-detect inner literal, we can apply an optimization that
        // quickly finds candidate matches.
        let matcher = RegexMatcherBuilder::new()
            .line_terminator(Some(b'\n'))
            .build(r"\wfoo\s")
            .unwrap();
        let m = matcher.find_candidate_line(b"afoo ").unwrap().unwrap();
        assert!(is_candidate(m));
    }
}