libripgrep: initial commit introducing libripgrep

libripgrep is not any one library, but rather, a collection of libraries that roughly separate the following key distinct phases in a grep implementation: 1. Pattern matching (e.g., by a regex engine). 2. Searching a file using a pattern matcher. 3. Printing results. Ultimately, both (1) and (3) are defined by de-coupled interfaces, of which there may be multiple implementations. Namely, (1) is satisfied by the `Matcher` trait in the `grep-matcher` crate and (3) is satisfied by the `Sink` trait in the `grep2` crate. The searcher (2) ties everything together and finds results using a matcher and reports those results using a `Sink` implementation. Closes #162
2025-08-04 21:52:54 +02:00 · 2018-04-29 09:29:52 -04:00
parent 0958837ee1
commit d9ca529356
68 changed files with 18010 additions and 20 deletions
--- a/grep-regex/Cargo.toml
+++ b/grep-regex/Cargo.toml
@ -0,0 +1,21 @@
+[package]
+name = "grep-regex"
+version = "0.0.1"  #:version
+authors = ["Andrew Gallant <jamslam@gmail.com>"]
+description = """
+Use Rust's regex library with the 'grep' crate.
+"""
+documentation = "https://docs.rs/grep-regex"
+homepage = "https://github.com/BurntSushi/ripgrep"
+repository = "https://github.com/BurntSushi/ripgrep"
+readme = "README.md"
+keywords = ["regex", "grep", "search", "pattern", "line"]
+license = "Unlicense/MIT"
+
+[dependencies]
+log = "0.4"
+grep-matcher = { version = "0.0.1", path = "../grep-matcher" }
+regex = "1"
+regex-syntax = "0.6"
+thread_local = "0.3.5"
+utf8-ranges = "1"
--- a/grep-regex/LICENSE-MIT
+++ b/grep-regex/LICENSE-MIT
@ -0,0 +1,21 @@
+The MIT License (MIT)
+
+Copyright (c) 2015 Andrew Gallant
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
--- a/grep-regex/README.md
+++ b/grep-regex/README.md
@ -0,0 +1,35 @@
+grep-regex
+----------
+The `grep-regex` crate provides an implementation of the `Matcher` trait from
+the `grep-matcher` crate. This implementation permits Rust's regex engine to
+be used in the `grep` crate for fast line oriented searching.
+
+[![Linux build status](https://api.travis-ci.org/BurntSushi/ripgrep.svg)](https://travis-ci.org/BurntSushi/ripgrep)
+[![Windows build status](https://ci.appveyor.com/api/projects/status/github/BurntSushi/ripgrep?svg=true)](https://ci.appveyor.com/project/BurntSushi/ripgrep)
+[![](https://img.shields.io/crates/v/grep-regex.svg)](https://crates.io/crates/grep-regex)
+
+Dual-licensed under MIT or the [UNLICENSE](http://unlicense.org).
+
+### Documentation
+
+[https://docs.rs/grep-regex](https://docs.rs/grep-regex)
+
+**NOTE:** You probably don't want to use this crate directly. Instead, you
+should prefer the facade defined in the
+[`grep`](https://docs.rs/grep)
+crate.
+
+### Usage
+
+Add this to your `Cargo.toml`:
+
+```toml
+[dependencies]
+grep-regex = "0.1"
+```
+
+and this to your crate root:
+
+```rust
+extern crate grep_regex;
+```
--- a/grep-regex/UNLICENSE
+++ b/grep-regex/UNLICENSE
@ -0,0 +1,24 @@
+This is free and unencumbered software released into the public domain.
+
+Anyone is free to copy, modify, publish, use, compile, sell, or
+distribute this software, either in source code form or as a compiled
+binary, for any purpose, commercial or non-commercial, and by any
+means.
+
+In jurisdictions that recognize copyright laws, the author or authors
+of this software dedicate any and all copyright interest in the
+software to the public domain. We make this dedication for the benefit
+of the public at large and to the detriment of our heirs and
+successors. We intend this dedication to be an overt act of
+relinquishment in perpetuity of all present and future rights to this
+software under copyright law.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+OTHER DEALINGS IN THE SOFTWARE.
+
+For more information, please refer to <http://unlicense.org/>
--- a/grep-regex/src/ast.rs
+++ b/grep-regex/src/ast.rs
@ -0,0 +1,263 @@
+use regex_syntax::ast::{self, Ast};
+use regex_syntax::ast::parse::Parser;
+
+/// The results of analyzing AST of a regular expression (e.g., for supporting
+/// smart case).
+#[derive(Clone, Debug)]
+pub struct AstAnalysis {
+    /// True if and only if a literal uppercase character occurs in the regex.
+    any_uppercase: bool,
+    /// True if and only if the regex contains any literal at all.
+    any_literal: bool,
+    /// True if and only if the regex consists entirely of a literal and no
+    /// other special regex characters.
+    all_verbatim_literal: bool,
+}
+
+impl AstAnalysis {
+    /// Returns a `AstAnalysis` value by doing analysis on the AST of `pattern`.
+    ///
+    /// If `pattern` is not a valid regular expression, then `None` is
+    /// returned.
+    #[allow(dead_code)]
+    pub fn from_pattern(pattern: &str) -> Option<AstAnalysis> {
+        Parser::new()
+            .parse(pattern)
+            .map(|ast| AstAnalysis::from_ast(&ast))
+            .ok()
+    }
+
+    /// Perform an AST analysis given the AST.
+    pub fn from_ast(ast: &Ast) -> AstAnalysis {
+        let mut analysis = AstAnalysis::new();
+        analysis.from_ast_impl(ast);
+        analysis
+    }
+
+    /// Returns true if and only if a literal uppercase character occurs in
+    /// the pattern.
+    ///
+    /// For example, a pattern like `\pL` contains no uppercase literals,
+    /// even though `L` is uppercase and the `\pL` class contains uppercase
+    /// characters.
+    pub fn any_uppercase(&self) -> bool {
+        self.any_uppercase
+    }
+
+    /// Returns true if and only if the regex contains any literal at all.
+    ///
+    /// For example, a pattern like `\pL` reports `false`, but a pattern like
+    /// `\pLfoo` reports `true`.
+    pub fn any_literal(&self) -> bool {
+        self.any_literal
+    }
+
+    /// Returns true if and only if the entire pattern is a verbatim literal
+    /// with no special meta characters.
+    ///
+    /// When this is true, then the pattern satisfies the following law:
+    /// `escape(pattern) == pattern`. Notable examples where this returns
+    /// `false` include patterns like `a\u0061` even though `\u0061` is just
+    /// a literal `a`.
+    ///
+    /// The purpose of this flag is to determine whether the patterns can be
+    /// given to non-regex substring search algorithms as-is.
+    #[allow(dead_code)]
+    pub fn all_verbatim_literal(&self) -> bool {
+        self.all_verbatim_literal
+    }
+
+    /// Creates a new `AstAnalysis` value with an initial configuration.
+    fn new() -> AstAnalysis {
+        AstAnalysis {
+            any_uppercase: false,
+            any_literal: false,
+            all_verbatim_literal: true,
+        }
+    }
+
+    fn from_ast_impl(&mut self, ast: &Ast) {
+        if self.done() {
+            return;
+        }
+        match *ast {
+            Ast::Empty(_) => {}
+            Ast::Flags(_)
+            | Ast::Dot(_)
+            | Ast::Assertion(_)
+            | Ast::Class(ast::Class::Unicode(_))
+            | Ast::Class(ast::Class::Perl(_)) => {
+                self.all_verbatim_literal = false;
+            }
+            Ast::Literal(ref x) => {
+                self.from_ast_literal(x);
+            }
+            Ast::Class(ast::Class::Bracketed(ref x)) => {
+                self.all_verbatim_literal = false;
+                self.from_ast_class_set(&x.kind);
+            }
+            Ast::Repetition(ref x) => {
+                self.all_verbatim_literal = false;
+                self.from_ast_impl(&x.ast);
+            }
+            Ast::Group(ref x) => {
+                self.all_verbatim_literal = false;
+                self.from_ast_impl(&x.ast);
+            }
+            Ast::Alternation(ref alt) => {
+                self.all_verbatim_literal = false;
+                for x in &alt.asts {
+                    self.from_ast_impl(x);
+                }
+            }
+            Ast::Concat(ref alt) => {
+                for x in &alt.asts {
+                    self.from_ast_impl(x);
+                }
+            }
+        }
+    }
+
+    fn from_ast_class_set(&mut self, ast: &ast::ClassSet) {
+        if self.done() {
+            return;
+        }
+        match *ast {
+            ast::ClassSet::Item(ref item) => {
+                self.from_ast_class_set_item(item);
+            }
+            ast::ClassSet::BinaryOp(ref x) => {
+                self.from_ast_class_set(&x.lhs);
+                self.from_ast_class_set(&x.rhs);
+            }
+        }
+    }
+
+    fn from_ast_class_set_item(&mut self, ast: &ast::ClassSetItem) {
+        if self.done() {
+            return;
+        }
+        match *ast {
+            ast::ClassSetItem::Empty(_)
+            | ast::ClassSetItem::Ascii(_)
+            | ast::ClassSetItem::Unicode(_)
+            | ast::ClassSetItem::Perl(_) => {}
+            ast::ClassSetItem::Literal(ref x) => {
+                self.from_ast_literal(x);
+            }
+            ast::ClassSetItem::Range(ref x) => {
+                self.from_ast_literal(&x.start);
+                self.from_ast_literal(&x.end);
+            }
+            ast::ClassSetItem::Bracketed(ref x) => {
+                self.from_ast_class_set(&x.kind);
+            }
+            ast::ClassSetItem::Union(ref union) => {
+                for x in &union.items {
+                    self.from_ast_class_set_item(x);
+                }
+            }
+        }
+    }
+
+    fn from_ast_literal(&mut self, ast: &ast::Literal) {
+        if ast.kind != ast::LiteralKind::Verbatim {
+            self.all_verbatim_literal = false;
+        }
+        self.any_literal = true;
+        self.any_uppercase = self.any_uppercase || ast.c.is_uppercase();
+    }
+
+    /// Returns true if and only if the attributes can never change no matter
+    /// what other AST it might see.
+    fn done(&self) -> bool {
+        self.any_uppercase && self.any_literal && !self.all_verbatim_literal
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    fn analysis(pattern: &str) -> AstAnalysis {
+        AstAnalysis::from_pattern(pattern).unwrap()
+    }
+
+    #[test]
+    fn various() {
+        let x = analysis("");
+        assert!(!x.any_uppercase);
+        assert!(!x.any_literal);
+        assert!(x.all_verbatim_literal);
+
+        let x = analysis("foo");
+        assert!(!x.any_uppercase);
+        assert!(x.any_literal);
+        assert!(x.all_verbatim_literal);
+
+        let x = analysis("Foo");
+        assert!(x.any_uppercase);
+        assert!(x.any_literal);
+        assert!(x.all_verbatim_literal);
+
+        let x = analysis("foO");
+        assert!(x.any_uppercase);
+        assert!(x.any_literal);
+        assert!(x.all_verbatim_literal);
+
+        let x = analysis(r"foo\\");
+        assert!(!x.any_uppercase);
+        assert!(x.any_literal);
+        assert!(!x.all_verbatim_literal);
+
+        let x = analysis(r"foo\w");
+        assert!(!x.any_uppercase);
+        assert!(x.any_literal);
+        assert!(!x.all_verbatim_literal);
+
+        let x = analysis(r"foo\S");
+        assert!(!x.any_uppercase);
+        assert!(x.any_literal);
+        assert!(!x.all_verbatim_literal);
+
+        let x = analysis(r"foo\p{Ll}");
+        assert!(!x.any_uppercase);
+        assert!(x.any_literal);
+        assert!(!x.all_verbatim_literal);
+
+        let x = analysis(r"foo[a-z]");
+        assert!(!x.any_uppercase);
+        assert!(x.any_literal);
+        assert!(!x.all_verbatim_literal);
+
+        let x = analysis(r"foo[A-Z]");
+        assert!(x.any_uppercase);
+        assert!(x.any_literal);
+        assert!(!x.all_verbatim_literal);
+
+        let x = analysis(r"foo[\S\t]");
+        assert!(!x.any_uppercase);
+        assert!(x.any_literal);
+        assert!(!x.all_verbatim_literal);
+
+        let x = analysis(r"foo\\S");
+        assert!(x.any_uppercase);
+        assert!(x.any_literal);
+        assert!(!x.all_verbatim_literal);
+
+        let x = analysis(r"\p{Ll}");
+        assert!(!x.any_uppercase);
+        assert!(!x.any_literal);
+        assert!(!x.all_verbatim_literal);
+
+        let x = analysis(r"aBc\w");
+        assert!(x.any_uppercase);
+        assert!(x.any_literal);
+        assert!(!x.all_verbatim_literal);
+
+        let x = analysis(r"a\u0061");
+        assert!(!x.any_uppercase);
+        assert!(x.any_literal);
+        assert!(!x.all_verbatim_literal);
+    }
+}
--- a/grep-regex/src/config.rs
+++ b/grep-regex/src/config.rs
@ -0,0 +1,265 @@
+use grep_matcher::{ByteSet, LineTerminator};
+use regex::bytes::{Regex, RegexBuilder};
+use regex_syntax::ast::{self, Ast};
+use regex_syntax::hir::Hir;
+
+use ast::AstAnalysis;
+use crlf::crlfify;
+use error::Error;
+use literal::LiteralSets;
+use non_matching::non_matching_bytes;
+use strip::strip_from_match;
+
+/// Config represents the configuration of a regex matcher in this crate.
+/// The configuration is itself a rough combination of the knobs found in
+/// the `regex` crate itself, along with additional `grep-matcher` specific
+/// options.
+///
+/// The configuration can be used to build a "configured" HIR expression. A
+/// configured HIR expression is an HIR expression that is aware of the
+/// configuration which generated it, and provides transformation on that HIR
+/// such that the configuration is preserved.
+#[derive(Clone, Debug)]
+pub struct Config {
+    pub case_insensitive: bool,
+    pub case_smart: bool,
+    pub multi_line: bool,
+    pub dot_matches_new_line: bool,
+    pub swap_greed: bool,
+    pub ignore_whitespace: bool,
+    pub unicode: bool,
+    pub octal: bool,
+    pub size_limit: usize,
+    pub dfa_size_limit: usize,
+    pub nest_limit: u32,
+    pub line_terminator: Option<LineTerminator>,
+    pub crlf: bool,
+    pub word: bool,
+}
+
+impl Default for Config {
+    fn default() -> Config {
+        Config {
+            case_insensitive: false,
+            case_smart: false,
+            multi_line: false,
+            dot_matches_new_line: false,
+            swap_greed: false,
+            ignore_whitespace: false,
+            unicode: true,
+            octal: false,
+            // These size limits are much bigger than what's in the regex
+            // crate.
+            size_limit: 100 * (1<<20),
+            dfa_size_limit: 1000 * (1<<20),
+            nest_limit: 250,
+            line_terminator: None,
+            crlf: false,
+            word: false,
+        }
+    }
+}
+
+impl Config {
+    /// Parse the given pattern and returned its HIR expression along with
+    /// the current configuration.
+    ///
+    /// If there was a problem parsing the given expression then an error
+    /// is returned.
+    pub fn hir(&self, pattern: &str) -> Result<ConfiguredHIR, Error> {
+        let analysis = self.analysis(pattern)?;
+        let expr = ::regex_syntax::ParserBuilder::new()
+            .nest_limit(self.nest_limit)
+            .octal(self.octal)
+            .allow_invalid_utf8(true)
+            .ignore_whitespace(self.ignore_whitespace)
+            .case_insensitive(self.is_case_insensitive(&analysis)?)
+            .multi_line(self.multi_line)
+            .dot_matches_new_line(self.dot_matches_new_line)
+            .swap_greed(self.swap_greed)
+            .unicode(self.unicode)
+            .build()
+            .parse(pattern)
+            .map_err(Error::regex)?;
+        let expr = match self.line_terminator {
+            None => expr,
+            Some(line_term) => strip_from_match(expr, line_term)?,
+        };
+        Ok(ConfiguredHIR {
+            original: pattern.to_string(),
+            config: self.clone(),
+            analysis: analysis,
+            // If CRLF mode is enabled, replace `$` with `(?:\r?$)`.
+            expr: if self.crlf { crlfify(expr) } else { expr },
+        })
+    }
+
+    /// Accounting for the `smart_case` config knob, return true if and only if
+    /// this pattern should be matched case insensitively.
+    fn is_case_insensitive(
+        &self,
+        analysis: &AstAnalysis,
+    ) -> Result<bool, Error> {
+        if self.case_insensitive {
+            return Ok(true);
+        }
+        if !self.case_smart {
+            return Ok(false);
+        }
+        Ok(analysis.any_literal() && !analysis.any_uppercase())
+    }
+
+    /// Perform analysis on the AST of this pattern.
+    ///
+    /// This returns an error if the given pattern failed to parse.
+    fn analysis(&self, pattern: &str) -> Result<AstAnalysis, Error> {
+        Ok(AstAnalysis::from_ast(&self.ast(pattern)?))
+    }
+
+    /// Parse the given pattern into its abstract syntax.
+    ///
+    /// This returns an error if the given pattern failed to parse.
+    fn ast(&self, pattern: &str) -> Result<Ast, Error> {
+        ast::parse::ParserBuilder::new()
+            .nest_limit(self.nest_limit)
+            .octal(self.octal)
+            .ignore_whitespace(self.ignore_whitespace)
+            .build()
+            .parse(pattern)
+            .map_err(Error::regex)
+    }
+}
+
+/// A "configured" HIR expression, which is aware of the configuration which
+/// produced this HIR.
+///
+/// Since the configuration is tracked, values with this type can be
+/// transformed into other HIR expressions (or regular expressions) in a way
+/// that preserves the configuration. For example, the `fast_line_regex`
+/// method will apply literal extraction to the inner HIR and use that to build
+/// a new regex that matches the extracted literals in a way that is
+/// consistent with the configuration that produced this HIR. For example, the
+/// size limits set on the configured HIR will be propagated out to any
+/// subsequently constructed HIR or regular expression.
+#[derive(Clone, Debug)]
+pub struct ConfiguredHIR {
+    original: String,
+    config: Config,
+    analysis: AstAnalysis,
+    expr: Hir,
+}
+
+impl ConfiguredHIR {
+    /// Return the configuration for this HIR expression.
+    pub fn config(&self) -> &Config {
+        &self.config
+    }
+
+    /// Compute the set of non-matching bytes for this HIR expression.
+    pub fn non_matching_bytes(&self) -> ByteSet {
+        non_matching_bytes(&self.expr)
+    }
+
+    /// Builds a regular expression from this HIR expression.
+    pub fn regex(&self) -> Result<Regex, Error> {
+        self.pattern_to_regex(&self.expr.to_string())
+    }
+
+    /// Applies the given function to the concrete syntax of this HIR and then
+    /// generates a new HIR based on the result of the function in a way that
+    /// preserves the configuration.
+    ///
+    /// For example, this can be used to wrap a user provided regular
+    /// expression with additional semantics. e.g., See the `WordMatcher`.
+    pub fn with_pattern<F: FnMut(&str) -> String>(
+        &self,
+        mut f: F,
+    ) -> Result<ConfiguredHIR, Error>
+    {
+        self.pattern_to_hir(&f(&self.expr.to_string()))
+    }
+
+    /// If the current configuration has a line terminator set and if useful
+    /// literals could be extracted, then a regular expression matching those
+    /// literals is returned. If no line terminator is set, then `None` is
+    /// returned.
+    ///
+    /// If compiling the resulting regular expression failed, then an error
+    /// is returned.
+    ///
+    /// This method only returns something when a line terminator is set
+    /// because matches from this regex are generally candidates that must be
+    /// confirmed before reporting a match. When performing a line oriented
+    /// search, confirmation is easy: just extend the candidate match to its
+    /// respective line boundaries and then re-search that line for a full
+    /// match. This only works when the line terminator is set because the line
+    /// terminator setting guarantees that the regex itself can never match
+    /// through the line terminator byte.
+    pub fn fast_line_regex(&self) -> Result<Option<Regex>, Error> {
+        if self.config.line_terminator.is_none() {
+            return Ok(None);
+        }
+        match LiteralSets::new(&self.expr).one_regex() {
+            None => Ok(None),
+            Some(pattern) => self.pattern_to_regex(&pattern).map(Some),
+        }
+    }
+
+    /// Create a regex from the given pattern using this HIR's configuration.
+    fn pattern_to_regex(&self, pattern: &str) -> Result<Regex, Error> {
+        // The settings we explicitly set here are intentionally a subset
+        // of the settings we have. The key point here is that our HIR
+        // expression is computed with the settings in mind, such that setting
+        // them here could actually lead to unintended behavior. For example,
+        // consider the pattern `(?U)a+`. This will get folded into the HIR
+        // as a non-greedy repetition operator which will in turn get printed
+        // to the concrete syntax as `a+?`, which is correct. But if we
+        // set the `swap_greed` option again, then we'll wind up with `(?U)a+?`
+        // which is equal to `a+` which is not the same as what we were given.
+        //
+        // We also don't need to apply `case_insensitive` since this gets
+        // folded into the HIR and would just cause us to do redundant work.
+        //
+        // Finally, we don't need to set `ignore_whitespace` since the concrete
+        // syntax emitted by the HIR printer never needs it.
+        //
+        // We set the rest of the options. Some of them are important, such as
+        // the size limit, and some of them are necessary to preserve the
+        // intention of the original pattern. For example, the Unicode flag
+        // will impact how the WordMatcher functions, namely, whether its
+        // word boundaries are Unicode aware or not.
+        RegexBuilder::new(&pattern)
+            .nest_limit(self.config.nest_limit)
+            .octal(self.config.octal)
+            .multi_line(self.config.multi_line)
+            .dot_matches_new_line(self.config.dot_matches_new_line)
+            .unicode(self.config.unicode)
+            .size_limit(self.config.size_limit)
+            .dfa_size_limit(self.config.dfa_size_limit)
+            .build()
+            .map_err(Error::regex)
+    }
+
+    /// Create an HIR expression from the given pattern using this HIR's
+    /// configuration.
+    fn pattern_to_hir(&self, pattern: &str) -> Result<ConfiguredHIR, Error> {
+        // See `pattern_to_regex` comment for explanation of why we only set
+        // a subset of knobs here. e.g., `swap_greed` is explicitly left out.
+        let expr = ::regex_syntax::ParserBuilder::new()
+            .nest_limit(self.config.nest_limit)
+            .octal(self.config.octal)
+            .allow_invalid_utf8(true)
+            .multi_line(self.config.multi_line)
+            .dot_matches_new_line(self.config.dot_matches_new_line)
+            .unicode(self.config.unicode)
+            .build()
+            .parse(pattern)
+            .map_err(Error::regex)?;
+        Ok(ConfiguredHIR {
+            original: self.original.clone(),
+            config: self.config.clone(),
+            analysis: self.analysis.clone(),
+            expr: expr,
+        })
+    }
+}
--- a/grep-regex/src/crlf.rs
+++ b/grep-regex/src/crlf.rs
@ -0,0 +1,83 @@
+use regex_syntax::hir::{self, Hir, HirKind};
+
+/// Substitutes all occurrences of multi-line enabled `$` with `(?:\r?$)`.
+///
+/// This does not preserve the exact semantics of the given expression,
+/// however, it does have the useful property that anything that matched the
+/// given expression will also match the returned expression. The difference is
+/// that the returned expression can match possibly other things as well.
+///
+/// The principle reason why we do this is because the underlying regex engine
+/// doesn't support CRLF aware `$` look-around. It's planned to fix it at that
+/// level, but we perform this kludge in the mean time.
+///
+/// Note that while the match preserving semantics are nice and neat, the
+/// match position semantics are quite a bit messier. Namely, `$` only ever
+/// matches the position between characters where as `\r??` can match a
+/// character and change the offset. This is regretable, but works out pretty
+/// nicely in most cases, especially when a match is limited to a single line.
+pub fn crlfify(expr: Hir) -> Hir {
+    match expr.into_kind() {
+        HirKind::Anchor(hir::Anchor::EndLine) => {
+            let concat = Hir::concat(vec![
+                Hir::repetition(hir::Repetition {
+                    kind: hir::RepetitionKind::ZeroOrOne,
+                    greedy: false,
+                    hir: Box::new(Hir::literal(hir::Literal::Unicode('\r'))),
+                }),
+                Hir::anchor(hir::Anchor::EndLine),
+            ]);
+            Hir::group(hir::Group {
+                kind: hir::GroupKind::NonCapturing,
+                hir: Box::new(concat),
+            })
+        }
+        HirKind::Empty => Hir::empty(),
+        HirKind::Literal(x) => Hir::literal(x),
+        HirKind::Class(x) => Hir::class(x),
+        HirKind::Anchor(x) => Hir::anchor(x),
+        HirKind::WordBoundary(x) => Hir::word_boundary(x),
+        HirKind::Repetition(mut x) => {
+            x.hir = Box::new(crlfify(*x.hir));
+            Hir::repetition(x)
+        }
+        HirKind::Group(mut x) => {
+            x.hir = Box::new(crlfify(*x.hir));
+            Hir::group(x)
+        }
+        HirKind::Concat(xs) => {
+            Hir::concat(xs.into_iter().map(crlfify).collect())
+        }
+        HirKind::Alternation(xs) => {
+            Hir::alternation(xs.into_iter().map(crlfify).collect())
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use regex_syntax::Parser;
+    use super::crlfify;
+
+    fn roundtrip(pattern: &str) -> String {
+        let expr1 = Parser::new().parse(pattern).unwrap();
+        let expr2 = crlfify(expr1);
+        expr2.to_string()
+    }
+
+    #[test]
+    fn various() {
+        assert_eq!(roundtrip(r"(?m)$"), "(?:\r??(?m:$))");
+        assert_eq!(roundtrip(r"(?m)$$"), "(?:\r??(?m:$))(?:\r??(?m:$))");
+        assert_eq!(
+            roundtrip(r"(?m)(?:foo$|bar$)"),
+            "(?:foo(?:\r??(?m:$))|bar(?:\r??(?m:$)))"
+        );
+        assert_eq!(roundtrip(r"(?m)$a"), "(?:\r??(?m:$))a");
+
+        // Not a multiline `$`, so no crlfifying occurs.
+        assert_eq!(roundtrip(r"$"), "\\z");
+        // It's a literal, derp.
+        assert_eq!(roundtrip(r"\$"), "\\$");
+    }
+}
--- a/grep-regex/src/error.rs
+++ b/grep-regex/src/error.rs
@ -0,0 +1,88 @@
+use std::error;
+use std::fmt;
+
+use util;
+
+/// An error that can occur in this crate.
+///
+/// Generally, this error corresponds to problems building a regular
+/// expression, whether it's in parsing, compilation or a problem with
+/// guaranteeing a configured optimization.
+#[derive(Clone, Debug)]
+pub struct Error {
+    kind: ErrorKind,
+}
+
+impl Error {
+    pub(crate) fn new(kind: ErrorKind) -> Error {
+        Error { kind }
+    }
+
+    pub(crate) fn regex<E: error::Error>(err: E) -> Error {
+        Error { kind: ErrorKind::Regex(err.to_string()) }
+    }
+
+    /// Return the kind of this error.
+    pub fn kind(&self) -> &ErrorKind {
+        &self.kind
+    }
+}
+
+/// The kind of an error that can occur.
+#[derive(Clone, Debug)]
+pub enum ErrorKind {
+    /// An error that occurred as a result of parsing a regular expression.
+    /// This can be a syntax error or an error that results from attempting to
+    /// compile a regular expression that is too big.
+    ///
+    /// The string here is the underlying error converted to a string.
+    Regex(String),
+    /// An error that occurs when a building a regex that isn't permitted to
+    /// match a line terminator. In general, building the regex will do its
+    /// best to make matching a line terminator impossible (e.g., by removing
+    /// `\n` from the `\s` character class), but if the regex contains a
+    /// `\n` literal, then there is no reasonable choice that can be made and
+    /// therefore an error is reported.
+    ///
+    /// The string is the literal sequence found in the regex that is not
+    /// allowed.
+    NotAllowed(String),
+    /// This error occurs when a non-ASCII line terminator was provided.
+    ///
+    /// The invalid byte is included in this error.
+    InvalidLineTerminator(u8),
+    /// Hints that destructuring should not be exhaustive.
+    ///
+    /// This enum may grow additional variants, so this makes sure clients
+    /// don't count on exhaustive matching. (Otherwise, adding a new variant
+    /// could break existing code.)
+    #[doc(hidden)]
+    __Nonexhaustive,
+}
+
+impl error::Error for Error {
+    fn description(&self) -> &str {
+        match self.kind {
+            ErrorKind::Regex(_) => "regex error",
+            ErrorKind::NotAllowed(_) => "literal not allowed",
+            ErrorKind::InvalidLineTerminator(_) => "invalid line terminator",
+            ErrorKind::__Nonexhaustive => unreachable!(),
+        }
+    }
+}
+
+impl fmt::Display for Error {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        match self.kind {
+            ErrorKind::Regex(ref s) => write!(f, "{}", s),
+            ErrorKind::NotAllowed(ref lit) => {
+                write!(f, "the literal '{:?}' is not allowed in a regex", lit)
+            }
+            ErrorKind::InvalidLineTerminator(byte) => {
+                let x = util::show_bytes(&[byte]);
+                write!(f, "line terminators must be ASCII, but '{}' is not", x)
+            }
+            ErrorKind::__Nonexhaustive => unreachable!(),
+        }
+    }
+}
--- a/grep-regex/src/lib.rs
+++ b/grep-regex/src/lib.rs
@ -0,0 +1,27 @@
+/*!
+An implementation of `grep-matcher`'s `Matcher` trait for Rust's regex engine.
+*/
+
+#![deny(missing_docs)]
+
+extern crate grep_matcher;
+#[macro_use]
+extern crate log;
+extern crate regex;
+extern crate regex_syntax;
+extern crate thread_local;
+extern crate utf8_ranges;
+
+pub use error::{Error, ErrorKind};
+pub use matcher::{RegexCaptures, RegexMatcher, RegexMatcherBuilder};
+
+mod ast;
+mod config;
+mod crlf;
+mod error;
+mod literal;
+mod matcher;
+mod non_matching;
+mod strip;
+mod util;
+mod word;
--- a/grep-regex/src/literal.rs
+++ b/grep-regex/src/literal.rs
@ -0,0 +1,304 @@
+/*
+This module is responsible for extracting *inner* literals out of the AST of a
+regular expression. Normally this is the job of the regex engine itself, but
+the regex engine doesn't look for inner literals. Since we're doing line based
+searching, we can use them, so we need to do it ourselves.
+*/
+
+use std::cmp;
+
+use regex_syntax::hir::{self, Hir, HirKind};
+use regex_syntax::hir::literal::{Literal, Literals};
+
+use util;
+
+/// Represents prefix, suffix and inner "required" literals for a regular
+/// expression.
+///
+/// Prefixes and suffixes are detected using regex-syntax. The inner required
+/// literals are detected using something custom (but based on the code in
+/// regex-syntax).
+#[derive(Clone, Debug)]
+pub struct LiteralSets {
+    /// A set of prefix literals.
+    prefixes: Literals,
+    /// A set of suffix literals.
+    suffixes: Literals,
+    /// A set of literals such that at least one of them must appear in every
+    /// match. A literal in this set may be neither a prefix nor a suffix.
+    required: Literals,
+}
+
+impl LiteralSets {
+    /// Create a set of literals from the given HIR expression.
+    pub fn new(expr: &Hir) -> LiteralSets {
+        let mut required = Literals::empty();
+        union_required(expr, &mut required);
+        LiteralSets {
+            prefixes: Literals::prefixes(expr),
+            suffixes: Literals::suffixes(expr),
+            required: required,
+        }
+    }
+
+    /// If it is deemed advantageuous to do so (via various suspicious
+    /// heuristics), this will return a single regular expression pattern that
+    /// matches a subset of the language matched by the regular expression that
+    /// generated these literal sets. The idea here is that the pattern
+    /// returned by this method is much cheaper to search for. i.e., It is
+    /// usually a single literal or an alternation of literals.
+    pub fn one_regex(&self) -> Option<String> {
+        // TODO: The logic in this function is basically inscrutable. It grew
+        // organically in the old grep 0.1 crate. Ideally, it would be
+        // re-worked. In fact, the entire inner literal extraction should be
+        // re-worked. Actually, most of regex-syntax's literal extraction
+        // should also be re-worked. Alas... only so much time in the day.
+
+        if self.prefixes.all_complete() && !self.prefixes.is_empty() {
+            debug!("literal prefixes detected: {:?}", self.prefixes);
+            // When this is true, the regex engine will do a literal scan,
+            // so we don't need to return anything.
+            return None;
+        }
+
+        // Out of inner required literals, prefixes and suffixes, which one
+        // is the longest? We pick the longest to do fast literal scan under
+        // the assumption that a longer literal will have a lower false
+        // positive rate.
+        let pre_lcp = self.prefixes.longest_common_prefix();
+        let pre_lcs = self.prefixes.longest_common_suffix();
+        let suf_lcp = self.suffixes.longest_common_prefix();
+        let suf_lcs = self.suffixes.longest_common_suffix();
+
+        let req_lits = self.required.literals();
+        let req = match req_lits.iter().max_by_key(|lit| lit.len()) {
+            None => &[],
+            Some(req) => &***req,
+        };
+
+        let mut lit = pre_lcp;
+        if pre_lcs.len() > lit.len() {
+            lit = pre_lcs;
+        }
+        if suf_lcp.len() > lit.len() {
+            lit = suf_lcp;
+        }
+        if suf_lcs.len() > lit.len() {
+            lit = suf_lcs;
+        }
+        if req_lits.len() == 1 && req.len() > lit.len() {
+            lit = req;
+        }
+
+        // Special case: if we detected an alternation of inner required
+        // literals and its longest literal is bigger than the longest
+        // prefix/suffix, then choose the alternation. In practice, this
+        // helps with case insensitive matching, which can generate lots of
+        // inner required literals.
+        let any_empty = req_lits.iter().any(|lit| lit.is_empty());
+        if req.len() > lit.len() && req_lits.len() > 1 && !any_empty {
+            debug!("required literals found: {:?}", req_lits);
+            let alts: Vec<String> = req_lits
+                .into_iter()
+                .map(|x| util::bytes_to_regex(x))
+                .collect();
+            // We're matching raw bytes, so disable Unicode mode.
+            Some(format!("(?-u:{})", alts.join("|")))
+        } else if lit.is_empty() {
+            None
+        } else {
+            debug!("required literal found: {:?}", util::show_bytes(lit));
+            Some(format!("(?-u:{})", util::bytes_to_regex(&lit)))
+        }
+    }
+}
+
+fn union_required(expr: &Hir, lits: &mut Literals) {
+    match *expr.kind() {
+        HirKind::Literal(hir::Literal::Unicode(c)) => {
+            let mut buf = [0u8; 4];
+            lits.cross_add(c.encode_utf8(&mut buf).as_bytes());
+        }
+        HirKind::Literal(hir::Literal::Byte(b)) => {
+            lits.cross_add(&[b]);
+        }
+        HirKind::Class(hir::Class::Unicode(ref cls)) => {
+            if count_unicode_class(cls) >= 5 || !lits.add_char_class(cls) {
+                lits.cut();
+            }
+        }
+        HirKind::Class(hir::Class::Bytes(ref cls)) => {
+            if count_byte_class(cls) >= 5 || !lits.add_byte_class(cls) {
+                lits.cut();
+            }
+        }
+        HirKind::Group(hir::Group { ref hir, .. }) => {
+            union_required(&**hir, lits);
+        }
+        HirKind::Repetition(ref x) => {
+            match x.kind {
+                hir::RepetitionKind::ZeroOrOne => lits.cut(),
+                hir::RepetitionKind::ZeroOrMore => lits.cut(),
+                hir::RepetitionKind::OneOrMore => {
+                    union_required(&x.hir, lits);
+                    lits.cut();
+                }
+                hir::RepetitionKind::Range(ref rng) => {
+                    let (min, max) = match *rng {
+                        hir::RepetitionRange::Exactly(m) => (m, Some(m)),
+                        hir::RepetitionRange::AtLeast(m) => (m, None),
+                        hir::RepetitionRange::Bounded(m, n) => (m, Some(n)),
+                    };
+                    repeat_range_literals(
+                        &x.hir, min, max, x.greedy, lits, union_required);
+                }
+            }
+        }
+        HirKind::Concat(ref es) if es.is_empty() => {}
+        HirKind::Concat(ref es) if es.len() == 1 => {
+            union_required(&es[0], lits)
+        }
+        HirKind::Concat(ref es) => {
+            for e in es {
+                let mut lits2 = lits.to_empty();
+                union_required(e, &mut lits2);
+                if lits2.is_empty() {
+                    lits.cut();
+                    continue;
+                }
+                if lits2.contains_empty() {
+                    lits.cut();
+                }
+                if !lits.cross_product(&lits2) {
+                    // If this expression couldn't yield any literal that
+                    // could be extended, then we need to quit. Since we're
+                    // short-circuiting, we also need to freeze every member.
+                    lits.cut();
+                    break;
+                }
+            }
+        }
+        HirKind::Alternation(ref es) => {
+            alternate_literals(es, lits, union_required);
+        }
+        _ => lits.cut(),
+    }
+}
+
+fn repeat_range_literals<F: FnMut(&Hir, &mut Literals)>(
+    e: &Hir,
+    min: u32,
+    max: Option<u32>,
+    _greedy: bool,
+    lits: &mut Literals,
+    mut f: F,
+) {
+    if min == 0 {
+        // This is a bit conservative. If `max` is set, then we could
+        // treat this as a finite set of alternations. For now, we
+        // just treat it as `e*`.
+        lits.cut();
+    } else {
+        let n = cmp::min(lits.limit_size(), min as usize);
+        // We only extract literals from a single repetition, even though
+        // we could do more. e.g., `a{3}` will have `a` extracted instead of
+        // `aaa`. The reason is that inner literal extraction can't be unioned
+        // across repetitions. e.g., extracting `foofoofoo` from `(\w+foo){3}`
+        // is wrong.
+        f(e, lits);
+        if n < min as usize {
+            lits.cut();
+        }
+        if max.map_or(true, |max| min < max) {
+            lits.cut();
+        }
+    }
+}
+
+fn alternate_literals<F: FnMut(&Hir, &mut Literals)>(
+    es: &[Hir],
+    lits: &mut Literals,
+    mut f: F,
+) {
+    let mut lits2 = lits.to_empty();
+    for e in es {
+        let mut lits3 = lits.to_empty();
+        lits3.set_limit_size(lits.limit_size() / 5);
+        f(e, &mut lits3);
+        if lits3.is_empty() || !lits2.union(lits3) {
+            // If we couldn't find suffixes for *any* of the
+            // alternates, then the entire alternation has to be thrown
+            // away and any existing members must be frozen. Similarly,
+            // if the union couldn't complete, stop and freeze.
+            lits.cut();
+            return;
+        }
+    }
+    // All we do at the moment is look for prefixes and suffixes. If both
+    // are empty, then we report nothing. We should be able to do better than
+    // this, but we'll need something more expressive than just a "set of
+    // literals."
+    let lcp = lits2.longest_common_prefix();
+    let lcs = lits2.longest_common_suffix();
+    if !lcp.is_empty() {
+        lits.cross_add(lcp);
+    }
+    lits.cut();
+    if !lcs.is_empty() {
+        lits.add(Literal::empty());
+        lits.add(Literal::new(lcs.to_vec()));
+    }
+}
+
+/// Return the number of characters in the given class.
+fn count_unicode_class(cls: &hir::ClassUnicode) -> u32 {
+    cls.iter().map(|r| 1 + (r.end() as u32 - r.start() as u32)).sum()
+}
+
+/// Return the number of bytes in the given class.
+fn count_byte_class(cls: &hir::ClassBytes) -> u32 {
+    cls.iter().map(|r| 1 + (r.end() as u32 - r.start() as u32)).sum()
+}
+
+#[cfg(test)]
+mod tests {
+    use regex_syntax::Parser;
+    use super::LiteralSets;
+
+    fn sets(pattern: &str) -> LiteralSets {
+        let hir = Parser::new().parse(pattern).unwrap();
+        LiteralSets::new(&hir)
+    }
+
+    fn one_regex(pattern: &str) -> Option<String> {
+        sets(pattern).one_regex()
+    }
+
+    // Put a pattern into the same format as the one returned by `one_regex`.
+    fn pat(pattern: &str) -> Option<String> {
+        Some(format!("(?-u:{})", pattern))
+    }
+
+    #[test]
+    fn various() {
+        // Obviously no literals.
+        assert!(one_regex(r"\w").is_none());
+        assert!(one_regex(r"\pL").is_none());
+
+        // Tantalizingly close.
+        assert!(one_regex(r"\w|foo").is_none());
+
+        // There's a literal, but it's better if the regex engine handles it
+        // internally.
+        assert!(one_regex(r"abc").is_none());
+
+        // Core use cases.
+        assert_eq!(one_regex(r"\wabc\w"), pat("abc"));
+        assert_eq!(one_regex(r"abc\w"), pat("abc"));
+
+        // TODO: Make these pass. We're missing some potentially big wins
+        // without these.
+        // assert_eq!(one_regex(r"\w(foo|bar|baz)"), pat("foo|bar|baz"));
+        // assert_eq!(one_regex(r"\w(foo|bar|baz)\w"), pat("foo|bar|baz"));
+    }
+}
--- a/grep-regex/src/matcher.rs
+++ b/grep-regex/src/matcher.rs
@ -0,0 +1,864 @@
+use std::collections::HashMap;
+
+use grep_matcher::{
+    Captures, LineMatchKind, LineTerminator, Match, Matcher, NoError, ByteSet,
+};
+use regex::bytes::{CaptureLocations, Regex};
+
+use config::{Config, ConfiguredHIR};
+use error::Error;
+use word::WordMatcher;
+
+/// A builder for constructing a `Matcher` using regular expressions.
+///
+/// This builder re-exports many of the same options found on the regex crate's
+/// builder, in addition to a few other options such as smart case, word
+/// matching and the ability to set a line terminator which may enable certain
+/// types of optimizations.
+///
+/// The syntax supported is documented as part of the regex crate:
+/// https://docs.rs/regex/*/regex/#syntax
+#[derive(Clone, Debug)]
+pub struct RegexMatcherBuilder {
+    config: Config,
+}
+
+impl Default for RegexMatcherBuilder {
+    fn default() -> RegexMatcherBuilder {
+        RegexMatcherBuilder::new()
+    }
+}
+
+impl RegexMatcherBuilder {
+    /// Create a new builder for configuring a regex matcher.
+    pub fn new() -> RegexMatcherBuilder {
+        RegexMatcherBuilder {
+            config: Config::default(),
+        }
+    }
+
+    /// Build a new matcher using the current configuration for the provided
+    /// pattern.
+    ///
+    /// The syntax supported is documented as part of the regex crate:
+    /// https://docs.rs/regex/*/regex/#syntax
+    pub fn build(&self, pattern: &str) -> Result<RegexMatcher, Error> {
+        let chir = self.config.hir(pattern)?;
+        let fast_line_regex = chir.fast_line_regex()?;
+        let non_matching_bytes = chir.non_matching_bytes();
+        if let Some(ref re) = fast_line_regex {
+            trace!("extracted fast line regex: {:?}", re);
+        }
+        Ok(RegexMatcher {
+            config: self.config.clone(),
+            matcher: RegexMatcherImpl::new(&chir)?,
+            fast_line_regex: fast_line_regex,
+            non_matching_bytes: non_matching_bytes,
+        })
+    }
+
+    /// Set the value for the case insensitive (`i`) flag.
+    ///
+    /// When enabled, letters in the pattern will match both upper case and
+    /// lower case variants.
+    pub fn case_insensitive(&mut self, yes: bool) -> &mut RegexMatcherBuilder {
+        self.config.case_insensitive = yes;
+        self
+    }
+
+    /// Whether to enable "smart case" or not.
+    ///
+    /// When smart case is enabled, the builder will automatically enable
+    /// case insensitive matching based on how the pattern is written. Namely,
+    /// case insensitive mode is enabled when both of the following things
+    /// are true:
+    ///
+    /// 1. The pattern contains at least one literal character. For example,
+    ///    `a\w` contains a literal (`a`) but `\w` does not.
+    /// 2. Of the literals in the pattern, none of them are considered to be
+    ///    uppercase according to Unicode. For example, `foo\pL` has no
+    ///    uppercase literals but `Foo\pL` does.
+    pub fn case_smart(&mut self, yes: bool) -> &mut RegexMatcherBuilder {
+        self.config.case_smart = yes;
+        self
+    }
+
+    /// Set the value for the multi-line matching (`m`) flag.
+    ///
+    /// When enabled, `^` matches the beginning of lines and `$` matches the
+    /// end of lines.
+    ///
+    /// By default, they match beginning/end of the input.
+    pub fn multi_line(&mut self, yes: bool) -> &mut RegexMatcherBuilder {
+        self.config.multi_line = yes;
+        self
+    }
+
+    /// Set the value for the any character (`s`) flag, where in `.` matches
+    /// anything when `s` is set and matches anything except for new line when
+    /// it is not set (the default).
+    ///
+    /// N.B. "matches anything" means "any byte" when Unicode is disabled and
+    /// means "any valid UTF-8 encoding of any Unicode scalar value" when
+    /// Unicode is enabled.
+    pub fn dot_matches_new_line(
+        &mut self,
+        yes: bool,
+    ) -> &mut RegexMatcherBuilder {
+        self.config.dot_matches_new_line = yes;
+        self
+    }
+
+    /// Set the value for the greedy swap (`U`) flag.
+    ///
+    /// When enabled, a pattern like `a*` is lazy (tries to find shortest
+    /// match) and `a*?` is greedy (tries to find longest match).
+    ///
+    /// By default, `a*` is greedy and `a*?` is lazy.
+    pub fn swap_greed(&mut self, yes: bool) -> &mut RegexMatcherBuilder {
+        self.config.swap_greed = yes;
+        self
+    }
+
+    /// Set the value for the ignore whitespace (`x`) flag.
+    ///
+    /// When enabled, whitespace such as new lines and spaces will be ignored
+    /// between expressions of the pattern, and `#` can be used to start a
+    /// comment until the next new line.
+    pub fn ignore_whitespace(
+        &mut self,
+        yes: bool,
+    ) -> &mut RegexMatcherBuilder {
+        self.config.ignore_whitespace = yes;
+        self
+    }
+
+    /// Set the value for the Unicode (`u`) flag.
+    ///
+    /// Enabled by default. When disabled, character classes such as `\w` only
+    /// match ASCII word characters instead of all Unicode word characters.
+    pub fn unicode(&mut self, yes: bool) -> &mut RegexMatcherBuilder {
+        self.config.unicode = yes;
+        self
+    }
+
+    /// Whether to support octal syntax or not.
+    ///
+    /// Octal syntax is a little-known way of uttering Unicode codepoints in
+    /// a regular expression. For example, `a`, `\x61`, `\u0061` and
+    /// `\141` are all equivalent regular expressions, where the last example
+    /// shows octal syntax.
+    ///
+    /// While supporting octal syntax isn't in and of itself a problem, it does
+    /// make good error messages harder. That is, in PCRE based regex engines,
+    /// syntax like `\0` invokes a backreference, which is explicitly
+    /// unsupported in Rust's regex engine. However, many users expect it to
+    /// be supported. Therefore, when octal support is disabled, the error
+    /// message will explicitly mention that backreferences aren't supported.
+    ///
+    /// Octal syntax is disabled by default.
+    pub fn octal(&mut self, yes: bool) -> &mut RegexMatcherBuilder {
+        self.config.octal = yes;
+        self
+    }
+
+    /// Set the approximate size limit of the compiled regular expression.
+    ///
+    /// This roughly corresponds to the number of bytes occupied by a single
+    /// compiled program. If the program exceeds this number, then a
+    /// compilation error is returned.
+    pub fn size_limit(&mut self, bytes: usize) -> &mut RegexMatcherBuilder {
+        self.config.size_limit = bytes;
+        self
+    }
+
+    /// Set the approximate size of the cache used by the DFA.
+    ///
+    /// This roughly corresponds to the number of bytes that the DFA will
+    /// use while searching.
+    ///
+    /// Note that this is a *per thread* limit. There is no way to set a global
+    /// limit. In particular, if a regex is used from multiple threads
+    /// simultaneously, then each thread may use up to the number of bytes
+    /// specified here.
+    pub fn dfa_size_limit(
+        &mut self,
+        bytes: usize,
+    ) -> &mut RegexMatcherBuilder {
+        self.config.dfa_size_limit = bytes;
+        self
+    }
+
+    /// Set the nesting limit for this parser.
+    ///
+    /// The nesting limit controls how deep the abstract syntax tree is allowed
+    /// to be. If the AST exceeds the given limit (e.g., with too many nested
+    /// groups), then an error is returned by the parser.
+    ///
+    /// The purpose of this limit is to act as a heuristic to prevent stack
+    /// overflow for consumers that do structural induction on an `Ast` using
+    /// explicit recursion. While this crate never does this (instead using
+    /// constant stack space and moving the call stack to the heap), other
+    /// crates may.
+    ///
+    /// This limit is not checked until the entire Ast is parsed. Therefore,
+    /// if callers want to put a limit on the amount of heap space used, then
+    /// they should impose a limit on the length, in bytes, of the concrete
+    /// pattern string. In particular, this is viable since this parser
+    /// implementation will limit itself to heap space proportional to the
+    /// lenth of the pattern string.
+    ///
+    /// Note that a nest limit of `0` will return a nest limit error for most
+    /// patterns but not all. For example, a nest limit of `0` permits `a` but
+    /// not `ab`, since `ab` requires a concatenation, which results in a nest
+    /// depth of `1`. In general, a nest limit is not something that manifests
+    /// in an obvious way in the concrete syntax, therefore, it should not be
+    /// used in a granular way.
+    pub fn nest_limit(&mut self, limit: u32) -> &mut RegexMatcherBuilder {
+        self.config.nest_limit = limit;
+        self
+    }
+
+    /// Set an ASCII line terminator for the matcher.
+    ///
+    /// The purpose of setting a line terminator is to enable a certain class
+    /// of optimizations that can make line oriented searching faster. Namely,
+    /// when a line terminator is enabled, then the builder will guarantee that
+    /// the resulting matcher will never be capable of producing a match that
+    /// contains the line terminator. Because of this guarantee, users of the
+    /// resulting matcher do not need to slowly execute a search line by line
+    /// for line oriented search.
+    ///
+    /// If the aforementioned guarantee about not matching a line terminator
+    /// cannot be made because of how the pattern was written, then the builder
+    /// will return an error when attempting to construct the matcher. For
+    /// example, the pattern `a\sb` will be transformed such that it can never
+    /// match `a\nb` (when `\n` is the line terminator), but the pattern `a\nb`
+    /// will result in an error since the `\n` cannot be easily removed without
+    /// changing the fundamental intent of the pattern.
+    ///
+    /// If the given line terminator isn't an ASCII byte (`<=127`), then the
+    /// builder will return an error when constructing the matcher.
+    pub fn line_terminator(
+        &mut self,
+        line_term: Option<u8>,
+    ) -> &mut RegexMatcherBuilder {
+        self.config.line_terminator = line_term.map(LineTerminator::byte);
+        self
+    }
+
+    /// Set the line terminator to `\r\n` and enable CRLF matching for `$` in
+    /// regex patterns.
+    ///
+    /// This method sets two distinct settings:
+    ///
+    /// 1. It causes the line terminator for the matcher to be `\r\n`. Namely,
+    ///    this prevents the matcher from ever producing a match that contains
+    ///    a `\r` or `\n`.
+    /// 2. It translates all instances of `$` in the pattern to `(?:\r??$)`.
+    ///    This works around the fact that the regex engine does not support
+    ///    matching CRLF as a line terminator when using `$`.
+    ///
+    /// In particular, because of (2), the matches produced by the matcher may
+    /// be slightly different than what one would expect given the pattern.
+    /// This is the trade off made: in many cases, `$` will "just work" in the
+    /// presence of `\r\n` line terminators, but matches may require some
+    /// trimming to faithfully represent the intended match.
+    ///
+    /// Note that if you do not wish to set the line terminator but would still
+    /// like `$` to match `\r\n` line terminators, then it is valid to call
+    /// `crlf(true)` followed by `line_terminator(None)`. Ordering is
+    /// important, since `crlf` and `line_terminator` override each other.
+    pub fn crlf(&mut self, yes: bool) -> &mut RegexMatcherBuilder {
+        if yes {
+            self.config.line_terminator = Some(LineTerminator::crlf());
+        } else {
+            self.config.line_terminator = None;
+        }
+        self.config.crlf = yes;
+        self
+    }
+
+    /// Require that all matches occur on word boundaries.
+    ///
+    /// Enabling this option is subtly different than putting `\b` assertions
+    /// on both sides of your pattern. In particular, a `\b` assertion requires
+    /// that one side of it match a word character while the other match a
+    /// non-word character. This option, in contrast, merely requires that
+    /// one side match a non-word character.
+    ///
+    /// For example, `\b-2\b` will not match `foo -2 bar` since `-` is not a
+    /// word character. However, `-2` with this `word` option enabled will
+    /// match the `-2` in `foo -2 bar`.
+    pub fn word(&mut self, yes: bool) -> &mut RegexMatcherBuilder {
+        self.config.word = yes;
+        self
+    }
+}
+
+/// An implementation of the `Matcher` trait using Rust's standard regex
+/// library.
+#[derive(Clone, Debug)]
+pub struct RegexMatcher {
+    /// The configuration specified by the caller.
+    config: Config,
+    /// The underlying matcher implementation.
+    matcher: RegexMatcherImpl,
+    /// A regex that never reports false negatives but may report false
+    /// positives that is believed to be capable of being matched more quickly
+    /// than `regex`. Typically, this is a single literal or an alternation
+    /// of literals.
+    fast_line_regex: Option<Regex>,
+    /// A set of bytes that will never appear in a match.
+    non_matching_bytes: ByteSet,
+}
+
+impl RegexMatcher {
+    /// Create a new matcher from the given pattern using the default
+    /// configuration.
+    pub fn new(pattern: &str) -> Result<RegexMatcher, Error> {
+        RegexMatcherBuilder::new().build(pattern)
+    }
+
+    /// Create a new matcher from the given pattern using the default
+    /// configuration, but matches lines terminated by `\n`.
+    ///
+    /// This returns an error if the given pattern contains a literal `\n`.
+    /// Other uses of `\n` (such as in `\s`) are removed transparently.
+    pub fn new_line_matcher(pattern: &str) -> Result<RegexMatcher, Error> {
+        RegexMatcherBuilder::new()
+            .line_terminator(Some(b'\n'))
+            .build(pattern)
+    }
+}
+
+/// An encapsulation of the type of matcher we use in `RegexMatcher`.
+#[derive(Clone, Debug)]
+enum RegexMatcherImpl {
+    /// The standard matcher used for all regular expressions.
+    Standard(StandardMatcher),
+    /// A matcher that only matches at word boundaries. This transforms the
+    /// regex to `(^|\W)(...)($|\W)` instead of the more intuitive `\b(...)\b`.
+    /// Because of this, the WordMatcher provides its own implementation of
+    /// `Matcher` to encapsulate its use of capture groups to make them
+    /// invisible to the caller.
+    Word(WordMatcher),
+}
+
+impl RegexMatcherImpl {
+    /// Based on the configuration, create a new implementation of the
+    /// `Matcher` trait.
+    fn new(expr: &ConfiguredHIR) -> Result<RegexMatcherImpl, Error> {
+        if expr.config().word {
+            Ok(RegexMatcherImpl::Word(WordMatcher::new(expr)?))
+        } else {
+            Ok(RegexMatcherImpl::Standard(StandardMatcher::new(expr)?))
+        }
+    }
+}
+
+// This implementation just dispatches on the internal matcher impl except
+// for the line terminator optimization, which is possibly executed via
+// `fast_line_regex`.
+impl Matcher for RegexMatcher {
+    type Captures = RegexCaptures;
+    type Error = NoError;
+
+    fn find_at(
+        &self,
+        haystack: &[u8],
+        at: usize,
+    ) -> Result<Option<Match>, NoError> {
+        use self::RegexMatcherImpl::*;
+        match self.matcher {
+            Standard(ref m) => m.find_at(haystack, at),
+            Word(ref m) => m.find_at(haystack, at),
+        }
+    }
+
+    fn new_captures(&self) -> Result<RegexCaptures, NoError> {
+        use self::RegexMatcherImpl::*;
+        match self.matcher {
+            Standard(ref m) => m.new_captures(),
+            Word(ref m) => m.new_captures(),
+        }
+    }
+
+    fn capture_count(&self) -> usize {
+        use self::RegexMatcherImpl::*;
+        match self.matcher {
+            Standard(ref m) => m.capture_count(),
+            Word(ref m) => m.capture_count(),
+        }
+    }
+
+    fn capture_index(&self, name: &str) -> Option<usize> {
+        use self::RegexMatcherImpl::*;
+        match self.matcher {
+            Standard(ref m) => m.capture_index(name),
+            Word(ref m) => m.capture_index(name),
+        }
+    }
+
+    fn find(&self, haystack: &[u8]) -> Result<Option<Match>, NoError> {
+        use self::RegexMatcherImpl::*;
+        match self.matcher {
+            Standard(ref m) => m.find(haystack),
+            Word(ref m) => m.find(haystack),
+        }
+    }
+
+    fn find_iter<F>(
+        &self,
+        haystack: &[u8],
+        matched: F,
+    ) -> Result<(), NoError>
+    where F: FnMut(Match) -> bool
+    {
+        use self::RegexMatcherImpl::*;
+        match self.matcher {
+            Standard(ref m) => m.find_iter(haystack, matched),
+            Word(ref m) => m.find_iter(haystack, matched),
+        }
+    }
+
+    fn try_find_iter<F, E>(
+        &self,
+        haystack: &[u8],
+        matched: F,
+    ) -> Result<Result<(), E>, NoError>
+    where F: FnMut(Match) -> Result<bool, E>
+    {
+        use self::RegexMatcherImpl::*;
+        match self.matcher {
+            Standard(ref m) => m.try_find_iter(haystack, matched),
+            Word(ref m) => m.try_find_iter(haystack, matched),
+        }
+    }
+
+    fn captures(
+        &self,
+        haystack: &[u8],
+        caps: &mut RegexCaptures,
+    ) -> Result<bool, NoError> {
+        use self::RegexMatcherImpl::*;
+        match self.matcher {
+            Standard(ref m) => m.captures(haystack, caps),
+            Word(ref m) => m.captures(haystack, caps),
+        }
+    }
+
+    fn captures_iter<F>(
+        &self,
+        haystack: &[u8],
+        caps: &mut RegexCaptures,
+        matched: F,
+    ) -> Result<(), NoError>
+    where F: FnMut(&RegexCaptures) -> bool
+    {
+        use self::RegexMatcherImpl::*;
+        match self.matcher {
+            Standard(ref m) => m.captures_iter(haystack, caps, matched),
+            Word(ref m) => m.captures_iter(haystack, caps, matched),
+        }
+    }
+
+    fn try_captures_iter<F, E>(
+        &self,
+        haystack: &[u8],
+        caps: &mut RegexCaptures,
+        matched: F,
+    ) -> Result<Result<(), E>, NoError>
+    where F: FnMut(&RegexCaptures) -> Result<bool, E>
+    {
+        use self::RegexMatcherImpl::*;
+        match self.matcher {
+            Standard(ref m) => m.try_captures_iter(haystack, caps, matched),
+            Word(ref m) => m.try_captures_iter(haystack, caps, matched),
+        }
+    }
+
+    fn captures_at(
+        &self,
+        haystack: &[u8],
+        at: usize,
+        caps: &mut RegexCaptures,
+    ) -> Result<bool, NoError> {
+        use self::RegexMatcherImpl::*;
+        match self.matcher {
+            Standard(ref m) => m.captures_at(haystack, at, caps),
+            Word(ref m) => m.captures_at(haystack, at, caps),
+        }
+    }
+
+    fn replace<F>(
+        &self,
+        haystack: &[u8],
+        dst: &mut Vec<u8>,
+        append: F,
+    ) -> Result<(), NoError>
+    where F: FnMut(Match, &mut Vec<u8>) -> bool
+    {
+        use self::RegexMatcherImpl::*;
+        match self.matcher {
+            Standard(ref m) => m.replace(haystack, dst, append),
+            Word(ref m) => m.replace(haystack, dst, append),
+        }
+    }
+
+    fn replace_with_captures<F>(
+        &self,
+        haystack: &[u8],
+        caps: &mut RegexCaptures,
+        dst: &mut Vec<u8>,
+        append: F,
+    ) -> Result<(), NoError>
+    where F: FnMut(&Self::Captures, &mut Vec<u8>) -> bool
+    {
+        use self::RegexMatcherImpl::*;
+        match self.matcher {
+            Standard(ref m) => {
+                m.replace_with_captures(haystack, caps, dst, append)
+            }
+            Word(ref m) => {
+                m.replace_with_captures(haystack, caps, dst, append)
+            }
+        }
+    }
+
+    fn is_match(&self, haystack: &[u8]) -> Result<bool, NoError> {
+        use self::RegexMatcherImpl::*;
+        match self.matcher {
+            Standard(ref m) => m.is_match(haystack),
+            Word(ref m) => m.is_match(haystack),
+        }
+    }
+
+    fn is_match_at(
+        &self,
+        haystack: &[u8],
+        at: usize,
+    ) -> Result<bool, NoError> {
+        use self::RegexMatcherImpl::*;
+        match self.matcher {
+            Standard(ref m) => m.is_match_at(haystack, at),
+            Word(ref m) => m.is_match_at(haystack, at),
+        }
+    }
+
+    fn shortest_match(
+        &self,
+        haystack: &[u8],
+    ) -> Result<Option<usize>, NoError> {
+        use self::RegexMatcherImpl::*;
+        match self.matcher {
+            Standard(ref m) => m.shortest_match(haystack),
+            Word(ref m) => m.shortest_match(haystack),
+        }
+    }
+
+    fn shortest_match_at(
+        &self,
+        haystack: &[u8],
+        at: usize,
+    ) -> Result<Option<usize>, NoError> {
+        use self::RegexMatcherImpl::*;
+        match self.matcher {
+            Standard(ref m) => m.shortest_match_at(haystack, at),
+            Word(ref m) => m.shortest_match_at(haystack, at),
+        }
+    }
+
+    fn non_matching_bytes(&self) -> Option<&ByteSet> {
+        Some(&self.non_matching_bytes)
+    }
+
+    fn line_terminator(&self) -> Option<LineTerminator> {
+        self.config.line_terminator
+    }
+
+    fn find_candidate_line(
+        &self,
+        haystack: &[u8],
+    ) -> Result<Option<LineMatchKind>, NoError> {
+        Ok(match self.fast_line_regex {
+            Some(ref regex) => {
+                regex.shortest_match(haystack).map(LineMatchKind::Candidate)
+            }
+            None => {
+                self.shortest_match(haystack)?.map(LineMatchKind::Confirmed)
+            }
+        })
+    }
+}
+
+/// The implementation of the standard regex matcher.
+#[derive(Clone, Debug)]
+struct StandardMatcher {
+    /// The regular expression compiled from the pattern provided by the
+    /// caller.
+    regex: Regex,
+    /// A map from capture group name to its corresponding index.
+    names: HashMap<String, usize>,
+}
+
+impl StandardMatcher {
+    fn new(expr: &ConfiguredHIR) -> Result<StandardMatcher, Error> {
+        let regex = expr.regex()?;
+        let mut names = HashMap::new();
+        for (i, optional_name) in regex.capture_names().enumerate() {
+            if let Some(name) = optional_name {
+                names.insert(name.to_string(), i);
+            }
+        }
+        Ok(StandardMatcher { regex, names })
+    }
+}
+
+impl Matcher for StandardMatcher {
+    type Captures = RegexCaptures;
+    type Error = NoError;
+
+    fn find_at(
+        &self,
+        haystack: &[u8],
+        at: usize,
+    ) -> Result<Option<Match>, NoError> {
+        Ok(self.regex
+            .find_at(haystack, at)
+            .map(|m| Match::new(m.start(), m.end())))
+    }
+
+    fn new_captures(&self) -> Result<RegexCaptures, NoError> {
+        Ok(RegexCaptures::new(self.regex.capture_locations()))
+    }
+
+    fn capture_count(&self) -> usize {
+        self.regex.captures_len()
+    }
+
+    fn capture_index(&self, name: &str) -> Option<usize> {
+        self.names.get(name).map(|i| *i)
+    }
+
+    fn try_find_iter<F, E>(
+        &self,
+        haystack: &[u8],
+        mut matched: F,
+    ) -> Result<Result<(), E>, NoError>
+    where F: FnMut(Match) -> Result<bool, E>
+    {
+        for m in self.regex.find_iter(haystack) {
+            match matched(Match::new(m.start(), m.end())) {
+                Ok(true) => continue,
+                Ok(false) => return Ok(Ok(())),
+                Err(err) => return Ok(Err(err)),
+            }
+        }
+        Ok(Ok(()))
+    }
+
+    fn captures_at(
+        &self,
+        haystack: &[u8],
+        at: usize,
+        caps: &mut RegexCaptures,
+    ) -> Result<bool, NoError> {
+        Ok(self.regex.captures_read_at(&mut caps.locs, haystack, at).is_some())
+    }
+
+    fn shortest_match_at(
+        &self,
+        haystack: &[u8],
+        at: usize,
+    ) -> Result<Option<usize>, NoError> {
+        Ok(self.regex.shortest_match_at(haystack, at))
+    }
+}
+
+/// Represents the match offsets of each capturing group in a match.
+///
+/// The first, or `0`th capture group, always corresponds to the entire match
+/// and is guaranteed to be present when a match occurs. The next capture
+/// group, at index `1`, corresponds to the first capturing group in the regex,
+/// ordered by the position at which the left opening parenthesis occurs.
+///
+/// Note that not all capturing groups are guaranteed to be present in a match.
+/// For example, in the regex, `(?P<foo>\w)|(?P<bar>\W)`, only one of `foo`
+/// or `bar` will ever be set in any given match.
+///
+/// In order to access a capture group by name, you'll need to first find the
+/// index of the group using the corresponding matcher's `capture_index`
+/// method, and then use that index with `RegexCaptures::get`.
+#[derive(Clone, Debug)]
+pub struct RegexCaptures {
+    /// Where the locations are stored.
+    locs: CaptureLocations,
+    /// These captures behave as if the capturing groups begin at the given
+    /// offset. When set to `0`, this has no affect and capture groups are
+    /// indexed like normal.
+    ///
+    /// This is useful when building matchers that wrap arbitrary regular
+    /// expressions. For example, `WordMatcher` takes an existing regex `re`
+    /// and creates `(?:^|\W)(re)(?:$|\W)`, but hides the fact that the regex
+    /// has been wrapped from the caller. In order to do this, the matcher
+    /// and the capturing groups must behave as if `(re)` is the `0`th capture
+    /// group.
+    offset: usize,
+}
+
+impl Captures for RegexCaptures {
+    fn len(&self) -> usize {
+        self.locs.len().checked_sub(self.offset).unwrap()
+    }
+
+    fn get(&self, i: usize) -> Option<Match> {
+        let actual = i.checked_add(self.offset).unwrap();
+        self.locs.pos(actual).map(|(s, e)| Match::new(s, e))
+    }
+}
+
+impl RegexCaptures {
+    pub(crate) fn new(locs: CaptureLocations) -> RegexCaptures {
+        RegexCaptures::with_offset(locs, 0)
+    }
+
+    pub(crate) fn with_offset(
+        locs: CaptureLocations,
+        offset: usize,
+    ) -> RegexCaptures {
+        RegexCaptures { locs, offset }
+    }
+
+    pub(crate) fn locations(&mut self) -> &mut CaptureLocations {
+        &mut self.locs
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use grep_matcher::{LineMatchKind, Matcher};
+    use super::*;
+
+    // Test that enabling word matches does the right thing and demonstrate
+    // the difference between it and surrounding the regex in `\b`.
+    #[test]
+    fn word() {
+        let matcher = RegexMatcherBuilder::new()
+            .word(true)
+            .build(r"-2")
+            .unwrap();
+        assert!(matcher.is_match(b"abc -2 foo").unwrap());
+
+        let matcher = RegexMatcherBuilder::new()
+            .word(false)
+            .build(r"\b-2\b")
+            .unwrap();
+        assert!(!matcher.is_match(b"abc -2 foo").unwrap());
+    }
+
+    // Test that enabling a line terminator prevents it from matching through
+    // said line terminator.
+    #[test]
+    fn line_terminator() {
+        // This works, because there's no line terminator specified.
+        let matcher = RegexMatcherBuilder::new()
+            .build(r"abc\sxyz")
+            .unwrap();
+        assert!(matcher.is_match(b"abc\nxyz").unwrap());
+
+        // This doesn't.
+        let matcher = RegexMatcherBuilder::new()
+            .line_terminator(Some(b'\n'))
+            .build(r"abc\sxyz")
+            .unwrap();
+        assert!(!matcher.is_match(b"abc\nxyz").unwrap());
+    }
+
+    // Ensure that the builder returns an error if a line terminator is set
+    // and the regex could not be modified to remove a line terminator.
+    #[test]
+    fn line_terminator_error() {
+        assert!(RegexMatcherBuilder::new()
+            .line_terminator(Some(b'\n'))
+            .build(r"a\nz")
+            .is_err())
+    }
+
+    // Test that enabling CRLF permits `$` to match at the end of a line.
+    #[test]
+    fn line_terminator_crlf() {
+        // Test normal use of `$` with a `\n` line terminator.
+        let matcher = RegexMatcherBuilder::new()
+            .multi_line(true)
+            .build(r"abc$")
+            .unwrap();
+        assert!(matcher.is_match(b"abc\n").unwrap());
+
+        // Test that `$` doesn't match at `\r\n` boundary normally.
+        let matcher = RegexMatcherBuilder::new()
+            .multi_line(true)
+            .build(r"abc$")
+            .unwrap();
+        assert!(!matcher.is_match(b"abc\r\n").unwrap());
+
+        // Now check the CRLF handling.
+        let matcher = RegexMatcherBuilder::new()
+            .multi_line(true)
+            .crlf(true)
+            .build(r"abc$")
+            .unwrap();
+        assert!(matcher.is_match(b"abc\r\n").unwrap());
+    }
+
+    // Test that smart case works.
+    #[test]
+    fn case_smart() {
+        let matcher = RegexMatcherBuilder::new()
+            .case_smart(true)
+            .build(r"abc")
+            .unwrap();
+        assert!(matcher.is_match(b"ABC").unwrap());
+
+        let matcher = RegexMatcherBuilder::new()
+            .case_smart(true)
+            .build(r"aBc")
+            .unwrap();
+        assert!(!matcher.is_match(b"ABC").unwrap());
+    }
+
+    // Test that finding candidate lines works as expected.
+    #[test]
+    fn candidate_lines() {
+        fn is_confirmed(m: LineMatchKind) -> bool {
+            match m {
+                LineMatchKind::Confirmed(_) => true,
+                _ => false,
+            }
+        }
+        fn is_candidate(m: LineMatchKind) -> bool {
+            match m {
+                LineMatchKind::Candidate(_) => true,
+                _ => false,
+            }
+        }
+
+        // With no line terminator set, we can't employ any optimizations,
+        // so we get a confirmed match.
+        let matcher = RegexMatcherBuilder::new()
+            .build(r"\wfoo\s")
+            .unwrap();
+        let m = matcher.find_candidate_line(b"afoo ").unwrap().unwrap();
+        assert!(is_confirmed(m));
+
+        // With a line terminator and a regex specially crafted to have an
+        // easy-to-detect inner literal, we can apply an optimization that
+        // quickly finds candidate matches.
+        let matcher = RegexMatcherBuilder::new()
+            .line_terminator(Some(b'\n'))
+            .build(r"\wfoo\s")
+            .unwrap();
+        let m = matcher.find_candidate_line(b"afoo ").unwrap().unwrap();
+        assert!(is_candidate(m));
+    }
+}
--- a/grep-regex/src/non_matching.rs
+++ b/grep-regex/src/non_matching.rs
@ -0,0 +1,128 @@
+use grep_matcher::ByteSet;
+use regex_syntax::hir::{self, Hir, HirKind};
+use utf8_ranges::Utf8Sequences;
+
+/// Return a confirmed set of non-matching bytes from the given expression.
+pub fn non_matching_bytes(expr: &Hir) -> ByteSet {
+    let mut set = ByteSet::full();
+    remove_matching_bytes(expr, &mut set);
+    set
+}
+
+/// Remove any bytes from the given set that can occur in a matched produced by
+/// the given expression.
+fn remove_matching_bytes(
+    expr: &Hir,
+    set: &mut ByteSet,
+) {
+    match *expr.kind() {
+        HirKind::Empty
+        | HirKind::Anchor(_)
+        | HirKind::WordBoundary(_) => {}
+        HirKind::Literal(hir::Literal::Unicode(c)) => {
+            for &b in c.encode_utf8(&mut [0; 4]).as_bytes() {
+                set.remove(b);
+            }
+        }
+        HirKind::Literal(hir::Literal::Byte(b)) => {
+            set.remove(b);
+        }
+        HirKind::Class(hir::Class::Unicode(ref cls)) => {
+            for range in cls.iter() {
+                // This is presumably faster than encoding every codepoint
+                // to UTF-8 and then removing those bytes from the set.
+                for seq in Utf8Sequences::new(range.start(), range.end()) {
+                    for byte_range in seq.as_slice() {
+                        set.remove_all(byte_range.start, byte_range.end);
+                    }
+                }
+            }
+        }
+        HirKind::Class(hir::Class::Bytes(ref cls)) => {
+            for range in cls.iter() {
+                set.remove_all(range.start(), range.end());
+            }
+        }
+        HirKind::Repetition(ref x) => {
+            remove_matching_bytes(&x.hir, set);
+        }
+        HirKind::Group(ref x) => {
+            remove_matching_bytes(&x.hir, set);
+        }
+        HirKind::Concat(ref xs) => {
+            for x in xs {
+                remove_matching_bytes(x, set);
+            }
+        }
+        HirKind::Alternation(ref xs) => {
+            for x in xs {
+                remove_matching_bytes(x, set);
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use grep_matcher::ByteSet;
+    use regex_syntax::ParserBuilder;
+
+    use super::non_matching_bytes;
+
+    fn extract(pattern: &str) -> ByteSet {
+        let expr = ParserBuilder::new()
+            .allow_invalid_utf8(true)
+            .build()
+            .parse(pattern)
+            .unwrap();
+        non_matching_bytes(&expr)
+    }
+
+    fn sparse(set: &ByteSet) -> Vec<u8> {
+        let mut sparse_set = vec![];
+        for b in (0..256).map(|b| b as u8) {
+            if set.contains(b) {
+                sparse_set.push(b);
+            }
+        }
+        sparse_set
+    }
+
+    fn sparse_except(except: &[u8]) -> Vec<u8> {
+        let mut except_set = vec![false; 256];
+        for &b in except {
+            except_set[b as usize] = true;
+        }
+
+        let mut set = vec![];
+        for b in (0..256).map(|b| b as u8) {
+            if !except_set[b as usize] {
+                set.push(b);
+            }
+        }
+        set
+    }
+
+    #[test]
+    fn dot() {
+        assert_eq!(sparse(&extract(".")), vec![
+            b'\n',
+            192, 193, 245, 246, 247, 248, 249,
+            250, 251, 252, 253, 254, 255,
+        ]);
+        assert_eq!(sparse(&extract("(?s).")), vec![
+            192, 193, 245, 246, 247, 248, 249,
+            250, 251, 252, 253, 254, 255,
+        ]);
+        assert_eq!(sparse(&extract("(?-u).")), vec![b'\n']);
+        assert_eq!(sparse(&extract("(?s-u).")), vec![]);
+    }
+
+    #[test]
+    fn literal() {
+        assert_eq!(sparse(&extract("a")), sparse_except(&[b'a']));
+        assert_eq!(sparse(&extract("☃")), sparse_except(&[0xE2, 0x98, 0x83]));
+        assert_eq!(sparse(&extract(r"\xFF")), sparse_except(&[0xC3, 0xBF]));
+        assert_eq!(sparse(&extract(r"(?-u)\xFF")), sparse_except(&[0xFF]));
+    }
+}
--- a/grep-regex/src/strip.rs
+++ b/grep-regex/src/strip.rs
@ -0,0 +1,154 @@
+use grep_matcher::LineTerminator;
+use regex_syntax::hir::{self, Hir, HirKind};
+
+use error::{Error, ErrorKind};
+
+/// Return an HIR that is guaranteed to never match the given line terminator,
+/// if possible.
+///
+/// If the transformation isn't possible, then an error is returned.
+///
+/// In general, if a literal line terminator occurs anywhere in the HIR, then
+/// this will return an error. However, if the line terminator occurs within
+/// a character class with at least one other character (that isn't also a line
+/// terminator), then the line terminator is simply stripped from that class.
+///
+/// If the given line terminator is not ASCII, then this function returns an
+/// error.
+pub fn strip_from_match(
+    expr: Hir,
+    line_term: LineTerminator,
+) -> Result<Hir, Error> {
+    if line_term.is_crlf() {
+        let expr1 = strip_from_match_ascii(expr, b'\r')?;
+        strip_from_match_ascii(expr1, b'\n')
+    } else {
+        let b = line_term.as_byte();
+        if b > 0x7F {
+            return Err(Error::new(ErrorKind::InvalidLineTerminator(b)));
+        }
+        strip_from_match_ascii(expr, b)
+    }
+}
+
+/// The implementation of strip_from_match. The given byte must be ASCII. This
+/// function panics otherwise.
+fn strip_from_match_ascii(
+    expr: Hir,
+    byte: u8,
+) -> Result<Hir, Error> {
+    assert!(byte <= 0x7F);
+    let chr = byte as char;
+    assert_eq!(chr.len_utf8(), 1);
+
+    let invalid = || Err(Error::new(ErrorKind::NotAllowed(chr.to_string())));
+
+    Ok(match expr.into_kind() {
+        HirKind::Empty => Hir::empty(),
+        HirKind::Literal(hir::Literal::Unicode(c)) => {
+            if c == chr {
+                return invalid();
+            }
+            Hir::literal(hir::Literal::Unicode(c))
+        }
+        HirKind::Literal(hir::Literal::Byte(b)) => {
+            if b as char == chr {
+                return invalid();
+            }
+            Hir::literal(hir::Literal::Byte(b))
+        }
+        HirKind::Class(hir::Class::Unicode(mut cls)) => {
+            let remove = hir::ClassUnicode::new(Some(
+                hir::ClassUnicodeRange::new(chr, chr),
+            ));
+            cls.difference(&remove);
+            if cls.ranges().is_empty() {
+                return invalid();
+            }
+            Hir::class(hir::Class::Unicode(cls))
+        }
+        HirKind::Class(hir::Class::Bytes(mut cls)) => {
+            let remove = hir::ClassBytes::new(Some(
+                hir::ClassBytesRange::new(byte, byte),
+            ));
+            cls.difference(&remove);
+            if cls.ranges().is_empty() {
+                return invalid();
+            }
+            Hir::class(hir::Class::Bytes(cls))
+        }
+        HirKind::Anchor(x) => Hir::anchor(x),
+        HirKind::WordBoundary(x) => Hir::word_boundary(x),
+        HirKind::Repetition(mut x) => {
+            x.hir = Box::new(strip_from_match_ascii(*x.hir, byte)?);
+            Hir::repetition(x)
+        }
+        HirKind::Group(mut x) => {
+            x.hir = Box::new(strip_from_match_ascii(*x.hir, byte)?);
+            Hir::group(x)
+        }
+        HirKind::Concat(xs) => {
+            let xs = xs.into_iter()
+                .map(|e| strip_from_match_ascii(e, byte))
+                .collect::<Result<Vec<Hir>, Error>>()?;
+            Hir::concat(xs)
+        }
+        HirKind::Alternation(xs) => {
+            let xs = xs.into_iter()
+                .map(|e| strip_from_match_ascii(e, byte))
+                .collect::<Result<Vec<Hir>, Error>>()?;
+            Hir::alternation(xs)
+        }
+    })
+}
+
+#[cfg(test)]
+mod tests {
+    use regex_syntax::Parser;
+
+    use error::Error;
+    use super::{LineTerminator, strip_from_match};
+
+    fn roundtrip(pattern: &str, byte: u8) -> String {
+        roundtrip_line_term(pattern, LineTerminator::byte(byte)).unwrap()
+    }
+
+    fn roundtrip_crlf(pattern: &str) -> String {
+        roundtrip_line_term(pattern, LineTerminator::crlf()).unwrap()
+    }
+
+    fn roundtrip_err(pattern: &str, byte: u8) -> Result<String, Error> {
+        roundtrip_line_term(pattern, LineTerminator::byte(byte))
+    }
+
+    fn roundtrip_line_term(
+        pattern: &str,
+        line_term: LineTerminator,
+    ) -> Result<String, Error> {
+        let expr1 = Parser::new().parse(pattern).unwrap();
+        let expr2 = strip_from_match(expr1, line_term)?;
+        Ok(expr2.to_string())
+    }
+
+    #[test]
+    fn various() {
+        assert_eq!(roundtrip(r"[a\n]", b'\n'), "[a]");
+        assert_eq!(roundtrip(r"[a\n]", b'a'), "[\n]");
+        assert_eq!(roundtrip_crlf(r"[a\n]"), "[a]");
+        assert_eq!(roundtrip_crlf(r"[a\r]"), "[a]");
+        assert_eq!(roundtrip_crlf(r"[a\r\n]"), "[a]");
+
+        assert_eq!(roundtrip(r"(?-u)\s", b'a'), r"(?-u:[\x09-\x0D\x20])");
+        assert_eq!(roundtrip(r"(?-u)\s", b'\n'), r"(?-u:[\x09\x0B-\x0D\x20])");
+
+        assert!(roundtrip_err(r"\n", b'\n').is_err());
+        assert!(roundtrip_err(r"abc\n", b'\n').is_err());
+        assert!(roundtrip_err(r"\nabc", b'\n').is_err());
+        assert!(roundtrip_err(r"abc\nxyz", b'\n').is_err());
+        assert!(roundtrip_err(r"\x0A", b'\n').is_err());
+        assert!(roundtrip_err(r"\u000A", b'\n').is_err());
+        assert!(roundtrip_err(r"\U0000000A", b'\n').is_err());
+        assert!(roundtrip_err(r"\u{A}", b'\n').is_err());
+        assert!(roundtrip_err("\n", b'\n').is_err());
+    }
+}
--- a/grep-regex/src/util.rs
+++ b/grep-regex/src/util.rs
@ -0,0 +1,29 @@
+/// Converts an arbitrary sequence of bytes to a literal suitable for building
+/// a regular expression.
+pub fn bytes_to_regex(bs: &[u8]) -> String {
+    use std::fmt::Write;
+    use regex_syntax::is_meta_character;
+
+    let mut s = String::with_capacity(bs.len());
+    for &b in bs {
+        if b <= 0x7F && !is_meta_character(b as char) {
+            write!(s, r"{}", b as char).unwrap();
+        } else {
+            write!(s, r"\x{:02x}", b).unwrap();
+        }
+    }
+    s
+}
+
+/// Converts arbitrary bytes to a nice string.
+pub fn show_bytes(bs: &[u8]) -> String {
+    use std::ascii::escape_default;
+    use std::str;
+
+    let mut nice = String::new();
+    for &b in bs {
+        let part: Vec<u8> = escape_default(b).collect();
+        nice.push_str(str::from_utf8(&part).unwrap());
+    }
+    nice
+}
--- a/grep-regex/src/word.rs
+++ b/grep-regex/src/word.rs
@ -0,0 +1,196 @@
+use std::collections::HashMap;
+use std::cell::RefCell;
+use std::sync::Arc;
+
+use grep_matcher::{Match, Matcher, NoError};
+use regex::bytes::{CaptureLocations, Regex};
+use thread_local::CachedThreadLocal;
+
+use config::ConfiguredHIR;
+use error::Error;
+use matcher::RegexCaptures;
+
+/// A matcher for implementing "word match" semantics.
+#[derive(Debug)]
+pub struct WordMatcher {
+    /// The regex which is roughly `(?:^|\W)(<original pattern>)(?:$|\W)`.
+    regex: Regex,
+    /// A map from capture group name to capture group index.
+    names: HashMap<String, usize>,
+    /// A reusable buffer for finding the match location of the inner group.
+    locs: Arc<CachedThreadLocal<RefCell<CaptureLocations>>>,
+}
+
+impl Clone for WordMatcher {
+    fn clone(&self) -> WordMatcher {
+        // We implement Clone manually so that we get a fresh CachedThreadLocal
+        // such that it can set its own thread owner. This permits each thread
+        // usings `locs` to hit the fast path.
+        WordMatcher {
+            regex: self.regex.clone(),
+            names: self.names.clone(),
+            locs: Arc::new(CachedThreadLocal::new()),
+        }
+    }
+}
+
+impl WordMatcher {
+    /// Create a new matcher from the given pattern that only produces matches
+    /// that are considered "words."
+    ///
+    /// The given options are used to construct the regular expression
+    /// internally.
+    pub fn new(expr: &ConfiguredHIR) -> Result<WordMatcher, Error> {
+        let word_expr = expr.with_pattern(|pat| {
+            format!(r"(?:(?m:^)|\W)({})(?:(?m:$)|\W)", pat)
+        })?;
+        let regex = word_expr.regex()?;
+        let locs = Arc::new(CachedThreadLocal::new());
+
+        let mut names = HashMap::new();
+        for (i, optional_name) in regex.capture_names().enumerate() {
+            if let Some(name) = optional_name {
+                names.insert(name.to_string(), i.checked_sub(1).unwrap());
+            }
+        }
+        Ok(WordMatcher { regex, names, locs })
+    }
+}
+
+impl Matcher for WordMatcher {
+    type Captures = RegexCaptures;
+    type Error = NoError;
+
+    fn find_at(
+        &self,
+        haystack: &[u8],
+        at: usize,
+    ) -> Result<Option<Match>, NoError> {
+        // To make this easy to get right, we extract captures here instead of
+        // calling `find_at`. The actual match is at capture group `1` instead
+        // of `0`. We *could* use `find_at` here and then trim the match after
+        // the fact, but that's a bit harder to get right, and it's not clear
+        // if it's worth it.
+
+        let cell = self.locs.get_or(|| {
+            Box::new(RefCell::new(self.regex.capture_locations()))
+        });
+        let mut caps = cell.borrow_mut();
+        self.regex.captures_read_at(&mut caps, haystack, at);
+        Ok(caps.get(1).map(|m| Match::new(m.0, m.1)))
+    }
+
+    fn new_captures(&self) -> Result<RegexCaptures, NoError> {
+        Ok(RegexCaptures::with_offset(self.regex.capture_locations(), 1))
+    }
+
+    fn capture_count(&self) -> usize {
+        self.regex.captures_len().checked_sub(1).unwrap()
+    }
+
+    fn capture_index(&self, name: &str) -> Option<usize> {
+        self.names.get(name).map(|i| *i)
+    }
+
+    fn captures_at(
+        &self,
+        haystack: &[u8],
+        at: usize,
+        caps: &mut RegexCaptures,
+    ) -> Result<bool, NoError> {
+        let r = self.regex.captures_read_at(caps.locations(), haystack, at);
+        Ok(r.is_some())
+    }
+
+    // We specifically do not implement other methods like find_iter or
+    // captures_iter. Namely, the iter methods are guaranteed to be correct
+    // by virtue of implementing find_at and captures_at above.
+}
+
+#[cfg(test)]
+mod tests {
+    use grep_matcher::{Captures, Match, Matcher};
+    use config::Config;
+    use super::WordMatcher;
+
+    fn matcher(pattern: &str) -> WordMatcher {
+        let chir = Config::default().hir(pattern).unwrap();
+        WordMatcher::new(&chir).unwrap()
+    }
+
+    fn find(pattern: &str, haystack: &str) -> Option<(usize, usize)> {
+        matcher(pattern)
+            .find(haystack.as_bytes())
+            .unwrap()
+            .map(|m| (m.start(), m.end()))
+    }
+
+    fn find_by_caps(pattern: &str, haystack: &str) -> Option<(usize, usize)> {
+        let m = matcher(pattern);
+        let mut caps = m.new_captures().unwrap();
+        if !m.captures(haystack.as_bytes(), &mut caps).unwrap() {
+            None
+        } else {
+            caps.get(0).map(|m| (m.start(), m.end()))
+        }
+    }
+
+    // Test that the standard `find` API reports offsets correctly.
+    #[test]
+    fn various_find() {
+        assert_eq!(Some((0, 3)), find(r"foo", "foo"));
+        assert_eq!(Some((0, 3)), find(r"foo", "foo("));
+        assert_eq!(Some((1, 4)), find(r"foo", "!foo("));
+        assert_eq!(None, find(r"foo", "!afoo("));
+
+        assert_eq!(Some((0, 3)), find(r"foo", "foo☃"));
+        assert_eq!(None, find(r"foo", "fooб"));
+        // assert_eq!(Some((0, 3)), find(r"foo", "fooб"));
+
+        // See: https://github.com/BurntSushi/ripgrep/issues/389
+        assert_eq!(Some((0, 2)), find(r"-2", "-2"));
+    }
+
+    // Test that the captures API also reports offsets correctly, just as
+    // find does. This exercises a different path in the code since captures
+    // are handled differently.
+    #[test]
+    fn various_captures() {
+        assert_eq!(Some((0, 3)), find_by_caps(r"foo", "foo"));
+        assert_eq!(Some((0, 3)), find_by_caps(r"foo", "foo("));
+        assert_eq!(Some((1, 4)), find_by_caps(r"foo", "!foo("));
+        assert_eq!(None, find_by_caps(r"foo", "!afoo("));
+
+        assert_eq!(Some((0, 3)), find_by_caps(r"foo", "foo☃"));
+        assert_eq!(None, find_by_caps(r"foo", "fooб"));
+        // assert_eq!(Some((0, 3)), find_by_caps(r"foo", "fooб"));
+
+        // See: https://github.com/BurntSushi/ripgrep/issues/389
+        assert_eq!(Some((0, 2)), find_by_caps(r"-2", "-2"));
+    }
+
+    // Test that the capture reporting methods work as advertised.
+    #[test]
+    fn capture_indexing() {
+        let m = matcher(r"(a)(?P<foo>b)(c)");
+        assert_eq!(4, m.capture_count());
+        assert_eq!(Some(2), m.capture_index("foo"));
+
+        let mut caps = m.new_captures().unwrap();
+        assert_eq!(4, caps.len());
+
+        assert!(m.captures(b"abc", &mut caps).unwrap());
+        assert_eq!(caps.get(0), Some(Match::new(0, 3)));
+        assert_eq!(caps.get(1), Some(Match::new(0, 1)));
+        assert_eq!(caps.get(2), Some(Match::new(1, 2)));
+        assert_eq!(caps.get(3), Some(Match::new(2, 3)));
+        assert_eq!(caps.get(4), None);
+
+        assert!(m.captures(b"#abc#", &mut caps).unwrap());
+        assert_eq!(caps.get(0), Some(Match::new(1, 4)));
+        assert_eq!(caps.get(1), Some(Match::new(1, 2)));
+        assert_eq!(caps.get(2), Some(Match::new(2, 3)));
+        assert_eq!(caps.get(3), Some(Match::new(3, 4)));
+        assert_eq!(caps.get(4), None);
+    }
+}