deps: initial migration steps to regex 1.9

This leaves the grep-regex crate in tatters. Pretty much the entire thing needs to be re-worked. The upshot is that it should result in some big simplifications. I hope. The idea here is to drop down and actually use regex-automata 0.3 instead of the regex crate itself.
2025-08-10 05:59:25 +02:00 · 2023-06-11 21:25:23 -04:00
parent a7f1276021
commit 1035f6b1ff
15 changed files with 606 additions and 558 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4,18 +4,9 @@ version = 3

 [[package]]
 name = "aho-corasick"
-version = "0.7.20"
+version = "1.0.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cc936419f96fa211c1b9166887b38e5e40b19958e5b895be7c1f93adec7071ac"
-dependencies = [
- "memchr",
-]
-
-[[package]]
-name = "aho-corasick"
-version = "1.0.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "67fc08ce920c31afb70f013dcce1bfc3a3195de6a228474e45e1f145b36f8d04"
+checksum = "43f6cb1bf222025340178f382c426f13757b2960e89779dfcb319c32542a5a41"
 dependencies = [
 "memchr",
 ]
@@ -40,7 +31,7 @@ checksum = "a246e68bb43f6cd9db24bea052a53e40405417c5fb372e3d1a8a7f770a564ef5"
 dependencies = [
 "memchr",
 "once_cell",
- "regex-automata",
+ "regex-automata 0.1.10",
 "serde",
 ]

@@ -131,7 +122,7 @@ checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b"
 name = "globset"
 version = "0.4.10"
 dependencies = [
- "aho-corasick 0.7.20",
+ "aho-corasick",
 "bstr",
 "fnv",
 "glob",
@@ -204,12 +195,12 @@ dependencies = [
 name = "grep-regex"
 version = "0.1.11"
 dependencies = [
- "aho-corasick 0.7.20",
+ "aho-corasick",
 "bstr",
 "grep-matcher",
 "log",
 "regex",
- "regex-syntax 0.6.29",
+ "regex-syntax",
 "thread_local",
 ]

@@ -287,9 +278,9 @@ checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646"

 [[package]]
 name = "libc"
-version = "0.2.144"
+version = "0.2.146"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2b00cc1c228a6782d0f076e7b232802e0c5689d41bb5df366f2a6b6621cfdfe1"
+checksum = "f92be4933c13fd498862a9e02a3055f8a8d9c039ce33db97306fd5a6caa7f29b"

 [[package]]
 name = "libm"
@@ -299,12 +290,9 @@ checksum = "7fc7aa29613bd6a620df431842069224d8bc9011086b1db4c0e0cd47fa03ec9a"

 [[package]]
 name = "log"
-version = "0.4.17"
+version = "0.4.19"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "abb12e687cfb44aa40f41fc3978ef76448f9b6038cad6aef4259d3c095a2382e"
-dependencies = [
- "cfg-if",
-]
+checksum = "b06a4cde4c0f271a446782e3eff8de789548ce57dbc8eca9292c27f4a42004b4"

 [[package]]
 name = "memchr"
@@ -323,9 +311,9 @@ dependencies = [

 [[package]]
 name = "once_cell"
-version = "1.17.1"
+version = "1.18.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b7e5500299e16ebb147ae15a00a942af264cf3688f47923b8fc2cd5858f23ad3"
+checksum = "dd8b5dd2ae5ed71462c540258bedcb51965123ad7e7ccf4b9a8cafaa4a63576d"

 [[package]]
 name = "packed_simd_2"
@@ -368,31 +356,30 @@ checksum = "26072860ba924cbfa98ea39c8c19b4dd6a4a25423dbdf219c1eca91aa0cf6964"

 [[package]]
 name = "proc-macro2"
-version = "1.0.58"
+version = "1.0.60"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fa1fb82fc0c281dd9671101b66b771ebbe1eaf967b96ac8740dcba4b70005ca8"
+checksum = "dec2b086b7a862cf4de201096214fa870344cf922b2b30c167badb3af3195406"
 dependencies = [
 "unicode-ident",
 ]

 [[package]]
 name = "quote"
-version = "1.0.27"
+version = "1.0.28"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8f4f29d145265ec1c483c7c654450edde0bfe043d3938d6972630663356d9500"
+checksum = "1b9ab9c7eadfd8df19006f1cf1a4aed13540ed5cbc047010ece5826e10825488"
 dependencies = [
 "proc-macro2",
 ]

 [[package]]
 name = "regex"
-version = "1.8.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "81ca098a9821bd52d6b24fd8b10bd081f47d39c22778cafaa75a2857a62c6390"
+version = "1.8.4"
 dependencies = [
- "aho-corasick 1.0.1",
+ "aho-corasick",
 "memchr",
- "regex-syntax 0.7.2",
+ "regex-automata 0.3.0",
+ "regex-syntax",
 ]

 [[package]]
@@ -402,16 +389,17 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6c230d73fb8d8c1b9c0b3135c5142a8acee3a0558fb8db5cf1cb65f8d7862132"

 [[package]]
-name = "regex-syntax"
-version = "0.6.29"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f162c6dd7b008981e4d40210aca20b4bd0f9b60ca9271061b07f78537722f2e1"
+name = "regex-automata"
+version = "0.3.0"
+dependencies = [
+ "aho-corasick",
+ "memchr",
+ "regex-syntax",
+]

 [[package]]
 name = "regex-syntax"
 version = "0.7.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "436b050e76ed2903236f032a59761c1eb99e1b0aead2c257922771dab1fc8c78"

 [[package]]
 name = "ripgrep"
@@ -449,18 +437,18 @@ dependencies = [

 [[package]]
 name = "serde"
-version = "1.0.163"
+version = "1.0.164"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2113ab51b87a539ae008b5c6c02dc020ffa39afd2d83cffcb3f4eb2722cebec2"
+checksum = "9e8c8cf938e98f769bc164923b06dce91cea1751522f46f8466461af04c9027d"
 dependencies = [
 "serde_derive",
 ]

 [[package]]
 name = "serde_derive"
-version = "1.0.163"
+version = "1.0.164"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8c805777e3930c8883389c602315a24224bcc738b63905ef87cd1420353ea93e"
+checksum = "d9735b638ccc51c28bf6914d90a2e9725b377144fc612c49a611fddd1b631d68"
 dependencies = [
 "proc-macro2",
 "quote",
@@ -486,9 +474,9 @@ checksum = "8ea5119cdb4c55b55d432abb513a0429384878c15dde60cc77b1c99de1a95a6a"

 [[package]]
 name = "syn"
-version = "2.0.16"
+version = "2.0.18"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a6f671d4b5ffdb8eadec19c0ae67fe2639df8684bd7bc4b83d986b8db549cf01"
+checksum = "32d41677bcbe24c20c52e7c70b0d8db04134c5d1066bf98662e2871ad200ea3e"
 dependencies = [
 "proc-macro2",
 "quote",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -19,6 +19,11 @@ autotests = false
 edition = "2018"
 rust-version = "1.65"

+[patch.crates-io]
+regex = { path = "/home/andrew/rust/regex" }
+regex-automata = { path = "/home/andrew/rust/regex/regex-automata" }
+regex-syntax = { path = "/home/andrew/rust/regex/regex-syntax" }
+
 [[bin]]
 bench = false
 path = "crates/core/main.rs"
@@ -47,7 +52,7 @@ grep = { version = "0.2.12", path = "crates/grep" }
 ignore = { version = "0.4.19", path = "crates/ignore" }
 lazy_static = "1.1.0"
 log = "0.4.5"
-regex = "1.3.5"
+regex = "1.8.3"
 serde_json = "1.0.23"
 termcolor = "1.1.0"

--- a/crates/core/args.rs
+++ b/crates/core/args.rs
@@ -1464,7 +1464,7 @@ impl ArgMatches {
        // own, but if the patterns are joined in a set of alternations, then
        // you wind up with `foo|`, which is currently invalid in Rust's regex
        // engine.
-        "(?:z{0})*".to_string()
+        "(?:)".to_string()
    }

    /// Converts an OsStr pattern to a String pattern. The pattern is escaped
--- a/crates/globset/Cargo.toml
+++ b/crates/globset/Cargo.toml
@@ -20,11 +20,11 @@ name = "globset"
 bench = false

 [dependencies]
-aho-corasick = "0.7.3"
-bstr = { version = "1.1.0", default-features = false, features = ["std"] }
+aho-corasick = "1.0.2"
+bstr = { version = "1.5.0", default-features = false, features = ["std"] }
 fnv = "1.0.6"
 log = { version = "0.4.5", optional = true }
-regex = { version = "1.1.5", default-features = false, features = ["perf", "std"] }
+regex = { version = "1.8.3", default-features = false, features = ["perf", "std"] }
 serde = { version = "1.0.104", optional = true }

 [dev-dependencies]
--- a/crates/globset/src/lib.rs
+++ b/crates/globset/src/lib.rs
@@ -818,7 +818,7 @@ impl MultiStrategyBuilder {

    fn prefix(self) -> PrefixStrategy {
        PrefixStrategy {
-            matcher: AhoCorasick::new_auto_configured(&self.literals),
+            matcher: AhoCorasick::new(&self.literals).unwrap(),
            map: self.map,
            longest: self.longest,
        }
@@ -826,7 +826,7 @@ impl MultiStrategyBuilder {

    fn suffix(self) -> SuffixStrategy {
        SuffixStrategy {
-            matcher: AhoCorasick::new_auto_configured(&self.literals),
+            matcher: AhoCorasick::new(&self.literals).unwrap(),
            map: self.map,
            longest: self.longest,
        }
--- a/crates/regex/Cargo.toml
+++ b/crates/regex/Cargo.toml
@@ -14,10 +14,10 @@ license = "Unlicense OR MIT"
 edition = "2018"

 [dependencies]
-aho-corasick = "0.7.3"
-bstr = "1.1.0"
+aho-corasick = "1.0.2"
+bstr = "1.5.0"
 grep-matcher = { version = "0.1.6", path = "../matcher" }
 log = "0.4.5"
-regex = "1.1"
-regex-syntax = "0.6.5"
-thread_local = "1.1.2"
+regex = "1.8.3"
+regex-syntax = "0.7.2"
+thread_local = "1.1.7"
--- a/crates/regex/src/config.rs
+++ b/crates/regex/src/config.rs
@@ -71,7 +71,7 @@ impl Config {
        let ast = self.ast(pattern)?;
        let analysis = self.analysis(&ast)?;
        let expr = hir::translate::TranslatorBuilder::new()
-            .allow_invalid_utf8(true)
+            .utf8(false)
            .case_insensitive(self.is_case_insensitive(&analysis))
            .multi_line(self.multi_line)
            .dot_matches_new_line(self.dot_matches_new_line)
@@ -172,7 +172,12 @@ impl ConfiguredHIR {
    /// CRLF hack is enabled and the regex is line anchored at the end. In
    /// this case, matches that end with a `\r` have the `\r` stripped.
    pub fn needs_crlf_stripped(&self) -> bool {
-        self.config.crlf && self.expr.is_line_anchored_end()
+        self.config.crlf
+            && self
+                .expr
+                .properties()
+                .look_set_suffix_any()
+                .contains(hir::Look::EndLF)
    }

    /// Returns the line terminator configured on this expression.
@@ -202,7 +207,7 @@ impl ConfiguredHIR {

    /// Returns true if and only if the underlying HIR has any text anchors.
    fn is_any_anchored(&self) -> bool {
-        self.expr.is_any_anchored_start() || self.expr.is_any_anchored_end()
+        self.expr.properties().look_set().contains_anchor_haystack()
    }

    /// Builds a regular expression from this HIR expression.
@@ -301,7 +306,7 @@ impl ConfiguredHIR {
        let expr = ::regex_syntax::ParserBuilder::new()
            .nest_limit(self.config.nest_limit)
            .octal(self.config.octal)
-            .allow_invalid_utf8(true)
+            .utf8(false)
            .multi_line(self.config.multi_line)
            .dot_matches_new_line(self.config.dot_matches_new_line)
            .unicode(self.config.unicode)
--- a/crates/regex/src/crlf.rs
+++ b/crates/regex/src/crlf.rs
@@ -124,32 +124,26 @@ pub fn adjust_match(haystack: &[u8], m: Match) -> Match {
 /// nicely in most cases, especially when a match is limited to a single line.
 pub fn crlfify(expr: Hir) -> Hir {
    match expr.into_kind() {
-        HirKind::Anchor(hir::Anchor::EndLine) => {
-            let concat = Hir::concat(vec![
-                Hir::repetition(hir::Repetition {
-                    kind: hir::RepetitionKind::ZeroOrOne,
-                    greedy: false,
-                    hir: Box::new(Hir::literal(hir::Literal::Unicode('\r'))),
-                }),
-                Hir::anchor(hir::Anchor::EndLine),
-            ]);
-            Hir::group(hir::Group {
-                kind: hir::GroupKind::NonCapturing,
-                hir: Box::new(concat),
-            })
-        }
+        HirKind::Look(hir::Look::EndLF) => Hir::concat(vec![
+            Hir::repetition(hir::Repetition {
+                min: 0,
+                max: Some(1),
+                greedy: false,
+                sub: Box::new(Hir::literal("\r".as_bytes())),
+            }),
+            Hir::look(hir::Look::EndLF),
+        ]),
        HirKind::Empty => Hir::empty(),
-        HirKind::Literal(x) => Hir::literal(x),
+        HirKind::Literal(hir::Literal(x)) => Hir::literal(x),
        HirKind::Class(x) => Hir::class(x),
-        HirKind::Anchor(x) => Hir::anchor(x),
-        HirKind::WordBoundary(x) => Hir::word_boundary(x),
+        HirKind::Look(x) => Hir::look(x),
        HirKind::Repetition(mut x) => {
-            x.hir = Box::new(crlfify(*x.hir));
+            x.sub = Box::new(crlfify(*x.sub));
            Hir::repetition(x)
        }
-        HirKind::Group(mut x) => {
-            x.hir = Box::new(crlfify(*x.hir));
-            Hir::group(x)
+        HirKind::Capture(mut x) => {
+            x.sub = Box::new(crlfify(*x.sub));
+            Hir::capture(x)
        }
        HirKind::Concat(xs) => {
            Hir::concat(xs.into_iter().map(crlfify).collect())
@@ -174,12 +168,12 @@ mod tests {
    #[test]
    fn various() {
        assert_eq!(roundtrip(r"(?m)$"), "(?:\r??(?m:$))");
-        assert_eq!(roundtrip(r"(?m)$$"), "(?:\r??(?m:$))(?:\r??(?m:$))");
+        assert_eq!(roundtrip(r"(?m)$$"), "(?:\r??(?m:$)\r??(?m:$))");
        assert_eq!(
            roundtrip(r"(?m)(?:foo$|bar$)"),
-            "(?:foo(?:\r??(?m:$))|bar(?:\r??(?m:$)))"
+            "(?:(?:(?:foo)\r??(?m:$))|(?:(?:bar)\r??(?m:$)))"
        );
-        assert_eq!(roundtrip(r"(?m)$a"), "(?:\r??(?m:$))a");
+        assert_eq!(roundtrip(r"(?m)$a"), "(?:\r??(?m:$)a)");

        // Not a multiline `$`, so no crlfifying occurs.
        assert_eq!(roundtrip(r"$"), "\\z");
--- a/crates/regex/src/literal.rs
+++ b/crates/regex/src/literal.rs
@@ -1,43 +1,12 @@
-/*
-This module is responsible for extracting *inner* literals out of the AST of a
-regular expression. Normally this is the job of the regex engine itself, but
-the regex engine doesn't look for inner literals. Since we're doing line based
-searching, we can use them, so we need to do it ourselves.
-*/
+use regex_syntax::hir::Hir;

-use bstr::ByteSlice;
-use regex_syntax::hir::literal::{Literal, Literals};
-use regex_syntax::hir::{self, Hir, HirKind};
-
-use crate::util;
-
-/// Represents prefix, suffix and inner "required" literals for a regular
-/// expression.
-///
-/// Prefixes and suffixes are detected using regex-syntax. The inner required
-/// literals are detected using something custom (but based on the code in
-/// regex-syntax).
 #[derive(Clone, Debug)]
-pub struct LiteralSets {
-    /// A set of prefix literals.
-    prefixes: Literals,
-    /// A set of suffix literals.
-    suffixes: Literals,
-    /// A set of literals such that at least one of them must appear in every
-    /// match. A literal in this set may be neither a prefix nor a suffix.
-    required: Literals,
-}
+pub struct LiteralSets {}

 impl LiteralSets {
    /// Create a set of literals from the given HIR expression.
-    pub fn new(expr: &Hir) -> LiteralSets {
-        let mut required = Literals::empty();
-        union_required(expr, &mut required);
-        LiteralSets {
-            prefixes: Literals::prefixes(expr),
-            suffixes: Literals::suffixes(expr),
-            required,
-        }
+    pub fn new(_: &Hir) -> LiteralSets {
+        LiteralSets {}
    }

    /// If it is deemed advantageuous to do so (via various suspicious
@@ -46,383 +15,7 @@ impl LiteralSets {
    /// generated these literal sets. The idea here is that the pattern
    /// returned by this method is much cheaper to search for. i.e., It is
    /// usually a single literal or an alternation of literals.
-    pub fn one_regex(&self, word: bool) -> Option<String> {
-        // TODO: The logic in this function is basically inscrutable. It grew
-        // organically in the old grep 0.1 crate. Ideally, it would be
-        // re-worked. In fact, the entire inner literal extraction should be
-        // re-worked. Actually, most of regex-syntax's literal extraction
-        // should also be re-worked. Alas... only so much time in the day.
-
-        if !word {
-            if self.prefixes.all_complete() && !self.prefixes.is_empty() {
-                log::debug!("literal prefixes detected: {:?}", self.prefixes);
-                // When this is true, the regex engine will do a literal scan,
-                // so we don't need to return anything. But we only do this
-                // if we aren't doing a word regex, since a word regex adds
-                // a `(?:\W|^)` to the beginning of the regex, thereby
-                // defeating the regex engine's literal detection.
-                return None;
-            }
-        }
-
-        // Out of inner required literals, prefixes and suffixes, which one
-        // is the longest? We pick the longest to do fast literal scan under
-        // the assumption that a longer literal will have a lower false
-        // positive rate.
-        let pre_lcp = self.prefixes.longest_common_prefix();
-        let pre_lcs = self.prefixes.longest_common_suffix();
-        let suf_lcp = self.suffixes.longest_common_prefix();
-        let suf_lcs = self.suffixes.longest_common_suffix();
-
-        let req_lits = self.required.literals();
-        let req = match req_lits.iter().max_by_key(|lit| lit.len()) {
-            None => &[],
-            Some(req) => &***req,
-        };
-
-        let mut lit = pre_lcp;
-        if pre_lcs.len() > lit.len() {
-            lit = pre_lcs;
-        }
-        if suf_lcp.len() > lit.len() {
-            lit = suf_lcp;
-        }
-        if suf_lcs.len() > lit.len() {
-            lit = suf_lcs;
-        }
-        if req_lits.len() == 1 && req.len() > lit.len() {
-            lit = req;
-        }
-
-        // Special case: if we detected an alternation of inner required
-        // literals and its longest literal is bigger than the longest
-        // prefix/suffix, then choose the alternation. In practice, this
-        // helps with case insensitive matching, which can generate lots of
-        // inner required literals.
-        let any_empty = req_lits.iter().any(|lit| lit.is_empty());
-        let any_white = has_only_whitespace(&req_lits);
-        if req.len() > lit.len()
-            && req_lits.len() > 1
-            && !any_empty
-            && !any_white
-        {
-            log::debug!("required literals found: {:?}", req_lits);
-            let alts: Vec<String> = req_lits
-                .into_iter()
-                .map(|x| util::bytes_to_regex(x))
-                .collect();
-            // We're matching raw bytes, so disable Unicode mode.
-            Some(format!("(?-u:{})", alts.join("|")))
-        } else if lit.is_empty() {
-            // If we're here, then we have no LCP. No LCS. And no detected
-            // inner required literals. In theory this shouldn't happen, but
-            // the inner literal detector isn't as nice as we hope and doesn't
-            // actually support returning a set of alternating required
-            // literals. (Instead, it only returns a set where EVERY literal
-            // in it is required. It cannot currently express "either P or Q
-            // is required.")
-            //
-            // In this case, it is possible that we still have meaningful
-            // prefixes or suffixes to use. So we look for the set of literals
-            // with the highest minimum length and use that to build our "fast"
-            // regex.
-            //
-            // This manifests in fairly common scenarios. e.g.,
-            //
-            //     rg -w 'foo|bar|baz|quux'
-            //
-            // Normally, without the `-w`, the regex engine itself would
-            // detect the prefix correctly. Unfortunately, the `-w` option
-            // turns the regex into something like this:
-            //
-            //     rg '(^|\W)(foo|bar|baz|quux)($|\W)'
-            //
-            // Which will defeat all prefix and suffix literal optimizations.
-            // (Not in theory---it could be better. But the current
-            // implementation isn't good enough.) ... So we make up for it
-            // here.
-            if !word {
-                return None;
-            }
-            let p_min_len = self.prefixes.min_len();
-            let s_min_len = self.suffixes.min_len();
-            let lits = match (p_min_len, s_min_len) {
-                (None, None) => return None,
-                (Some(_), None) => {
-                    log::debug!("prefix literals found");
-                    self.prefixes.literals()
-                }
-                (None, Some(_)) => {
-                    log::debug!("suffix literals found");
-                    self.suffixes.literals()
-                }
-                (Some(p), Some(s)) => {
-                    if p >= s {
-                        log::debug!("prefix literals found");
-                        self.prefixes.literals()
-                    } else {
-                        log::debug!("suffix literals found");
-                        self.suffixes.literals()
-                    }
-                }
-            };
-
-            log::debug!("prefix/suffix literals found: {:?}", lits);
-            if has_only_whitespace(lits) {
-                log::debug!("dropping literals because one was whitespace");
-                return None;
-            }
-            let alts: Vec<String> =
-                lits.into_iter().map(|x| util::bytes_to_regex(x)).collect();
-            // We're matching raw bytes, so disable Unicode mode.
-            Some(format!("(?-u:{})", alts.join("|")))
-        } else {
-            log::debug!("required literal found: {:?}", util::show_bytes(lit));
-            if lit.chars().all(|c| c.is_whitespace()) {
-                log::debug!("dropping literal because one was whitespace");
-                return None;
-            }
-            Some(format!("(?-u:{})", util::bytes_to_regex(&lit)))
-        }
-    }
-}
-
-fn union_required(expr: &Hir, lits: &mut Literals) {
-    match *expr.kind() {
-        HirKind::Literal(hir::Literal::Unicode(c)) => {
-            let mut buf = [0u8; 4];
-            lits.cross_add(c.encode_utf8(&mut buf).as_bytes());
-        }
-        HirKind::Literal(hir::Literal::Byte(b)) => {
-            lits.cross_add(&[b]);
-        }
-        HirKind::Class(hir::Class::Unicode(ref cls)) => {
-            if count_unicode_class(cls) >= 5 || !lits.add_char_class(cls) {
-                lits.cut();
-            }
-        }
-        HirKind::Class(hir::Class::Bytes(ref cls)) => {
-            if count_byte_class(cls) >= 5 || !lits.add_byte_class(cls) {
-                lits.cut();
-            }
-        }
-        HirKind::Group(hir::Group { ref hir, .. }) => {
-            union_required(&**hir, lits);
-        }
-        HirKind::Repetition(ref x) => match x.kind {
-            hir::RepetitionKind::ZeroOrOne => lits.cut(),
-            hir::RepetitionKind::ZeroOrMore => lits.cut(),
-            hir::RepetitionKind::OneOrMore => {
-                union_required(&x.hir, lits);
-            }
-            hir::RepetitionKind::Range(ref rng) => {
-                let (min, max) = match *rng {
-                    hir::RepetitionRange::Exactly(m) => (m, Some(m)),
-                    hir::RepetitionRange::AtLeast(m) => (m, None),
-                    hir::RepetitionRange::Bounded(m, n) => (m, Some(n)),
-                };
-                repeat_range_literals(
-                    &x.hir,
-                    min,
-                    max,
-                    x.greedy,
-                    lits,
-                    union_required,
-                );
-            }
-        },
-        HirKind::Concat(ref es) if es.is_empty() => {}
-        HirKind::Concat(ref es) if es.len() == 1 => {
-            union_required(&es[0], lits)
-        }
-        HirKind::Concat(ref es) => {
-            for e in es {
-                let mut lits2 = lits.to_empty();
-                union_required(e, &mut lits2);
-                if lits2.is_empty() {
-                    lits.cut();
-                    continue;
-                }
-                if lits2.contains_empty() || !is_simple(&e) {
-                    lits.cut();
-                }
-                if !lits.cross_product(&lits2) || !lits2.any_complete() {
-                    // If this expression couldn't yield any literal that
-                    // could be extended, then we need to quit. Since we're
-                    // short-circuiting, we also need to freeze every member.
-                    lits.cut();
-                    break;
-                }
-            }
-        }
-        HirKind::Alternation(ref es) => {
-            alternate_literals(es, lits, union_required);
-        }
-        _ => lits.cut(),
-    }
-}
-
-fn repeat_range_literals<F: FnMut(&Hir, &mut Literals)>(
-    e: &Hir,
-    min: u32,
-    _max: Option<u32>,
-    _greedy: bool,
-    lits: &mut Literals,
-    mut f: F,
-) {
-    if min == 0 {
-        // This is a bit conservative. If `max` is set, then we could
-        // treat this as a finite set of alternations. For now, we
-        // just treat it as `e*`.
-        lits.cut();
-    } else {
-        // We only extract literals from a single repetition, even though
-        // we could do more. e.g., `a{3}` will have `a` extracted instead of
-        // `aaa`. The reason is that inner literal extraction can't be unioned
-        // across repetitions. e.g., extracting `foofoofoo` from `(\w+foo){3}`
-        // is wrong.
-        f(e, lits);
-        lits.cut();
-    }
-}
-
-fn alternate_literals<F: FnMut(&Hir, &mut Literals)>(
-    es: &[Hir],
-    lits: &mut Literals,
-    mut f: F,
-) {
-    let mut lits2 = lits.to_empty();
-    for e in es {
-        let mut lits3 = lits.to_empty();
-        lits3.set_limit_size(lits.limit_size() / 5);
-        f(e, &mut lits3);
-        if lits3.is_empty() || !lits2.union(lits3) {
-            // If we couldn't find suffixes for *any* of the
-            // alternates, then the entire alternation has to be thrown
-            // away and any existing members must be frozen. Similarly,
-            // if the union couldn't complete, stop and freeze.
-            lits.cut();
-            return;
-        }
-    }
-    // All we do at the moment is look for prefixes and suffixes. If both
-    // are empty, then we report nothing. We should be able to do better than
-    // this, but we'll need something more expressive than just a "set of
-    // literals."
-    let lcp = lits2.longest_common_prefix();
-    let lcs = lits2.longest_common_suffix();
-    if !lcp.is_empty() {
-        lits.cross_add(lcp);
-    }
-    lits.cut();
-    if !lcs.is_empty() {
-        lits.add(Literal::empty());
-        lits.add(Literal::new(lcs.to_vec()));
-    }
-}
-
-fn is_simple(expr: &Hir) -> bool {
-    match *expr.kind() {
-        HirKind::Empty
-        | HirKind::Literal(_)
-        | HirKind::Class(_)
-        | HirKind::Concat(_)
-        | HirKind::Alternation(_) => true,
-        HirKind::Anchor(_)
-        | HirKind::WordBoundary(_)
-        | HirKind::Group(_)
-        | HirKind::Repetition(_) => false,
-    }
-}
-
-/// Return the number of characters in the given class.
-fn count_unicode_class(cls: &hir::ClassUnicode) -> u32 {
-    cls.iter().map(|r| 1 + (r.end() as u32 - r.start() as u32)).sum()
-}
-
-/// Return the number of bytes in the given class.
-fn count_byte_class(cls: &hir::ClassBytes) -> u32 {
-    cls.iter().map(|r| 1 + (r.end() as u32 - r.start() as u32)).sum()
-}
-
-/// Returns true if and only if any of the literals in the given set is
-/// entirely whitespace.
-fn has_only_whitespace(lits: &[Literal]) -> bool {
-    for lit in lits {
-        if lit.chars().all(|c| c.is_whitespace()) {
-            return true;
-        }
-    }
-    false
-}
-
-#[cfg(test)]
-mod tests {
-    use super::LiteralSets;
-    use regex_syntax::Parser;
-
-    fn sets(pattern: &str) -> LiteralSets {
-        let hir = Parser::new().parse(pattern).unwrap();
-        LiteralSets::new(&hir)
-    }
-
-    fn one_regex(pattern: &str) -> Option<String> {
-        sets(pattern).one_regex(false)
-    }
-
-    // Put a pattern into the same format as the one returned by `one_regex`.
-    fn pat(pattern: &str) -> Option<String> {
-        Some(format!("(?-u:{})", pattern))
-    }
-
-    #[test]
-    fn various() {
-        // Obviously no literals.
-        assert!(one_regex(r"\w").is_none());
-        assert!(one_regex(r"\pL").is_none());
-
-        // Tantalizingly close.
-        assert!(one_regex(r"\w|foo").is_none());
-
-        // There's a literal, but it's better if the regex engine handles it
-        // internally.
-        assert!(one_regex(r"abc").is_none());
-
-        // Core use cases.
-        assert_eq!(one_regex(r"\wabc\w"), pat("abc"));
-        assert_eq!(one_regex(r"abc\w"), pat("abc"));
-
-        // TODO: Make these pass. We're missing some potentially big wins
-        // without these.
-        // assert_eq!(one_regex(r"\w(foo|bar|baz)"), pat("foo|bar|baz"));
-        // assert_eq!(one_regex(r"\w(foo|bar|baz)\w"), pat("foo|bar|baz"));
-    }
-
-    #[test]
-    fn regression_1064() {
-        // Regression from:
-        // https://github.com/BurntSushi/ripgrep/issues/1064
-        // assert_eq!(one_regex(r"a.*c"), pat("a"));
-        assert_eq!(one_regex(r"a(.*c)"), pat("a"));
-    }
-
-    #[test]
-    fn regression_1319() {
-        // Regression from:
-        // https://github.com/BurntSushi/ripgrep/issues/1319
-        assert_eq!(
-            one_regex(r"TTGAGTCCAGGAG[ATCG]{2}C"),
-            pat("TTGAGTCCAGGAG"),
-        );
-    }
-
-    #[test]
-    fn regression_1537() {
-        // Regression from:
-        // https://github.com/BurntSushi/ripgrep/issues/1537
-        assert_eq!(one_regex(r";(.*,)"), pat(";"));
-        assert_eq!(one_regex(r";((.*,))"), pat(";"));
-        assert_eq!(one_regex(r";(.*,)+"), pat(";"),);
-        assert_eq!(one_regex(r";(.*,){1}"), pat(";"),);
+    pub fn one_regex(&self, _word: bool) -> Option<String> {
+        None
    }
 }
--- a/crates/regex/src/literalold.rs
+++ b/crates/regex/src/literalold.rs
@@ -0,0 +1,466 @@
+/*
+This module is responsible for extracting *inner* literals out of the AST of a
+regular expression. Normally this is the job of the regex engine itself, but
+the regex engine doesn't look for inner literals. Since we're doing line based
+searching, we can use them, so we need to do it ourselves.
+*/
+
+use {
+    bstr::ByteSlice,
+    regex_syntax::hir::{
+        self,
+        literal::{self, Literal, Seq},
+        Hir, HirKind,
+    },
+};
+
+use crate::util;
+
+/// Represents prefix, suffix and inner "required" literals for a regular
+/// expression.
+///
+/// Prefixes and suffixes are detected using regex-syntax. The inner required
+/// literals are detected using something custom (but based on the code in
+/// regex-syntax).
+#[derive(Clone, Debug)]
+pub struct LiteralSets {
+    /// A set of prefix literals.
+    prefixes: Seq,
+    /// A set of suffix literals.
+    suffixes: Seq,
+    /// A set of literals such that at least one of them must appear in every
+    /// match. A literal in this set may be neither a prefix nor a suffix.
+    required: Seq,
+}
+
+impl LiteralSets {
+    /// Create a set of literals from the given HIR expression.
+    pub fn new(expr: &Hir) -> LiteralSets {
+        let mut required = Seq::singleton(Literal::exact(vec![]));
+        union_required(expr, &mut required);
+        LiteralSets {
+            prefixes: prefixes(expr),
+            suffixes: suffixes(expr),
+            required,
+        }
+    }
+
+    /// If it is deemed advantageuous to do so (via various suspicious
+    /// heuristics), this will return a single regular expression pattern that
+    /// matches a subset of the language matched by the regular expression that
+    /// generated these literal sets. The idea here is that the pattern
+    /// returned by this method is much cheaper to search for. i.e., It is
+    /// usually a single literal or an alternation of literals.
+    pub fn one_regex(&self, word: bool) -> Option<String> {
+        // TODO: The logic in this function is basically inscrutable. It grew
+        // organically in the old grep 0.1 crate. Ideally, it would be
+        // re-worked. In fact, the entire inner literal extraction should be
+        // re-worked. Actually, most of regex-syntax's literal extraction
+        // should also be re-worked. Alas... only so much time in the day.
+
+        if !word {
+            if self.prefixes.is_exact() && !self.prefixes.is_empty() {
+                log::debug!("literal prefixes detected: {:?}", self.prefixes);
+                // When this is true, the regex engine will do a literal scan,
+                // so we don't need to return anything. But we only do this
+                // if we aren't doing a word regex, since a word regex adds
+                // a `(?:\W|^)` to the beginning of the regex, thereby
+                // defeating the regex engine's literal detection.
+                return None;
+            }
+        }
+
+        // Out of inner required literals, prefixes and suffixes, which one
+        // is the longest? We pick the longest to do fast literal scan under
+        // the assumption that a longer literal will have a lower false
+        // positive rate.
+        let pre_lcp = self.prefixes.longest_common_prefix().unwrap_or(&[]);
+        let pre_lcs = self.prefixes.longest_common_suffix().unwrap_or(&[]);
+        let suf_lcp = self.suffixes.longest_common_prefix().unwrap_or(&[]);
+        let suf_lcs = self.suffixes.longest_common_suffix().unwrap_or(&[]);
+
+        let req_lits = self.required.literals().unwrap_or(&[]);
+        let req = match req_lits.iter().max_by_key(|lit| lit.len()) {
+            None => &[],
+            Some(req) => req.as_bytes(),
+        };
+
+        let mut lit = pre_lcp;
+        if pre_lcs.len() > lit.len() {
+            lit = pre_lcs;
+        }
+        if suf_lcp.len() > lit.len() {
+            lit = suf_lcp;
+        }
+        if suf_lcs.len() > lit.len() {
+            lit = suf_lcs;
+        }
+        if req_lits.len() == 1 && req.len() > lit.len() {
+            lit = req;
+        }
+
+        // Special case: if we detected an alternation of inner required
+        // literals and its longest literal is bigger than the longest
+        // prefix/suffix, then choose the alternation. In practice, this
+        // helps with case insensitive matching, which can generate lots of
+        // inner required literals.
+        let any_empty = req_lits.iter().any(|lit| lit.is_empty());
+        let any_white = has_only_whitespace(&req_lits);
+        if req.len() > lit.len()
+            && req_lits.len() > 1
+            && !any_empty
+            && !any_white
+        {
+            log::debug!("required literals found: {:?}", req_lits);
+            let alts: Vec<String> = req_lits
+                .into_iter()
+                .map(|x| util::bytes_to_regex(x.as_bytes()))
+                .collect();
+            // We're matching raw bytes, so disable Unicode mode.
+            Some(format!("(?-u:{})", alts.join("|")))
+        } else if lit.is_empty() {
+            // If we're here, then we have no LCP. No LCS. And no detected
+            // inner required literals. In theory this shouldn't happen, but
+            // the inner literal detector isn't as nice as we hope and doesn't
+            // actually support returning a set of alternating required
+            // literals. (Instead, it only returns a set where EVERY literal
+            // in it is required. It cannot currently express "either P or Q
+            // is required.")
+            //
+            // In this case, it is possible that we still have meaningful
+            // prefixes or suffixes to use. So we look for the set of literals
+            // with the highest minimum length and use that to build our "fast"
+            // regex.
+            //
+            // This manifests in fairly common scenarios. e.g.,
+            //
+            //     rg -w 'foo|bar|baz|quux'
+            //
+            // Normally, without the `-w`, the regex engine itself would
+            // detect the prefix correctly. Unfortunately, the `-w` option
+            // turns the regex into something like this:
+            //
+            //     rg '(^|\W)(foo|bar|baz|quux)($|\W)'
+            //
+            // Which will defeat all prefix and suffix literal optimizations.
+            // (Not in theory---it could be better. But the current
+            // implementation isn't good enough.) ... So we make up for it
+            // here.
+            if !word {
+                return None;
+            }
+            let p_min_len = self.prefixes.min_literal_len();
+            let s_min_len = self.suffixes.min_literal_len();
+            let lits = match (p_min_len, s_min_len) {
+                (None, None) => return None,
+                (Some(_), None) => {
+                    log::debug!("prefix literals found");
+                    self.prefixes.literals().unwrap()
+                }
+                (None, Some(_)) => {
+                    log::debug!("suffix literals found");
+                    self.suffixes.literals().unwrap()
+                }
+                (Some(p), Some(s)) => {
+                    if p >= s {
+                        log::debug!("prefix literals found");
+                        self.prefixes.literals().unwrap()
+                    } else {
+                        log::debug!("suffix literals found");
+                        self.suffixes.literals().unwrap()
+                    }
+                }
+            };
+
+            log::debug!("prefix/suffix literals found: {:?}", lits);
+            if has_only_whitespace(lits) {
+                log::debug!("dropping literals because one was whitespace");
+                return None;
+            }
+            let alts: Vec<String> = lits
+                .into_iter()
+                .map(|x| util::bytes_to_regex(x.as_bytes()))
+                .collect();
+            // We're matching raw bytes, so disable Unicode mode.
+            Some(format!("(?-u:{})", alts.join("|")))
+        } else {
+            log::debug!("required literal found: {:?}", util::show_bytes(lit));
+            if lit.chars().all(|c| c.is_whitespace()) {
+                log::debug!("dropping literal because one was whitespace");
+                return None;
+            }
+            Some(format!("(?-u:{})", util::bytes_to_regex(&lit)))
+        }
+    }
+}
+
+fn union_required(expr: &Hir, lits: &mut Seq) {
+    match *expr.kind() {
+        HirKind::Literal(hir::Literal(ref bytes)) => {
+            lits.cross_forward(&mut Seq::new([bytes]));
+        }
+        HirKind::Class(hir::Class::Unicode(_)) => {
+            lits.make_inexact();
+        }
+        HirKind::Class(hir::Class::Bytes(_)) => {
+            lits.make_inexact();
+        }
+        HirKind::Capture(hir::Capture { ref sub, .. }) => {
+            union_required(&**sub, lits);
+        }
+        HirKind::Repetition(hir::Repetition { min, max, greedy, ref sub }) => {
+            repeat_range_literals(
+                &sub,
+                min,
+                max,
+                greedy,
+                lits,
+                union_required,
+            );
+        }
+        HirKind::Concat(ref es) if es.is_empty() => {}
+        HirKind::Concat(ref es) if es.len() == 1 => {
+            union_required(&es[0], lits)
+        }
+        HirKind::Concat(ref es) => {
+            for e in es {
+                let mut lits2 = Seq::singleton(Literal::exact(vec![]));
+                union_required(e, &mut lits2);
+                if lits2.len() == Some(1) && lits2.min_literal_len() == Some(0)
+                {
+                    lits.make_inexact();
+                    continue;
+                }
+                if lits2.min_literal_len() == Some(0) || !is_simple(&e) {
+                    lits.make_inexact();
+                }
+                lits.cross_forward(&mut lits2);
+                if lits2.is_inexact() {
+                    // If this expression couldn't yield any literal that
+                    // could be extended, then we need to quit. Since we're
+                    // short-circuiting, we also need to freeze every member.
+                    lits.make_inexact();
+                    break;
+                }
+            }
+        }
+        HirKind::Alternation(ref es) => {
+            alternate_literals(es, lits, union_required);
+        }
+        _ => lits.make_inexact(),
+    }
+}
+
+fn repeat_range_literals<F: FnMut(&Hir, &mut Seq)>(
+    e: &Hir,
+    min: u32,
+    _max: Option<u32>,
+    _greedy: bool,
+    lits: &mut Seq,
+    mut f: F,
+) {
+    if min == 0 {
+        // This is a bit conservative. If `max` is set, then we could
+        // treat this as a finite set of alternations. For now, we
+        // just treat it as `e*`.
+        lits.make_inexact();
+    } else {
+        // We only extract literals from a single repetition, even though
+        // we could do more. e.g., `a{3}` will have `a` extracted instead of
+        // `aaa`. The reason is that inner literal extraction can't be unioned
+        // across repetitions. e.g., extracting `foofoofoo` from `(\w+foo){3}`
+        // is wrong.
+        f(e, lits);
+        lits.make_inexact();
+    }
+}
+
+fn alternate_literals<F: FnMut(&Hir, &mut Seq)>(
+    es: &[Hir],
+    lits: &mut Seq,
+    mut f: F,
+) {
+    let mut lits2 = Seq::empty();
+    for e in es {
+        let mut lits3 = Seq::empty();
+        // FIXME
+        // lits3.set_limit_size(lits.limit_size() / 5);
+        f(e, &mut lits3);
+        if lits3.is_empty() {
+            lits.make_inexact();
+            return;
+        }
+        lits2.union(&mut lits3);
+    }
+    // All we do at the moment is look for prefixes and suffixes. If both
+    // are empty, then we report nothing. We should be able to do better than
+    // this, but we'll need something more expressive than just a "set of
+    // literals."
+    if let Some(lcp) = lits2.longest_common_prefix() {
+        lits.cross_forward(&mut Seq::new([lcp]));
+    }
+    lits.make_inexact();
+    if let Some(lcs) = lits2.longest_common_suffix() {
+        lits.push(Literal::exact([]));
+        lits.push(Literal::exact(lcs));
+    }
+    /*
+    let lcp = lits2.longest_common_prefix();
+    let lcs = lits2.longest_common_suffix();
+    if !lcp.is_empty() {
+        lits.cross_forward(lcp);
+    }
+    lits.make_inexact();
+    if !lcs.is_empty() {
+        lits.push(Literal::exact([]));
+        lits.push(Literal::exact(lcs));
+    }
+    */
+}
+
+fn is_simple(expr: &Hir) -> bool {
+    match *expr.kind() {
+        HirKind::Empty
+        | HirKind::Literal(_)
+        | HirKind::Class(_)
+        | HirKind::Concat(_)
+        | HirKind::Alternation(_) => true,
+        HirKind::Look(_) | HirKind::Capture(_) | HirKind::Repetition(_) => {
+            false
+        }
+    }
+}
+
+/*
+/// Return the number of characters in the given class.
+fn count_unicode_class(cls: &hir::ClassUnicode) -> u32 {
+    cls.iter().map(|r| 1 + (r.end() as u32 - r.start() as u32)).sum()
+}
+
+/// Return the number of bytes in the given class.
+fn count_byte_class(cls: &hir::ClassBytes) -> u32 {
+    cls.iter().map(|r| 1 + (r.end() as u32 - r.start() as u32)).sum()
+}
+*/
+
+/// Returns true if and only if any of the literals in the given set is
+/// entirely whitespace.
+fn has_only_whitespace(lits: &[Literal]) -> bool {
+    for lit in lits {
+        if lit.as_bytes().chars().all(|c| c.is_whitespace()) {
+            return true;
+        }
+    }
+    false
+}
+
+fn prefixes(hir: &Hir) -> Seq {
+    let mut extractor = literal::Extractor::new();
+    extractor.kind(literal::ExtractKind::Prefix);
+    let mut prefixes = extractor.extract(hir);
+    log::debug!(
+        "prefixes (len={:?}, exact={:?}) extracted before optimization: {:?}",
+        prefixes.len(),
+        prefixes.is_exact(),
+        prefixes
+    );
+    prefixes.optimize_for_prefix_by_preference();
+    log::debug!(
+        "prefixes (len={:?}, exact={:?}) extracted after optimization: {:?}",
+        prefixes.len(),
+        prefixes.is_exact(),
+        prefixes
+    );
+    prefixes
+}
+
+fn suffixes(hir: &Hir) -> Seq {
+    let mut extractor = literal::Extractor::new();
+    extractor.kind(literal::ExtractKind::Suffix);
+    let mut suffixes = extractor.extract(hir);
+    log::debug!(
+        "suffixes (len={:?}, exact={:?}) extracted before optimization: {:?}",
+        suffixes.len(),
+        suffixes.is_exact(),
+        suffixes
+    );
+    suffixes.optimize_for_suffix_by_preference();
+    log::debug!(
+        "suffixes (len={:?}, exact={:?}) extracted after optimization: {:?}",
+        suffixes.len(),
+        suffixes.is_exact(),
+        suffixes
+    );
+    suffixes
+}
+
+#[cfg(test)]
+mod tests {
+    use super::LiteralSets;
+    use regex_syntax::Parser;
+
+    fn sets(pattern: &str) -> LiteralSets {
+        let hir = Parser::new().parse(pattern).unwrap();
+        LiteralSets::new(&hir)
+    }
+
+    fn one_regex(pattern: &str) -> Option<String> {
+        sets(pattern).one_regex(false)
+    }
+
+    // Put a pattern into the same format as the one returned by `one_regex`.
+    fn pat(pattern: &str) -> Option<String> {
+        Some(format!("(?-u:{})", pattern))
+    }
+
+    #[test]
+    fn various() {
+        // Obviously no literals.
+        assert!(one_regex(r"\w").is_none());
+        assert!(one_regex(r"\pL").is_none());
+
+        // Tantalizingly close.
+        assert!(one_regex(r"\w|foo").is_none());
+
+        // There's a literal, but it's better if the regex engine handles it
+        // internally.
+        assert!(one_regex(r"abc").is_none());
+
+        // Core use cases.
+        assert_eq!(one_regex(r"\wabc\w"), pat("abc"));
+        assert_eq!(one_regex(r"abc\w"), pat("abc"));
+
+        // TODO: Make these pass. We're missing some potentially big wins
+        // without these.
+        // assert_eq!(one_regex(r"\w(foo|bar|baz)"), pat("foo|bar|baz"));
+        // assert_eq!(one_regex(r"\w(foo|bar|baz)\w"), pat("foo|bar|baz"));
+    }
+
+    #[test]
+    fn regression_1064() {
+        // Regression from:
+        // https://github.com/BurntSushi/ripgrep/issues/1064
+        // assert_eq!(one_regex(r"a.*c"), pat("a"));
+        assert_eq!(one_regex(r"a(.*c)"), pat("a"));
+    }
+
+    #[test]
+    fn regression_1319() {
+        // Regression from:
+        // https://github.com/BurntSushi/ripgrep/issues/1319
+        assert_eq!(
+            one_regex(r"TTGAGTCCAGGAG[ATCG]{2}C"),
+            pat("TTGAGTCCAGGAG"),
+        );
+    }
+
+    #[test]
+    fn regression_1537() {
+        // Regression from:
+        // https://github.com/BurntSushi/ripgrep/issues/1537
+        assert_eq!(one_regex(r";(.*,)"), pat(";"));
+        assert_eq!(one_regex(r";((.*,))"), pat(";"));
+        assert_eq!(one_regex(r";(.*,)+"), pat(";"),);
+        assert_eq!(one_regex(r";(.*,){1}"), pat(";"),);
+    }
+}
--- a/crates/regex/src/matcher.rs
+++ b/crates/regex/src/matcher.rs
@@ -1036,7 +1036,9 @@ mod tests {
    }

    // Test that finding candidate lines works as expected.
+    // FIXME: Re-enable this test once inner literal extraction works.
    #[test]
+    #[ignore]
    fn candidate_lines() {
        fn is_confirmed(m: LineMatchKind) -> bool {
            match m {
--- a/crates/regex/src/multi.rs
+++ b/crates/regex/src/multi.rs
@@ -1,6 +1,6 @@
-use aho_corasick::{AhoCorasick, AhoCorasickBuilder, MatchKind};
+use aho_corasick::{AhoCorasick, MatchKind};
 use grep_matcher::{Match, Matcher, NoError};
-use regex_syntax::hir::Hir;
+use regex_syntax::hir::{Hir, HirKind};

 use crate::error::Error;
 use crate::matcher::RegexCaptures;
@@ -23,10 +23,9 @@ impl MultiLiteralMatcher {
    pub fn new<B: AsRef<[u8]>>(
        literals: &[B],
    ) -> Result<MultiLiteralMatcher, Error> {
-        let ac = AhoCorasickBuilder::new()
+        let ac = AhoCorasick::builder()
            .match_kind(MatchKind::LeftmostFirst)
-            .auto_configure(literals)
-            .build_with_size::<usize, _, _>(literals)
+            .build(literals)
            .map_err(Error::regex)?;
        Ok(MultiLiteralMatcher { ac })
    }
@@ -79,13 +78,11 @@ impl Matcher for MultiLiteralMatcher {
 /// Alternation literals checks if the given HIR is a simple alternation of
 /// literals, and if so, returns them. Otherwise, this returns None.
 pub fn alternation_literals(expr: &Hir) -> Option<Vec<Vec<u8>>> {
-    use regex_syntax::hir::{HirKind, Literal};
-
    // This is pretty hacky, but basically, if `is_alternation_literal` is
    // true, then we can make several assumptions about the structure of our
    // HIR. This is what justifies the `unreachable!` statements below.

-    if !expr.is_alternation_literal() {
+    if !expr.properties().is_alternation_literal() {
        return None;
    }
    let alts = match *expr.kind() {
@@ -93,26 +90,16 @@ pub fn alternation_literals(expr: &Hir) -> Option<Vec<Vec<u8>>> {
        _ => return None, // one literal isn't worth it
    };

-    let extendlit = |lit: &Literal, dst: &mut Vec<u8>| match *lit {
-        Literal::Unicode(c) => {
-            let mut buf = [0; 4];
-            dst.extend_from_slice(c.encode_utf8(&mut buf).as_bytes());
-        }
-        Literal::Byte(b) => {
-            dst.push(b);
-        }
-    };
-
    let mut lits = vec![];
    for alt in alts {
        let mut lit = vec![];
        match *alt.kind() {
            HirKind::Empty => {}
-            HirKind::Literal(ref x) => extendlit(x, &mut lit),
+            HirKind::Literal(ref x) => lit.extend_from_slice(&x.0),
            HirKind::Concat(ref exprs) => {
                for e in exprs {
                    match *e.kind() {
-                        HirKind::Literal(ref x) => extendlit(x, &mut lit),
+                        HirKind::Literal(ref x) => lit.extend_from_slice(&x.0),
                        _ => unreachable!("expected literal, got {:?}", e),
                    }
                }
--- a/crates/regex/src/non_matching.rs
+++ b/crates/regex/src/non_matching.rs
@@ -1,6 +1,10 @@
-use grep_matcher::ByteSet;
-use regex_syntax::hir::{self, Hir, HirKind};
-use regex_syntax::utf8::Utf8Sequences;
+use {
+    grep_matcher::ByteSet,
+    regex_syntax::{
+        hir::{self, Hir, HirKind, Look},
+        utf8::Utf8Sequences,
+    },
+};

 /// Return a confirmed set of non-matching bytes from the given expression.
 pub fn non_matching_bytes(expr: &Hir) -> ByteSet {
@@ -13,18 +17,28 @@ pub fn non_matching_bytes(expr: &Hir) -> ByteSet {
 /// the given expression.
 fn remove_matching_bytes(expr: &Hir, set: &mut ByteSet) {
    match *expr.kind() {
-        HirKind::Empty | HirKind::WordBoundary(_) => {}
-        HirKind::Anchor(_) => {
+        HirKind::Empty
+        // | HirKind::Look(Look::Start | Look::End)
+        | HirKind::Look(Look::WordAscii | Look::WordAsciiNegate)
+        | HirKind::Look(Look::WordUnicode | Look::WordUnicodeNegate) => {}
+        HirKind::Look(Look::Start | Look::End) => {
+            // FIXME: This is wrong, but not doing this leads to incorrect
+            // results because of how anchored searches are implemented in
+            // the 'grep-searcher' crate.
            set.remove(b'\n');
        }
-        HirKind::Literal(hir::Literal::Unicode(c)) => {
-            for &b in c.encode_utf8(&mut [0; 4]).as_bytes() {
+        HirKind::Look(Look::StartLF | Look::EndLF) => {
+            set.remove(b'\n');
+        }
+        HirKind::Look(Look::StartCRLF | Look::EndCRLF) => {
+            set.remove(b'\r');
+            set.remove(b'\n');
+        }
+        HirKind::Literal(hir::Literal(ref lit)) => {
+            for &b in lit.iter() {
                set.remove(b);
            }
        }
-        HirKind::Literal(hir::Literal::Byte(b)) => {
-            set.remove(b);
-        }
        HirKind::Class(hir::Class::Unicode(ref cls)) => {
            for range in cls.iter() {
                // This is presumably faster than encoding every codepoint
@@ -42,10 +56,10 @@ fn remove_matching_bytes(expr: &Hir, set: &mut ByteSet) {
            }
        }
        HirKind::Repetition(ref x) => {
-            remove_matching_bytes(&x.hir, set);
+            remove_matching_bytes(&x.sub, set);
        }
-        HirKind::Group(ref x) => {
-            remove_matching_bytes(&x.hir, set);
+        HirKind::Capture(ref x) => {
+            remove_matching_bytes(&x.sub, set);
        }
        HirKind::Concat(ref xs) => {
            for x in xs {
@@ -62,17 +76,13 @@ fn remove_matching_bytes(expr: &Hir, set: &mut ByteSet) {

 #[cfg(test)]
 mod tests {
-    use grep_matcher::ByteSet;
-    use regex_syntax::ParserBuilder;
+    use {grep_matcher::ByteSet, regex_syntax::ParserBuilder};

    use super::non_matching_bytes;

    fn extract(pattern: &str) -> ByteSet {
-        let expr = ParserBuilder::new()
-            .allow_invalid_utf8(true)
-            .build()
-            .parse(pattern)
-            .unwrap();
+        let expr =
+            ParserBuilder::new().utf8(false).build().parse(pattern).unwrap();
        non_matching_bytes(&expr)
    }

@@ -131,9 +141,13 @@ mod tests {

    #[test]
    fn anchor() {
+        // FIXME: The first four tests below should correspond to a full set
+        // of bytes for the non-matching bytes I think.
        assert_eq!(sparse(&extract(r"^")), sparse_except(&[b'\n']));
        assert_eq!(sparse(&extract(r"$")), sparse_except(&[b'\n']));
        assert_eq!(sparse(&extract(r"\A")), sparse_except(&[b'\n']));
        assert_eq!(sparse(&extract(r"\z")), sparse_except(&[b'\n']));
+        assert_eq!(sparse(&extract(r"(?m)^")), sparse_except(&[b'\n']));
+        assert_eq!(sparse(&extract(r"(?m)$")), sparse_except(&[b'\n']));
    }
 }
--- a/crates/regex/src/strip.rs
+++ b/crates/regex/src/strip.rs
@@ -42,17 +42,11 @@ fn strip_from_match_ascii(expr: Hir, byte: u8) -> Result<Hir, Error> {

    Ok(match expr.into_kind() {
        HirKind::Empty => Hir::empty(),
-        HirKind::Literal(hir::Literal::Unicode(c)) => {
-            if c == chr {
+        HirKind::Literal(hir::Literal(lit)) => {
+            if lit.iter().find(|&&b| b == byte).is_some() {
                return invalid();
            }
-            Hir::literal(hir::Literal::Unicode(c))
-        }
-        HirKind::Literal(hir::Literal::Byte(b)) => {
-            if b as char == chr {
-                return invalid();
-            }
-            Hir::literal(hir::Literal::Byte(b))
+            Hir::literal(lit)
        }
        HirKind::Class(hir::Class::Unicode(mut cls)) => {
            let remove = hir::ClassUnicode::new(Some(
@@ -74,15 +68,14 @@ fn strip_from_match_ascii(expr: Hir, byte: u8) -> Result<Hir, Error> {
            }
            Hir::class(hir::Class::Bytes(cls))
        }
-        HirKind::Anchor(x) => Hir::anchor(x),
-        HirKind::WordBoundary(x) => Hir::word_boundary(x),
+        HirKind::Look(x) => Hir::look(x),
        HirKind::Repetition(mut x) => {
-            x.hir = Box::new(strip_from_match_ascii(*x.hir, byte)?);
+            x.sub = Box::new(strip_from_match_ascii(*x.sub, byte)?);
            Hir::repetition(x)
        }
-        HirKind::Group(mut x) => {
-            x.hir = Box::new(strip_from_match_ascii(*x.hir, byte)?);
-            Hir::group(x)
+        HirKind::Capture(mut x) => {
+            x.sub = Box::new(strip_from_match_ascii(*x.sub, byte)?);
+            Hir::capture(x)
        }
        HirKind::Concat(xs) => {
            let xs = xs
@@ -131,11 +124,11 @@ mod tests {

    #[test]
    fn various() {
-        assert_eq!(roundtrip(r"[a\n]", b'\n'), "[a]");
-        assert_eq!(roundtrip(r"[a\n]", b'a'), "[\n]");
-        assert_eq!(roundtrip_crlf(r"[a\n]"), "[a]");
-        assert_eq!(roundtrip_crlf(r"[a\r]"), "[a]");
-        assert_eq!(roundtrip_crlf(r"[a\r\n]"), "[a]");
+        assert_eq!(roundtrip(r"[a\n]", b'\n'), "a");
+        assert_eq!(roundtrip(r"[a\n]", b'a'), "\n");
+        assert_eq!(roundtrip_crlf(r"[a\n]"), "a");
+        assert_eq!(roundtrip_crlf(r"[a\r]"), "a");
+        assert_eq!(roundtrip_crlf(r"[a\r\n]"), "a");

        assert_eq!(roundtrip(r"(?-u)\s", b'a'), r"(?-u:[\x09-\x0D\x20])");
        assert_eq!(roundtrip(r"(?-u)\s", b'\n'), r"(?-u:[\x09\x0B-\x0D\x20])");
--- a/crates/regex/src/util.rs
+++ b/crates/regex/src/util.rs
@@ -1,5 +1,6 @@
 /// Converts an arbitrary sequence of bytes to a literal suitable for building
 /// a regular expression.
+#[allow(dead_code)]
 pub fn bytes_to_regex(bs: &[u8]) -> String {
    use regex_syntax::is_meta_character;
    use std::fmt::Write;