libripgrep: initial commit introducing libripgrep

libripgrep is not any one library, but rather, a collection of libraries that roughly separate the following key distinct phases in a grep implementation: 1. Pattern matching (e.g., by a regex engine). 2. Searching a file using a pattern matcher. 3. Printing results. Ultimately, both (1) and (3) are defined by de-coupled interfaces, of which there may be multiple implementations. Namely, (1) is satisfied by the `Matcher` trait in the `grep-matcher` crate and (3) is satisfied by the `Sink` trait in the `grep2` crate. The searcher (2) ties everything together and finds results using a matcher and reports those results using a `Sink` implementation. Closes #162
2025-06-25 14:22:54 +02:00 · 2018-04-29 09:29:52 -04:00
parent 0958837ee1
commit d9ca529356
68 changed files with 18010 additions and 20 deletions
--- a/grep-regex/src/non_matching.rs
+++ b/grep-regex/src/non_matching.rs
@ -0,0 +1,128 @@
+use grep_matcher::ByteSet;
+use regex_syntax::hir::{self, Hir, HirKind};
+use utf8_ranges::Utf8Sequences;
+
+/// Return a confirmed set of non-matching bytes from the given expression.
+pub fn non_matching_bytes(expr: &Hir) -> ByteSet {
+    let mut set = ByteSet::full();
+    remove_matching_bytes(expr, &mut set);
+    set
+}
+
+/// Remove any bytes from the given set that can occur in a matched produced by
+/// the given expression.
+fn remove_matching_bytes(
+    expr: &Hir,
+    set: &mut ByteSet,
+) {
+    match *expr.kind() {
+        HirKind::Empty
+        | HirKind::Anchor(_)
+        | HirKind::WordBoundary(_) => {}
+        HirKind::Literal(hir::Literal::Unicode(c)) => {
+            for &b in c.encode_utf8(&mut [0; 4]).as_bytes() {
+                set.remove(b);
+            }
+        }
+        HirKind::Literal(hir::Literal::Byte(b)) => {
+            set.remove(b);
+        }
+        HirKind::Class(hir::Class::Unicode(ref cls)) => {
+            for range in cls.iter() {
+                // This is presumably faster than encoding every codepoint
+                // to UTF-8 and then removing those bytes from the set.
+                for seq in Utf8Sequences::new(range.start(), range.end()) {
+                    for byte_range in seq.as_slice() {
+                        set.remove_all(byte_range.start, byte_range.end);
+                    }
+                }
+            }
+        }
+        HirKind::Class(hir::Class::Bytes(ref cls)) => {
+            for range in cls.iter() {
+                set.remove_all(range.start(), range.end());
+            }
+        }
+        HirKind::Repetition(ref x) => {
+            remove_matching_bytes(&x.hir, set);
+        }
+        HirKind::Group(ref x) => {
+            remove_matching_bytes(&x.hir, set);
+        }
+        HirKind::Concat(ref xs) => {
+            for x in xs {
+                remove_matching_bytes(x, set);
+            }
+        }
+        HirKind::Alternation(ref xs) => {
+            for x in xs {
+                remove_matching_bytes(x, set);
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use grep_matcher::ByteSet;
+    use regex_syntax::ParserBuilder;
+
+    use super::non_matching_bytes;
+
+    fn extract(pattern: &str) -> ByteSet {
+        let expr = ParserBuilder::new()
+            .allow_invalid_utf8(true)
+            .build()
+            .parse(pattern)
+            .unwrap();
+        non_matching_bytes(&expr)
+    }
+
+    fn sparse(set: &ByteSet) -> Vec<u8> {
+        let mut sparse_set = vec![];
+        for b in (0..256).map(|b| b as u8) {
+            if set.contains(b) {
+                sparse_set.push(b);
+            }
+        }
+        sparse_set
+    }
+
+    fn sparse_except(except: &[u8]) -> Vec<u8> {
+        let mut except_set = vec![false; 256];
+        for &b in except {
+            except_set[b as usize] = true;
+        }
+
+        let mut set = vec![];
+        for b in (0..256).map(|b| b as u8) {
+            if !except_set[b as usize] {
+                set.push(b);
+            }
+        }
+        set
+    }
+
+    #[test]
+    fn dot() {
+        assert_eq!(sparse(&extract(".")), vec![
+            b'\n',
+            192, 193, 245, 246, 247, 248, 249,
+            250, 251, 252, 253, 254, 255,
+        ]);
+        assert_eq!(sparse(&extract("(?s).")), vec![
+            192, 193, 245, 246, 247, 248, 249,
+            250, 251, 252, 253, 254, 255,
+        ]);
+        assert_eq!(sparse(&extract("(?-u).")), vec![b'\n']);
+        assert_eq!(sparse(&extract("(?s-u).")), vec![]);
+    }
+
+    #[test]
+    fn literal() {
+        assert_eq!(sparse(&extract("a")), sparse_except(&[b'a']));
+        assert_eq!(sparse(&extract("☃")), sparse_except(&[0xE2, 0x98, 0x83]));
+        assert_eq!(sparse(&extract(r"\xFF")), sparse_except(&[0xC3, 0xBF]));
+        assert_eq!(sparse(&extract(r"(?-u)\xFF")), sparse_except(&[0xFF]));
+    }
+}