cli: use bstr

This uses bstr in the unescaping logic. This lets us remove some platform specific code, and also lets us remove a hacked UTF-8 decoder on raw bytes.
2025-08-04 21:52:54 +02:00 · 2019-04-04 15:15:48 -04:00
parent 9b8f5cbaba
commit d968a27ed5
3 changed files with 13 additions and 63 deletions
--- a/grep-cli/Cargo.toml
+++ b/grep-cli/Cargo.toml
@ -14,6 +14,7 @@ license = "Unlicense/MIT"

 [dependencies]
 atty = "0.2.11"
+bstr = "0.1.2"
 globset = { version = "0.4.2", path = "../globset" }
 lazy_static = "1.1.0"
 log = "0.4.5"
--- a/grep-cli/src/escape.rs
+++ b/grep-cli/src/escape.rs
@ -1,6 +1,8 @@
 use std::ffi::OsStr;
 use std::str;

+use bstr::{BStr, BString};
+
 /// A single state in the state machine used by `unescape`.
 #[derive(Clone, Copy, Eq, PartialEq)]
 enum State {
@ -35,18 +37,16 @@ enum State {
 ///
 /// assert_eq!(r"foo\nbar\xFFbaz", escape(b"foo\nbar\xFFbaz"));
 /// ```
-pub fn escape(mut bytes: &[u8]) -> String {
+pub fn escape(bytes: &[u8]) -> String {
+    let bytes = BStr::new(bytes);
    let mut escaped = String::new();
-    while let Some(result) = decode_utf8(bytes) {
-        match result {
-            Ok(cp) => {
-                escape_char(cp, &mut escaped);
-                bytes = &bytes[cp.len_utf8()..];
-            }
-            Err(byte) => {
-                escape_byte(byte, &mut escaped);
-                bytes = &bytes[1..];
+    for (s, e, ch) in bytes.char_indices() {
+        if ch == '\u{FFFD}' {
+            for b in bytes[s..e].bytes() {
+                escape_byte(b, &mut escaped);
            }
+        } else {
+            escape_char(ch, &mut escaped);
        }
    }
    escaped
@ -56,19 +56,7 @@ pub fn escape(mut bytes: &[u8]) -> String {
 ///
 /// This is like [`escape`](fn.escape.html), but accepts an OS string.
 pub fn escape_os(string: &OsStr) -> String {
-    #[cfg(unix)]
-    fn imp(string: &OsStr) -> String {
-        use std::os::unix::ffi::OsStrExt;
-
-        escape(string.as_bytes())
-    }
-
-    #[cfg(not(unix))]
-    fn imp(string: &OsStr) -> String {
-        escape(string.to_string_lossy().as_bytes())
-    }
-
-    imp(string)
+    escape(BString::from_os_str_lossy(string).as_bytes())
 }

 /// Unescapes a string.
@ -195,46 +183,6 @@ fn escape_byte(byte: u8, into: &mut String) {
    }
 }

-/// Decodes the next UTF-8 encoded codepoint from the given byte slice.
-///
-/// If no valid encoding of a codepoint exists at the beginning of the given
-/// byte slice, then the first byte is returned instead.
-///
-/// This returns `None` if and only if `bytes` is empty.
-fn decode_utf8(bytes: &[u8]) -> Option<Result<char, u8>> {
-    if bytes.is_empty() {
-        return None;
-    }
-    let len = match utf8_len(bytes[0]) {
-        None => return Some(Err(bytes[0])),
-        Some(len) if len > bytes.len() => return Some(Err(bytes[0])),
-        Some(len) =>  len,
-    };
-    match str::from_utf8(&bytes[..len]) {
-        Ok(s) => Some(Ok(s.chars().next().unwrap())),
-        Err(_) => Some(Err(bytes[0])),
-    }
-}
-
-/// Given a UTF-8 leading byte, this returns the total number of code units
-/// in the following encoded codepoint.
-///
-/// If the given byte is not a valid UTF-8 leading byte, then this returns
-/// `None`.
-fn utf8_len(byte: u8) -> Option<usize> {
-    if byte <= 0x7F {
-        Some(1)
-    } else if byte <= 0b110_11111 {
-        Some(2)
-    } else if byte <= 0b1110_1111 {
-        Some(3)
-    } else if byte <= 0b1111_0111 {
-        Some(4)
-    } else {
-        None
-    }
-}
-
 #[cfg(test)]
 mod tests {
    use super::{escape, unescape};
--- a/grep-cli/src/lib.rs
+++ b/grep-cli/src/lib.rs
@ -159,6 +159,7 @@ error message is crafted that typically tells the user how to fix the problem.
 #![deny(missing_docs)]

 extern crate atty;
+extern crate bstr;
 extern crate globset;
 #[macro_use]
 extern crate lazy_static;