From d968a27ed5298d99e46ff65b68a7f6c2c641105f Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Thu, 4 Apr 2019 15:15:48 -0400
Subject: [PATCH] cli: use bstr

This uses bstr in the unescaping logic. This lets us remove some platform
specific code, and also lets us remove a hacked UTF-8 decoder on raw
bytes.
---
 grep-cli/Cargo.toml    |  1 +
 grep-cli/src/escape.rs | 74 +++++++-----------------------------------
 grep-cli/src/lib.rs    |  1 +
 3 files changed, 13 insertions(+), 63 deletions(-)
diff --git a/grep-cli/Cargo.toml b/grep-cli/Cargo.toml
index 29e15b28..f143e401 100644
--- a/grep-cli/Cargo.toml
+++ b/grep-cli/Cargo.toml
@@ -14,6 +14,7 @@ license = "Unlicense/MIT"
 
 [dependencies]
 atty = "0.2.11"
+bstr = "0.1.2"
 globset = { version = "0.4.2", path = "../globset" }
 lazy_static = "1.1.0"
 log = "0.4.5"
diff --git a/grep-cli/src/escape.rs b/grep-cli/src/escape.rs
index 9b350a93..7ea96788 100644
--- a/grep-cli/src/escape.rs
+++ b/grep-cli/src/escape.rs
@@ -1,6 +1,8 @@
 use std::ffi::OsStr;
 use std::str;
 
+use bstr::{BStr, BString};
+
 /// A single state in the state machine used by `unescape`.
 #[derive(Clone, Copy, Eq, PartialEq)]
 enum State {
@@ -35,18 +37,16 @@ enum State {
 ///
 /// assert_eq!(r"foo\nbar\xFFbaz", escape(b"foo\nbar\xFFbaz"));
 /// ```
-pub fn escape(mut bytes: &[u8]) -> String {
+pub fn escape(bytes: &[u8]) -> String {
+    let bytes = BStr::new(bytes);
     let mut escaped = String::new();
-    while let Some(result) = decode_utf8(bytes) {
-        match result {
-            Ok(cp) => {
-                escape_char(cp, &mut escaped);
-                bytes = &bytes[cp.len_utf8()..];
-            }
-            Err(byte) => {
-                escape_byte(byte, &mut escaped);
-                bytes = &bytes[1..];
+    for (s, e, ch) in bytes.char_indices() {
+        if ch == '\u{FFFD}' {
+            for b in bytes[s..e].bytes() {
+                escape_byte(b, &mut escaped);
             }
+        } else {
+            escape_char(ch, &mut escaped);
         }
     }
     escaped
@@ -56,19 +56,7 @@ pub fn escape(mut bytes: &[u8]) -> String {
 ///
 /// This is like [`escape`](fn.escape.html), but accepts an OS string.
 pub fn escape_os(string: &OsStr) -> String {
-    #[cfg(unix)]
-    fn imp(string: &OsStr) -> String {
-        use std::os::unix::ffi::OsStrExt;
-
-        escape(string.as_bytes())
-    }
-
-    #[cfg(not(unix))]
-    fn imp(string: &OsStr) -> String {
-        escape(string.to_string_lossy().as_bytes())
-    }
-
-    imp(string)
+    escape(BString::from_os_str_lossy(string).as_bytes())
 }
 
 /// Unescapes a string.
@@ -195,46 +183,6 @@ fn escape_byte(byte: u8, into: &mut String) {
     }
 }
 
-/// Decodes the next UTF-8 encoded codepoint from the given byte slice.
-///
-/// If no valid encoding of a codepoint exists at the beginning of the given
-/// byte slice, then the first byte is returned instead.
-///
-/// This returns `None` if and only if `bytes` is empty.
-fn decode_utf8(bytes: &[u8]) -> Option<Result<char, u8>> {
-    if bytes.is_empty() {
-        return None;
-    }
-    let len = match utf8_len(bytes[0]) {
-        None => return Some(Err(bytes[0])),
-        Some(len) if len > bytes.len() => return Some(Err(bytes[0])),
-        Some(len) =>  len,
-    };
-    match str::from_utf8(&bytes[..len]) {
-        Ok(s) => Some(Ok(s.chars().next().unwrap())),
-        Err(_) => Some(Err(bytes[0])),
-    }
-}
-
-/// Given a UTF-8 leading byte, this returns the total number of code units
-/// in the following encoded codepoint.
-///
-/// If the given byte is not a valid UTF-8 leading byte, then this returns
-/// `None`.
-fn utf8_len(byte: u8) -> Option<usize> {
-    if byte <= 0x7F {
-        Some(1)
-    } else if byte <= 0b110_11111 {
-        Some(2)
-    } else if byte <= 0b1110_1111 {
-        Some(3)
-    } else if byte <= 0b1111_0111 {
-        Some(4)
-    } else {
-        None
-    }
-}
-
 #[cfg(test)]
 mod tests {
     use super::{escape, unescape};
diff --git a/grep-cli/src/lib.rs b/grep-cli/src/lib.rs
index b9909c20..9c5d71ad 100644
--- a/grep-cli/src/lib.rs
+++ b/grep-cli/src/lib.rs
@@ -159,6 +159,7 @@ error message is crafted that typically tells the user how to fix the problem.
 #![deny(missing_docs)]
 
 extern crate atty;
+extern crate bstr;
 extern crate globset;
 #[macro_use]
 extern crate lazy_static;