From d968a27ed5298d99e46ff65b68a7f6c2c641105f Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Thu, 4 Apr 2019 15:15:48 -0400 Subject: [PATCH] cli: use bstr This uses bstr in the unescaping logic. This lets us remove some platform specific code, and also lets us remove a hacked UTF-8 decoder on raw bytes. --- grep-cli/Cargo.toml | 1 + grep-cli/src/escape.rs | 74 +++++++----------------------------------- grep-cli/src/lib.rs | 1 + 3 files changed, 13 insertions(+), 63 deletions(-) diff --git a/grep-cli/Cargo.toml b/grep-cli/Cargo.toml index 29e15b28..f143e401 100644 --- a/grep-cli/Cargo.toml +++ b/grep-cli/Cargo.toml @@ -14,6 +14,7 @@ license = "Unlicense/MIT" [dependencies] atty = "0.2.11" +bstr = "0.1.2" globset = { version = "0.4.2", path = "../globset" } lazy_static = "1.1.0" log = "0.4.5" diff --git a/grep-cli/src/escape.rs b/grep-cli/src/escape.rs index 9b350a93..7ea96788 100644 --- a/grep-cli/src/escape.rs +++ b/grep-cli/src/escape.rs @@ -1,6 +1,8 @@ use std::ffi::OsStr; use std::str; +use bstr::{BStr, BString}; + /// A single state in the state machine used by `unescape`. #[derive(Clone, Copy, Eq, PartialEq)] enum State { @@ -35,18 +37,16 @@ enum State { /// /// assert_eq!(r"foo\nbar\xFFbaz", escape(b"foo\nbar\xFFbaz")); /// ``` -pub fn escape(mut bytes: &[u8]) -> String { +pub fn escape(bytes: &[u8]) -> String { + let bytes = BStr::new(bytes); let mut escaped = String::new(); - while let Some(result) = decode_utf8(bytes) { - match result { - Ok(cp) => { - escape_char(cp, &mut escaped); - bytes = &bytes[cp.len_utf8()..]; - } - Err(byte) => { - escape_byte(byte, &mut escaped); - bytes = &bytes[1..]; + for (s, e, ch) in bytes.char_indices() { + if ch == '\u{FFFD}' { + for b in bytes[s..e].bytes() { + escape_byte(b, &mut escaped); } + } else { + escape_char(ch, &mut escaped); } } escaped @@ -56,19 +56,7 @@ pub fn escape(mut bytes: &[u8]) -> String { /// /// This is like [`escape`](fn.escape.html), but accepts an OS string. pub fn escape_os(string: &OsStr) -> String { - #[cfg(unix)] - fn imp(string: &OsStr) -> String { - use std::os::unix::ffi::OsStrExt; - - escape(string.as_bytes()) - } - - #[cfg(not(unix))] - fn imp(string: &OsStr) -> String { - escape(string.to_string_lossy().as_bytes()) - } - - imp(string) + escape(BString::from_os_str_lossy(string).as_bytes()) } /// Unescapes a string. @@ -195,46 +183,6 @@ fn escape_byte(byte: u8, into: &mut String) { } } -/// Decodes the next UTF-8 encoded codepoint from the given byte slice. -/// -/// If no valid encoding of a codepoint exists at the beginning of the given -/// byte slice, then the first byte is returned instead. -/// -/// This returns `None` if and only if `bytes` is empty. -fn decode_utf8(bytes: &[u8]) -> Option> { - if bytes.is_empty() { - return None; - } - let len = match utf8_len(bytes[0]) { - None => return Some(Err(bytes[0])), - Some(len) if len > bytes.len() => return Some(Err(bytes[0])), - Some(len) => len, - }; - match str::from_utf8(&bytes[..len]) { - Ok(s) => Some(Ok(s.chars().next().unwrap())), - Err(_) => Some(Err(bytes[0])), - } -} - -/// Given a UTF-8 leading byte, this returns the total number of code units -/// in the following encoded codepoint. -/// -/// If the given byte is not a valid UTF-8 leading byte, then this returns -/// `None`. -fn utf8_len(byte: u8) -> Option { - if byte <= 0x7F { - Some(1) - } else if byte <= 0b110_11111 { - Some(2) - } else if byte <= 0b1110_1111 { - Some(3) - } else if byte <= 0b1111_0111 { - Some(4) - } else { - None - } -} - #[cfg(test)] mod tests { use super::{escape, unescape}; diff --git a/grep-cli/src/lib.rs b/grep-cli/src/lib.rs index b9909c20..9c5d71ad 100644 --- a/grep-cli/src/lib.rs +++ b/grep-cli/src/lib.rs @@ -159,6 +159,7 @@ error message is crafted that typically tells the user how to fix the problem. #![deny(missing_docs)] extern crate atty; +extern crate bstr; extern crate globset; #[macro_use] extern crate lazy_static;