From 02de97b8ce2762a7530cc18bba737f6ccea022a2 Mon Sep 17 00:00:00 2001
From: Andre Bogus <bogusandre@gmail.com>
Date: Fri, 23 Sep 2016 04:59:25 +0200
Subject: [PATCH] Use the bytecount crate for fast line counting.

Fixes #128
---
 Cargo.lock           |  7 ++++
 Cargo.toml           |  1 +
 src/search_stream.rs | 84 ++------------------------------------------
 tests/tests.rs       |  9 +++++
 tests/workdir.rs     |  7 +++-
 5 files changed, 26 insertions(+), 82 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index d6f512f7..d497e7b7 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2,6 +2,7 @@
 name = "ripgrep"
 version = "0.2.6"
 dependencies = [
+ "bytecount 0.1.4 (registry+https://github.com/rust-lang/crates.io-index)",
  "ctrlc 2.0.1 (registry+https://github.com/rust-lang/crates.io-index)",
  "docopt 0.6.86 (registry+https://github.com/rust-lang/crates.io-index)",
  "env_logger 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)",
@@ -28,6 +29,11 @@ dependencies = [
  "memchr 0.1.11 (registry+https://github.com/rust-lang/crates.io-index)",
 ]
 
+[[package]]
+name = "bytecount"
+version = "0.1.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+
 [[package]]
 name = "crossbeam"
 version = "0.2.10"
@@ -282,6 +288,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 
 [metadata]
 "checksum aho-corasick 0.5.3 (registry+https://github.com/rust-lang/crates.io-index)" = "ca972c2ea5f742bfce5687b9aef75506a764f61d37f8f649047846a9686ddb66"
+"checksum bytecount 0.1.4 (registry+https://github.com/rust-lang/crates.io-index)" = "49e3c21915578e2300b08d3c174a8ac887e0c6421dff86fdc4d741dc29e5d413"
 "checksum crossbeam 0.2.10 (registry+https://github.com/rust-lang/crates.io-index)" = "0c5ea215664ca264da8a9d9c3be80d2eaf30923c259d03e870388eb927508f97"
 "checksum ctrlc 2.0.1 (registry+https://github.com/rust-lang/crates.io-index)" = "77f98bb69e3fefadcc5ca80a1368a55251f70295168203e01165bcaecb270891"
 "checksum docopt 0.6.86 (registry+https://github.com/rust-lang/crates.io-index)" = "4a7ef30445607f6fc8720f0a0a2c7442284b629cf0d049286860fae23e71c4d9"
diff --git a/Cargo.toml b/Cargo.toml
index 95cc97d8..23135f30 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -24,6 +24,7 @@ name = "integration"
 path = "tests/tests.rs"
 
 [dependencies]
+bytecount = "0.1.4"
 ctrlc = "2.0"
 docopt = "0.6"
 env_logger = "0.3"
diff --git a/src/search_stream.rs b/src/search_stream.rs
index 8f458ca5..cbd7a63e 100644
--- a/src/search_stream.rs
+++ b/src/search_stream.rs
@@ -4,6 +4,8 @@ printing matches. In particular, it searches the file in a streaming fashion
 using `read` calls and a (roughly) fixed size buffer.
 */
 
+extern crate bytecount;
+
 use std::cmp;
 use std::error::Error as StdError;
 use std::fmt;
@@ -582,89 +584,9 @@ pub fn is_binary(buf: &[u8]) -> bool {
 }
 
 /// Count the number of lines in the given buffer.
-#[inline(never)]
-
 #[inline(never)]
 pub fn count_lines(buf: &[u8], eol: u8) -> u64 {
-    // This was adapted from code in the memchr crate. The specific benefit
-    // here is that we can avoid a branch in the inner loop because all we're
-    // doing is counting.
-
-    // The technique to count EOL bytes was adapted from:
-    // http://bits.stephan-brumme.com/null.html
-    const LO_U64: u64 = 0x0101010101010101;
-    const HI_U64: u64 = 0x8080808080808080;
-
-    // use truncation
-    const LO_USIZE: usize = LO_U64 as usize;
-    const HI_USIZE: usize = HI_U64 as usize;
-
-    #[cfg(target_pointer_width = "32")]
-    const USIZE_BYTES: usize = 4;
-    #[cfg(target_pointer_width = "64")]
-    const USIZE_BYTES: usize = 8;
-
-    fn count_eol(eol: usize) -> u64 {
-        // Ideally, this would compile down to a POPCNT instruction, but
-        // it looks like you need to set RUSTFLAGS="-C target-cpu=native"
-        // (or target-feature=+popcnt) to get that to work. Bummer.
-        (eol.wrapping_sub(LO_USIZE) & !eol & HI_USIZE).count_ones() as u64
-    }
-
-    #[cfg(target_pointer_width = "32")]
-    fn repeat_byte(b: u8) -> usize {
-        let mut rep = (b as usize) << 8 | b as usize;
-        rep = rep << 16 | rep;
-        rep
-    }
-
-    #[cfg(target_pointer_width = "64")]
-    fn repeat_byte(b: u8) -> usize {
-        let mut rep = (b as usize) << 8 | b as usize;
-        rep = rep << 16 | rep;
-        rep = rep << 32 | rep;
-        rep
-    }
-
-    fn count_lines_slow(mut buf: &[u8], eol: u8) -> u64 {
-        let mut count = 0;
-        while let Some(pos) = memchr(eol, buf) {
-            count += 1;
-            buf = &buf[pos + 1..];
-        }
-        count
-    }
-
-    let len = buf.len();
-    let ptr = buf.as_ptr();
-    let mut count = 0;
-
-    // Search up to an aligned boundary...
-    let align = (ptr as usize) & (USIZE_BYTES - 1);
-    let mut i = 0;
-    if align > 0 {
-        i = cmp::min(USIZE_BYTES - align, len);
-        count += count_lines_slow(&buf[..i], eol);
-    }
-
-    // ... and search the rest.
-    let repeated_eol = repeat_byte(eol);
-
-    if len >= 2 * USIZE_BYTES {
-        while i <= len - (2 * USIZE_BYTES) {
-            unsafe {
-                let u = *(ptr.offset(i as isize) as *const usize);
-                let v = *(ptr.offset((i + USIZE_BYTES) as isize)
-                          as *const usize);
-
-                count += count_eol(u ^ repeated_eol);
-                count += count_eol(v ^ repeated_eol);
-            }
-            i += USIZE_BYTES * 2;
-        }
-    }
-    count += count_lines_slow(&buf[i..], eol);
-    count
+    bytecount::count(buf, eol) as u64
 }
 
 /// Replaces a with b in buf.
diff --git a/tests/tests.rs b/tests/tests.rs
index 65962c74..48fb8185 100644
--- a/tests/tests.rs
+++ b/tests/tests.rs
@@ -789,6 +789,15 @@ clean!(regression_127, "Sherlock", ".", |wd: WorkDir, mut cmd: Command| {
     assert_eq!(lines, expected);
 });
 
+// See: https://github.com/BurntSushi/ripgrep/issues/128
+clean!(regression_128, "x", ".", |wd: WorkDir, mut cmd: Command| {
+    wd.create_bytes("foo", b"01234567\x0b\n\x0b\n\x0b\n\x0b\nx");
+    cmd.arg("-n");
+
+    let lines: String = wd.stdout(&mut cmd);
+    assert_eq!(lines, "foo:5:x\n");
+});
+
 // See: https://github.com/BurntSushi/ripgrep/issues/131
 //
 // TODO(burntsushi): Darwin doesn't like this test for some reason.
diff --git a/tests/workdir.rs b/tests/workdir.rs
index 6a79daf7..5ef3b72e 100644
--- a/tests/workdir.rs
+++ b/tests/workdir.rs
@@ -43,9 +43,14 @@ impl WorkDir {
 
     /// Create a new file with the given name and contents in this directory.
     pub fn create<P: AsRef<Path>>(&self, name: P, contents: &str) {
+        self.create_bytes(name, contents.as_bytes());
+    }
+
+    /// Create a new file with the given name and contents in this directory.
+    pub fn create_bytes<P: AsRef<Path>>(&self, name: P, contents: &[u8]) {
         let path = self.dir.join(name);
         let mut file = nice_err(&path, File::create(&path));
-        nice_err(&path, file.write_all(contents.as_bytes()));
+        nice_err(&path, file.write_all(contents));
         nice_err(&path, file.flush());
     }