From 02de97b8ce2762a7530cc18bba737f6ccea022a2 Mon Sep 17 00:00:00 2001 From: Andre Bogus Date: Fri, 23 Sep 2016 04:59:25 +0200 Subject: [PATCH] Use the bytecount crate for fast line counting. Fixes #128 --- Cargo.lock | 7 ++++ Cargo.toml | 1 + src/search_stream.rs | 84 ++------------------------------------------ tests/tests.rs | 9 +++++ tests/workdir.rs | 7 +++- 5 files changed, 26 insertions(+), 82 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index d6f512f7..d497e7b7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,6 +2,7 @@ name = "ripgrep" version = "0.2.6" dependencies = [ + "bytecount 0.1.4 (registry+https://github.com/rust-lang/crates.io-index)", "ctrlc 2.0.1 (registry+https://github.com/rust-lang/crates.io-index)", "docopt 0.6.86 (registry+https://github.com/rust-lang/crates.io-index)", "env_logger 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)", @@ -28,6 +29,11 @@ dependencies = [ "memchr 0.1.11 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "bytecount" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" + [[package]] name = "crossbeam" version = "0.2.10" @@ -282,6 +288,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" [metadata] "checksum aho-corasick 0.5.3 (registry+https://github.com/rust-lang/crates.io-index)" = "ca972c2ea5f742bfce5687b9aef75506a764f61d37f8f649047846a9686ddb66" +"checksum bytecount 0.1.4 (registry+https://github.com/rust-lang/crates.io-index)" = "49e3c21915578e2300b08d3c174a8ac887e0c6421dff86fdc4d741dc29e5d413" "checksum crossbeam 0.2.10 (registry+https://github.com/rust-lang/crates.io-index)" = "0c5ea215664ca264da8a9d9c3be80d2eaf30923c259d03e870388eb927508f97" "checksum ctrlc 2.0.1 (registry+https://github.com/rust-lang/crates.io-index)" = "77f98bb69e3fefadcc5ca80a1368a55251f70295168203e01165bcaecb270891" "checksum docopt 0.6.86 (registry+https://github.com/rust-lang/crates.io-index)" = "4a7ef30445607f6fc8720f0a0a2c7442284b629cf0d049286860fae23e71c4d9" diff --git a/Cargo.toml b/Cargo.toml index 95cc97d8..23135f30 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -24,6 +24,7 @@ name = "integration" path = "tests/tests.rs" [dependencies] +bytecount = "0.1.4" ctrlc = "2.0" docopt = "0.6" env_logger = "0.3" diff --git a/src/search_stream.rs b/src/search_stream.rs index 8f458ca5..cbd7a63e 100644 --- a/src/search_stream.rs +++ b/src/search_stream.rs @@ -4,6 +4,8 @@ printing matches. In particular, it searches the file in a streaming fashion using `read` calls and a (roughly) fixed size buffer. */ +extern crate bytecount; + use std::cmp; use std::error::Error as StdError; use std::fmt; @@ -582,89 +584,9 @@ pub fn is_binary(buf: &[u8]) -> bool { } /// Count the number of lines in the given buffer. -#[inline(never)] - #[inline(never)] pub fn count_lines(buf: &[u8], eol: u8) -> u64 { - // This was adapted from code in the memchr crate. The specific benefit - // here is that we can avoid a branch in the inner loop because all we're - // doing is counting. - - // The technique to count EOL bytes was adapted from: - // http://bits.stephan-brumme.com/null.html - const LO_U64: u64 = 0x0101010101010101; - const HI_U64: u64 = 0x8080808080808080; - - // use truncation - const LO_USIZE: usize = LO_U64 as usize; - const HI_USIZE: usize = HI_U64 as usize; - - #[cfg(target_pointer_width = "32")] - const USIZE_BYTES: usize = 4; - #[cfg(target_pointer_width = "64")] - const USIZE_BYTES: usize = 8; - - fn count_eol(eol: usize) -> u64 { - // Ideally, this would compile down to a POPCNT instruction, but - // it looks like you need to set RUSTFLAGS="-C target-cpu=native" - // (or target-feature=+popcnt) to get that to work. Bummer. - (eol.wrapping_sub(LO_USIZE) & !eol & HI_USIZE).count_ones() as u64 - } - - #[cfg(target_pointer_width = "32")] - fn repeat_byte(b: u8) -> usize { - let mut rep = (b as usize) << 8 | b as usize; - rep = rep << 16 | rep; - rep - } - - #[cfg(target_pointer_width = "64")] - fn repeat_byte(b: u8) -> usize { - let mut rep = (b as usize) << 8 | b as usize; - rep = rep << 16 | rep; - rep = rep << 32 | rep; - rep - } - - fn count_lines_slow(mut buf: &[u8], eol: u8) -> u64 { - let mut count = 0; - while let Some(pos) = memchr(eol, buf) { - count += 1; - buf = &buf[pos + 1..]; - } - count - } - - let len = buf.len(); - let ptr = buf.as_ptr(); - let mut count = 0; - - // Search up to an aligned boundary... - let align = (ptr as usize) & (USIZE_BYTES - 1); - let mut i = 0; - if align > 0 { - i = cmp::min(USIZE_BYTES - align, len); - count += count_lines_slow(&buf[..i], eol); - } - - // ... and search the rest. - let repeated_eol = repeat_byte(eol); - - if len >= 2 * USIZE_BYTES { - while i <= len - (2 * USIZE_BYTES) { - unsafe { - let u = *(ptr.offset(i as isize) as *const usize); - let v = *(ptr.offset((i + USIZE_BYTES) as isize) - as *const usize); - - count += count_eol(u ^ repeated_eol); - count += count_eol(v ^ repeated_eol); - } - i += USIZE_BYTES * 2; - } - } - count += count_lines_slow(&buf[i..], eol); - count + bytecount::count(buf, eol) as u64 } /// Replaces a with b in buf. diff --git a/tests/tests.rs b/tests/tests.rs index 65962c74..48fb8185 100644 --- a/tests/tests.rs +++ b/tests/tests.rs @@ -789,6 +789,15 @@ clean!(regression_127, "Sherlock", ".", |wd: WorkDir, mut cmd: Command| { assert_eq!(lines, expected); }); +// See: https://github.com/BurntSushi/ripgrep/issues/128 +clean!(regression_128, "x", ".", |wd: WorkDir, mut cmd: Command| { + wd.create_bytes("foo", b"01234567\x0b\n\x0b\n\x0b\n\x0b\nx"); + cmd.arg("-n"); + + let lines: String = wd.stdout(&mut cmd); + assert_eq!(lines, "foo:5:x\n"); +}); + // See: https://github.com/BurntSushi/ripgrep/issues/131 // // TODO(burntsushi): Darwin doesn't like this test for some reason. diff --git a/tests/workdir.rs b/tests/workdir.rs index 6a79daf7..5ef3b72e 100644 --- a/tests/workdir.rs +++ b/tests/workdir.rs @@ -43,9 +43,14 @@ impl WorkDir { /// Create a new file with the given name and contents in this directory. pub fn create>(&self, name: P, contents: &str) { + self.create_bytes(name, contents.as_bytes()); + } + + /// Create a new file with the given name and contents in this directory. + pub fn create_bytes>(&self, name: P, contents: &[u8]) { let path = self.dir.join(name); let mut file = nice_err(&path, File::create(&path)); - nice_err(&path, file.write_all(contents.as_bytes())); + nice_err(&path, file.write_all(contents)); nice_err(&path, file.flush()); }