progress

2025-11-23 21:54:45 +02:00 · 2016-06-22 21:19:02 -04:00
parent 0163b39faa
commit a3f609222c
3 changed files with 13290 additions and 46 deletions
--- a/grep/src/data/sherlock.txt
+++ b/grep/src/data/sherlock.txt
--- a/grep/src/lib.rs
+++ b/grep/src/lib.rs
@@ -4,6 +4,7 @@ extern crate regex_syntax as syntax;

 use std::error;
 use std::fmt;
+use std::io;
 use std::result;

 pub use search::{Grep, GrepBuilder};
@@ -26,6 +27,13 @@ pub enum Error {
    /// pattern. For example, if the line terminator is `\n` and the regex
    /// pattern is `\w+\n\w+`, then the presence of `\n` will cause this error.
    LiteralNotAllowed(char),
+    /// This errors occurs when a line exceeds the buffer size. The buffer
+    /// size is given.
+    LineTooLong(usize),
+    /// An IO error occurred while searching.
+    Io(io::Error),
+    /// An unused enum variant that indicates this enum may be expanded in
+    /// the future and therefore should not be exhaustively matched.
    #[doc(hidden)]
    __Nonexhaustive,
 }
@@ -35,6 +43,8 @@ impl error::Error for Error {
        match *self {
            Error::Regex(ref err) => err.description(),
            Error::LiteralNotAllowed(_) => "use of forbidden literal",
+            Error::LineTooLong(_) => "line exceeds buffer size",
+            Error::Io(ref err) => err.description(),
            Error::__Nonexhaustive => unreachable!(),
        }
    }
@@ -42,6 +52,7 @@ impl error::Error for Error {
    fn cause(&self) -> Option<&error::Error> {
        match *self {
            Error::Regex(ref err) => err.cause(),
+            Error::Io(ref err) => err.cause(),
            _ => None,
        }
    }
@@ -54,6 +65,11 @@ impl fmt::Display for Error {
            Error::LiteralNotAllowed(chr) => {
                write!(f, "Literal '{}' not allowed.", chr)
            }
+            Error::LineTooLong(limit) => {
+                write!(f, "Line exceeded buffer size of {} bytes, try \
+                           searching with memory maps instead.", limit)
+            }
+            Error::Io(ref err) => err.fmt(f),
            Error::__Nonexhaustive => unreachable!(),
        }
    }
@@ -70,3 +86,9 @@ impl From<syntax::Error> for Error {
        Error::Regex(regex::Error::Syntax(err))
    }
 }
+
+impl From<io::Error> for Error {
+    fn from(err: io::Error) -> Error {
+        Error::Io(err)
+    }
+}
--- a/grep/src/search.rs
+++ b/grep/src/search.rs
@@ -1,3 +1,4 @@
+use std::cmp;
 use std::io;

 use memchr::{memchr, memrchr};
@@ -6,7 +7,52 @@ use syntax;

 use literals::LiteralSets;
 use nonl;
-use Result;
+use {Error, Result};
+
+#[derive(Clone, Debug, Default, Eq, PartialEq)]
+pub struct Match {
+    start: usize,
+    end: usize,
+    line: Option<usize>,
+    locations: Vec<(usize, usize)>,
+}
+
+impl Match {
+    pub fn new() -> Match {
+        Match::default()
+    }
+
+    /// Return the starting byte offset of the line that matched.
+    #[inline]
+    pub fn start(&self) -> usize {
+        self.start
+    }
+
+    /// Return the ending byte offset of the line that matched.
+    #[inline]
+    pub fn end(&self) -> usize {
+        self.end
+    }
+
+    /// Return the line number that this match corresponds to.
+    ///
+    /// Note that this is `None` if line numbers aren't being computed. Line
+    /// number tracking can be enabled using `GrepBuilder`.
+    #[inline]
+    pub fn line(&self) -> Option<usize> {
+        self.line
+    }
+
+    /// Return the exact start and end locations (in byte offsets) of every
+    /// regex match in this line.
+    ///
+    /// Note that this always returns an empty slice if exact locations aren't
+    /// computed. Exact location tracking can be enabled using `GrepBuilder`.
+    #[inline]
+    pub fn locations(&self) -> &[(usize, usize)] {
+        &self.locations
+    }
+}

 #[derive(Clone, Debug)]
 pub struct Grep {
@@ -164,6 +210,22 @@ impl Grep {
        }
    }

+    pub fn buffered_reader<'g, R: io::Read>(
+        &'g self,
+        buf: Buffer,
+        rdr: R,
+    ) -> GrepBuffered<'g, R> {
+        GrepBuffered {
+            grep: self,
+            rdr: rdr,
+            b: buf,
+            pos: 0,
+            start: 0,
+            lastnl: 0,
+            end: 0,
+        }
+    }
+
    pub fn read_match(
        &self,
        mat: &mut Match,
@@ -222,48 +284,133 @@ impl Grep {
    }
 }

-#[derive(Clone, Debug, Default, Eq, PartialEq)]
-pub struct Match {
-    start: usize,
-    end: usize,
-    line: Option<usize>,
-    locations: Vec<(usize, usize)>,
+pub struct Buffer {
+    buf: Vec<u8>,
+    tmp: Vec<u8>,
 }

-impl Match {
-    pub fn new() -> Match {
-        Match::default()
+impl Buffer {
+    pub fn new() -> Buffer {
+        Buffer::with_capacity(16 * (1<<10))
    }

-    /// Return the starting byte offset of the line that matched.
-    #[inline]
-    pub fn start(&self) -> usize {
-        self.start
+    pub fn with_capacity(cap: usize) -> Buffer {
+        Buffer {
+            buf: vec![0; cap],
+            tmp: Vec::new(),
+        }
+    }
+}
+
+pub struct GrepBuffered<'g, R> {
+    grep: &'g Grep,
+    rdr: R,
+    b: Buffer,
+    pos: usize,
+    start: usize,
+    lastnl: usize,
+    end: usize,
+}
+
+impl<'g, R: io::Read> GrepBuffered<'g, R> {
+    pub fn into_buffer(self) -> Buffer {
+        self.b
    }

-    /// Return the ending byte offset of the line that matched.
-    #[inline]
-    pub fn end(&self) -> usize {
-        self.end
+    pub fn iter<'b>(&'b mut self) -> IterBuffered<'b, 'g, R> {
+        IterBuffered { grep: self }
    }

-    /// Return the line number that this match corresponds to.
-    ///
-    /// Note that this is `None` if line numbers aren't being computed. Line
-    /// number tracking can be enabled using `GrepBuilder`.
-    #[inline]
-    pub fn line(&self) -> Option<usize> {
-        self.line
+    pub fn read_match(
+        &mut self,
+        mat: &mut Match,
+    ) -> Result<bool> {
+        loop {
+            if self.start == self.lastnl {
+                if !try!(self.fill()) {
+                    return Ok(false);
+                }
+            }
+            let ok = self.grep.read_match(
+                mat, &self.b.buf[..self.lastnl], self.start);
+            if !ok {
+                self.start = self.lastnl;
+                continue;
+            }
+            // Move start to the first possible byte of the next line.
+            self.start = cmp::min(
+                self.lastnl, mat.end.checked_add(1).unwrap());
+            mat.start += self.pos;
+            mat.end += self.pos;
+            return Ok(true);
+        }
    }

-    /// Return the exact start and end locations (in byte offsets) of every
-    /// regex match in this line.
-    ///
-    /// Note that this always returns an empty slice if exact locations aren't
-    /// computed. Exact location tracking can be enabled using `GrepBuilder`.
-    #[inline]
-    pub fn locations(&self) -> &[(usize, usize)] {
-        &self.locations
+    fn fill(&mut self) -> Result<bool> {
+        {
+            // The buffer might have leftover bytes that have not been
+            // searched yet. Leftovers correspond to all bytes proceding the
+            // final \n in the current buffer.
+            //
+            // TODO(ag): Seems like we should be able to memmove from the end
+            // of the buffer to the beginning, but let's do it the stupid (but
+            // safe) way for now.
+            let leftovers = &self.b.buf[self.lastnl..self.end];
+            self.b.tmp.clear();
+            self.b.tmp.resize(leftovers.len(), 0);
+            self.b.tmp.copy_from_slice(leftovers);
+        }
+        // Move the leftovers to the beginning of our buffer.
+        self.b.buf[0..self.b.tmp.len()].copy_from_slice(&self.b.tmp);
+        // Fill the rest with fresh bytes.
+        let nread = try!(self.rdr.read(&mut self.b.buf[self.b.tmp.len()..]));
+        // Now update our various positions.
+        self.pos += self.start;
+        println!("start: {:?}, pos: {:?}", self.start, self.pos);
+        self.start = 0;
+        // The end is the total number of bytes read plus whatever we had for
+        // leftovers.
+        self.end = self.b.tmp.len() + nread;
+        // Find the last new line. All searches on this buffer will be capped
+        // at this position since any proceding bytes may correspond to a
+        // partial line.
+        //
+        // This is a little complicated because must handle the case where
+        // the buffer is not full and no new line character could be found.
+        // We detect this case because this could potentially be a partial
+        // line. If we fill our buffer and still can't find a `\n`, then we
+        // give up.
+        let mut start = 0;
+        let term = self.grep.opts.line_terminator;
+        loop {
+            match memrchr(term, &self.b.buf[start..self.end]) {
+                Some(i) => {
+                    self.lastnl = start + i + 1;
+                    break;
+                }
+                None => {
+                    // If we couldn't find a new line and our buffer is
+                    // completely full, then this line is terribly long and we
+                    // return an error.
+                    if self.end == self.b.buf.len() {
+                        return Err(Error::LineTooLong(self.b.buf.len()));
+                    }
+                    // Otherwise we try to ask for more bytes and look again.
+                    let nread = try!(
+                        self.rdr.read(&mut self.b.buf[self.end..]));
+                    // If we got nothing than we're at EOF and we no longer
+                    // need to care about leftovers.
+                    if nread == 0 {
+                        self.lastnl = self.end;
+                        break;
+                    }
+                    start = self.end;
+                    self.end += nread;
+                }
+            }
+        }
+        // If end is zero, then we've hit EOF and we have no leftovers.
+        Ok(self.end > 0)
    }
 }

@@ -287,21 +434,44 @@ impl<'b, 's> Iterator for Iter<'b, 's> {
    }
 }

-pub struct GrepBuffered<'g, B> {
-    grep: &'g Grep,
-    buf: B,
-    start: usize,
+pub struct IterBuffered<'b, 'g: 'b, R: 'b> {
+    grep: &'b mut GrepBuffered<'g, R>,
 }

-impl<'g, B: BufRead> GrepBuffered {
-    pub fn read_match(
-        &self,
-        mat: &mut Match,
-    ) -> io::Result<bool> {
-        let buf = try!(self.buf.fill_buf());
-        if buf.is_empty() {
-            return Ok(false);
+impl<'b, 'g, R: io::Read> Iterator for IterBuffered<'b, 'g, R> {
+    type Item = Result<Match>;
+
+    fn next(&mut self) -> Option<Result<Match>> {
+        let mut mat = Match::default();
+        match self.grep.read_match(&mut mat) {
+            Err(err) => Some(Err(err)),
+            Ok(false) => None,
+            Ok(true) => Some(Ok(mat)),
        }
-        Ok(false)
+    }
+}
+
+#[allow(dead_code)]
+fn s(bytes: &[u8]) -> String {
+    String::from_utf8(bytes.to_vec()).unwrap()
+}
+
+#[cfg(test)]
+mod tests {
+    #![allow(unused_imports)]
+
+    use super::{Buffer, GrepBuilder, s};
+
+    static SHERLOCK: &'static [u8] = include_bytes!("./data/sherlock.txt");
+
+    #[test]
+    fn buffered() {
+        let g = GrepBuilder::new("Sherlock Holmes").create().unwrap();
+        let mut bg = g.buffered_reader(Buffer::new(), SHERLOCK);
+        let ms: Vec<_> = bg.iter().map(|r| r.unwrap()).collect();
+        let m = ms.last().unwrap();
+        assert_eq!(91, ms.len());
+        assert_eq!(575707, m.start());
+        assert_eq!(575784, m.end());
    }
 }