use std::io::{self, Write}; use std::path::Path; use std::time::Instant; use grep_matcher::{Match, Matcher}; use grep_searcher::{ Searcher, Sink, SinkContext, SinkContextKind, SinkError, SinkFinish, SinkMatch, }; use serde_json as json; use counter::CounterWriter; use jsont; use stats::Stats; /// The configuration for the JSON printer. /// /// This is manipulated by the JSONBuilder and then referenced by the actual /// implementation. Once a printer is build, the configuration is frozen and /// cannot changed. #[derive(Debug, Clone)] struct Config { pretty: bool, max_matches: Option, always_begin_end: bool, } impl Default for Config { fn default() -> Config { Config { pretty: false, max_matches: None, always_begin_end: false } } } /// A builder for a JSON lines printer. /// /// The builder permits configuring how the printer behaves. The JSON printer /// has fewer configuration options than the standard printer because it is /// a structured format, and the printer always attempts to find the most /// information possible. /// /// Some configuration options, such as whether line numbers are included or /// whether contextual lines are shown, are drawn directly from the /// `grep_searcher::Searcher`'s configuration. /// /// Once a `JSON` printer is built, its configuration cannot be changed. #[derive(Clone, Debug)] pub struct JSONBuilder { config: Config, } impl JSONBuilder { /// Return a new builder for configuring the JSON printer. pub fn new() -> JSONBuilder { JSONBuilder { config: Config::default() } } /// Create a JSON printer that writes results to the given writer. pub fn build(&self, wtr: W) -> JSON { JSON { config: self.config.clone(), wtr: CounterWriter::new(wtr), matches: vec![], } } /// Print JSON in a pretty printed format. /// /// Enabling this will no longer produce a "JSON lines" format, in that /// each JSON object printed may span multiple lines. /// /// This is disabled by default. pub fn pretty(&mut self, yes: bool) -> &mut JSONBuilder { self.config.pretty = yes; self } /// Set the maximum amount of matches that are printed. /// /// If multi line search is enabled and a match spans multiple lines, then /// that match is counted exactly once for the purposes of enforcing this /// limit, regardless of how many lines it spans. pub fn max_matches(&mut self, limit: Option) -> &mut JSONBuilder { self.config.max_matches = limit; self } /// When enabled, the `begin` and `end` messages are always emitted, even /// when no match is found. /// /// When disabled, the `begin` and `end` messages are only shown if there /// is at least one `match` or `context` message. /// /// This is disabled by default. pub fn always_begin_end(&mut self, yes: bool) -> &mut JSONBuilder { self.config.always_begin_end = yes; self } } /// The JSON printer, which emits results in a JSON lines format. /// /// This type is generic over `W`, which represents any implementation of /// the standard library `io::Write` trait. /// /// # Format /// /// This section describes the JSON format used by this printer. /// /// To skip the rigamarole, take a look at the /// [example](#example) /// at the end. /// /// ## Overview /// /// The format of this printer is the [JSON Lines](https://jsonlines.org/) /// format. Specifically, this printer emits a sequence of messages, where /// each message is encoded as a single JSON value on a single line. There are /// four different types of messages (and this number may expand over time): /// /// * **begin** - A message that indicates a file is being searched. /// * **end** - A message the indicates a file is done being searched. This /// message also include summary statistics about the search. /// * **match** - A message that indicates a match was found. This includes /// the text and offsets of the match. /// * **context** - A message that indicates a contextual line was found. /// This includes the text of the line, along with any match information if /// the search was inverted. /// /// Every message is encoded in the same envelope format, which includes a tag /// indicating the message type along with an object for the payload: /// /// ```json /// { /// "type": "{begin|end|match|context}", /// "data": { ... } /// } /// ``` /// /// The message itself is encoded in the envelope's `data` key. /// /// ## Text encoding /// /// Before describing each message format, we first must briefly discuss text /// encoding, since it factors into every type of message. In particular, JSON /// may only be encoded in UTF-8, UTF-16 or UTF-32. For the purposes of this /// printer, we need only worry about UTF-8. The problem here is that searching /// is not limited to UTF-8 exclusively, which in turn implies that matches /// may be reported that contain invalid UTF-8. Moreover, this printer may /// also print file paths, and the encoding of file paths is itself not /// guarnateed to be valid UTF-8. Therefore, this printer must deal with the /// presence of invalid UTF-8 somehow. The printer could silently ignore such /// things completely, or even lossily transcode invalid UTF-8 to valid UTF-8 /// by replacing all invalid sequences with the Unicode replacement character. /// However, this would prevent consumers of this format from accessing the /// original data in a non-lossy way. /// /// Therefore, this printer will emit valid UTF-8 encoded bytes as normal /// JSON strings and otherwise base64 encode data that isn't valid UTF-8. To /// communicate whether this process occurs or not, strings are keyed by the /// name `text` where as arbitrary bytes are keyed by `bytes`. /// /// For example, when a path is included in a message, it is formatted like so, /// if and only if the path is valid UTF-8: /// /// ```json /// { /// "path": { /// "text": "/home/ubuntu/lib.rs" /// } /// } /// ``` /// /// If instead our path was `/home/ubuntu/lib\xFF.rs`, where the `\xFF` byte /// makes it invalid UTF-8, the path would instead be encoded like so: /// /// ```json /// { /// "path": { /// "bytes": "L2hvbWUvdWJ1bnR1L2xpYv8ucnM=" /// } /// } /// ``` /// /// This same representation is used for reporting matches as well. /// /// The printer guarantees that the `text` field is used whenever the /// underlying bytes are valid UTF-8. /// /// ## Wire format /// /// This section documents the wire format emitted by this printer, starting /// with the four types of messages. /// /// Each message has its own format, and is contained inside an envelope that /// indicates the type of message. The envelope has these fields: /// /// * **type** - A string indicating the type of this message. It may be one /// of four possible strings: `begin`, `end`, `match` or `context`. This /// list may expand over time. /// * **data** - The actual message data. The format of this field depends on /// the value of `type`. The possible message formats are /// [`begin`](#message-begin), /// [`end`](#message-end), /// [`match`](#message-match), /// [`context`](#message-context). /// /// #### Message: **begin** /// /// This message indicates that a search has begun. It has these fields: /// /// * **path** - An /// [arbitrary data object](#object-arbitrary-data) /// representing the file path corresponding to the search, if one is /// present. If no file path is available, then this field is `null`. /// /// #### Message: **end** /// /// This message indicates that a search has finished. It has these fields: /// /// * **path** - An /// [arbitrary data object](#object-arbitrary-data) /// representing the file path corresponding to the search, if one is /// present. If no file path is available, then this field is `null`. /// * **binary_offset** - The absolute offset in the data searched /// corresponding to the place at which binary data was detected. If no /// binary data was detected (or if binary detection was disabled), then this /// field is `null`. /// * **stats** - A [`stats` object](#object-stats) that contains summary /// statistics for the previous search. /// /// #### Message: **match** /// /// This message indicates that a match has been found. A match generally /// corresponds to a single line of text, although it may correspond to /// multiple lines if the search can emit matches over multiple lines. It /// has these fields: /// /// * **path** - An /// [arbitrary data object](#object-arbitrary-data) /// representing the file path corresponding to the search, if one is /// present. If no file path is available, then this field is `null`. /// * **lines** - An /// [arbitrary data object](#object-arbitrary-data) /// representing one or more lines contained in this match. /// * **line_number** - If the searcher has been configured to report line /// numbers, then this corresponds to the line number of the first line /// in `lines`. If no line numbers are available, then this is `null`. /// * **absolute_offset** - The absolute byte offset corresponding to the start /// of `lines` in the data being searched. /// * **submatches** - An array of [`submatch` objects](#object-submatch) /// corresponding to matches in `lines`. The offsets included in each /// `submatch` correspond to byte offsets into `lines`. (If `lines` is base64 /// encoded, then the byte offsets correspond to the data after base64 /// decoding.) The `submatch` objects are guaranteed to be sorted by their /// starting offsets. Note that it is possible for this array to be empty, /// for example, when searching reports inverted matches. /// /// #### Message: **context** /// /// This message indicates that a contextual line has been found. A contextual /// line is a line that doesn't contain a match, but is generally adjacent to /// a line that does contain a match. The precise way in which contextual lines /// are reported is determined by the searcher. It has these fields, which are /// exactly the same fields found in a [`match`](#message-match): /// /// * **path** - An /// [arbitrary data object](#object-arbitrary-data) /// representing the file path corresponding to the search, if one is /// present. If no file path is available, then this field is `null`. /// * **lines** - An /// [arbitrary data object](#object-arbitrary-data) /// representing one or more lines contained in this context. This includes /// line terminators, if they're present. /// * **line_number** - If the searcher has been configured to report line /// numbers, then this corresponds to the line number of the first line /// in `lines`. If no line numbers are available, then this is `null`. /// * **absolute_offset** - The absolute byte offset corresponding to the start /// of `lines` in the data being searched. /// * **submatches** - An array of [`submatch` objects](#object-submatch) /// corresponding to matches in `lines`. The offsets included in each /// `submatch` correspond to byte offsets into `lines`. (If `lines` is base64 /// encoded, then the byte offsets correspond to the data after base64 /// decoding.) The `submatch` objects are guaranteed to be sorted by /// their starting offsets. Note that it is possible for this array to be /// non-empty, for example, when searching reports inverted matches such that /// the original matcher could match things in the contextual lines. /// /// #### Object: **submatch** /// /// This object describes submatches found within `match` or `context` /// messages. The `start` and `end` fields indicate the half-open interval on /// which the match occurs (`start` is included, but `end` is not). It is /// guaranteed that `start <= end`. It has these fields: /// /// * **match** - An /// [arbitrary data object](#object-arbitrary-data) /// corresponding to the text in this submatch. /// * **start** - A byte offset indicating the start of this match. This offset /// is generally reported in terms of the parent object's data. For example, /// the `lines` field in the /// [`match`](#message-match) or [`context`](#message-context) /// messages. /// * **end** - A byte offset indicating the end of this match. This offset /// is generally reported in terms of the parent object's data. For example, /// the `lines` field in the /// [`match`](#message-match) or [`context`](#message-context) /// messages. /// /// #### Object: **stats** /// /// This object is included in messages and contains summary statistics about /// a search. It has these fields: /// /// * **elapsed** - A [`duration` object](#object-duration) describing the /// length of time that elapsed while performing the search. /// * **searches** - The number of searches that have run. For this printer, /// this value is always `1`. (Implementations may emit additional message /// types that use this same `stats` object that represents summary /// statistics over multiple searches.) /// * **searches_with_match** - The number of searches that have run that have /// found at least one match. This is never more than `searches`. /// * **bytes_searched** - The total number of bytes that have been searched. /// * **bytes_printed** - The total number of bytes that have been printed. /// This includes everything emitted by this printer. /// * **matched_lines** - The total number of lines that participated in a /// match. When matches may contain multiple lines, then this includes every /// line that is part of every match. /// * **matches** - The total number of matches. There may be multiple matches /// per line. When matches may contain multiple lines, each match is counted /// only once, regardless of how many lines it spans. /// /// #### Object: **duration** /// /// This object includes a few fields for describing a duration. Two of its /// fields, `secs` and `nanos`, can be combined to give nanosecond precision /// on systems that support it. It has these fields: /// /// * **secs** - A whole number of seconds indicating the length of this /// duration. /// * **nanos** - A fractional part of this duration represent by nanoseconds. /// If nanosecond precision isn't supported, then this is typically rounded /// up to the nearest number of nanoseconds. /// * **human** - A human readable string describing the length of the /// duration. The format of the string is itself unspecified. /// /// #### Object: **arbitrary data** /// /// This object is used whenever arbitrary data needs to be represented as a /// JSON value. This object contains two fields, where generally only one of /// the fields is present: /// /// * **text** - A normal JSON string that is UTF-8 encoded. This field is /// populated if and only if the underlying data is valid UTF-8. /// * **bytes** - A normal JSON string that is a base64 encoding of the /// underlying bytes. /// /// More information on the motivation for this representation can be seen in /// the section [text encoding](#text-encoding) above. /// /// ## Example /// /// This section shows a small example that includes all message types. /// /// Here's the file we want to search, located at `/home/andrew/sherlock`: /// /// ```text /// For the Doctor Watsons of this world, as opposed to the Sherlock /// Holmeses, success in the province of detective work must always /// be, to a very large extent, the result of luck. Sherlock Holmes /// can extract a clew from a wisp of straw or a flake of cigar ash; /// but Doctor Watson has to have it taken out for him and dusted, /// and exhibited clearly, with a label attached. /// ``` /// /// Searching for `Watson` with a `before_context` of `1` with line numbers /// enabled shows something like this using the standard printer: /// /// ```text /// sherlock:1:For the Doctor Watsons of this world, as opposed to the Sherlock /// -- /// sherlock-4-can extract a clew from a wisp of straw or a flake of cigar ash; /// sherlock:5:but Doctor Watson has to have it taken out for him and dusted, /// ``` /// /// Here's what the same search looks like using the JSON wire format described /// above, where in we show semi-prettified JSON (instead of a strict JSON /// Lines format), for illustrative purposes: /// /// ```json /// { /// "type": "begin", /// "data": { /// "path": {"text": "/home/andrew/sherlock"}} /// } /// } /// { /// "type": "match", /// "data": { /// "path": {"text": "/home/andrew/sherlock"}, /// "lines": {"text": "For the Doctor Watsons of this world, as opposed to the Sherlock\n"}, /// "line_number": 1, /// "absolute_offset": 0, /// "submatches": [ /// {"match": {"text": "Watson"}, "start": 15, "end": 21} /// ] /// } /// } /// { /// "type": "context", /// "data": { /// "path": {"text": "/home/andrew/sherlock"}, /// "lines": {"text": "can extract a clew from a wisp of straw or a flake of cigar ash;\n"}, /// "line_number": 4, /// "absolute_offset": 193, /// "submatches": [] /// } /// } /// { /// "type": "match", /// "data": { /// "path": {"text": "/home/andrew/sherlock"}, /// "lines": {"text": "but Doctor Watson has to have it taken out for him and dusted,\n"}, /// "line_number": 5, /// "absolute_offset": 258, /// "submatches": [ /// {"match": {"text": "Watson"}, "start": 11, "end": 17} /// ] /// } /// } /// { /// "type": "end", /// "data": { /// "path": {"text": "/home/andrew/sherlock"}, /// "binary_offset": null, /// "stats": { /// "elapsed": {"secs": 0, "nanos": 36296, "human": "0.0000s"}, /// "searches": 1, /// "searches_with_match": 1, /// "bytes_searched": 367, /// "bytes_printed": 1151, /// "matched_lines": 2, /// "matches": 2 /// } /// } /// } /// ``` #[derive(Debug)] pub struct JSON { config: Config, wtr: CounterWriter, matches: Vec, } impl JSON { /// Return a JSON lines printer with a default configuration that writes /// matches to the given writer. pub fn new(wtr: W) -> JSON { JSONBuilder::new().build(wtr) } /// Return an implementation of `Sink` for the JSON printer. /// /// This does not associate the printer with a file path, which means this /// implementation will never print a file path along with the matches. pub fn sink<'s, M: Matcher>( &'s mut self, matcher: M, ) -> JSONSink<'static, 's, M, W> { JSONSink { matcher: matcher, json: self, path: None, start_time: Instant::now(), match_count: 0, after_context_remaining: 0, binary_byte_offset: None, begin_printed: false, stats: Stats::new(), } } /// Return an implementation of `Sink` associated with a file path. /// /// When the printer is associated with a path, then it may, depending on /// its configuration, print the path along with the matches found. pub fn sink_with_path<'p, 's, M, P>( &'s mut self, matcher: M, path: &'p P, ) -> JSONSink<'p, 's, M, W> where M: Matcher, P: ?Sized + AsRef, { JSONSink { matcher: matcher, json: self, path: Some(path.as_ref()), start_time: Instant::now(), match_count: 0, after_context_remaining: 0, binary_byte_offset: None, begin_printed: false, stats: Stats::new(), } } /// Write the given message followed by a new line. The new line is /// determined from the configuration of the given searcher. fn write_message(&mut self, message: &jsont::Message) -> io::Result<()> { if self.config.pretty { json::to_writer_pretty(&mut self.wtr, message)?; } else { json::to_writer(&mut self.wtr, message)?; } self.wtr.write(&[b'\n'])?; Ok(()) } } impl JSON { /// Returns true if and only if this printer has written at least one byte /// to the underlying writer during any of the previous searches. pub fn has_written(&self) -> bool { self.wtr.total_count() > 0 } /// Return a mutable reference to the underlying writer. pub fn get_mut(&mut self) -> &mut W { self.wtr.get_mut() } /// Consume this printer and return back ownership of the underlying /// writer. pub fn into_inner(self) -> W { self.wtr.into_inner() } } /// An implementation of `Sink` associated with a matcher and an optional file /// path for the JSON printer. /// /// This type is generic over a few type parameters: /// /// * `'p` refers to the lifetime of the file path, if one is provided. When /// no file path is given, then this is `'static`. /// * `'s` refers to the lifetime of the /// [`JSON`](struct.JSON.html) /// printer that this type borrows. /// * `M` refers to the type of matcher used by /// `grep_searcher::Searcher` that is reporting results to this sink. /// * `W` refers to the underlying writer that this printer is writing its /// output to. #[derive(Debug)] pub struct JSONSink<'p, 's, M: Matcher, W: 's> { matcher: M, json: &'s mut JSON, path: Option<&'p Path>, start_time: Instant, match_count: u64, after_context_remaining: u64, binary_byte_offset: Option, begin_printed: bool, stats: Stats, } impl<'p, 's, M: Matcher, W: io::Write> JSONSink<'p, 's, M, W> { /// Returns true if and only if this printer received a match in the /// previous search. /// /// This is unaffected by the result of searches before the previous /// search. pub fn has_match(&self) -> bool { self.match_count > 0 } /// Return the total number of matches reported to this sink. /// /// This corresponds to the number of times `Sink::matched` is called. pub fn match_count(&self) -> u64 { self.match_count } /// If binary data was found in the previous search, this returns the /// offset at which the binary data was first detected. /// /// The offset returned is an absolute offset relative to the entire /// set of bytes searched. /// /// This is unaffected by the result of searches before the previous /// search. e.g., If the search prior to the previous search found binary /// data but the previous search found no binary data, then this will /// return `None`. pub fn binary_byte_offset(&self) -> Option { self.binary_byte_offset } /// Return a reference to the stats produced by the printer for all /// searches executed on this sink. pub fn stats(&self) -> &Stats { &self.stats } /// Execute the matcher over the given bytes and record the match /// locations if the current configuration demands match granularity. fn record_matches(&mut self, bytes: &[u8]) -> io::Result<()> { self.json.matches.clear(); // If printing requires knowing the location of each individual match, // then compute and stored those right now for use later. While this // adds an extra copy for storing the matches, we do amortize the // allocation for it and this greatly simplifies the printing logic to // the extent that it's easy to ensure that we never do more than // one search to find the matches. let matches = &mut self.json.matches; self.matcher .find_iter(bytes, |m| { matches.push(m); true }) .map_err(io::Error::error_message)?; // Don't report empty matches appearing at the end of the bytes. if !matches.is_empty() && matches.last().unwrap().is_empty() && matches.last().unwrap().start() >= bytes.len() { matches.pop().unwrap(); } Ok(()) } /// Returns true if this printer should quit. /// /// This implements the logic for handling quitting after seeing a certain /// amount of matches. In most cases, the logic is simple, but we must /// permit all "after" contextual lines to print after reaching the limit. fn should_quit(&self) -> bool { let limit = match self.json.config.max_matches { None => return false, Some(limit) => limit, }; if self.match_count < limit { return false; } self.after_context_remaining == 0 } /// Write the "begin" message. fn write_begin_message(&mut self) -> io::Result<()> { if self.begin_printed { return Ok(()); } let msg = jsont::Message::Begin(jsont::Begin { path: self.path }); self.json.write_message(&msg)?; self.begin_printed = true; Ok(()) } } impl<'p, 's, M: Matcher, W: io::Write> Sink for JSONSink<'p, 's, M, W> { type Error = io::Error; fn matched( &mut self, searcher: &Searcher, mat: &SinkMatch, ) -> Result { self.write_begin_message()?; self.match_count += 1; self.after_context_remaining = searcher.after_context() as u64; self.record_matches(mat.bytes())?; self.stats.add_matches(self.json.matches.len() as u64); self.stats.add_matched_lines(mat.lines().count() as u64); let submatches = SubMatches::new(mat.bytes(), &self.json.matches); let msg = jsont::Message::Match(jsont::Match { path: self.path, lines: mat.bytes(), line_number: mat.line_number(), absolute_offset: mat.absolute_byte_offset(), submatches: submatches.as_slice(), }); self.json.write_message(&msg)?; Ok(!self.should_quit()) } fn context( &mut self, searcher: &Searcher, ctx: &SinkContext, ) -> Result { self.write_begin_message()?; self.json.matches.clear(); if ctx.kind() == &SinkContextKind::After { self.after_context_remaining = self.after_context_remaining.saturating_sub(1); } let submatches = if searcher.invert_match() { self.record_matches(ctx.bytes())?; SubMatches::new(ctx.bytes(), &self.json.matches) } else { SubMatches::empty() }; let msg = jsont::Message::Context(jsont::Context { path: self.path, lines: ctx.bytes(), line_number: ctx.line_number(), absolute_offset: ctx.absolute_byte_offset(), submatches: submatches.as_slice(), }); self.json.write_message(&msg)?; Ok(!self.should_quit()) } fn begin(&mut self, _searcher: &Searcher) -> Result { self.json.wtr.reset_count(); self.start_time = Instant::now(); self.match_count = 0; self.after_context_remaining = 0; self.binary_byte_offset = None; if self.json.config.max_matches == Some(0) { return Ok(false); } if !self.json.config.always_begin_end { return Ok(true); } self.write_begin_message()?; Ok(true) } fn finish( &mut self, _searcher: &Searcher, finish: &SinkFinish, ) -> Result<(), io::Error> { if !self.begin_printed { return Ok(()); } self.binary_byte_offset = finish.binary_byte_offset(); self.stats.add_elapsed(self.start_time.elapsed()); self.stats.add_searches(1); if self.match_count > 0 { self.stats.add_searches_with_match(1); } self.stats.add_bytes_searched(finish.byte_count()); self.stats.add_bytes_printed(self.json.wtr.count()); let msg = jsont::Message::End(jsont::End { path: self.path, binary_offset: finish.binary_byte_offset(), stats: self.stats.clone(), }); self.json.write_message(&msg)?; Ok(()) } } /// SubMatches represents a set of matches in a contiguous range of bytes. /// /// A simpler representation for this would just simply be `Vec`, /// but the common case is exactly one match per range of bytes, which we /// specialize here using a fixed size array without any allocation. enum SubMatches<'a> { Empty, Small([jsont::SubMatch<'a>; 1]), Big(Vec>), } impl<'a> SubMatches<'a> { /// Create a new set of match ranges from a set of matches and the /// corresponding bytes that those matches apply to. fn new(bytes: &'a [u8], matches: &[Match]) -> SubMatches<'a> { if matches.len() == 1 { let mat = matches[0]; SubMatches::Small([jsont::SubMatch { m: &bytes[mat], start: mat.start(), end: mat.end(), }]) } else { let mut match_ranges = vec![]; for &mat in matches { match_ranges.push(jsont::SubMatch { m: &bytes[mat], start: mat.start(), end: mat.end(), }); } SubMatches::Big(match_ranges) } } /// Create an empty set of match ranges. fn empty() -> SubMatches<'static> { SubMatches::Empty } /// Return this set of match ranges as a slice. fn as_slice(&self) -> &[jsont::SubMatch] { match *self { SubMatches::Empty => &[], SubMatches::Small(ref x) => x, SubMatches::Big(ref x) => x, } } } #[cfg(test)] mod tests { use grep_matcher::LineTerminator; use grep_regex::{RegexMatcher, RegexMatcherBuilder}; use grep_searcher::SearcherBuilder; use super::{JSONBuilder, JSON}; const SHERLOCK: &'static [u8] = b"\ For the Doctor Watsons of this world, as opposed to the Sherlock Holmeses, success in the province of detective work must always be, to a very large extent, the result of luck. Sherlock Holmes can extract a clew from a wisp of straw or a flake of cigar ash; but Doctor Watson has to have it taken out for him and dusted, and exhibited clearly, with a label attached. "; fn printer_contents(printer: &mut JSON>) -> String { String::from_utf8(printer.get_mut().to_owned()).unwrap() } #[test] fn binary_detection() { use grep_searcher::BinaryDetection; const BINARY: &'static [u8] = b"\ For the Doctor Watsons of this world, as opposed to the Sherlock Holmeses, success in the province of detective work must always be, to a very large extent, the result of luck. Sherlock Holmes can extract a clew \x00 from a wisp of straw or a flake of cigar ash; but Doctor Watson has to have it taken out for him and dusted, and exhibited clearly, with a label attached.\ "; let matcher = RegexMatcher::new(r"Watson").unwrap(); let mut printer = JSONBuilder::new().build(vec![]); SearcherBuilder::new() .binary_detection(BinaryDetection::quit(b'\x00')) .heap_limit(Some(80)) .build() .search_reader(&matcher, BINARY, printer.sink(&matcher)) .unwrap(); let got = printer_contents(&mut printer); assert_eq!(got.lines().count(), 3); let last = got.lines().last().unwrap(); assert!(last.contains(r#""binary_offset":212,"#)); } #[test] fn max_matches() { let matcher = RegexMatcher::new(r"Watson").unwrap(); let mut printer = JSONBuilder::new().max_matches(Some(1)).build(vec![]); SearcherBuilder::new() .build() .search_reader(&matcher, SHERLOCK, printer.sink(&matcher)) .unwrap(); let got = printer_contents(&mut printer); assert_eq!(got.lines().count(), 3); } #[test] fn no_match() { let matcher = RegexMatcher::new(r"DOES NOT MATCH").unwrap(); let mut printer = JSONBuilder::new().build(vec![]); SearcherBuilder::new() .build() .search_reader(&matcher, SHERLOCK, printer.sink(&matcher)) .unwrap(); let got = printer_contents(&mut printer); assert!(got.is_empty()); } #[test] fn always_begin_end_no_match() { let matcher = RegexMatcher::new(r"DOES NOT MATCH").unwrap(); let mut printer = JSONBuilder::new().always_begin_end(true).build(vec![]); SearcherBuilder::new() .build() .search_reader(&matcher, SHERLOCK, printer.sink(&matcher)) .unwrap(); let got = printer_contents(&mut printer); assert_eq!(got.lines().count(), 2); assert!(got.contains("begin") && got.contains("end")); } #[test] fn missing_crlf() { let haystack = "test\r\n".as_bytes(); let matcher = RegexMatcherBuilder::new().build("test").unwrap(); let mut printer = JSONBuilder::new().build(vec![]); SearcherBuilder::new() .build() .search_reader(&matcher, haystack, printer.sink(&matcher)) .unwrap(); let got = printer_contents(&mut printer); assert_eq!(got.lines().count(), 3); assert!( got.lines().nth(1).unwrap().contains(r"test\r\n"), r"missing 'test\r\n' in '{}'", got.lines().nth(1).unwrap(), ); let matcher = RegexMatcherBuilder::new().crlf(true).build("test").unwrap(); let mut printer = JSONBuilder::new().build(vec![]); SearcherBuilder::new() .line_terminator(LineTerminator::crlf()) .build() .search_reader(&matcher, haystack, printer.sink(&matcher)) .unwrap(); let got = printer_contents(&mut printer); assert_eq!(got.lines().count(), 3); assert!( got.lines().nth(1).unwrap().contains(r"test\r\n"), r"missing 'test\r\n' in '{}'", got.lines().nth(1).unwrap(), ); } }