From 4b8aa91ae5b17bd988ed05ab5c7c387de9247b13 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sun, 18 Jun 2023 13:07:48 -0400 Subject: [PATCH] deps: update to pcre2 0.2.4 0.2.4 updates to PCRE2 10.42 and has a few other nice changes. For example, when `utf` is enabled, the crate will always set the PCRE2_MATCH_INVALID_UTF option. That means we no longer need to do transcoding or UTF-8 validity checks. Because of this, we actually get to remove one of the two uses of `unsafe` in ripgrep's `main` program. (This also updates a couple other dependencies for convenience.) --- Cargo.lock | 16 ++++++++-------- crates/core/args.rs | 33 --------------------------------- crates/pcre2/Cargo.toml | 2 +- crates/pcre2/src/matcher.rs | 29 ++++++++++++++--------------- 4 files changed, 23 insertions(+), 57 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 3f4bc49b..597aa63e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -80,9 +80,9 @@ dependencies = [ [[package]] name = "crossbeam-utils" -version = "0.8.15" +version = "0.8.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3c063cd8cc95f5c377ed0d4b49a4b21f632396ff690e8470c29b3359b346984b" +checksum = "5a22b2d63d4d1dc0b7f1b6b2747dd0088008a9be28b6ddf0b1e7d335e3037294" dependencies = [ "cfg-if", ] @@ -327,9 +327,9 @@ dependencies = [ [[package]] name = "pcre2" -version = "0.2.3" +version = "0.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "85b30f2f69903b439dd9dc9e824119b82a55bf113b29af8d70948a03c1b11ab1" +checksum = "486aca7e74edb8cab09a48d461177f450a5cca3b55e61d139f7552190e2bbcf5" dependencies = [ "libc", "log", @@ -339,9 +339,9 @@ dependencies = [ [[package]] name = "pcre2-sys" -version = "0.2.5" +version = "0.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dec30e5e9ec37eb8fbf1dea5989bc957fd3df56fbee5061aa7b7a99dbb37b722" +checksum = "ae234f441970dbd52d4e29bee70f3b56ca83040081cb2b55b7df772b16e0b06e" dependencies = [ "cc", "libc", @@ -457,9 +457,9 @@ dependencies = [ [[package]] name = "serde_json" -version = "1.0.96" +version = "1.0.97" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "057d394a50403bcac12672b2b18fb387ab6d289d957dab67dd201875391e52f1" +checksum = "bdf3bf93142acad5821c99197022e170842cdbc1c30482b98750c688c640842a" dependencies = [ "itoa", "ryu", diff --git a/crates/core/args.rs b/crates/core/args.rs index 10e6a64f..3d5f7742 100644 --- a/crates/core/args.rs +++ b/crates/core/args.rs @@ -472,24 +472,6 @@ enum EncodingMode { Disabled, } -impl EncodingMode { - /// Checks if an explicit encoding has been set. Returns false for - /// automatic BOM sniffing and no sniffing. - /// - /// This is only used to determine whether PCRE2 needs to have its own - /// UTF-8 checking enabled. If we have an explicit encoding set, then - /// we're always guaranteed to get UTF-8, so we can disable PCRE2's check. - /// Otherwise, we have no such guarantee, and must enable PCRE2' UTF-8 - /// check. - #[cfg(feature = "pcre2")] - fn has_explicit_encoding(&self) -> bool { - match self { - EncodingMode::Some(_) => true, - _ => false, - } - } -} - impl ArgMatches { /// Create an ArgMatches from clap's parse result. fn new(clap_matches: clap::ArgMatches<'static>) -> ArgMatches { @@ -732,14 +714,6 @@ impl ArgMatches { } if self.unicode() { builder.utf(true).ucp(true); - if self.encoding()?.has_explicit_encoding() { - // SAFETY: If an encoding was specified, then we're guaranteed - // to get valid UTF-8, so we can disable PCRE2's UTF checking. - // (Feeding invalid UTF-8 to PCRE2 is undefined behavior.) - unsafe { - builder.disable_utf_check(); - } - } } if self.is_present("multiline") { builder.dotall(self.is_present("multiline-dotall")); @@ -1080,7 +1054,6 @@ impl ArgMatches { } let label = match self.value_of_lossy("encoding") { - None if self.pcre2_unicode() => "utf-8".to_string(), None => return Ok(EncodingMode::Auto), Some(label) => label, }; @@ -1641,12 +1614,6 @@ impl ArgMatches { !(self.is_present("no-unicode") || self.is_present("no-pcre2-unicode")) } - /// Returns true if and only if PCRE2 is enabled and its Unicode mode is - /// enabled. - fn pcre2_unicode(&self) -> bool { - self.is_present("pcre2") && self.unicode() - } - /// Returns true if and only if file names containing each match should /// be emitted. fn with_filename(&self, paths: &[PathBuf]) -> bool { diff --git a/crates/pcre2/Cargo.toml b/crates/pcre2/Cargo.toml index 67653057..c0c3ede2 100644 --- a/crates/pcre2/Cargo.toml +++ b/crates/pcre2/Cargo.toml @@ -15,4 +15,4 @@ edition = "2018" [dependencies] grep-matcher = { version = "0.1.6", path = "../matcher" } -pcre2 = "0.2.3" +pcre2 = "0.2.4" diff --git a/crates/pcre2/src/matcher.rs b/crates/pcre2/src/matcher.rs index 94a6f338..a8c47c32 100644 --- a/crates/pcre2/src/matcher.rs +++ b/crates/pcre2/src/matcher.rs @@ -178,23 +178,22 @@ impl RegexMatcherBuilder { self } - /// When UTF matching mode is enabled, this will disable the UTF checking - /// that PCRE2 will normally perform automatically. If UTF matching mode - /// is not enabled, then this has no effect. + /// This is now deprecated and is a no-op. /// - /// UTF checking is enabled by default when UTF matching mode is enabled. - /// If UTF matching mode is enabled and UTF checking is enabled, then PCRE2 - /// will return an error if you attempt to search a subject string that is - /// not valid UTF-8. + /// Previously, this option permitted disabling PCRE2's UTF-8 validity + /// check, which could result in undefined behavior if the haystack was + /// not valid UTF-8. But PCRE2 introduced a new option, `PCRE2_MATCH_INVALID_UTF`, + /// in 10.34 which this crate always sets. When this option is enabled, + /// PCRE2 claims to not have undefined behavior when the haystack is + /// invalid UTF-8. /// - /// # Safety - /// - /// It is undefined behavior to disable the UTF check in UTF matching mode - /// and search a subject string that is not valid UTF-8. When the UTF check - /// is disabled, callers must guarantee that the subject string is valid - /// UTF-8. - pub unsafe fn disable_utf_check(&mut self) -> &mut RegexMatcherBuilder { - self.builder.disable_utf_check(); + /// Therefore, disabling the UTF-8 check is not something that is exposed + /// by this crate. + #[deprecated( + since = "0.2.4", + note = "now a no-op due to new PCRE2 features" + )] + pub fn disable_utf_check(&mut self) -> &mut RegexMatcherBuilder { self }