diff --git a/CHANGELOG.md b/CHANGELOG.md index e72d48af..30736139 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,14 @@ TBD === TODO +Deprecations: + +* The `--no-pcre2-unicode` flag is deprecated. Instead, use the `--no-unicode` + flag, which applies to both the default regex engine and PCRE2. For now, + `--no-pcre2-unicode` and `--pcre2-unicode` are aliases to `--no-unicode` + and `--unicode`, respectively. The `--[no-]pcre2-unicode` flags may be + removed in a future release. + Performance improvements: * [PERF #1381](https://github.com/BurntSushi/ripgrep/pull/1381): @@ -27,6 +35,8 @@ Feature enhancements: Add `--no-require-git` flag to allow ripgrep to respect gitignores anywhere. * [FEATURE #1420](https://github.com/BurntSushi/ripgrep/pull/1420): Add `--no-ignore-exclude` to disregard rules in `.git/info/exclude` files. +* FEATURE: + Add `--no-unicode` flag. This works on all supported regex engines. Bug fixes: diff --git a/complete/_rg b/complete/_rg index 82dc3cdb..44d63e63 100644 --- a/complete/_rg +++ b/complete/_rg @@ -144,6 +144,8 @@ _rg() { + '(ignore-vcs)' # VCS ignore-file options "--no-ignore-vcs[don't respect version control ignore files]" $no'--ignore-vcs[respect version control ignore files]' + + + '(require-git)' # git specific settings "--no-require-git[don't require git repository to respect gitignore rules]" $no'--require-git[require git repository to respect gitignore rules]' @@ -270,6 +272,10 @@ _rg() { {-w,--word-regexp}'[only show matches surrounded by word boundaries]' {-x,--line-regexp}'[only show matches surrounded by line boundaries]' + + '(unicode)' # Unicode options + $no'--unicode[enable Unicode mode]' + '--no-unicode[disable Unicode mode]' + + '(zip)' # Compression options '(--pre)'{-z,--search-zip}'[search in compressed files]' $no"--no-search-zip[don't search in compressed files]" diff --git a/src/app.rs b/src/app.rs index 320261b5..dd1e296b 100644 --- a/src/app.rs +++ b/src/app.rs @@ -603,6 +603,7 @@ pub fn all_args_and_flags() -> Vec { flag_no_messages(&mut args); flag_no_pcre2_unicode(&mut args); flag_no_require_git(&mut args); + flag_no_unicode(&mut args); flag_null(&mut args); flag_null_data(&mut args); flag_one_file_system(&mut args); @@ -1890,42 +1891,21 @@ This flag can be disabled with the --messages flag. fn flag_no_pcre2_unicode(args: &mut Vec) { const SHORT: &str = "Disable Unicode mode for PCRE2 matching."; const LONG: &str = long!("\ -When PCRE2 matching is enabled, this flag will disable Unicode mode, which is -otherwise enabled by default. If PCRE2 matching is not enabled, then this flag -has no effect. +DEPRECATED. Use --no-unicode instead. -When PCRE2's Unicode mode is enabled, several different types of patterns -become Unicode aware. This includes '\\b', '\\B', '\\w', '\\W', '\\d', '\\D', -'\\s' and '\\S'. Similarly, the '.' meta character will match any Unicode -codepoint instead of any byte. Caseless matching will also use Unicode simple -case folding instead of ASCII-only case insensitivity. - -Unicode mode in PCRE2 represents a critical trade off in the user experience -of ripgrep. In particular, unlike the default regex engine, PCRE2 does not -support the ability to search possibly invalid UTF-8 with Unicode features -enabled. Instead, PCRE2 *requires* that everything it searches when Unicode -mode is enabled is valid UTF-8. (Or valid UTF-16/UTF-32, but for the purposes -of ripgrep, we only discuss UTF-8.) This means that if you have PCRE2's Unicode -mode enabled and you attempt to search invalid UTF-8, then the search for that -file will halt and print an error. For this reason, when PCRE2's Unicode mode -is enabled, ripgrep will automatically \"fix\" invalid UTF-8 sequences by -replacing them with the Unicode replacement codepoint. - -If you would rather see the encoding errors surfaced by PCRE2 when Unicode mode -is enabled, then pass the --no-encoding flag to disable all transcoding. - -Related flags: --pcre2 - -This flag can be disabled with --pcre2-unicode. +This flag is now an alias for --no-unicode. And --pcre2-unicode is an alias +for --unicode. "); let arg = RGArg::switch("no-pcre2-unicode") .help(SHORT).long_help(LONG) - .overrides("pcre2-unicode"); + .overrides("pcre2-unicode") + .overrides("unicode"); args.push(arg); let arg = RGArg::switch("pcre2-unicode") .hidden() - .overrides("no-pcre2-unicode"); + .overrides("no-pcre2-unicode") + .overrides("no-unicode"); args.push(arg); } @@ -1951,6 +1931,55 @@ This flag can be disabled with --require-git. args.push(arg); } +fn flag_no_unicode(args: &mut Vec) { + const SHORT: &str = "Disable Unicode mode."; + const LONG: &str = long!("\ +By default, ripgrep will enable \"Unicode mode\" in all of its regexes. This +has a number of consequences: + +* '.' will only match valid UTF-8 encoded scalar values. +* Classes like '\\w', '\\s', '\\d' are all Unicode aware and much bigger + than their ASCII only versions. +* Case insensitive matching will use Unicode case folding. +* A large array of classes like '\\p{Emoji}' are available. +* Word boundaries ('\\b' and '\\B') use the Unicode definition of a word + character. + +In some cases it can be desirable to turn these things off. The --no-unicode +flag will do exactly that. + +For PCRE2 specifically, Unicode mode represents a critical trade off in the +user experience of ripgrep. In particular, unlike the default regex engine, +PCRE2 does not support the ability to search possibly invalid UTF-8 with +Unicode features enabled. Instead, PCRE2 *requires* that everything it searches +when Unicode mode is enabled is valid UTF-8. (Or valid UTF-16/UTF-32, but for +the purposes of ripgrep, we only discuss UTF-8.) This means that if you have +PCRE2's Unicode mode enabled and you attempt to search invalid UTF-8, then +the search for that file will halt and print an error. For this reason, when +PCRE2's Unicode mode is enabled, ripgrep will automatically \"fix\" invalid +UTF-8 sequences by replacing them with the Unicode replacement codepoint. This +penalty does not occur when using the default regex engine. + +If you would rather see the encoding errors surfaced by PCRE2 when Unicode mode +is enabled, then pass the --no-encoding flag to disable all transcoding. + +The --no-unicode flag can be disabled with --unicode. Note that +--no-pcre2-unicode and --pcre2-unicode are aliases for --no-unicode and +--unicode, respectively. +"); + let arg = RGArg::switch("no-unicode") + .help(SHORT).long_help(LONG) + .overrides("unicode") + .overrides("pcre2-unicode"); + args.push(arg); + + let arg = RGArg::switch("unicode") + .hidden() + .overrides("no-unicode") + .overrides("no-pcre2-unicode"); + args.push(arg); +} + fn flag_null(args: &mut Vec) { const SHORT: &str = "Print a NUL byte after file paths."; const LONG: &str = long!("\ diff --git a/src/args.rs b/src/args.rs index d9eeb2e1..4989f14d 100644 --- a/src/args.rs +++ b/src/args.rs @@ -654,7 +654,7 @@ impl ArgMatches { .case_smart(self.case_smart()) .case_insensitive(self.case_insensitive()) .multi_line(true) - .unicode(true) + .unicode(self.unicode()) .octal(false) .word(self.is_present("word-regexp")); if self.is_present("multiline") { @@ -720,7 +720,7 @@ impl ArgMatches { // 10MB. .max_jit_stack_size(Some(10 * (1<<20))); } - if self.pcre2_unicode() { + if self.unicode() { builder.utf(true).ucp(true); if self.encoding()?.has_explicit_encoding() { // SAFETY: If an encoding was specified, then we're guaranteed @@ -1602,11 +1602,17 @@ impl ArgMatches { self.occurrences_of("unrestricted") } - /// Returns true if and only if PCRE2's Unicode mode should be enabled. + /// Returns true if and only if Unicode mode should be enabled. + fn unicode(&self) -> bool { + // Unicode mode is enabled by default, so only disable it when + // --no-unicode is given explicitly. + !(self.is_present("no-unicode") || self.is_present("no-pcre2-unicode")) + } + + /// Returns true if and only if PCRE2 is enabled and its Unicode mode is + /// enabled. fn pcre2_unicode(&self) -> bool { - // PCRE2 Unicode is enabled by default, so only disable it when told - // to do so explicitly. - self.is_present("pcre2") && !self.is_present("no-pcre2-unicode") + self.is_present("pcre2") && self.unicode() } /// Returns true if and only if file names containing each match should diff --git a/tests/feature.rs b/tests/feature.rs index 3d61f459..f3cf8463 100644 --- a/tests/feature.rs +++ b/tests/feature.rs @@ -834,3 +834,8 @@ rgtest!(context_sep_empty, |dir: Dir, mut cmd: TestCommand| { ]); eqnice!("foo\nctx\n\nfoo\nctx\n", cmd.stdout()); }); + +rgtest!(no_unicode, |dir: Dir, mut cmd: TestCommand| { + dir.create("test", "δ"); + cmd.arg("-i").arg("--no-unicode").arg("Δ").assert_err(); +});