ripgrep/crates/core/flags/lowargs.rs

/*!
Provides the definition of low level arguments from CLI flags.
*/

use std::{
    ffi::{OsStr, OsString},
    path::PathBuf,
};

use {
    bstr::{BString, ByteVec},
    grep::printer::{HyperlinkFormat, UserColorSpec},
};

/// A collection of "low level" arguments.
///
/// The "low level" here is meant to constrain this type to be as close to the
/// actual CLI flags and arguments as possible. Namely, other than some
/// convenience types to help validate flag values and deal with overrides
/// between flags, these low level arguments do not contain any higher level
/// abstractions.
///
/// Another self-imposed constraint is that populating low level arguments
/// should not require anything other than validating what the user has
/// provided. For example, low level arguments should not contain a
/// `HyperlinkConfig`, since in order to get a full configuration, one needs to
/// discover the hostname of the current system (which might require running a
/// binary or a syscall).
///
/// Low level arguments are populated by the parser directly via the `update`
/// method on the corresponding implementation of the `Flag` trait.
#[derive(Debug, Default)]
pub(crate) struct LowArgs {
    // Essential arguments.
    pub(crate) special: Option<SpecialMode>,
    pub(crate) mode: Mode,
    pub(crate) positional: Vec<OsString>,
    pub(crate) patterns: Vec<PatternSource>,
    // Everything else, sorted lexicographically.
    pub(crate) binary: BinaryMode,
    pub(crate) boundary: Option<BoundaryMode>,
    pub(crate) buffer: BufferMode,
    pub(crate) byte_offset: bool,
    pub(crate) case: CaseMode,
    pub(crate) color: ColorChoice,
    pub(crate) colors: Vec<UserColorSpec>,
    pub(crate) column: Option<bool>,
    pub(crate) context: ContextMode,
    pub(crate) context_separator: ContextSeparator,
    pub(crate) crlf: bool,
    pub(crate) dfa_size_limit: Option<usize>,
    pub(crate) encoding: EncodingMode,
    pub(crate) engine: EngineChoice,
    pub(crate) field_context_separator: FieldContextSeparator,
    pub(crate) field_match_separator: FieldMatchSeparator,
    pub(crate) fixed_strings: bool,
    pub(crate) follow: bool,
    pub(crate) glob_case_insensitive: bool,
    pub(crate) globs: Vec<String>,
    pub(crate) heading: Option<bool>,
    pub(crate) hidden: bool,
    pub(crate) hostname_bin: Option<PathBuf>,
    pub(crate) hyperlink_format: HyperlinkFormat,
    pub(crate) iglobs: Vec<String>,
    pub(crate) ignore_file: Vec<PathBuf>,
    pub(crate) ignore_file_case_insensitive: bool,
    pub(crate) include_zero: bool,
    pub(crate) invert_match: bool,
    pub(crate) line_number: Option<bool>,
    pub(crate) logging: Option<LoggingMode>,
    pub(crate) max_columns: Option<u64>,
    pub(crate) max_columns_preview: bool,
    pub(crate) max_count: Option<u64>,
    pub(crate) max_depth: Option<usize>,
    pub(crate) max_filesize: Option<u64>,
    pub(crate) mmap: MmapMode,
    pub(crate) multiline: bool,
    pub(crate) multiline_dotall: bool,
    pub(crate) no_config: bool,
    pub(crate) no_ignore_dot: bool,
    pub(crate) no_ignore_exclude: bool,
    pub(crate) no_ignore_files: bool,
    pub(crate) no_ignore_global: bool,
    pub(crate) no_ignore_messages: bool,
    pub(crate) no_ignore_parent: bool,
    pub(crate) no_ignore_vcs: bool,
    pub(crate) no_messages: bool,
    pub(crate) no_require_git: bool,
    pub(crate) no_unicode: bool,
    pub(crate) null: bool,
    pub(crate) null_data: bool,
    pub(crate) one_file_system: bool,
    pub(crate) only_matching: bool,
    pub(crate) path_separator: Option<u8>,
    pub(crate) pre: Option<PathBuf>,
    pub(crate) pre_glob: Vec<String>,
    pub(crate) quiet: bool,
    pub(crate) regex_size_limit: Option<usize>,
    pub(crate) replace: Option<BString>,
    pub(crate) search_zip: bool,
    pub(crate) sort: Option<SortMode>,
    pub(crate) stats: bool,
    pub(crate) stop_on_nonmatch: bool,
    pub(crate) threads: Option<usize>,
    pub(crate) trim: bool,
    pub(crate) type_changes: Vec<TypeChange>,
    pub(crate) unrestricted: usize,
    pub(crate) vimgrep: bool,
    pub(crate) with_filename: Option<bool>,
}

/// A "special" mode that supercedes everything else.
///
/// When one of these modes is present, it overrides everything else and causes
/// ripgrep to short-circuit. In particular, we avoid converting low-level
/// argument types into higher level arguments types that can fail for various
/// reasons related to the environment. (Parsing the low-level arguments can
/// fail too, but usually not in a way that can't be worked around by removing
/// the corresponding arguments from the CLI command.) This is overall a hedge
/// to ensure that version and help information are basically always available.
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
pub(crate) enum SpecialMode {
    /// Show a condensed version of "help" output. Generally speaking, this
    /// shows each flag and an extremely terse description of that flag on
    /// a single line. This corresponds to the `-h` flag.
    HelpShort,
    /// Shows a very verbose version of the "help" output. The docs for some
    /// flags will be paragraphs long. This corresponds to the `--help` flag.
    HelpLong,
    /// Show condensed version information. e.g., `ripgrep x.y.z`.
    VersionShort,
    /// Show verbose version information. Includes "short" information as well
    /// as features included in the build.
    VersionLong,
    /// Show PCRE2's version information, or an error if this version of
    /// ripgrep wasn't compiled with PCRE2 support.
    VersionPCRE2,
}

/// The overall mode that ripgrep should operate in.
///
/// If ripgrep were designed without the legacy of grep, these would probably
/// be sub-commands? Perhaps not, since they aren't as frequently used.
///
/// The point of putting these in one enum is that they are all mutually
/// exclusive and override one another.
///
/// Note that -h/--help and -V/--version are not included in this because
/// they always overrides everything else, regardless of where it appears
/// in the command line. They are treated as "special" modes that short-circuit
/// ripgrep's usual flow.
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
pub(crate) enum Mode {
    /// ripgrep will execute a search of some kind.
    Search(SearchMode),
    /// Show the files that *would* be searched, but don't actually search
    /// them.
    Files,
    /// List all file type definitions configured, including the default file
    /// types and any additional file types added to the command line.
    Types,
    /// Generate various things like the man page and completion files.
    Generate(GenerateMode),
}

impl Default for Mode {
    fn default() -> Mode {
        Mode::Search(SearchMode::Standard)
    }
}

impl Mode {
    /// Update this mode to the new mode while implementing various override
    /// semantics. For example, a search mode cannot override a non-search
    /// mode.
    pub(crate) fn update(&mut self, new: Mode) {
        match *self {
            // If we're in a search mode, then anything can override it.
            Mode::Search(_) => *self = new,
            _ => {
                // Once we're in a non-search mode, other non-search modes
                // can override it. But search modes cannot. So for example,
                // `--files -l` will still be Mode::Files.
                if !matches!(*self, Mode::Search(_)) {
                    *self = new;
                }
            }
        }
    }
}

/// The kind of search that ripgrep is going to perform.
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
pub(crate) enum SearchMode {
    /// The default standard mode of operation. ripgrep looks for matches and
    /// prints them when found.
    ///
    /// There is no specific flag for this mode since it's the default. But
    /// some of the modes below, like JSON, have negation flags like --no-json
    /// that let you revert back to this default mode.
    Standard,
    /// Show files containing at least one match.
    FilesWithMatches,
    /// Show files that don't contain any matches.
    FilesWithoutMatch,
    /// Show files containing at least one match and the number of matching
    /// lines.
    Count,
    /// Show files containing at least one match and the total number of
    /// matches.
    CountMatches,
    /// Print matches in a JSON lines format.
    JSON,
}

/// The thing to generate via the --generate flag.
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
pub(crate) enum GenerateMode {
    /// Generate the raw roff used for the man page.
    Man,
    /// Completions for bash.
    CompleteBash,
    /// Completions for zsh.
    CompleteZsh,
    /// Completions for fish.
    CompleteFish,
    /// Completions for PowerShell.
    CompletePowerShell,
}

/// Indicates how ripgrep should treat binary data.
#[derive(Debug, Default, Eq, PartialEq)]
pub(crate) enum BinaryMode {
    /// Automatically determine the binary mode to use. Essentially, when
    /// a file is searched explicitly, then it will be searched using the
    /// `SearchAndSuppress` strategy. Otherwise, it will be searched in a way
    /// that attempts to skip binary files as much as possible. That is, once
    /// a file is classified as binary, searching will immediately stop.
    #[default]
    Auto,
    /// Search files even when they have binary data, but if a match is found,
    /// suppress it and emit a warning.
    ///
    /// In this mode, `NUL` bytes are replaced with line terminators. This is
    /// a heuristic meant to reduce heap memory usage, since true binary data
    /// isn't line oriented. If one attempts to treat such data as line
    /// oriented, then one may wind up with impractically large lines. For
    /// example, many binary files contain very long runs of NUL bytes.
    SearchAndSuppress,
    /// Treat all files as if they were plain text. There's no skipping and no
    /// replacement of `NUL` bytes with line terminators.
    AsText,
}

/// Indicates what kind of boundary mode to use (line or word).
#[derive(Debug, Eq, PartialEq)]
pub(crate) enum BoundaryMode {
    /// Only allow matches when surrounded by line bounaries.
    Line,
    /// Only allow matches when surrounded by word bounaries.
    Word,
}

/// Indicates the buffer mode that ripgrep should use when printing output.
///
/// The default is `Auto`.
#[derive(Debug, Default, Eq, PartialEq)]
pub(crate) enum BufferMode {
    /// Select the buffer mode, 'line' or 'block', automatically based on
    /// whether stdout is connected to a tty.
    #[default]
    Auto,
    /// Flush the output buffer whenever a line terminator is seen.
    ///
    /// This is useful when wants to see search results more immediately,
    /// for example, with `tail -f`.
    Line,
    /// Flush the output buffer whenever it reaches some fixed size. The size
    /// is usually big enough to hold many lines.
    ///
    /// This is useful for maximum performance, particularly when printing
    /// lots of results.
    Block,
}

/// Indicates the case mode for how to interpret all patterns given to ripgrep.
///
/// The default is `Sensitive`.
#[derive(Debug, Default, Eq, PartialEq)]
pub(crate) enum CaseMode {
    /// Patterns are matched case sensitively. i.e., `a` does not match `A`.
    #[default]
    Sensitive,
    /// Patterns are matched case insensitively. i.e., `a` does match `A`.
    Insensitive,
    /// Patterns are automatically matched case insensitively only when they
    /// consist of all lowercase literal characters. For example, the pattern
    /// `a` will match `A` but `A` will not match `a`.
    Smart,
}

/// Indicates whether ripgrep should include color/hyperlinks in its output.
///
/// The default is `Auto`.
#[derive(Debug, Default, Eq, PartialEq)]
pub(crate) enum ColorChoice {
    /// Color and hyperlinks will never be used.
    Never,
    /// Color and hyperlinks will be used only when stdout is connected to a
    /// tty.
    #[default]
    Auto,
    /// Color will always be used.
    Always,
    /// Color will always be used and only ANSI escapes will be used.
    ///
    /// This only makes sense in the context of legacy Windows console APIs.
    /// At time of writing, ripgrep will try to use the legacy console APIs
    /// if ANSI coloring isn't believed to be possible. This option will force
    /// ripgrep to use ANSI coloring.
    Ansi,
}

impl ColorChoice {
    /// Convert this color choice to the corresponding termcolor type.
    pub(crate) fn to_termcolor(&self) -> termcolor::ColorChoice {
        match *self {
            ColorChoice::Never => termcolor::ColorChoice::Never,
            ColorChoice::Auto => termcolor::ColorChoice::Auto,
            ColorChoice::Always => termcolor::ColorChoice::Always,
            ColorChoice::Ansi => termcolor::ColorChoice::AlwaysAnsi,
        }
    }
}

/// Indicates the line context options ripgrep should use for output.
///
/// The default is no context at all.
#[derive(Debug, Eq, PartialEq)]
pub(crate) enum ContextMode {
    /// All lines will be printed. That is, the context is unbounded.
    Passthru,
    /// Only show a certain number of lines before and after each match.
    Limited(ContextModeLimited),
}

impl Default for ContextMode {
    fn default() -> ContextMode {
        ContextMode::Limited(ContextModeLimited::default())
    }
}

impl ContextMode {
    /// Set the "before" context.
    ///
    /// If this was set to "passthru" context, then it is overridden in favor
    /// of limited context with the given value for "before" and `0` for
    /// "after."
    pub(crate) fn set_before(&mut self, lines: usize) {
        match *self {
            ContextMode::Passthru => {
                *self = ContextMode::Limited(ContextModeLimited {
                    before: Some(lines),
                    after: None,
                    both: None,
                })
            }
            ContextMode::Limited(ContextModeLimited {
                ref mut before,
                ..
            }) => *before = Some(lines),
        }
    }

    /// Set the "after" context.
    ///
    /// If this was set to "passthru" context, then it is overridden in favor
    /// of limited context with the given value for "after" and `0` for
    /// "before."
    pub(crate) fn set_after(&mut self, lines: usize) {
        match *self {
            ContextMode::Passthru => {
                *self = ContextMode::Limited(ContextModeLimited {
                    before: None,
                    after: Some(lines),
                    both: None,
                })
            }
            ContextMode::Limited(ContextModeLimited {
                ref mut after, ..
            }) => *after = Some(lines),
        }
    }

    /// Set the "both" context.
    ///
    /// If this was set to "passthru" context, then it is overridden in favor
    /// of limited context with the given value for "both" and `None` for
    /// "before" and "after".
    pub(crate) fn set_both(&mut self, lines: usize) {
        match *self {
            ContextMode::Passthru => {
                *self = ContextMode::Limited(ContextModeLimited {
                    before: None,
                    after: None,
                    both: Some(lines),
                })
            }
            ContextMode::Limited(ContextModeLimited {
                ref mut both, ..
            }) => *both = Some(lines),
        }
    }

    /// A convenience function for use in tests that returns the limited
    /// context. If this mode isn't limited, then it panics.
    #[cfg(test)]
    pub(crate) fn get_limited(&self) -> (usize, usize) {
        match *self {
            ContextMode::Passthru => unreachable!("context mode is passthru"),
            ContextMode::Limited(ref limited) => limited.get(),
        }
    }
}

/// A context mode for a finite number of lines.
///
/// Namely, this indicates that a specific number of lines (possibly zero)
/// should be shown before and/or after each matching line.
///
/// Note that there is a subtle difference between `Some(0)` and `None`. In the
/// former case, it happens when `0` is given explicitly, where as `None` is
/// the default value and occurs when no value is specified.
///
/// `both` is only set by the -C/--context flag. The reason why we don't just
/// set before = after = --context is because the before and after context
/// settings always take precedent over the -C/--context setting, regardless of
/// order. Thus, we need to keep track of them separately.
#[derive(Debug, Default, Eq, PartialEq)]
pub(crate) struct ContextModeLimited {
    before: Option<usize>,
    after: Option<usize>,
    both: Option<usize>,
}

impl ContextModeLimited {
    /// Returns the specific number of contextual lines that should be shown
    /// around each match. This takes proper precedent into account, i.e.,
    /// that `before` and `after` both partially override `both` in all cases.
    ///
    /// By default, this returns `(0, 0)`.
    pub(crate) fn get(&self) -> (usize, usize) {
        let (mut before, mut after) =
            self.both.map(|lines| (lines, lines)).unwrap_or((0, 0));
        // --before and --after always override --context, regardless
        // of where they appear relative to each other.
        if let Some(lines) = self.before {
            before = lines;
        }
        if let Some(lines) = self.after {
            after = lines;
        }
        (before, after)
    }
}

/// Represents the separator to use between non-contiguous sections of
/// contextual lines.
///
/// The default is `--`.
#[derive(Clone, Debug, Eq, PartialEq)]
pub(crate) struct ContextSeparator(Option<BString>);

impl Default for ContextSeparator {
    fn default() -> ContextSeparator {
        ContextSeparator(Some(BString::from("--")))
    }
}

impl ContextSeparator {
    /// Create a new context separator from the user provided argument. This
    /// handles unescaping.
    pub(crate) fn new(os: &OsStr) -> anyhow::Result<ContextSeparator> {
        let Some(string) = os.to_str() else {
            anyhow::bail!(
                "separator must be valid UTF-8 (use escape sequences \
                 to provide a separator that is not valid UTF-8)"
            )
        };
        Ok(ContextSeparator(Some(Vec::unescape_bytes(string).into())))
    }

    /// Creates a new separator that intructs the printer to disable contextual
    /// separators entirely.
    pub(crate) fn disabled() -> ContextSeparator {
        ContextSeparator(None)
    }

    /// Return the raw bytes of this separator.
    ///
    /// If context separators were disabled, then this returns `None`.
    ///
    /// Note that this may return a `Some` variant with zero bytes.
    pub(crate) fn into_bytes(self) -> Option<Vec<u8>> {
        self.0.map(|sep| sep.into())
    }
}

/// The encoding mode the searcher will use.
///
/// The default is `Auto`.
#[derive(Debug, Default, Eq, PartialEq)]
pub(crate) enum EncodingMode {
    /// Use only BOM sniffing to auto-detect an encoding.
    #[default]
    Auto,
    /// Use an explicit encoding forcefully, but let BOM sniffing override it.
    Some(grep::searcher::Encoding),
    /// Use no explicit encoding and disable all BOM sniffing. This will
    /// always result in searching the raw bytes, regardless of their
    /// true encoding.
    Disabled,
}

/// The regex engine to use.
///
/// The default is `Default`.
#[derive(Debug, Default, Eq, PartialEq)]
pub(crate) enum EngineChoice {
    /// Uses the default regex engine: Rust's `regex` crate.
    ///
    /// (Well, technically it uses `regex-automata`, but `regex-automata` is
    /// the implementation of the `regex` crate.)
    #[default]
    Default,
    /// Dynamically select the right engine to use.
    ///
    /// This works by trying to use the default engine, and if the pattern does
    /// not compile, it switches over to the PCRE2 engine if it's available.
    Auto,
    /// Uses the PCRE2 regex engine if it's available.
    PCRE2,
}

/// The field context separator to use to between metadata for each contextual
/// line.
///
/// The default is `-`.
#[derive(Clone, Debug, Eq, PartialEq)]
pub(crate) struct FieldContextSeparator(BString);

impl Default for FieldContextSeparator {
    fn default() -> FieldContextSeparator {
        FieldContextSeparator(BString::from("-"))
    }
}

impl FieldContextSeparator {
    /// Create a new separator from the given argument value provided by the
    /// user. Unescaping it automatically handled.
    pub(crate) fn new(os: &OsStr) -> anyhow::Result<FieldContextSeparator> {
        let Some(string) = os.to_str() else {
            anyhow::bail!(
                "separator must be valid UTF-8 (use escape sequences \
                 to provide a separator that is not valid UTF-8)"
            )
        };
        Ok(FieldContextSeparator(Vec::unescape_bytes(string).into()))
    }

    /// Return the raw bytes of this separator.
    ///
    /// Note that this may return an empty `Vec`.
    pub(crate) fn into_bytes(self) -> Vec<u8> {
        self.0.into()
    }
}

/// The field match separator to use to between metadata for each matching
/// line.
///
/// The default is `:`.
#[derive(Clone, Debug, Eq, PartialEq)]
pub(crate) struct FieldMatchSeparator(BString);

impl Default for FieldMatchSeparator {
    fn default() -> FieldMatchSeparator {
        FieldMatchSeparator(BString::from(":"))
    }
}

impl FieldMatchSeparator {
    /// Create a new separator from the given argument value provided by the
    /// user. Unescaping it automatically handled.
    pub(crate) fn new(os: &OsStr) -> anyhow::Result<FieldMatchSeparator> {
        let Some(string) = os.to_str() else {
            anyhow::bail!(
                "separator must be valid UTF-8 (use escape sequences \
                 to provide a separator that is not valid UTF-8)"
            )
        };
        Ok(FieldMatchSeparator(Vec::unescape_bytes(string).into()))
    }

    /// Return the raw bytes of this separator.
    ///
    /// Note that this may return an empty `Vec`.
    pub(crate) fn into_bytes(self) -> Vec<u8> {
        self.0.into()
    }
}

/// The type of logging to do. `Debug` emits some details while `Trace` emits
/// much more.
#[derive(Debug, Eq, PartialEq)]
pub(crate) enum LoggingMode {
    Debug,
    Trace,
}

/// Indicates when to use memory maps.
///
/// The default is `Auto`.
#[derive(Debug, Default, Eq, PartialEq)]
pub(crate) enum MmapMode {
    /// This instructs ripgrep to use heuristics for selecting when to and not
    /// to use memory maps for searching.
    #[default]
    Auto,
    /// This instructs ripgrep to always try memory maps when possible. (Memory
    /// maps are not possible to use in all circumstances, for example, for
    /// virtual files.)
    AlwaysTryMmap,
    /// Never use memory maps under any circumstances. This includes even
    /// when multi-line search is enabled where ripgrep will read the entire
    /// contents of a file on to the heap before searching it.
    Never,
}

/// Represents a source of patterns that ripgrep should search for.
///
/// The reason to unify these is so that we can retain the order of `-f/--flag`
/// and `-e/--regexp` flags relative to one another.
#[derive(Debug, Eq, PartialEq)]
pub(crate) enum PatternSource {
    /// Comes from the `-e/--regexp` flag.
    Regexp(String),
    /// Comes from the `-f/--file` flag.
    File(PathBuf),
}

/// The sort criteria, if present.
#[derive(Debug, Eq, PartialEq)]
pub(crate) struct SortMode {
    /// Whether to reverse the sort criteria (i.e., descending order).
    pub(crate) reverse: bool,
    /// The actual sorting criteria.
    pub(crate) kind: SortModeKind,
}

/// The criteria to use for sorting.
#[derive(Debug, Eq, PartialEq)]
pub(crate) enum SortModeKind {
    /// Sort by path.
    Path,
    /// Sort by last modified time.
    LastModified,
    /// Sort by last accessed time.
    LastAccessed,
    /// Sort by creation time.
    Created,
}

impl SortMode {
    /// Checks whether the selected sort mode is supported. If it isn't, an
    /// error (hopefully explaining why) is returned.
    pub(crate) fn supported(&self) -> anyhow::Result<()> {
        match self.kind {
            SortModeKind::Path => Ok(()),
            SortModeKind::LastModified => {
                let md = std::env::current_exe()
                    .and_then(|p| p.metadata())
                    .and_then(|md| md.modified());
                let Err(err) = md else { return Ok(()) };
                anyhow::bail!(
                    "sorting by last modified isn't supported: {err}"
                );
            }
            SortModeKind::LastAccessed => {
                let md = std::env::current_exe()
                    .and_then(|p| p.metadata())
                    .and_then(|md| md.accessed());
                let Err(err) = md else { return Ok(()) };
                anyhow::bail!(
                    "sorting by last accessed isn't supported: {err}"
                );
            }
            SortModeKind::Created => {
                let md = std::env::current_exe()
                    .and_then(|p| p.metadata())
                    .and_then(|md| md.created());
                let Err(err) = md else { return Ok(()) };
                anyhow::bail!(
                    "sorting by creation time isn't supported: {err}"
                );
            }
        }
    }
}

/// A single instance of either a change or a selection of one ripgrep's
/// file types.
#[derive(Debug, Eq, PartialEq)]
pub(crate) enum TypeChange {
    /// Clear the given type from ripgrep.
    Clear { name: String },
    /// Add the given type definition (name and glob) to ripgrep.
    Add { def: String },
    /// Select the given type for filtering.
    Select { name: String },
    /// Select the given type for filtering but negate it.
    Negate { name: String },
}