ripgrep/crates/core/flags/parse.rs

/*!
Parses command line arguments into a structured and typed representation.
*/

use std::{borrow::Cow, collections::BTreeSet, ffi::OsString};

use anyhow::Context;

use crate::flags::{
    defs::FLAGS,
    hiargs::HiArgs,
    lowargs::{LoggingMode, LowArgs, SpecialMode},
    Flag, FlagValue,
};

/// The result of parsing CLI arguments.
///
/// This is basically a `anyhow::Result<T>`, but with one extra variant that is
/// inhabited whenever ripgrep should execute a "special" mode. That is, when a
/// user provides the `-h/--help` or `-V/--version` flags.
///
/// This special variant exists to allow CLI parsing to short circuit as
/// quickly as is reasonable. For example, it lets CLI parsing avoid reading
/// ripgrep's configuration and converting low level arguments into a higher
/// level representation.
#[derive(Debug)]
pub(crate) enum ParseResult<T> {
    Special(SpecialMode),
    Ok(T),
    Err(anyhow::Error),
}

impl<T> ParseResult<T> {
    /// If this result is `Ok`, then apply `then` to it. Otherwise, return this
    /// result unchanged.
    fn and_then<U>(
        self,
        mut then: impl FnMut(T) -> ParseResult<U>,
    ) -> ParseResult<U> {
        match self {
            ParseResult::Special(mode) => ParseResult::Special(mode),
            ParseResult::Ok(t) => then(t),
            ParseResult::Err(err) => ParseResult::Err(err),
        }
    }
}

/// Parse CLI arguments and convert then to their high level representation.
pub(crate) fn parse() -> ParseResult<HiArgs> {
    parse_low().and_then(|low| match HiArgs::from_low_args(low) {
        Ok(hi) => ParseResult::Ok(hi),
        Err(err) => ParseResult::Err(err),
    })
}

/// Parse CLI arguments only into their low level representation.
///
/// This takes configuration into account. That is, it will try to read
/// `RIPGREP_CONFIG_PATH` and prepend any arguments found there to the
/// arguments passed to this process.
///
/// This will also set one-time global state flags, such as the log level and
/// whether messages should be printed.
fn parse_low() -> ParseResult<LowArgs> {
    if let Err(err) = crate::logger::Logger::init() {
        let err = anyhow::anyhow!("failed to initialize logger: {err}");
        return ParseResult::Err(err);
    }

    let parser = Parser::new();
    let mut low = LowArgs::default();
    if let Err(err) = parser.parse(std::env::args_os().skip(1), &mut low) {
        return ParseResult::Err(err);
    }
    // Even though we haven't parsed the config file yet (assuming it exists),
    // we can still use the arguments given on the CLI to setup ripgrep's
    // logging preferences. Even if the config file changes them in some way,
    // it's really the best we can do. This way, for example, folks can pass
    // `--trace` and see any messages logged during config file parsing.
    set_log_levels(&low);
    // Before we try to take configuration into account, we can bail early
    // if a special mode was enabled. This is basically only for version and
    // help output which shouldn't be impacted by extra configuration.
    if let Some(special) = low.special.take() {
        return ParseResult::Special(special);
    }
    // If the end user says no config, then respect it.
    if low.no_config {
        log::debug!("not reading config files because --no-config is present");
        return ParseResult::Ok(low);
    }
    // Look for arguments from a config file. If we got nothing (whether the
    // file is empty or RIPGREP_CONFIG_PATH wasn't set), then we don't need
    // to re-parse.
    let config_args = crate::flags::config::args();
    if config_args.is_empty() {
        log::debug!("no extra arguments found from configuration file");
        return ParseResult::Ok(low);
    }
    // The final arguments are just the arguments from the CLI appending to
    // the end of the config arguments.
    let mut final_args = config_args;
    final_args.extend(std::env::args_os().skip(1));

    // Now do the CLI parsing dance again.
    let mut low = LowArgs::default();
    if let Err(err) = parser.parse(final_args.into_iter(), &mut low) {
        return ParseResult::Err(err);
    }
    // Reset the message and logging levels, since they could have changed.
    set_log_levels(&low);
    ParseResult::Ok(low)
}

/// Sets global state flags that control logging based on low-level arguments.
fn set_log_levels(low: &LowArgs) {
    crate::messages::set_messages(!low.no_messages);
    crate::messages::set_ignore_messages(!low.no_ignore_messages);
    match low.logging {
        Some(LoggingMode::Trace) => {
            log::set_max_level(log::LevelFilter::Trace)
        }
        Some(LoggingMode::Debug) => {
            log::set_max_level(log::LevelFilter::Debug)
        }
        None => log::set_max_level(log::LevelFilter::Warn),
    }
}

/// Parse the sequence of CLI arguments given a low level typed set of
/// arguments.
///
/// This is exposed for testing that the correct low-level arguments are parsed
/// from a CLI. It just runs the parser once over the CLI arguments. It doesn't
/// setup logging or read from a config file.
///
/// This assumes the iterator given does *not* begin with the binary name.
#[cfg(test)]
pub(crate) fn parse_low_raw(
    rawargs: impl IntoIterator<Item = impl Into<OsString>>,
) -> anyhow::Result<LowArgs> {
    let mut args = LowArgs::default();
    Parser::new().parse(rawargs, &mut args)?;
    Ok(args)
}

/// Return the metadata for the flag of the given name.
pub(super) fn lookup(name: &str) -> Option<&'static dyn Flag> {
    // N.B. Creating a new parser might look expensive, but it only builds
    // the lookup trie exactly once. That is, we get a `&'static Parser` from
    // `Parser::new()`.
    match Parser::new().find_long(name) {
        FlagLookup::Match(&FlagInfo { flag, .. }) => Some(flag),
        _ => None,
    }
}

/// A parser for turning a sequence of command line arguments into a more
/// strictly typed set of arguments.
#[derive(Debug)]
struct Parser {
    /// A single map that contains all possible flag names. This includes
    /// short and long names, aliases and negations. This maps those names to
    /// indices into `info`.
    map: FlagMap,
    /// A map from IDs returned by the `map` to the corresponding flag
    /// information.
    info: Vec<FlagInfo>,
}

impl Parser {
    /// Create a new parser.
    ///
    /// This always creates the same parser and only does it once. Callers may
    /// call this repeatedly, and the parser will only be built once.
    fn new() -> &'static Parser {
        use std::sync::OnceLock;

        // Since a parser's state is immutable and completely determined by
        // FLAGS, and since FLAGS is a constant, we can initialize it exactly
        // once.
        static P: OnceLock<Parser> = OnceLock::new();
        P.get_or_init(|| {
            let mut infos = vec![];
            for &flag in FLAGS.iter() {
                infos.push(FlagInfo {
                    flag,
                    name: Ok(flag.name_long()),
                    kind: FlagInfoKind::Standard,
                });
                for alias in flag.aliases() {
                    infos.push(FlagInfo {
                        flag,
                        name: Ok(alias),
                        kind: FlagInfoKind::Alias,
                    });
                }
                if let Some(byte) = flag.name_short() {
                    infos.push(FlagInfo {
                        flag,
                        name: Err(byte),
                        kind: FlagInfoKind::Standard,
                    });
                }
                if let Some(name) = flag.name_negated() {
                    infos.push(FlagInfo {
                        flag,
                        name: Ok(name),
                        kind: FlagInfoKind::Negated,
                    });
                }
            }
            let map = FlagMap::new(&infos);
            Parser { map, info: infos }
        })
    }

    /// Parse the given CLI arguments into a low level representation.
    ///
    /// The iterator given should *not* start with the binary name.
    fn parse<I, O>(&self, rawargs: I, args: &mut LowArgs) -> anyhow::Result<()>
    where
        I: IntoIterator<Item = O>,
        O: Into<OsString>,
    {
        let mut p = lexopt::Parser::from_args(rawargs);
        while let Some(arg) = p.next().context("invalid CLI arguments")? {
            let lookup = match arg {
                lexopt::Arg::Value(value) => {
                    args.positional.push(value);
                    continue;
                }
                lexopt::Arg::Short(ch) if ch == 'h' => {
                    // Special case -h/--help since behavior is different
                    // based on whether short or long flag is given.
                    args.special = Some(SpecialMode::HelpShort);
                    continue;
                }
                lexopt::Arg::Short(ch) if ch == 'V' => {
                    // Special case -V/--version since behavior is different
                    // based on whether short or long flag is given.
                    args.special = Some(SpecialMode::VersionShort);
                    continue;
                }
                lexopt::Arg::Short(ch) => self.find_short(ch),
                lexopt::Arg::Long(name) if name == "help" => {
                    // Special case -h/--help since behavior is different
                    // based on whether short or long flag is given.
                    args.special = Some(SpecialMode::HelpLong);
                    continue;
                }
                lexopt::Arg::Long(name) if name == "version" => {
                    // Special case -V/--version since behavior is different
                    // based on whether short or long flag is given.
                    args.special = Some(SpecialMode::VersionLong);
                    continue;
                }
                lexopt::Arg::Long(name) => self.find_long(name),
            };
            let mat = match lookup {
                FlagLookup::Match(mat) => mat,
                FlagLookup::UnrecognizedShort(name) => {
                    anyhow::bail!("unrecognized flag -{name}")
                }
                FlagLookup::UnrecognizedLong(name) => {
                    let mut msg = format!("unrecognized flag --{name}");
                    if let Some(suggest_msg) = suggest(&name) {
                        msg = format!("{msg}\n\n{suggest_msg}");
                    }
                    anyhow::bail!("{msg}")
                }
            };
            let value = if matches!(mat.kind, FlagInfoKind::Negated) {
                // Negated flags are always switches, even if the non-negated
                // flag is not. For example, --context-separator accepts a
                // value, but --no-context-separator does not.
                FlagValue::Switch(false)
            } else if mat.flag.is_switch() {
                FlagValue::Switch(true)
            } else {
                FlagValue::Value(p.value().with_context(|| {
                    format!("missing value for flag {mat}")
                })?)
            };
            mat.flag
                .update(value, args)
                .with_context(|| format!("error parsing flag {mat}"))?;
        }
        Ok(())
    }

    /// Look for a flag by its short name.
    fn find_short(&self, ch: char) -> FlagLookup<'_> {
        if !ch.is_ascii() {
            return FlagLookup::UnrecognizedShort(ch);
        }
        let byte = u8::try_from(ch).unwrap();
        let Some(index) = self.map.find(&[byte]) else {
            return FlagLookup::UnrecognizedShort(ch);
        };
        FlagLookup::Match(&self.info[index])
    }

    /// Look for a flag by its long name.
    ///
    /// This also works for aliases and negated names.
    fn find_long(&self, name: &str) -> FlagLookup<'_> {
        let Some(index) = self.map.find(name.as_bytes()) else {
            return FlagLookup::UnrecognizedLong(name.to_string());
        };
        FlagLookup::Match(&self.info[index])
    }
}

/// The result of looking up a flag name.
#[derive(Debug)]
enum FlagLookup<'a> {
    /// Lookup found a match and the metadata for the flag is attached.
    Match(&'a FlagInfo),
    /// The given short name is unrecognized.
    UnrecognizedShort(char),
    /// The given long name is unrecognized.
    UnrecognizedLong(String),
}

/// The info about a flag associated with a flag's ID in the flag map.
#[derive(Debug)]
struct FlagInfo {
    /// The flag object and its associated metadata.
    flag: &'static dyn Flag,
    /// The actual name that is stored in the Aho-Corasick automaton. When this
    /// is a byte, it corresponds to a short single character ASCII flag. The
    /// actual pattern that's in the Aho-Corasick automaton is just the single
    /// byte.
    name: Result<&'static str, u8>,
    /// The type of flag that is stored for the corresponding Aho-Corasick
    /// pattern.
    kind: FlagInfoKind,
}

/// The kind of flag that is being matched.
#[derive(Debug)]
enum FlagInfoKind {
    /// A standard flag, e.g., --passthru.
    Standard,
    /// A negation of a standard flag, e.g., --no-multiline.
    Negated,
    /// An alias for a standard flag, e.g., --passthrough.
    Alias,
}

impl std::fmt::Display for FlagInfo {
    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
        match self.name {
            Ok(long) => write!(f, "--{long}"),
            Err(short) => write!(f, "-{short}", short = char::from(short)),
        }
    }
}

/// A map from flag names (short, long, negated and aliases) to their ID.
///
/// Once an ID is known, it can be used to look up a flag's metadata in the
/// parser's internal state.
#[derive(Debug)]
struct FlagMap {
    map: std::collections::HashMap<Vec<u8>, usize>,
}

impl FlagMap {
    /// Create a new map of flags for the given flag information.
    ///
    /// The index of each flag info corresponds to its ID.
    fn new(infos: &[FlagInfo]) -> FlagMap {
        let mut map = std::collections::HashMap::with_capacity(infos.len());
        for (i, info) in infos.iter().enumerate() {
            match info.name {
                Ok(name) => {
                    assert_eq!(None, map.insert(name.as_bytes().to_vec(), i));
                }
                Err(byte) => {
                    assert_eq!(None, map.insert(vec![byte], i));
                }
            }
        }
        FlagMap { map }
    }

    /// Look for a match of `name` in the given Aho-Corasick automaton.
    ///
    /// This only returns a match if the one found has a length equivalent to
    /// the length of the name given.
    fn find(&self, name: &[u8]) -> Option<usize> {
        self.map.get(name).copied()
    }
}

/// Possibly return a message suggesting flags similar in the name to the one
/// given.
///
/// The one given should be a flag given by the user (without the leading
/// dashes) that was unrecognized. This attempts to find existing flags that
/// are similar to the one given.
fn suggest(unrecognized: &str) -> Option<String> {
    let similars = find_similar_names(unrecognized);
    if similars.is_empty() {
        return None;
    }
    let list = similars
        .into_iter()
        .map(|name| format!("--{name}"))
        .collect::<Vec<String>>()
        .join(", ");
    Some(format!("similar flags that are available: {list}"))
}

/// Return a sequence of names similar to the unrecognized name given.
fn find_similar_names(unrecognized: &str) -> Vec<&'static str> {
    // The jaccard similarity threshold at which we consider two flag names
    // similar enough that it's worth suggesting it to the end user.
    //
    // This value was determined by some ad hoc experimentation. It might need
    // further tweaking.
    const THRESHOLD: f64 = 0.4;

    let mut similar = vec![];
    let bow_given = ngrams(unrecognized);
    for &flag in FLAGS.iter() {
        let name = flag.name_long();
        let bow = ngrams(name);
        if jaccard_index(&bow_given, &bow) >= THRESHOLD {
            similar.push(name);
        }
        if let Some(name) = flag.name_negated() {
            let bow = ngrams(name);
            if jaccard_index(&bow_given, &bow) >= THRESHOLD {
                similar.push(name);
            }
        }
        for name in flag.aliases() {
            let bow = ngrams(name);
            if jaccard_index(&bow_given, &bow) >= THRESHOLD {
                similar.push(name);
            }
        }
    }
    similar
}

/// A "bag of words" is a set of ngrams.
type BagOfWords<'a> = BTreeSet<Cow<'a, [u8]>>;

/// Returns the jaccard index (a measure of similarity) between sets of ngrams.
fn jaccard_index(ngrams1: &BagOfWords<'_>, ngrams2: &BagOfWords<'_>) -> f64 {
    let union = u32::try_from(ngrams1.union(ngrams2).count())
        .expect("fewer than u32::MAX flags");
    let intersection = u32::try_from(ngrams1.intersection(ngrams2).count())
        .expect("fewer than u32::MAX flags");
    f64::from(intersection) / f64::from(union)
}

/// Returns all 3-grams in the slice given.
///
/// If the slice doesn't contain a 3-gram, then one is artificially created by
/// padding it out with a character that will never appear in a flag name.
fn ngrams(flag_name: &str) -> BagOfWords<'_> {
    // We only allow ASCII flag names, so we can just use bytes.
    let slice = flag_name.as_bytes();
    let seq: Vec<Cow<[u8]>> = match slice.len() {
        0 => vec![Cow::Owned(b"!!!".to_vec())],
        1 => vec![Cow::Owned(vec![slice[0], b'!', b'!'])],
        2 => vec![Cow::Owned(vec![slice[0], slice[1], b'!'])],
        _ => slice.windows(3).map(Cow::Borrowed).collect(),
    };
    BTreeSet::from_iter(seq)
}