mirror of
https://github.com/BurntSushi/ripgrep.git
synced 2025-03-03 14:32:22 +02:00
477 lines
17 KiB
Rust
477 lines
17 KiB
Rust
/*!
|
|
Parses command line arguments into a structured and typed representation.
|
|
*/
|
|
|
|
use std::{borrow::Cow, collections::BTreeSet, ffi::OsString};
|
|
|
|
use anyhow::Context;
|
|
|
|
use crate::flags::{
|
|
defs::FLAGS,
|
|
hiargs::HiArgs,
|
|
lowargs::{LoggingMode, LowArgs, SpecialMode},
|
|
Flag, FlagValue,
|
|
};
|
|
|
|
/// The result of parsing CLI arguments.
|
|
///
|
|
/// This is basically a `anyhow::Result<T>`, but with one extra variant that is
|
|
/// inhabited whenever ripgrep should execute a "special" mode. That is, when a
|
|
/// user provides the `-h/--help` or `-V/--version` flags.
|
|
///
|
|
/// This special variant exists to allow CLI parsing to short circuit as
|
|
/// quickly as is reasonable. For example, it lets CLI parsing avoid reading
|
|
/// ripgrep's configuration and converting low level arguments into a higher
|
|
/// level representation.
|
|
#[derive(Debug)]
|
|
pub(crate) enum ParseResult<T> {
|
|
Special(SpecialMode),
|
|
Ok(T),
|
|
Err(anyhow::Error),
|
|
}
|
|
|
|
impl<T> ParseResult<T> {
|
|
/// If this result is `Ok`, then apply `then` to it. Otherwise, return this
|
|
/// result unchanged.
|
|
fn and_then<U>(
|
|
self,
|
|
mut then: impl FnMut(T) -> ParseResult<U>,
|
|
) -> ParseResult<U> {
|
|
match self {
|
|
ParseResult::Special(mode) => ParseResult::Special(mode),
|
|
ParseResult::Ok(t) => then(t),
|
|
ParseResult::Err(err) => ParseResult::Err(err),
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Parse CLI arguments and convert then to their high level representation.
|
|
pub(crate) fn parse() -> ParseResult<HiArgs> {
|
|
parse_low().and_then(|low| match HiArgs::from_low_args(low) {
|
|
Ok(hi) => ParseResult::Ok(hi),
|
|
Err(err) => ParseResult::Err(err),
|
|
})
|
|
}
|
|
|
|
/// Parse CLI arguments only into their low level representation.
|
|
///
|
|
/// This takes configuration into account. That is, it will try to read
|
|
/// `RIPGREP_CONFIG_PATH` and prepend any arguments found there to the
|
|
/// arguments passed to this process.
|
|
///
|
|
/// This will also set one-time global state flags, such as the log level and
|
|
/// whether messages should be printed.
|
|
fn parse_low() -> ParseResult<LowArgs> {
|
|
if let Err(err) = crate::logger::Logger::init() {
|
|
let err = anyhow::anyhow!("failed to initialize logger: {err}");
|
|
return ParseResult::Err(err);
|
|
}
|
|
|
|
let parser = Parser::new();
|
|
let mut low = LowArgs::default();
|
|
if let Err(err) = parser.parse(std::env::args_os().skip(1), &mut low) {
|
|
return ParseResult::Err(err);
|
|
}
|
|
// Even though we haven't parsed the config file yet (assuming it exists),
|
|
// we can still use the arguments given on the CLI to setup ripgrep's
|
|
// logging preferences. Even if the config file changes them in some way,
|
|
// it's really the best we can do. This way, for example, folks can pass
|
|
// `--trace` and see any messages logged during config file parsing.
|
|
set_log_levels(&low);
|
|
// Before we try to take configuration into account, we can bail early
|
|
// if a special mode was enabled. This is basically only for version and
|
|
// help output which shouldn't be impacted by extra configuration.
|
|
if let Some(special) = low.special.take() {
|
|
return ParseResult::Special(special);
|
|
}
|
|
// If the end user says no config, then respect it.
|
|
if low.no_config {
|
|
log::debug!("not reading config files because --no-config is present");
|
|
return ParseResult::Ok(low);
|
|
}
|
|
// Look for arguments from a config file. If we got nothing (whether the
|
|
// file is empty or RIPGREP_CONFIG_PATH wasn't set), then we don't need
|
|
// to re-parse.
|
|
let config_args = crate::flags::config::args();
|
|
if config_args.is_empty() {
|
|
log::debug!("no extra arguments found from configuration file");
|
|
return ParseResult::Ok(low);
|
|
}
|
|
// The final arguments are just the arguments from the CLI appending to
|
|
// the end of the config arguments.
|
|
let mut final_args = config_args;
|
|
final_args.extend(std::env::args_os().skip(1));
|
|
|
|
// Now do the CLI parsing dance again.
|
|
let mut low = LowArgs::default();
|
|
if let Err(err) = parser.parse(final_args.into_iter(), &mut low) {
|
|
return ParseResult::Err(err);
|
|
}
|
|
// Reset the message and logging levels, since they could have changed.
|
|
set_log_levels(&low);
|
|
ParseResult::Ok(low)
|
|
}
|
|
|
|
/// Sets global state flags that control logging based on low-level arguments.
|
|
fn set_log_levels(low: &LowArgs) {
|
|
crate::messages::set_messages(!low.no_messages);
|
|
crate::messages::set_ignore_messages(!low.no_ignore_messages);
|
|
match low.logging {
|
|
Some(LoggingMode::Trace) => {
|
|
log::set_max_level(log::LevelFilter::Trace)
|
|
}
|
|
Some(LoggingMode::Debug) => {
|
|
log::set_max_level(log::LevelFilter::Debug)
|
|
}
|
|
None => log::set_max_level(log::LevelFilter::Warn),
|
|
}
|
|
}
|
|
|
|
/// Parse the sequence of CLI arguments given a low level typed set of
|
|
/// arguments.
|
|
///
|
|
/// This is exposed for testing that the correct low-level arguments are parsed
|
|
/// from a CLI. It just runs the parser once over the CLI arguments. It doesn't
|
|
/// setup logging or read from a config file.
|
|
///
|
|
/// This assumes the iterator given does *not* begin with the binary name.
|
|
#[cfg(test)]
|
|
pub(crate) fn parse_low_raw(
|
|
rawargs: impl IntoIterator<Item = impl Into<OsString>>,
|
|
) -> anyhow::Result<LowArgs> {
|
|
let mut args = LowArgs::default();
|
|
Parser::new().parse(rawargs, &mut args)?;
|
|
Ok(args)
|
|
}
|
|
|
|
/// Return the metadata for the flag of the given name.
|
|
pub(super) fn lookup(name: &str) -> Option<&'static dyn Flag> {
|
|
// N.B. Creating a new parser might look expensive, but it only builds
|
|
// the lookup trie exactly once. That is, we get a `&'static Parser` from
|
|
// `Parser::new()`.
|
|
match Parser::new().find_long(name) {
|
|
FlagLookup::Match(&FlagInfo { flag, .. }) => Some(flag),
|
|
_ => None,
|
|
}
|
|
}
|
|
|
|
/// A parser for turning a sequence of command line arguments into a more
|
|
/// strictly typed set of arguments.
|
|
#[derive(Debug)]
|
|
struct Parser {
|
|
/// A single map that contains all possible flag names. This includes
|
|
/// short and long names, aliases and negations. This maps those names to
|
|
/// indices into `info`.
|
|
map: FlagMap,
|
|
/// A map from IDs returned by the `map` to the corresponding flag
|
|
/// information.
|
|
info: Vec<FlagInfo>,
|
|
}
|
|
|
|
impl Parser {
|
|
/// Create a new parser.
|
|
///
|
|
/// This always creates the same parser and only does it once. Callers may
|
|
/// call this repeatedly, and the parser will only be built once.
|
|
fn new() -> &'static Parser {
|
|
use std::sync::OnceLock;
|
|
|
|
// Since a parser's state is immutable and completely determined by
|
|
// FLAGS, and since FLAGS is a constant, we can initialize it exactly
|
|
// once.
|
|
static P: OnceLock<Parser> = OnceLock::new();
|
|
P.get_or_init(|| {
|
|
let mut infos = vec![];
|
|
for &flag in FLAGS.iter() {
|
|
infos.push(FlagInfo {
|
|
flag,
|
|
name: Ok(flag.name_long()),
|
|
kind: FlagInfoKind::Standard,
|
|
});
|
|
for alias in flag.aliases() {
|
|
infos.push(FlagInfo {
|
|
flag,
|
|
name: Ok(alias),
|
|
kind: FlagInfoKind::Alias,
|
|
});
|
|
}
|
|
if let Some(byte) = flag.name_short() {
|
|
infos.push(FlagInfo {
|
|
flag,
|
|
name: Err(byte),
|
|
kind: FlagInfoKind::Standard,
|
|
});
|
|
}
|
|
if let Some(name) = flag.name_negated() {
|
|
infos.push(FlagInfo {
|
|
flag,
|
|
name: Ok(name),
|
|
kind: FlagInfoKind::Negated,
|
|
});
|
|
}
|
|
}
|
|
let map = FlagMap::new(&infos);
|
|
Parser { map, info: infos }
|
|
})
|
|
}
|
|
|
|
/// Parse the given CLI arguments into a low level representation.
|
|
///
|
|
/// The iterator given should *not* start with the binary name.
|
|
fn parse<I, O>(&self, rawargs: I, args: &mut LowArgs) -> anyhow::Result<()>
|
|
where
|
|
I: IntoIterator<Item = O>,
|
|
O: Into<OsString>,
|
|
{
|
|
let mut p = lexopt::Parser::from_args(rawargs);
|
|
while let Some(arg) = p.next().context("invalid CLI arguments")? {
|
|
let lookup = match arg {
|
|
lexopt::Arg::Value(value) => {
|
|
args.positional.push(value);
|
|
continue;
|
|
}
|
|
lexopt::Arg::Short(ch) if ch == 'h' => {
|
|
// Special case -h/--help since behavior is different
|
|
// based on whether short or long flag is given.
|
|
args.special = Some(SpecialMode::HelpShort);
|
|
continue;
|
|
}
|
|
lexopt::Arg::Short(ch) if ch == 'V' => {
|
|
// Special case -V/--version since behavior is different
|
|
// based on whether short or long flag is given.
|
|
args.special = Some(SpecialMode::VersionShort);
|
|
continue;
|
|
}
|
|
lexopt::Arg::Short(ch) => self.find_short(ch),
|
|
lexopt::Arg::Long(name) if name == "help" => {
|
|
// Special case -h/--help since behavior is different
|
|
// based on whether short or long flag is given.
|
|
args.special = Some(SpecialMode::HelpLong);
|
|
continue;
|
|
}
|
|
lexopt::Arg::Long(name) if name == "version" => {
|
|
// Special case -V/--version since behavior is different
|
|
// based on whether short or long flag is given.
|
|
args.special = Some(SpecialMode::VersionLong);
|
|
continue;
|
|
}
|
|
lexopt::Arg::Long(name) => self.find_long(name),
|
|
};
|
|
let mat = match lookup {
|
|
FlagLookup::Match(mat) => mat,
|
|
FlagLookup::UnrecognizedShort(name) => {
|
|
anyhow::bail!("unrecognized flag -{name}")
|
|
}
|
|
FlagLookup::UnrecognizedLong(name) => {
|
|
let mut msg = format!("unrecognized flag --{name}");
|
|
if let Some(suggest_msg) = suggest(&name) {
|
|
msg = format!("{msg}\n\n{suggest_msg}");
|
|
}
|
|
anyhow::bail!("{msg}")
|
|
}
|
|
};
|
|
let value = if matches!(mat.kind, FlagInfoKind::Negated) {
|
|
// Negated flags are always switches, even if the non-negated
|
|
// flag is not. For example, --context-separator accepts a
|
|
// value, but --no-context-separator does not.
|
|
FlagValue::Switch(false)
|
|
} else if mat.flag.is_switch() {
|
|
FlagValue::Switch(true)
|
|
} else {
|
|
FlagValue::Value(p.value().with_context(|| {
|
|
format!("missing value for flag {mat}")
|
|
})?)
|
|
};
|
|
mat.flag
|
|
.update(value, args)
|
|
.with_context(|| format!("error parsing flag {mat}"))?;
|
|
}
|
|
Ok(())
|
|
}
|
|
|
|
/// Look for a flag by its short name.
|
|
fn find_short(&self, ch: char) -> FlagLookup<'_> {
|
|
if !ch.is_ascii() {
|
|
return FlagLookup::UnrecognizedShort(ch);
|
|
}
|
|
let byte = u8::try_from(ch).unwrap();
|
|
let Some(index) = self.map.find(&[byte]) else {
|
|
return FlagLookup::UnrecognizedShort(ch);
|
|
};
|
|
FlagLookup::Match(&self.info[index])
|
|
}
|
|
|
|
/// Look for a flag by its long name.
|
|
///
|
|
/// This also works for aliases and negated names.
|
|
fn find_long(&self, name: &str) -> FlagLookup<'_> {
|
|
let Some(index) = self.map.find(name.as_bytes()) else {
|
|
return FlagLookup::UnrecognizedLong(name.to_string());
|
|
};
|
|
FlagLookup::Match(&self.info[index])
|
|
}
|
|
}
|
|
|
|
/// The result of looking up a flag name.
|
|
#[derive(Debug)]
|
|
enum FlagLookup<'a> {
|
|
/// Lookup found a match and the metadata for the flag is attached.
|
|
Match(&'a FlagInfo),
|
|
/// The given short name is unrecognized.
|
|
UnrecognizedShort(char),
|
|
/// The given long name is unrecognized.
|
|
UnrecognizedLong(String),
|
|
}
|
|
|
|
/// The info about a flag associated with a flag's ID in the flag map.
|
|
#[derive(Debug)]
|
|
struct FlagInfo {
|
|
/// The flag object and its associated metadata.
|
|
flag: &'static dyn Flag,
|
|
/// The actual name that is stored in the Aho-Corasick automaton. When this
|
|
/// is a byte, it corresponds to a short single character ASCII flag. The
|
|
/// actual pattern that's in the Aho-Corasick automaton is just the single
|
|
/// byte.
|
|
name: Result<&'static str, u8>,
|
|
/// The type of flag that is stored for the corresponding Aho-Corasick
|
|
/// pattern.
|
|
kind: FlagInfoKind,
|
|
}
|
|
|
|
/// The kind of flag that is being matched.
|
|
#[derive(Debug)]
|
|
enum FlagInfoKind {
|
|
/// A standard flag, e.g., --passthru.
|
|
Standard,
|
|
/// A negation of a standard flag, e.g., --no-multiline.
|
|
Negated,
|
|
/// An alias for a standard flag, e.g., --passthrough.
|
|
Alias,
|
|
}
|
|
|
|
impl std::fmt::Display for FlagInfo {
|
|
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
|
|
match self.name {
|
|
Ok(long) => write!(f, "--{long}"),
|
|
Err(short) => write!(f, "-{short}", short = char::from(short)),
|
|
}
|
|
}
|
|
}
|
|
|
|
/// A map from flag names (short, long, negated and aliases) to their ID.
|
|
///
|
|
/// Once an ID is known, it can be used to look up a flag's metadata in the
|
|
/// parser's internal state.
|
|
#[derive(Debug)]
|
|
struct FlagMap {
|
|
map: std::collections::HashMap<Vec<u8>, usize>,
|
|
}
|
|
|
|
impl FlagMap {
|
|
/// Create a new map of flags for the given flag information.
|
|
///
|
|
/// The index of each flag info corresponds to its ID.
|
|
fn new(infos: &[FlagInfo]) -> FlagMap {
|
|
let mut map = std::collections::HashMap::with_capacity(infos.len());
|
|
for (i, info) in infos.iter().enumerate() {
|
|
match info.name {
|
|
Ok(name) => {
|
|
assert_eq!(None, map.insert(name.as_bytes().to_vec(), i));
|
|
}
|
|
Err(byte) => {
|
|
assert_eq!(None, map.insert(vec![byte], i));
|
|
}
|
|
}
|
|
}
|
|
FlagMap { map }
|
|
}
|
|
|
|
/// Look for a match of `name` in the given Aho-Corasick automaton.
|
|
///
|
|
/// This only returns a match if the one found has a length equivalent to
|
|
/// the length of the name given.
|
|
fn find(&self, name: &[u8]) -> Option<usize> {
|
|
self.map.get(name).copied()
|
|
}
|
|
}
|
|
|
|
/// Possibly return a message suggesting flags similar in the name to the one
|
|
/// given.
|
|
///
|
|
/// The one given should be a flag given by the user (without the leading
|
|
/// dashes) that was unrecognized. This attempts to find existing flags that
|
|
/// are similar to the one given.
|
|
fn suggest(unrecognized: &str) -> Option<String> {
|
|
let similars = find_similar_names(unrecognized);
|
|
if similars.is_empty() {
|
|
return None;
|
|
}
|
|
let list = similars
|
|
.into_iter()
|
|
.map(|name| format!("--{name}"))
|
|
.collect::<Vec<String>>()
|
|
.join(", ");
|
|
Some(format!("similar flags that are available: {list}"))
|
|
}
|
|
|
|
/// Return a sequence of names similar to the unrecognized name given.
|
|
fn find_similar_names(unrecognized: &str) -> Vec<&'static str> {
|
|
// The jaccard similarity threshold at which we consider two flag names
|
|
// similar enough that it's worth suggesting it to the end user.
|
|
//
|
|
// This value was determined by some ad hoc experimentation. It might need
|
|
// further tweaking.
|
|
const THRESHOLD: f64 = 0.4;
|
|
|
|
let mut similar = vec![];
|
|
let bow_given = ngrams(unrecognized);
|
|
for &flag in FLAGS.iter() {
|
|
let name = flag.name_long();
|
|
let bow = ngrams(name);
|
|
if jaccard_index(&bow_given, &bow) >= THRESHOLD {
|
|
similar.push(name);
|
|
}
|
|
if let Some(name) = flag.name_negated() {
|
|
let bow = ngrams(name);
|
|
if jaccard_index(&bow_given, &bow) >= THRESHOLD {
|
|
similar.push(name);
|
|
}
|
|
}
|
|
for name in flag.aliases() {
|
|
let bow = ngrams(name);
|
|
if jaccard_index(&bow_given, &bow) >= THRESHOLD {
|
|
similar.push(name);
|
|
}
|
|
}
|
|
}
|
|
similar
|
|
}
|
|
|
|
/// A "bag of words" is a set of ngrams.
|
|
type BagOfWords<'a> = BTreeSet<Cow<'a, [u8]>>;
|
|
|
|
/// Returns the jaccard index (a measure of similarity) between sets of ngrams.
|
|
fn jaccard_index(ngrams1: &BagOfWords<'_>, ngrams2: &BagOfWords<'_>) -> f64 {
|
|
let union = u32::try_from(ngrams1.union(ngrams2).count())
|
|
.expect("fewer than u32::MAX flags");
|
|
let intersection = u32::try_from(ngrams1.intersection(ngrams2).count())
|
|
.expect("fewer than u32::MAX flags");
|
|
f64::from(intersection) / f64::from(union)
|
|
}
|
|
|
|
/// Returns all 3-grams in the slice given.
|
|
///
|
|
/// If the slice doesn't contain a 3-gram, then one is artificially created by
|
|
/// padding it out with a character that will never appear in a flag name.
|
|
fn ngrams(flag_name: &str) -> BagOfWords<'_> {
|
|
// We only allow ASCII flag names, so we can just use bytes.
|
|
let slice = flag_name.as_bytes();
|
|
let seq: Vec<Cow<[u8]>> = match slice.len() {
|
|
0 => vec![Cow::Owned(b"!!!".to_vec())],
|
|
1 => vec![Cow::Owned(vec![slice[0], b'!', b'!'])],
|
|
2 => vec![Cow::Owned(vec![slice[0], slice[1], b'!'])],
|
|
_ => slice.windows(3).map(Cow::Borrowed).collect(),
|
|
};
|
|
BTreeSet::from_iter(seq)
|
|
}
|