Unnamed repository; edit this file 'description' to name the repository.
Diffstat (limited to 'helix-core/src/command_line.rs')
| -rw-r--r-- | helix-core/src/command_line.rs | 1266 |
1 files changed, 1266 insertions, 0 deletions
diff --git a/helix-core/src/command_line.rs b/helix-core/src/command_line.rs new file mode 100644 index 00000000..4c762a71 --- /dev/null +++ b/helix-core/src/command_line.rs @@ -0,0 +1,1266 @@ +//! Types and parsing code for command mode (`:`) input. +//! +//! Command line parsing is done in steps: +//! +//! * The `Tokenizer` iterator returns `Token`s from the command line input naively - without +//! accounting for a command's signature. +//! * When executing a command (pressing `<ret>` in command mode), tokens are expanded with +//! information from the editor like the current cursor line or column. Otherwise the tokens +//! are unwrapped to their inner content. +//! * `Args` interprets the contents (potentially expanded) as flags or positional arguments. +//! When executing a command, `Args` performs validations like checking the number of positional +//! arguments supplied and whether duplicate or unknown flags were supplied. +//! +//! `Args` is the interface used by typable command implementations. `Args` may be treated as a +//! slice of `Cow<str>` or `&str` to access positional arguments, for example `for arg in args` +//! iterates over positional args (never flags) and `&args[0]` always corresponds to the first +//! positional. Use `Args::has_flag` and `Args::get_flag` to read any specified flags. +//! +//! `Args` and `Tokenizer` are intertwined. `Args` may ask the `Tokenizer` for the rest of the +//! command line as a single token after the configured number of positionals has been reached +//! (according to `raw_after`). This is used for the custom parsing in `:set-option` and +//! `:toggle-option` for example. Outside of executing commands, the `Tokenizer` can be used +//! directly to interpret a string according to the regular tokenization rules. +//! +//! This module also defines structs for configuring the parsing of the command line for a +//! command. See `Flag` and `Signature`. + +use std::{borrow::Cow, collections::HashMap, error::Error, fmt, ops, slice, vec}; + +/// Splits a command line into the command and arguments parts. +/// +/// The third tuple member describes whether the command part is finished. When this boolean is +/// true the completion code for the command line should complete command names, otherwise +/// command arguments. +pub fn split(line: &str) -> (&str, &str, bool) { + const SEPARATOR_PATTERN: [char; 2] = [' ', '\t']; + + let (command, rest) = line.split_once(SEPARATOR_PATTERN).unwrap_or((line, "")); + + let complete_command = + command.is_empty() || (rest.trim().is_empty() && !line.ends_with(SEPARATOR_PATTERN)); + + (command, rest, complete_command) +} + +/// A Unix-like flag that a command may accept. +/// +/// For example the `:sort` command accepts a `--reverse` (or `-r` for shorthand) boolean flag +/// which controls the direction of sorting. Flags may accept an argument by setting the +/// `completions` field to `Some`. +#[derive(Debug, Clone, Copy)] +pub struct Flag { + /// The name of the flag. + /// + /// This value is also used to construct the "longhand" version of the flag. For example a + /// flag with a name "reverse" has a longhand `--reverse`. + /// + /// This value should be supplied when reading a flag out of the [Args] with [Args::get_flag] + /// and [Args::has_flag]. The `:sort` command implementation for example should ask for + /// `args.has_flag("reverse")`. + pub name: &'static str, + /// The character that can be used as a shorthand for the flag, optionally. + /// + /// For example a flag like "reverse" mentioned above might take an alias `Some('r')` to + /// allow specifying the flag as `-r`. + pub alias: Option<char>, + pub doc: &'static str, + /// The completion values to use when specifying an argument for a flag. + /// + /// This should be set to `None` for boolean flags and `Some(&["foo", "bar", "baz"])` for + /// example for flags which accept options, with the strings corresponding to values that + /// should be shown in completion. + pub completions: Option<&'static [&'static str]>, +} + +impl Flag { + // This allows defining flags with the `..Flag::DEFAULT` shorthand. The `name` and `doc` + // fields should always be overwritten. + pub const DEFAULT: Self = Self { + name: "", + doc: "", + alias: None, + completions: None, + }; +} + +/// A description of how a command's input should be handled. +/// +/// Each typable command defines a signature (with the help of `Signature::DEFAULT`) at least to +/// declare how many positional arguments it accepts. Command flags are also declared in this +/// struct. The `raw_after` option may be set optionally to avoid evaluating quotes in parts of +/// the command line (useful for shell commands for example). +#[derive(Debug, Clone, Copy)] +#[allow(clippy::manual_non_exhaustive)] +pub struct Signature { + /// The minimum and (optionally) maximum number of positional arguments a command may take. + /// + /// For example accepting exactly one positional can be specified with `(1, Some(1))` while + /// accepting zero-or-more positionals can be specified as `(0, None)`. + /// + /// The number of positionals is checked when hitting `<ret>` in command mode. If the actual + /// number of positionals is outside the declared range then the command is not executed and + /// an error is shown instead. For example `:write` accepts zero or one positional arguments + /// (`(0, Some(1))`). A command line like `:write a.txt b.txt` is outside the declared range + /// and is not accepted. + pub positionals: (usize, Option<usize>), + /// The number of **positional** arguments for the parser to read with normal quoting rules. + /// + /// Once the number has been exceeded then the tokenizer returns the rest of the input as a + /// `TokenKind::Expand` token (see `Tokenizer::rest`), meaning that quoting rules do not apply + /// and none of the remaining text may be treated as a flag. + /// + /// If this is set to `None` then the entire command line is parsed with normal quoting and + /// flag rules. + /// + /// A good example use-case for this option is `:toggle-option` which sets `Some(1)`. + /// Everything up to the first positional argument is interpreted according to normal rules + /// and the rest of the input is parsed "raw". This allows `:toggle-option` to perform custom + /// parsing on the rest of the input - namely parsing complicated values as a JSON stream. + /// `:toggle-option` could accept a flag in the future. If so, the flag would need to come + /// before the first positional argument. + /// + /// Consider these lines for `:toggle-option` which sets `Some(1)`: + /// + /// * `:toggle foo` has one positional "foo" and no flags. + /// * `:toggle foo bar` has two positionals. Expansions for `bar` are evaluated but quotes + /// and anything that looks like a flag are treated literally. + /// * `:toggle foo --bar` has two positionals: `["foo", "--bar"]`. `--bar` is not considered + /// to be a flag because it comes after the first positional. + /// * `:toggle --bar foo` has one positional "foo" and one flag "--bar". + /// * `:toggle --bar foo --baz` has two positionals `["foo", "--baz"]` and one flag "--bar". + pub raw_after: Option<u8>, + /// A set of flags that a command may accept. + /// + /// See the `Flag` struct for more info. + pub flags: &'static [Flag], + /// Do not set this field. Use `..Signature::DEFAULT` to construct a `Signature` instead. + // This field allows adding new fields later with minimal code changes. This works like a + // `#[non_exhaustive]` annotation except that it supports the `..Signature::DEFAULT` + // shorthand. + pub _dummy: (), +} + +impl Signature { + // This allows defining signatures with the `..Signature::DEFAULT` shorthand. The + // `positionals` field should always be overwritten. + pub const DEFAULT: Self = Self { + positionals: (0, None), + raw_after: None, + flags: &[], + _dummy: (), + }; + + fn check_positional_count(&self, actual: usize) -> Result<(), ParseArgsError<'static>> { + let (min, max) = self.positionals; + if min <= actual && max.unwrap_or(usize::MAX) >= actual { + Ok(()) + } else { + Err(ParseArgsError::WrongPositionalCount { min, max, actual }) + } + } +} + +#[derive(Debug, PartialEq, Eq)] +pub enum ParseArgsError<'a> { + WrongPositionalCount { + min: usize, + max: Option<usize>, + actual: usize, + }, + UnterminatedToken { + token: Token<'a>, + }, + DuplicatedFlag { + flag: &'static str, + }, + UnknownFlag { + text: Cow<'a, str>, + }, + FlagMissingArgument { + flag: &'static str, + }, + MissingExpansionDelimiter { + expansion: &'a str, + }, + UnknownExpansion { + kind: &'a str, + }, +} + +impl fmt::Display for ParseArgsError<'_> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Self::WrongPositionalCount { min, max, actual } => { + write!(f, "expected ")?; + let maybe_plural = |n| if n == 1 { "" } else { "s" }; + match (min, max) { + (0, Some(0)) => write!(f, "no arguments")?, + (min, Some(max)) if min == max => { + write!(f, "exactly {min} argument{}", maybe_plural(*min))? + } + (min, _) if actual < min => { + write!(f, "at least {min} argument{}", maybe_plural(*min))? + } + (_, Some(max)) if actual > max => { + write!(f, "at most {max} argument{}", maybe_plural(*max))? + } + // `actual` must be either less than `min` or greater than `max` for this type + // to be constructed. + _ => unreachable!(), + } + + write!(f, ", got {actual}") + } + Self::UnterminatedToken { token } => { + write!(f, "unterminated token {}", token.content) + } + Self::DuplicatedFlag { flag } => { + write!(f, "flag '--{flag}' specified more than once") + } + Self::UnknownFlag { text } => write!(f, "unknown flag '{text}'"), + Self::FlagMissingArgument { flag } => { + write!(f, "flag '--{flag}' missing an argument") + } + Self::MissingExpansionDelimiter { expansion } => { + write!(f, "missing a string delimiter after '%{expansion}'") + } + Self::UnknownExpansion { kind } => { + write!(f, "unknown expansion '{kind}'") + } + } + } +} + +impl Error for ParseArgsError<'_> {} + +/// The kind of expansion to use on the token's content. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum ExpansionKind { + /// Expand variables from the editor's state. + /// + /// For example `%{cursor_line}`. + Variable, + /// Treat the token contents as hexadecimal corresponding to a Unicode codepoint value. + /// + /// For example `%u{25CF}`. + Unicode, + /// Run the token's contents via the configured shell program. + /// + /// For example `%sh{echo hello}`. + Shell, +} + +impl ExpansionKind { + pub const VARIANTS: &'static [Self] = &[Self::Variable, Self::Unicode, Self::Shell]; + + pub const fn as_str(&self) -> &'static str { + match self { + Self::Variable => "", + Self::Unicode => "u", + Self::Shell => "sh", + } + } + + pub fn from_kind(name: &str) -> Option<Self> { + match name { + "" => Some(Self::Variable), + "u" => Some(Self::Unicode), + "sh" => Some(Self::Shell), + _ => None, + } + } +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum Quote { + Single, + Backtick, +} + +impl Quote { + pub const fn char(&self) -> char { + match self { + Self::Single => '\'', + Self::Backtick => '`', + } + } + + // Quotes can be escaped by doubling them: `'hello '' world'` becomes `hello ' world`. + pub const fn escape(&self) -> &'static str { + match self { + Self::Single => "''", + Self::Backtick => "``", + } + } +} + +/// The type of argument being written. +/// +/// The token kind decides how an argument in the command line will be expanded upon hitting +/// `<ret>` in command mode. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum TokenKind { + /// Unquoted text. + /// + /// For example in `:echo hello world`, "hello" and "world" are raw tokens. + Unquoted, + /// Quoted text which is interpreted literally. + /// + /// The purpose of this kind is to avoid splitting arguments on whitespace. For example + /// `:open 'a b.txt'` will result in opening a file with a single argument `"a b.txt"`. + /// + /// Using expansions within single quotes or backticks will result in the expansion text + /// being shown literally. For example `:echo '%u{0020}'` will print `"%u{0020}"` to the + /// statusline. + Quoted(Quote), + /// Text within double quote delimiters (`"`). + /// + /// The inner text of a double quoted argument can be further expanded. For example + /// `:echo "line: #%{cursor_line}"` could print `"line: #1"` to the statusline. + Expand, + /// An expansion / "percent token". + /// + /// These take the form `%[<kind>]<open><contents><close>`. See `ExpansionKind`. + Expansion(ExpansionKind), + /// A token kind that exists for the sake of completion. + /// + /// In input like `%foo` this token contains the text `"%foo"`. The content start is the byte + /// after the percent token. + /// + /// When `Tokenizer` is passed `true` for its `validate` parameter this token cannot be + /// returned: inputs that would return this token get a validation error instead. + ExpansionKind, +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct Token<'a> { + pub kind: TokenKind, + /// The byte index into the input where the token's content starts. + /// + /// For quoted text this means the byte after the quote. For expansions this means the byte + /// after the opening delimiter. + pub content_start: usize, + /// The inner content of the token. + /// + /// Usually this content borrows from the input but an owned value may be used in cases of + /// escaping. On Unix systems a raw token like `a\ b` has the contents `"a b"`. + pub content: Cow<'a, str>, + /// Whether the token's opening delimiter is closed. + /// + /// For example a quote `"foo"` is closed but not `"foo` or an expansion `%sh{..}` is closed + /// but not `%sh{echo {}`. + pub is_terminated: bool, +} + +impl Token<'_> { + pub fn empty_at(content_start: usize) -> Self { + Self { + kind: TokenKind::Unquoted, + content_start, + content: Cow::Borrowed(""), + is_terminated: false, + } + } +} + +#[derive(Debug)] +pub struct Tokenizer<'a> { + input: &'a str, + /// Whether to return errors in the iterator for failed validations like unterminated strings + /// or expansions. When this is set to `false` the iterator will never return `Err`. + validate: bool, + /// The current byte index of the input being considered. + pos: usize, +} + +impl<'a> Tokenizer<'a> { + pub fn new(input: &'a str, validate: bool) -> Self { + Self { + input, + validate, + pos: 0, + } + } + + /// Returns the current byte index position of the parser in the input. + pub fn pos(&self) -> usize { + self.pos + } + + /// Returns the rest of the input as a single `TokenKind::Expand` token literally. + /// + /// Returns `None` if the tokenizer is already at the end of the input or advances the + /// tokenizer to the end of the input otherwise. Leading whitespace characters are skipped. + /// Quoting is not interpreted. + pub fn rest(&mut self) -> Option<Token<'a>> { + self.skip_blanks(); + + if self.pos == self.input.len() { + return None; + } + + let content_start = self.pos; + self.pos = self.input.len(); + Some(Token { + kind: TokenKind::Expand, + content_start, + content: Cow::Borrowed(&self.input[content_start..]), + is_terminated: false, + }) + } + + fn byte(&self) -> Option<u8> { + self.input.as_bytes().get(self.pos).copied() + } + + fn peek_byte(&self) -> Option<u8> { + self.input.as_bytes().get(self.pos + 1).copied() + } + + fn prev_byte(&self) -> Option<u8> { + self.pos + .checked_sub(1) + .map(|idx| self.input.as_bytes()[idx]) + } + + fn skip_blanks(&mut self) { + while let Some(b' ' | b'\t') = self.byte() { + self.pos += 1; + } + } + + fn parse_unquoted(&mut self) -> Cow<'a, str> { + // Note that `String::new` starts with no allocation. We only allocate if we see a + // backslash escape (on Unix only). + let mut escaped = String::new(); + let mut start = self.pos; + + while let Some(byte) = self.byte() { + if matches!(byte, b' ' | b'\t') { + if cfg!(unix) && self.prev_byte() == Some(b'\\') { + // Push everything up to but not including the backslash and then this + // whitespace character. + escaped.push_str(&self.input[start..self.pos - 1]); + escaped.push(byte as char); + start = self.pos + 1; + } else if escaped.is_empty() { + return Cow::Borrowed(&self.input[start..self.pos]); + } else { + break; + } + } + + self.pos += 1; + } + + // Special case for a trailing backslash on Unix: exclude the backslash from the content. + // This improves the behavior of completions like `":open a\\"` (trailing backslash). + let end = if cfg!(unix) && self.prev_byte() == Some(b'\\') { + self.pos - 1 + } else { + self.pos + }; + + if escaped.is_empty() { + assert_eq!(self.pos, self.input.len()); + Cow::Borrowed(&self.input[start..end]) + } else { + escaped.push_str(&self.input[start..end]); + Cow::Owned(escaped) + } + } + + /// Parses a string quoted by the given grapheme cluster. + /// + /// The position of the tokenizer is asserted to be immediately after the quote grapheme + /// cluster. + fn parse_quoted(&mut self, quote: u8) -> (Cow<'a, str>, bool) { + assert_eq!(self.byte(), Some(quote)); + self.pos += 1; + + let mut escaped = String::new(); + while let Some(offset) = self.input[self.pos..].find(quote as char) { + let idx = self.pos + offset; + if self.input.as_bytes().get(idx + 1) == Some("e) { + // Treat two quotes in a row as an escape. + escaped.push_str(&self.input[self.pos..idx + 1]); + // Advance past the escaped quote. + self.pos = idx + 2; + } else { + // Otherwise this quote string is finished. + let quoted = if escaped.is_empty() { + Cow::Borrowed(&self.input[self.pos..idx]) + } else { + escaped.push_str(&self.input[self.pos..idx]); + Cow::Owned(escaped) + }; + // Advance past the closing quote. + self.pos = idx + 1; + return (quoted, true); + } + } + + let quoted = if escaped.is_empty() { + Cow::Borrowed(&self.input[self.pos..]) + } else { + escaped.push_str(&self.input[self.pos..]); + Cow::Owned(escaped) + }; + self.pos = self.input.len(); + + (quoted, false) + } + + /// Parses the percent token expansion under the tokenizer's cursor. + /// + /// This function should only be called when the tokenizer's cursor is on a non-escaped + /// percent token. + pub fn parse_percent_token(&mut self) -> Option<Result<Token<'a>, ParseArgsError<'a>>> { + assert_eq!(self.byte(), Some(b'%')); + + self.pos += 1; + let kind_start = self.pos; + self.pos += self.input[self.pos..] + .bytes() + .take_while(|b| b.is_ascii_lowercase()) + .count(); + let kind = &self.input[kind_start..self.pos]; + + let (open, close) = match self.byte() { + // We support a couple of hard-coded chars only to make sure we can provide more + // useful errors and avoid weird behavior in case of typos. These should cover + // practical cases. + Some(b'(') => (b'(', b')'), + Some(b'[') => (b'[', b']'), + Some(b'{') => (b'{', b'}'), + Some(b'<') => (b'<', b'>'), + Some(b'\'') => (b'\'', b'\''), + Some(b'\"') => (b'\"', b'\"'), + Some(b'|') => (b'|', b'|'), + Some(_) | None => { + return Some(if self.validate { + Err(ParseArgsError::MissingExpansionDelimiter { expansion: kind }) + } else { + Ok(Token { + kind: TokenKind::ExpansionKind, + content_start: kind_start, + content: Cow::Borrowed(kind), + is_terminated: false, + }) + }); + } + }; + // The content start for expansions is the start of the content - after the opening + // delimiter grapheme. + let content_start = self.pos + 1; + let kind = match ExpansionKind::from_kind(kind) { + Some(kind) => TokenKind::Expansion(kind), + None if self.validate => { + return Some(Err(ParseArgsError::UnknownExpansion { kind })); + } + None => TokenKind::Expand, + }; + + let (content, is_terminated) = if open == close { + self.parse_quoted(open) + } else { + self.parse_quoted_balanced(open, close) + }; + + let token = Token { + kind, + content_start, + content, + is_terminated, + }; + + if self.validate && !is_terminated { + return Some(Err(ParseArgsError::UnterminatedToken { token })); + } + + Some(Ok(token)) + } + + /// Parse the next string under the cursor given an open and closing pair. + /// + /// The open and closing pair are different ASCII characters. The cursor is asserted to be + /// immediately after the opening delimiter. + /// + /// This function parses with nesting support. `%sh{echo {hello}}` for example should consume + /// the entire input and not quit after the first '}' character is found. + fn parse_quoted_balanced(&mut self, open: u8, close: u8) -> (Cow<'a, str>, bool) { + assert_eq!(self.byte(), Some(open)); + self.pos += 1; + let start = self.pos; + let mut level = 1; + + while let Some(offset) = self.input[self.pos..].find([open as char, close as char]) { + let idx = self.pos + offset; + // Move past the delimiter. + self.pos = idx + 1; + + let byte = self.input.as_bytes()[idx]; + if byte == open { + level += 1; + } else if byte == close { + level -= 1; + if level == 0 { + break; + } + } else { + unreachable!() + } + } + + let is_terminated = level == 0; + let end = if is_terminated { + // Exclude the closing delimiter from the token's content. + self.pos - 1 + } else { + // When the token is not closed, advance to the end of the input. + self.pos = self.input.len(); + self.pos + }; + + (Cow::Borrowed(&self.input[start..end]), is_terminated) + } +} + +impl<'a> Iterator for Tokenizer<'a> { + type Item = Result<Token<'a>, ParseArgsError<'a>>; + + fn next(&mut self) -> Option<Self::Item> { + self.skip_blanks(); + + let byte = self.byte()?; + match byte { + b'"' | b'\'' | b'`' => { + let content_start = self.pos + 1; + let (content, is_terminated) = self.parse_quoted(byte); + let token = Token { + kind: match byte { + b'"' => TokenKind::Expand, + b'\'' => TokenKind::Quoted(Quote::Single), + b'`' => TokenKind::Quoted(Quote::Backtick), + _ => unreachable!(), + }, + content_start, + content, + is_terminated, + }; + + Some(if self.validate && !is_terminated { + Err(ParseArgsError::UnterminatedToken { token }) + } else { + Ok(token) + }) + } + b'%' => self.parse_percent_token(), + _ => { + let content_start = self.pos; + + // Allow backslash escaping on Unix for quotes or expansions + if cfg!(unix) + && byte == b'\\' + && matches!(self.peek_byte(), Some(b'"' | b'\'' | b'`' | b'%')) + { + self.pos += 1; + } + + Some(Ok(Token { + kind: TokenKind::Unquoted, + content_start, + content: self.parse_unquoted(), + is_terminated: false, + })) + } + } + } +} + +#[derive(Debug, Default, Clone, Copy)] +pub enum CompletionState { + #[default] + Positional, + Flag(Option<Flag>), + FlagArgument(Flag), +} + +/// A set of arguments provided to a command on the command line. +/// +/// Regular arguments are called "positional" arguments (or "positionals" for short). Command line +/// input might also specify "flags" which can modify a command's behavior. +/// +/// ```rust,ignore +/// // Say that the command accepts a "bar" flag which doesn't accept an argument itself. +/// // This input has two positionals, "foo" and "baz" and one flag "--bar". +/// let args = Args::parse("foo --bar baz", /* .. */); +/// // `Args` may be treated like a slice to access positionals. +/// assert_eq!(args.len(), 2); +/// assert_eq!(&args[0], "foo"); +/// assert_eq!(&args[1], "baz"); +/// // Use `has_flag` or `get_flag` to access flags. +/// assert!(args.has_flag("bar")); +/// ``` +/// +/// The `Args` type can be treated mostly the same as a slice when accessing positional arguments. +/// Common slice methods like `len`, `get`, `first` and `join` only expose positional arguments. +/// Additionally, common syntax like `for arg in args` or `&args[idx]` is supported for accessing +/// positional arguments. +/// +/// To look up flags, use `Args::get_flag` for flags which should accept an argument or +/// `Args::has_flag` for boolean flags. +/// +/// The way that `Args` is parsed from the input depends on a command's `Signature`. See the +/// `Signature` type for more details. +#[derive(Debug)] +pub struct Args<'a> { + signature: Signature, + /// Whether to validate the arguments. + /// See the `ParseArgsError` type for the validations. + validate: bool, + /// Whether args pushed with `Self::push` should be treated as positionals even if they + /// start with '-'. + only_positionals: bool, + state: CompletionState, + positionals: Vec<Cow<'a, str>>, + flags: HashMap<&'static str, Cow<'a, str>>, +} + +impl Default for Args<'_> { + fn default() -> Self { + Self { + signature: Signature::DEFAULT, + validate: Default::default(), + only_positionals: Default::default(), + state: CompletionState::default(), + positionals: Default::default(), + flags: Default::default(), + } + } +} + +impl<'a> Args<'a> { + pub fn new(signature: Signature, validate: bool) -> Self { + Self { + signature, + validate, + only_positionals: false, + positionals: Vec::new(), + flags: HashMap::new(), + state: CompletionState::default(), + } + } + + /// Reads the next token out of the given parser. + /// + /// If the command's signature sets a maximum number of positionals (via `raw_after`) then + /// the token may contain the rest of the parser's input. + pub fn read_token<'p>( + &mut self, + parser: &mut Tokenizer<'p>, + ) -> Result<Option<Token<'p>>, ParseArgsError<'p>> { + if self + .signature + .raw_after + .is_some_and(|max| self.len() >= max as usize) + { + self.only_positionals = true; + Ok(parser.rest()) + } else { + parser.next().transpose() + } + } + + /// Parses the given command line according to a command's signature. + /// + /// The `try_map_fn` function can be used to try changing each token before it is considered + /// as an argument - this is used for variable expansion. + pub fn parse<M>( + line: &'a str, + signature: Signature, + validate: bool, + mut try_map_fn: M, + ) -> Result<Self, Box<dyn Error + 'a>> + where + // Note: this is a `FnMut` in case we decide to allow caching expansions in the future. + // The `mut` is not currently used. + M: FnMut(Token<'a>) -> Result<Cow<'a, str>, Box<dyn Error>>, + { + let mut tokenizer = Tokenizer::new(line, validate); + let mut args = Self::new(signature, validate); + + while let Some(token) = args.read_token(&mut tokenizer)? { + let arg = try_map_fn(token)?; + args.push(arg)?; + } + + args.finish()?; + + Ok(args) + } + + /// Adds the given argument token. + /// + /// Once all arguments have been added, `Self::finish` should be called to perform any + /// closing validations. + pub fn push(&mut self, arg: Cow<'a, str>) -> Result<(), ParseArgsError<'a>> { + if !self.only_positionals && arg == "--" { + // "--" marks the end of flags, everything after is a positional even if it starts + // with '-'. + self.only_positionals = true; + self.state = CompletionState::Flag(None); + } else if let Some(flag) = self.flag_awaiting_argument() { + // If the last token was a flag which accepts an argument, treat this token as a flag + // argument. + self.flags.insert(flag.name, arg); + self.state = CompletionState::FlagArgument(flag); + } else if !self.only_positionals && arg.starts_with('-') { + // If the token starts with '-' and we are not only accepting positional arguments, + // treat this token as a flag. + let flag = if let Some(longhand) = arg.strip_prefix("--") { + self.signature + .flags + .iter() + .find(|flag| flag.name == longhand) + } else { + let shorthand = arg.strip_prefix('-').unwrap(); + self.signature.flags.iter().find(|flag| { + flag.alias + .is_some_and(|ch| shorthand == ch.encode_utf8(&mut [0; 4])) + }) + }; + + let Some(flag) = flag else { + if self.validate { + return Err(ParseArgsError::UnknownFlag { text: arg }); + } + + self.positionals.push(arg); + self.state = CompletionState::Flag(None); + return Ok(()); + }; + + if self.validate && self.flags.contains_key(flag.name) { + return Err(ParseArgsError::DuplicatedFlag { flag: flag.name }); + } + + self.flags.insert(flag.name, Cow::Borrowed("")); + self.state = CompletionState::Flag(Some(*flag)); + } else { + // Otherwise this token is a positional argument. + self.positionals.push(arg); + self.state = CompletionState::Positional; + } + + Ok(()) + } + + /// Performs any validations that must be done after the input args are finished being pushed + /// with `Self::push`. + fn finish(&self) -> Result<(), ParseArgsError<'a>> { + if !self.validate { + return Ok(()); + }; + + if let Some(flag) = self.flag_awaiting_argument() { + return Err(ParseArgsError::FlagMissingArgument { flag: flag.name }); + } + self.signature + .check_positional_count(self.positionals.len())?; + + Ok(()) + } + + fn flag_awaiting_argument(&self) -> Option<Flag> { + match self.state { + CompletionState::Flag(flag) => flag.filter(|f| f.completions.is_some()), + _ => None, + } + } + + /// Returns the kind of argument the last token is considered to be. + /// + /// For example if the last argument in the command line is `--foo` then the argument may be + /// considered to be a flag. + pub fn completion_state(&self) -> CompletionState { + self.state + } + + /// Returns the number of positionals supplied in the input. + /// + /// This number does not account for any flags passed in the input. + pub fn len(&self) -> usize { + self.positionals.len() + } + + /// Checks whether the arguments contain no positionals. + /// + /// Note that this function returns `true` if there are no positional arguments even if the + /// input contained flags. + pub fn is_empty(&self) -> bool { + self.positionals.is_empty() + } + + /// Gets the first positional argument, if one exists. + pub fn first(&'a self) -> Option<&'a str> { + self.positionals.first().map(AsRef::as_ref) + } + + /// Gets the positional argument at the given index, if one exists. + pub fn get(&'a self, index: usize) -> Option<&'a str> { + self.positionals.get(index).map(AsRef::as_ref) + } + + /// Flattens all positional arguments together with the given separator between each + /// positional. + pub fn join(&self, sep: &str) -> String { + self.positionals.join(sep) + } + + /// Returns an iterator over all positional arguments. + pub fn iter(&self) -> slice::Iter<'_, Cow<'_, str>> { + self.positionals.iter() + } + + /// Gets the value associated with a flag's long name if the flag was provided. + /// + /// This function should be preferred over [Self::has_flag] when the flag accepts an argument. + pub fn get_flag(&'a self, name: &'static str) -> Option<&'a str> { + debug_assert!( + self.signature.flags.iter().any(|flag| flag.name == name), + "flag '--{name}' does not belong to the command's signature" + ); + debug_assert!( + self.signature + .flags + .iter() + .any(|flag| flag.name == name && flag.completions.is_some()), + "Args::get_flag was used for '--{name}' but should only be used for flags with arguments, use Args::has_flag instead" + ); + + self.flags.get(name).map(AsRef::as_ref) + } + + /// Checks if a flag was provided in the arguments. + /// + /// This function should be preferred over [Self::get_flag] for boolean flags - flags that + /// either are present or not. + pub fn has_flag(&self, name: &'static str) -> bool { + debug_assert!( + self.signature.flags.iter().any(|flag| flag.name == name), + "flag '--{name}' does not belong to the command's signature" + ); + debug_assert!( + self.signature + .flags + .iter() + .any(|flag| flag.name == name && flag.completions.is_none()), + "Args::has_flag was used for '--{name}' but should only be used for flags without arguments, use Args::get_flag instead" + ); + + self.flags.contains_key(name) + } +} + +// `arg[n]` +impl ops::Index<usize> for Args<'_> { + type Output = str; + + fn index(&self, index: usize) -> &Self::Output { + self.positionals[index].as_ref() + } +} + +// `for arg in args { .. }` +impl<'a> IntoIterator for Args<'a> { + type Item = Cow<'a, str>; + type IntoIter = vec::IntoIter<Cow<'a, str>>; + + fn into_iter(self) -> Self::IntoIter { + self.positionals.into_iter() + } +} + +// `for arg in &args { .. }` +impl<'i, 'a> IntoIterator for &'i Args<'a> { + type Item = &'i Cow<'a, str>; + type IntoIter = slice::Iter<'i, Cow<'a, str>>; + + fn into_iter(self) -> Self::IntoIter { + self.positionals.iter() + } +} + +#[cfg(test)] +mod test { + use super::*; + + #[track_caller] + fn assert_tokens(input: &str, expected: &[&str]) { + let actual: Vec<_> = Tokenizer::new(input, true) + .map(|arg| arg.unwrap().content) + .collect(); + let actual: Vec<_> = actual.iter().map(|c| c.as_ref()).collect(); + + assert_eq!(actual.as_slice(), expected); + } + + #[track_caller] + fn assert_incomplete_tokens(input: &str, expected: &[&str]) { + assert!( + Tokenizer::new(input, true).collect::<Result<Vec<_>, _>>().is_err(), + "`assert_incomplete_tokens` only accepts input that fails validation, consider using `assert_tokens` instead" + ); + + let actual: Vec<_> = Tokenizer::new(input, false) + .map(|arg| arg.unwrap().content) + .collect(); + let actual: Vec<_> = actual.iter().map(|c| c.as_ref()).collect(); + + assert_eq!(actual.as_slice(), expected); + } + + #[test] + fn tokenize_unquoted() { + assert_tokens("", &[]); + assert_tokens("hello", &["hello"]); + assert_tokens("hello world", &["hello", "world"]); + // Any amount of whitespace is considered a separator. + assert_tokens("hello\t \tworld", &["hello", "world"]); + } + + // This escaping behavior is specific to Unix systems. + #[cfg(unix)] + #[test] + fn tokenize_backslash_unix() { + assert_tokens(r#"hello\ world"#, &["hello world"]); + assert_tokens(r#"one\ two three"#, &["one two", "three"]); + assert_tokens(r#"one two\ three"#, &["one", "two three"]); + // Trailing backslash is ignored - this improves completions. + assert_tokens(r#"hello\"#, &["hello"]); + // The backslash at the start of the double quote makes the quote be treated as raw. + // For the backslash before the ending quote the token is already considered raw so the + // backslash and quote are treated literally. + assert_tokens( + r#"echo \"hello world\""#, + &["echo", r#""hello"#, r#"world\""#], + ); + } + + #[test] + fn tokenize_backslash() { + assert_tokens(r#"\n"#, &["\\n"]); + assert_tokens(r#"'\'"#, &["\\"]); + } + + #[test] + fn tokenize_quoting() { + // Using a quote character twice escapes it. + assert_tokens(r#"''"#, &[""]); + assert_tokens(r#""""#, &[""]); + assert_tokens(r#"``"#, &[""]); + assert_tokens(r#"echo """#, &["echo", ""]); + + assert_tokens(r#"'hello'"#, &["hello"]); + assert_tokens(r#"'hello world'"#, &["hello world"]); + + assert_tokens(r#""hello "" world""#, &["hello \" world"]); + } + + #[test] + fn tokenize_percent() { + // Pair delimiters: + assert_tokens(r#"echo %{hello world}"#, &["echo", "hello world"]); + assert_tokens(r#"echo %[hello world]"#, &["echo", "hello world"]); + assert_tokens(r#"echo %(hello world)"#, &["echo", "hello world"]); + assert_tokens(r#"echo %<hello world>"#, &["echo", "hello world"]); + assert_tokens(r#"echo %|hello world|"#, &["echo", "hello world"]); + assert_tokens(r#"echo %'hello world'"#, &["echo", "hello world"]); + assert_tokens(r#"echo %"hello world""#, &["echo", "hello world"]); + // When invoking a command, double percents can be used within a string as an escape for + // the percent. This is done in the expansion code though, not in the parser here. + assert_tokens(r#"echo "%%hello world""#, &["echo", "%%hello world"]); + // Different kinds of quotes nested: + assert_tokens( + r#"echo "%sh{echo 'hello world'}""#, + &["echo", r#"%sh{echo 'hello world'}"#], + ); + // Nesting of the expansion delimiter: + assert_tokens(r#"echo %{hello {x} world}"#, &["echo", "hello {x} world"]); + assert_tokens( + r#"echo %{hello {{😎}} world}"#, + &["echo", "hello {{😎}} world"], + ); + + // Balanced nesting: + assert_tokens( + r#"echo %{hello {}} world}"#, + &["echo", "hello {}", "world}"], + ); + + // Recursive expansions: + assert_tokens( + r#"echo %sh{echo "%{cursor_line}"}"#, + &["echo", r#"echo "%{cursor_line}""#], + ); + // Completion should provide variable names here. (Unbalanced nesting) + assert_incomplete_tokens(r#"echo %sh{echo "%{c"#, &["echo", r#"echo "%{c"#]); + assert_incomplete_tokens(r#"echo %{hello {{} world}"#, &["echo", "hello {{} world}"]); + } + + fn parse_signature<'a>( + input: &'a str, + signature: Signature, + ) -> Result<Args<'a>, Box<dyn std::error::Error + 'a>> { + Args::parse(input, signature, true, |token| Ok(token.content)) + } + + #[test] + fn signature_validation_positionals() { + let signature = Signature { + positionals: (2, Some(3)), + ..Signature::DEFAULT + }; + + assert!(parse_signature("hello world", signature).is_ok()); + assert!(parse_signature("foo bar baz", signature).is_ok()); + assert!(parse_signature(r#"a "b c" d"#, signature).is_ok()); + + assert!(parse_signature("hello", signature).is_err()); + assert!(parse_signature("foo bar baz quiz", signature).is_err()); + + let signature = Signature { + positionals: (1, None), + ..Signature::DEFAULT + }; + + assert!(parse_signature("a", signature).is_ok()); + assert!(parse_signature("a b", signature).is_ok()); + assert!(parse_signature(r#"a "b c" d"#, signature).is_ok()); + + assert!(parse_signature("", signature).is_err()); + } + + #[test] + fn flags() { + let signature = Signature { + positionals: (1, Some(2)), + flags: &[ + Flag { + name: "foo", + alias: Some('f'), + doc: "", + completions: None, + }, + Flag { + name: "bar", + alias: Some('b'), + doc: "", + completions: Some(&[]), + }, + ], + ..Signature::DEFAULT + }; + + let args = parse_signature("hello", signature).unwrap(); + assert_eq!(args.len(), 1); + assert_eq!(&args[0], "hello"); + assert!(!args.has_flag("foo")); + assert!(args.get_flag("bar").is_none()); + + let args = parse_signature("--bar abcd hello world --foo", signature).unwrap(); + assert_eq!(args.len(), 2); + assert_eq!(&args[0], "hello"); + assert_eq!(&args[1], "world"); + assert!(args.has_flag("foo")); + assert_eq!(args.get_flag("bar"), Some("abcd")); + + let args = parse_signature("hello -f -b abcd world", signature).unwrap(); + assert_eq!(args.len(), 2); + assert_eq!(&args[0], "hello"); + assert_eq!(&args[1], "world"); + assert!(args.has_flag("foo")); + assert_eq!(args.get_flag("bar"), Some("abcd")); + + // The signature requires at least one positional. + assert!(parse_signature("--foo", signature).is_err()); + // And at most two. + assert!(parse_signature("abc --bar baz def efg", signature).is_err()); + + let args = parse_signature(r#"abc -b "xyz 123" def"#, signature).unwrap(); + assert_eq!(args.len(), 2); + assert_eq!(&args[0], "abc"); + assert_eq!(&args[1], "def"); + assert_eq!(args.get_flag("bar"), Some("xyz 123")); + + // Unknown flags are validation errors. + assert!(parse_signature(r#"foo --quiz"#, signature).is_err()); + // Duplicated flags are parsing errors. + assert!(parse_signature(r#"--foo bar --foo"#, signature).is_err()); + assert!(parse_signature(r#"-f bar --foo"#, signature).is_err()); + + // "--" can be used to mark the end of flags. Everything after is considered a positional. + let args = parse_signature(r#"hello --bar baz -- --foo"#, signature).unwrap(); + assert_eq!(args.len(), 2); + assert_eq!(&args[0], "hello"); + assert_eq!(&args[1], "--foo"); + assert_eq!(args.get_flag("bar"), Some("baz")); + assert!(!args.has_flag("foo")); + } + + #[test] + fn raw_after() { + let signature = Signature { + positionals: (1, Some(1)), + raw_after: Some(0), + ..Signature::DEFAULT + }; + + // All quoting and escaping is treated literally in raw mode. + let args = parse_signature(r#"'\'"#, signature).unwrap(); + assert_eq!(args.len(), 1); + assert_eq!(&args[0], "'\\'"); + let args = parse_signature(r#"\''"#, signature).unwrap(); + assert_eq!(args.len(), 1); + assert_eq!(&args[0], "\\''"); + + // Leading space is trimmed. + let args = parse_signature(r#" %sh{foo}"#, signature).unwrap(); + assert_eq!(args.len(), 1); + assert_eq!(&args[0], "%sh{foo}"); + + let signature = Signature { + positionals: (1, Some(2)), + raw_after: Some(1), + ..Signature::DEFAULT + }; + + let args = parse_signature("foo", signature).unwrap(); + assert_eq!(args.len(), 1); + assert_eq!(&args[0], "foo"); + + // "--bar" is treated as a positional. + let args = parse_signature("foo --bar", signature).unwrap(); + assert_eq!(args.len(), 2); + assert_eq!(&args[0], "foo"); + assert_eq!(&args[1], "--bar"); + + let args = parse_signature("abc def ghi", signature).unwrap(); + assert_eq!(args.len(), 2); + assert_eq!(&args[0], "abc"); + assert_eq!(&args[1], "def ghi"); + + let args = parse_signature("rulers [20, 30]", signature).unwrap(); + assert_eq!(args.len(), 2); + assert_eq!(&args[0], "rulers"); + assert_eq!(&args[1], "[20, 30]"); + + let args = + parse_signature(r#"gutters ["diff"] ["diff", "diagnostics"]"#, signature).unwrap(); + assert_eq!(args.len(), 2); + assert_eq!(&args[0], "gutters"); + assert_eq!(&args[1], r#"["diff"] ["diff", "diagnostics"]"#); + } +} |