Unnamed repository; edit this file 'description' to name the repository.
Diffstat (limited to 'lib/ungrammar/src/lexer.rs')
-rw-r--r--lib/ungrammar/src/lexer.rs129
1 files changed, 129 insertions, 0 deletions
diff --git a/lib/ungrammar/src/lexer.rs b/lib/ungrammar/src/lexer.rs
new file mode 100644
index 0000000000..23da09abb2
--- /dev/null
+++ b/lib/ungrammar/src/lexer.rs
@@ -0,0 +1,129 @@
+//! Simple hand-written ungrammar lexer
+use crate::error::{Result, bail};
+
+#[derive(Debug, Eq, PartialEq)]
+pub(crate) enum TokenKind {
+ Node(String),
+ Token(String),
+ Eq,
+ Star,
+ Pipe,
+ QMark,
+ Colon,
+ LParen,
+ RParen,
+}
+
+#[derive(Debug)]
+pub(crate) struct Token {
+ pub(crate) kind: TokenKind,
+ pub(crate) loc: Location,
+}
+
+#[derive(Copy, Clone, Default, Debug)]
+pub(crate) struct Location {
+ pub(crate) line: usize,
+ pub(crate) column: usize,
+}
+
+impl Location {
+ fn advance(&mut self, text: &str) {
+ match text.rfind('\n') {
+ Some(idx) => {
+ self.line += text.chars().filter(|&it| it == '\n').count();
+ self.column = text[idx + 1..].chars().count();
+ }
+ None => self.column += text.chars().count(),
+ }
+ }
+}
+
+pub(crate) fn tokenize(mut input: &str) -> Result<Vec<Token>> {
+ let mut res = Vec::new();
+ let mut loc = Location::default();
+ while !input.is_empty() {
+ let old_input = input;
+ skip_ws(&mut input);
+ skip_comment(&mut input);
+ if old_input.len() == input.len() {
+ match advance(&mut input) {
+ Ok(kind) => {
+ res.push(Token { kind, loc });
+ }
+ Err(err) => return Err(err.with_location(loc)),
+ }
+ }
+ let consumed = old_input.len() - input.len();
+ loc.advance(&old_input[..consumed]);
+ }
+
+ Ok(res)
+}
+
+fn skip_ws(input: &mut &str) {
+ *input = input.trim_start_matches(is_whitespace)
+}
+fn skip_comment(input: &mut &str) {
+ if input.starts_with("//") {
+ let idx = input.find('\n').map_or(input.len(), |it| it + 1);
+ *input = &input[idx..]
+ }
+}
+
+fn advance(input: &mut &str) -> Result<TokenKind> {
+ let mut chars = input.chars();
+ let c = chars.next().unwrap();
+ let res = match c {
+ '=' => TokenKind::Eq,
+ '*' => TokenKind::Star,
+ '?' => TokenKind::QMark,
+ '(' => TokenKind::LParen,
+ ')' => TokenKind::RParen,
+ '|' => TokenKind::Pipe,
+ ':' => TokenKind::Colon,
+ '\'' => {
+ let mut buf = String::new();
+ loop {
+ match chars.next() {
+ None => bail!("unclosed token literal"),
+ Some('\\') => match chars.next() {
+ Some(c) if is_escapable(c) => buf.push(c),
+ _ => bail!("invalid escape in token literal"),
+ },
+ Some('\'') => break,
+ Some(c) => buf.push(c),
+ }
+ }
+ TokenKind::Token(buf)
+ }
+ c if is_ident_char(c) => {
+ let mut buf = String::new();
+ buf.push(c);
+ loop {
+ match chars.clone().next() {
+ Some(c) if is_ident_char(c) => {
+ chars.next();
+ buf.push(c);
+ }
+ _ => break,
+ }
+ }
+ TokenKind::Node(buf)
+ }
+ '\r' => bail!("unexpected `\\r`, only Unix-style line endings allowed"),
+ c => bail!("unexpected character: `{}`", c),
+ };
+
+ *input = chars.as_str();
+ Ok(res)
+}
+
+fn is_escapable(c: char) -> bool {
+ matches!(c, '\\' | '\'')
+}
+fn is_whitespace(c: char) -> bool {
+ matches!(c, ' ' | '\t' | '\n')
+}
+fn is_ident_char(c: char) -> bool {
+ matches!(c, 'a'..='z' | 'A'..='Z' | '_')
+}