smol lang
Diffstat (limited to 'src/lexer.rs')
-rw-r--r--src/lexer.rs271
1 files changed, 271 insertions, 0 deletions
diff --git a/src/lexer.rs b/src/lexer.rs
new file mode 100644
index 0000000..a3f8c60
--- /dev/null
+++ b/src/lexer.rs
@@ -0,0 +1,271 @@
+use beef::lean::Cow;
+use chumsky::span::SimpleSpan;
+use logos::{Lexer as RealLexer, Logos, SpannedIter};
+
+macro_rules! tokens {
+ ($($z:literal $( | $y:literal)? => $v:ident,)+) => {
+ #[derive(Logos, Debug, PartialEq, Clone)]
+ #[logos(skip r"[\n\s]+")]
+ pub enum Token<'strings> {
+ #[regex("//[^\n]+", priority = 8)]
+ // #[regex(r"/\*[\s\S]+\*/", priority = 8)]
+ Comment(&'strings str),
+ #[regex(r"[0-9]+", |lex| lex.slice().parse().ok())]
+ #[regex(r"0[xX][0-9a-fA-F]+", |lex| u64::from_str_radix(&lex.slice()[2..], 16).ok())]
+ #[regex(r"0[bB][01]+", |lex| u64::from_str_radix(&lex.slice()[2..], 2).ok())]
+ Int(u64),
+ #[regex(r"[0-9]+\.[0-9]+", |lex| lex.slice().parse().ok())]
+ Float(f64),
+ #[regex(r#""([^\\"\n])*""#, callback = |lex| Cow::from(&lex.slice()[1..lex.slice().len()-1]), priority = 12)]
+ #[regex(r#""[^"]*""#, callback = |lex| Cow::from(lex.slice()[1..lex.slice().len()-1].replace(r"\n", "\n")), priority = 8)]
+ String(Cow<'strings, str>),
+ #[regex(r"[a-z_α-ωA-Z]['̇A-Za-z0-9_α-ω]*", priority = 7)]
+ Ident(&'strings str),
+ #[regex(r"[^\{\[\(\)\]\}:λ0-9,\?\s][^'̇\?\{\[\(\)\]\},:\s]*", priority = 6)]
+ FnIdent(&'strings str),
+
+ #[token("{", chr::<'{'>)]
+ #[token("[", chr::<'['>)]
+ #[token("(", chr::<'('>)]
+ OpeningBracket(char),
+ #[token("}", chr::<'}'>)]
+ #[token("]", chr::<']'>)]
+ #[token(")", chr::<')'>)]
+ ClosingBracket(char),
+
+ $(#[token($z, priority = 8)] $(#[token($y, priority = 8)])? $v,)+
+ }
+
+ impl std::fmt::Display for Token<'_> {
+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
+ match self {
+ $(Self::$v => write!(f, $z),)+
+ Self::FnIdent(s) | Self::Ident(s) | Self::Comment(s) => write!(f, "{s}"),
+ Self::String(s) => write!(f, "{s}"),
+ Self::Float(n) => write!(f, "{n}"),
+ Self::Int(n) => write!(f, "{n}"),
+ Self::OpeningBracket(x) | Self::ClosingBracket(x) => write!(f,"{x}"),
+ }
+ }
+ }
+ }
+}
+
+tokens! {
+ "mut" => Mut,
+ "let" => Let,
+ "static" => Static,
+ "impl" => Impl,
+ "mod" => Mod,
+ "match" => Match,
+ "for" => For,
+ "break" => Break,
+ "enum" => Enum,
+ "union" => Union,
+ "pub" => Public,
+ "typeck" => TypeCheck,
+ "struct" => Struct,
+ "if" => If,
+ "else" => Else,
+ "=>" | "⇒" => FatArrow,
+ "->" | "→" => ThinArrow,
+ "," => Comma,
+ ":" => Colon,
+ ";" => Semicolon,
+ "::" | "∷" => Access,
+ "=" => Equal,
+ "λ" => Lamba,
+ "()" => Unit,
+ "prefix" => Prefix,
+ "infix" => Infix,
+ "postfix" => Postfix,
+ "alias" => Alias,
+ "associativity" => Associativity,
+ "looser_than" => LooserThan,
+ "tighter_than" => TighterThan,
+ "like" => Like,
+}
+
+pub fn lex(s: &str) -> Lexer {
+ Lexer {
+ inner: Token::lexer(s).spanned(),
+ }
+}
+
+fn chr<'src, const CHR: char>(_: &mut RealLexer<'src, Token<'src>>) -> Result<char, ()> {
+ Ok(CHR)
+}
+pub struct Lexer<'s> {
+ inner: SpannedIter<'s, Token<'s>>,
+}
+
+impl<'s> Iterator for Lexer<'s> {
+ type Item = (Token<'s>, SimpleSpan<usize>);
+
+ fn next(&mut self) -> Option<Self::Item> {
+ self.inner.find_map(|(x, s)| match x.ok()? {
+ Token::Comment(_) => None,
+ x => Some((x, SimpleSpan::new(s.start, s.end))),
+ })
+ }
+}
+
+#[test]
+fn lexer() {
+ let mut lex = lex(r#"
+let sint: typeset = { i8, i16 }
+- (x: T -> T) [alias neg, prefix { like ¬ }]
+mod intrinsics {
+ or ((a: T, b: T) -> T) { compiler_defined } [prefix, infix, T ∈ int]
+}
+enum bool { true, false }
+∧ ((a: bool, b: bool) -> bool) { a & b } [infix { associativity <, looser_than « }]
+impl bool {
+ ∧ ((a, b: λ(() -> me)) -> me) {
+ match a {
+ true => b (),
+ false => false,
+ }
+ }
+}
+ "#);
+ // while let Some(x) = lex.next() { print!("{x} "); }
+ macro_rules! test {
+ ($($tok:ident$(($var:literal))?)+) => {{
+ $(assert_eq!(lex.next().map(|(x,_)|x), Some(Token::$tok$(($var.into()))?));)+
+ assert_eq!(lex.next(), None);
+ }}
+ }
+ test! [
+ Let
+ Ident("sint")
+ Colon
+ Ident("typeset")
+ Equal
+ OpeningBracket('{')
+ Ident("i8")
+ Comma
+ Ident("i16")
+ ClosingBracket('}')
+ FnIdent("-")
+ OpeningBracket('(')
+ Ident("x")
+ Colon
+ FnIdent("T")
+ ThinArrow
+ FnIdent("T")
+ ClosingBracket(')')
+ OpeningBracket('[')
+ Alias
+ Ident("neg")
+ Comma
+ Prefix
+ OpeningBracket('{')
+ Like
+ FnIdent("¬")
+ ClosingBracket('}')
+ ClosingBracket(']')
+ Mod
+ Ident("intrinsics")
+ OpeningBracket('{')
+ Ident("or")
+ OpeningBracket('(')
+ OpeningBracket('(')
+ Ident("a")
+ Colon
+ FnIdent("T")
+ Comma
+ Ident("b")
+ Colon
+ FnIdent("T")
+ ClosingBracket(')')
+ ThinArrow
+ FnIdent("T")
+ ClosingBracket(')')
+ OpeningBracket('{')
+ Ident("compiler_defined")
+ ClosingBracket('}')
+ OpeningBracket('[')
+ Prefix
+ Comma
+ Infix
+ Comma
+ FnIdent("T")
+ FnIdent("∈")
+ Ident("int")
+ ClosingBracket(']')
+ ClosingBracket('}')
+ Enum
+ Ident("bool")
+ OpeningBracket('{')
+ Ident("true")
+ Comma
+ Ident("false")
+ ClosingBracket('}')
+ FnIdent("∧")
+ OpeningBracket('(')
+ OpeningBracket('(')
+ Ident("a")
+ Colon
+ Ident("bool")
+ Comma
+ Ident("b")
+ Colon
+ Ident("bool")
+ ClosingBracket(')')
+ ThinArrow
+ Ident("bool")
+ ClosingBracket(')')
+ OpeningBracket('{')
+ Ident("a")
+ FnIdent("&")
+ Ident("b")
+ ClosingBracket('}')
+ OpeningBracket('[')
+ Infix
+ OpeningBracket('{')
+ Associativity
+ FnIdent("<")
+ Comma
+ LooserThan
+ FnIdent("«")
+ ClosingBracket('}')
+ ClosingBracket(']')
+ Impl
+ Ident("bool")
+ OpeningBracket('{')
+ FnIdent("∧")
+ OpeningBracket('(')
+ OpeningBracket('(')
+ Ident("a")
+ Comma
+ Ident("b")
+ Colon
+ Lamba
+ OpeningBracket('(')
+ Unit
+ ThinArrow
+ Ident("me")
+ ClosingBracket(')')
+ ClosingBracket(')')
+ ThinArrow
+ Ident("me")
+ ClosingBracket(')')
+ OpeningBracket('{')
+ Match
+ Ident("a")
+ OpeningBracket('{')
+ Ident("true")
+ FatArrow
+ Ident("b")
+ Unit
+ Comma
+ Ident("false")
+ FatArrow
+ Ident("false")
+ Comma
+ ClosingBracket('}')
+ ClosingBracket('}')
+ ClosingBracket('}')
+ ]
+}