#![allow(clippy::upper_case_acronyms)] use itertools::Itertools; use std::{iter::{Iterator, Peekable}, convert::TryFrom, rc::Rc, fmt}; use std::convert::TryInto; /// A location in a particular source file. Note that the /// sizes of the internal unsigned integer types limit /// the size of a source file to 2^32 lines of /// at most 2^16 characters, which should be plenty big. #[derive(Debug, Clone, Copy, PartialEq, Default)] pub struct Location { pub(crate) line_num: u32, pub(crate) char_num: u16, } impl fmt::Display for Location { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { write!(f, "{}:{}", self.line_num, self.char_num) } } #[derive(Debug, PartialEq, Clone)] pub enum TokenKind { Newline, Semicolon, LParen, RParen, LSquareBracket, RSquareBracket, LAngleBracket, RAngleBracket, LCurlyBrace, RCurlyBrace, Pipe, Backslash, Comma, Period, Colon, Underscore, Slash, Equals, Operator(Rc), DigitGroup(Rc), HexLiteral(Rc), BinNumberSigil, StrLiteral { s: Rc, prefix: Option> }, Identifier(Rc), Keyword(Kw), EOF, Error(String), } use self::TokenKind::*; impl fmt::Display for TokenKind { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match self { &Operator(ref s) => write!(f, "Operator({})", **s), &DigitGroup(ref s) => write!(f, "DigitGroup({})", s), &HexLiteral(ref s) => write!(f, "HexLiteral({})", s), &StrLiteral {ref s, .. } => write!(f, "StrLiteral({})", s), &Identifier(ref s) => write!(f, "Identifier({})", s), &Error(ref s) => write!(f, "Error({})", s), other => write!(f, "{:?}", other), } } } #[derive(Debug, Clone, Copy, PartialEq)] pub enum Kw { If, Then, Else, Is, Func, For, While, Const, Let, In, Mut, Return, Alias, Type, SelfType, SelfIdent, Interface, Impl, True, False, Module, Import } impl TryFrom<&str> for Kw { type Error = (); fn try_from(value: &str) -> Result { Ok(match value { "if" => Kw::If, "then" => Kw::Then, "else" => Kw::Else, "is" => Kw::Is, "fn" => Kw::Func, "for" => Kw::For, "while" => Kw::While, "const" => Kw::Const, "let" => Kw::Let, "in" => Kw::In, "mut" => Kw::Mut, "return" => Kw::Return, "alias" => Kw::Alias, "type" => Kw::Type, "Self" => Kw::SelfType, "self" => Kw::SelfIdent, "interface" => Kw::Interface, "impl" => Kw::Impl, "true" => Kw::True, "false" => Kw::False, "module" => Kw::Module, "import" => Kw::Import, _ => return Err(()), }) } } #[derive(Debug, Clone, PartialEq)] pub struct Token { pub kind: TokenKind, pub(crate) location: Location, } impl Token { pub fn to_string_with_metadata(&self) -> String { format!("{}({})", self.kind, self.location) } pub fn get_kind(&self) -> TokenKind { self.kind.clone() } } const OPERATOR_CHARS: [char; 18] = ['!', '$', '%', '&', '*', '+', '-', '.', ':', '<', '>', '=', '?', '@', '^', '|', '~', '`']; fn is_operator(c: &char) -> bool { OPERATOR_CHARS.iter().any(|x| x == c) } type CharData = (usize, usize, char); pub fn tokenize(input: &str) -> Vec { let mut tokens: Vec = Vec::new(); let mut input = Iterator::intersperse(input.lines().enumerate(), (0, "\n")) .flat_map(|(line_idx, line)| { line.chars().enumerate().map(move |(ch_idx, ch)| (line_idx, ch_idx, ch)) }) .peekable(); while let Some((line_num, char_num, c)) = input.next() { let cur_tok_kind = match c { '/' => match input.peek().map(|t| t.2) { Some('/') => { for (_, _, c) in input.by_ref() { if c == '\n' { break; } } continue; }, Some('*') => { input.next(); let mut comment_level = 1; while let Some((_, _, c)) = input.next() { if c == '*' && input.peek().map(|t| t.2) == Some('/') { input.next(); comment_level -= 1; } else if c == '/' && input.peek().map(|t| t.2) == Some('*') { input.next(); comment_level += 1; } if comment_level == 0 { break; } } if comment_level != 0 { Error("Unclosed comment".to_string()) } else { continue; } }, _ => Slash }, c if c.is_whitespace() && c != '\n' => continue, '\n' => Newline, ';' => Semicolon, ':' => Colon, ',' => Comma, '(' => LParen, ')' => RParen, '{' => LCurlyBrace, '}' => RCurlyBrace, '[' => LSquareBracket, ']' => RSquareBracket, '"' => handle_quote(&mut input, None), '\\' => Backslash, c if c.is_digit(10) => handle_digit(c, &mut input), c if c.is_alphabetic() || c == '_' => handle_alphabetic(c, &mut input), c if is_operator(&c) => handle_operator(c, &mut input), unknown => Error(format!("Unexpected character: {}", unknown)), }; let location = Location { line_num: line_num.try_into().unwrap(), char_num: char_num.try_into().unwrap() }; tokens.push(Token { kind: cur_tok_kind, location }); } tokens } fn handle_digit(c: char, input: &mut Peekable>) -> TokenKind { let next_ch = input.peek().map(|&(_, _, c)| c); if c == '0' && next_ch == Some('x') { input.next(); let rest: String = input.peeking_take_while(|&(_, _, ref c)| c.is_digit(16) || *c == '_').map(|(_, _, c)| { c }).collect(); HexLiteral(Rc::new(rest)) } else if c == '0' && next_ch == Some('b') { input.next(); BinNumberSigil } else { let mut buf = c.to_string(); buf.extend(input.peeking_take_while(|&(_, _, ref c)| c.is_digit(10)).map(|(_, _, c)| { c })); DigitGroup(Rc::new(buf)) } } fn handle_quote(input: &mut Peekable>, quote_prefix: Option<&str>) -> TokenKind { let mut buf = String::new(); loop { match input.next().map(|(_, _, c)| { c }) { Some('"') => break, Some('\\') => { let next = input.peek().map(|&(_, _, c)| { c }); if next == Some('n') { input.next(); buf.push('\n') } else if next == Some('"') { input.next(); buf.push('"'); } else if next == Some('t') { input.next(); buf.push('\t'); } }, Some(c) => buf.push(c), None => return TokenKind::Error("Unclosed string".to_string()), } } TokenKind::StrLiteral { s: Rc::new(buf), prefix: quote_prefix.map(|s| Rc::new(s.to_string())) } } fn handle_alphabetic(c: char, input: &mut Peekable>) -> TokenKind { let mut buf = String::new(); buf.push(c); let next_is_alphabetic = input.peek().map(|&(_, _, c)| !c.is_alphabetic()).unwrap_or(true); if c == '_' && next_is_alphabetic { return TokenKind::Underscore } loop { match input.peek().map(|&(_, _, c)| { c }) { Some(c) if c == '"' => { input.next(); return handle_quote(input, Some(&buf)); }, Some(c) if c.is_alphanumeric() || c == '_' => { input.next(); buf.push(c); }, _ => break, } } match Kw::try_from(buf.as_str()) { Ok(kw) => TokenKind::Keyword(kw), Err(()) => TokenKind::Identifier(Rc::new(buf)), } } fn handle_operator(c: char, input: &mut Peekable>) -> TokenKind { match c { '<' | '>' | '|' | '.' | '=' => { let next = &input.peek().map(|&(_, _, c)| { c }); let next_is_op = next.map(|n| { is_operator(&n) }).unwrap_or(false); if !next_is_op { return match c { '<' => LAngleBracket, '>' => RAngleBracket, '|' => Pipe, '.' => Period, '=' => Equals, _ => unreachable!(), } } }, _ => (), }; let mut buf = String::new(); if c == '`' { loop { match input.peek().map(|&(_, _, c)| { c }) { Some(c) if c.is_alphabetic() || c == '_' => { input.next(); buf.push(c); }, Some('`') => { input.next(); break; }, _ => break } } } else { buf.push(c); loop { match input.peek().map(|&(_, _, c)| { c }) { Some(c) if is_operator(&c) => { input.next(); buf.push(c); }, _ => break } } } TokenKind::Operator(Rc::new(buf)) } #[cfg(test)] mod schala_tokenizer_tests { use super::*; use super::Kw::*; macro_rules! digit { ($ident:expr) => { DigitGroup(Rc::new($ident.to_string())) } } macro_rules! ident { ($ident:expr) => { Identifier(Rc::new($ident.to_string())) } } macro_rules! op { ($ident:expr) => { Operator(Rc::new($ident.to_string())) } } fn token_kinds(input: &str) -> Vec { tokenize(input).into_iter().map(move |tok| tok.kind).collect() } #[test] fn tokens() { let output = token_kinds("let a: A = c ++ d"); assert_eq!(output, vec![Keyword(Let), ident!("a"), Colon, ident!("A"), LAngleBracket, ident!("B"), RAngleBracket, Equals, ident!("c"), op!("++"), ident!("d")]); } #[test] fn underscores() { let output = token_kinds("4_8"); assert_eq!(output, vec![digit!("4"), Underscore, digit!("8")]); let output = token_kinds("aba_yo"); assert_eq!(output, vec![ident!("aba_yo")]); } #[test] fn comments() { let output = token_kinds("1 + /* hella /* bro */ */ 2"); assert_eq!(output, vec![digit!("1"), op!("+"), digit!("2")]); let output = token_kinds("1 + /* hella /* bro */ 2"); assert_eq!(output, vec![digit!("1"), op!("+"), Error("Unclosed comment".to_string())]); //TODO not sure if I want this behavior let output = token_kinds("1 + /* hella */ bro */ 2"); assert_eq!(output, vec![digit!("1"), op!("+"), Identifier(Rc::new("bro".to_string())), Operator(Rc::new("*".to_string())), Slash, DigitGroup(Rc::new("2".to_string()))]); } #[test] fn backtick_operators() { let output = token_kinds("1 `plus` 2"); assert_eq!(output, vec![digit!("1"), op!("plus"), digit!("2")]); } #[test] fn string_literals() { let output = token_kinds(r#""some string""#); assert_eq!(output, vec![StrLiteral { s: Rc::new("some string".to_string()), prefix: None }]); let output = token_kinds(r#"b"some bytestring""#); assert_eq!(output, vec![StrLiteral { s: Rc::new("some bytestring".to_string()), prefix: Some(Rc::new("b".to_string())) }]); let output = token_kinds(r#""Do \n \" escapes work\t""#); assert_eq!(output, vec![StrLiteral { s: Rc::new("Do \n \" escapes work\t".to_string()), prefix: None }]); } }