diff --git a/schala-lang/language/src/tokenizing.rs b/schala-lang/language/src/tokenizing.rs index 4f67dbf..4ed0dc0 100644 --- a/schala-lang/language/src/tokenizing.rs +++ b/schala-lang/language/src/tokenizing.rs @@ -1,8 +1,13 @@ #![allow(clippy::upper_case_acronyms)] +use std::{ + convert::{TryFrom, TryInto}, + fmt, + iter::{Iterator, Peekable}, + rc::Rc, +}; + use itertools::Itertools; -use std::{iter::{Iterator, Peekable}, convert::TryFrom, rc::Rc, fmt}; -use std::convert::TryInto; /// A location in a particular source file. Note that the /// sizes of the internal unsigned integer types limit @@ -10,370 +15,444 @@ use std::convert::TryInto; /// at most 2^16 characters, which should be plenty big. #[derive(Debug, Clone, Copy, PartialEq, Default)] pub struct Location { - pub(crate) line_num: u32, - pub(crate) char_num: u16, + pub(crate) line_num: u32, + pub(crate) char_num: u16, } impl fmt::Display for Location { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!(f, "{}:{}", self.line_num, self.char_num) - } + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{}:{}", self.line_num, self.char_num) + } } #[derive(Debug, PartialEq, Clone)] pub enum TokenKind { - Newline, Semicolon, + Newline, + Semicolon, - LParen, RParen, - LSquareBracket, RSquareBracket, - LAngleBracket, RAngleBracket, - LCurlyBrace, RCurlyBrace, - Pipe, Backslash, - AtSign, + LParen, + RParen, + LSquareBracket, + RSquareBracket, + LAngleBracket, + RAngleBracket, + LCurlyBrace, + RCurlyBrace, + Pipe, + Backslash, + AtSign, + Comma, + Period, + Colon, + Underscore, + Slash, + Equals, - Comma, Period, Colon, Underscore, - Slash, Equals, + Operator(Rc), + DigitGroup(Rc), + HexLiteral(Rc), + BinNumberSigil, + StrLiteral { s: Rc, prefix: Option> }, + Identifier(Rc), + Keyword(Kw), - Operator(Rc), - DigitGroup(Rc), HexLiteral(Rc), BinNumberSigil, - StrLiteral { - s: Rc, - prefix: Option> - }, - Identifier(Rc), - Keyword(Kw), + EOF, - EOF, - - Error(String), + Error(String), } use self::TokenKind::*; impl fmt::Display for TokenKind { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - match self { - &Operator(ref s) => write!(f, "Operator({})", **s), - &DigitGroup(ref s) => write!(f, "DigitGroup({})", s), - &HexLiteral(ref s) => write!(f, "HexLiteral({})", s), - &StrLiteral {ref s, .. } => write!(f, "StrLiteral({})", s), - &Identifier(ref s) => write!(f, "Identifier({})", s), - &Error(ref s) => write!(f, "Error({})", s), - other => write!(f, "{:?}", other), + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + &Operator(ref s) => write!(f, "Operator({})", **s), + &DigitGroup(ref s) => write!(f, "DigitGroup({})", s), + &HexLiteral(ref s) => write!(f, "HexLiteral({})", s), + &StrLiteral { ref s, .. } => write!(f, "StrLiteral({})", s), + &Identifier(ref s) => write!(f, "Identifier({})", s), + &Error(ref s) => write!(f, "Error({})", s), + other => write!(f, "{:?}", other), + } } - } } #[derive(Debug, Clone, Copy, PartialEq)] pub enum Kw { - If, Then, Else, - Is, - Func, - For, While, - Const, Let, In, - Mut, - Return, - Alias, Type, SelfType, SelfIdent, - Interface, Impl, - True, False, - Module, Import + If, + Then, + Else, + Is, + Func, + For, + While, + Const, + Let, + In, + Mut, + Return, + Alias, + Type, + SelfType, + SelfIdent, + Interface, + Impl, + True, + False, + Module, + Import, } impl TryFrom<&str> for Kw { - type Error = (); + type Error = (); - fn try_from(value: &str) -> Result { - Ok(match value { - "if" => Kw::If, - "then" => Kw::Then, - "else" => Kw::Else, - "is" => Kw::Is, - "fn" => Kw::Func, - "for" => Kw::For, - "while" => Kw::While, - "const" => Kw::Const, - "let" => Kw::Let, - "in" => Kw::In, - "mut" => Kw::Mut, - "return" => Kw::Return, - "alias" => Kw::Alias, - "type" => Kw::Type, - "Self" => Kw::SelfType, - "self" => Kw::SelfIdent, - "interface" => Kw::Interface, - "impl" => Kw::Impl, - "true" => Kw::True, - "false" => Kw::False, - "module" => Kw::Module, - "import" => Kw::Import, - _ => return Err(()), - }) - } + fn try_from(value: &str) -> Result { + Ok(match value { + "if" => Kw::If, + "then" => Kw::Then, + "else" => Kw::Else, + "is" => Kw::Is, + "fn" => Kw::Func, + "for" => Kw::For, + "while" => Kw::While, + "const" => Kw::Const, + "let" => Kw::Let, + "in" => Kw::In, + "mut" => Kw::Mut, + "return" => Kw::Return, + "alias" => Kw::Alias, + "type" => Kw::Type, + "Self" => Kw::SelfType, + "self" => Kw::SelfIdent, + "interface" => Kw::Interface, + "impl" => Kw::Impl, + "true" => Kw::True, + "false" => Kw::False, + "module" => Kw::Module, + "import" => Kw::Import, + _ => return Err(()), + }) + } } #[derive(Debug, Clone, PartialEq)] pub struct Token { - pub kind: TokenKind, - pub(crate) location: Location, + pub kind: TokenKind, + pub(crate) location: Location, } impl Token { - pub fn to_string_with_metadata(&self) -> String { - format!("{}({})", self.kind, self.location) - } + pub fn to_string_with_metadata(&self) -> String { + format!("{}({})", self.kind, self.location) + } - pub fn get_kind(&self) -> TokenKind { - self.kind.clone() - } + pub fn get_kind(&self) -> TokenKind { + self.kind.clone() + } } -const OPERATOR_CHARS: [char; 17] = ['!', '$', '%', '&', '*', '+', '-', '.', ':', '<', '>', '=', '?', '^', '|', '~', '`']; +const OPERATOR_CHARS: [char; 17] = + ['!', '$', '%', '&', '*', '+', '-', '.', ':', '<', '>', '=', '?', '^', '|', '~', '`']; fn is_operator(c: &char) -> bool { - OPERATOR_CHARS.iter().any(|x| x == c) + OPERATOR_CHARS.iter().any(|x| x == c) } type CharData = (usize, usize, char); pub fn tokenize(input: &str) -> Vec { - let mut tokens: Vec = Vec::new(); + let mut tokens: Vec = Vec::new(); - let mut input = Iterator::intersperse(input.lines().enumerate(), (0, "\n")) - .flat_map(|(line_idx, line)| { - line.chars().enumerate().map(move |(ch_idx, ch)| (line_idx, ch_idx, ch)) - }) - .peekable(); + let mut input = Iterator::intersperse(input.lines().enumerate(), (0, "\n")) + .flat_map(|(line_idx, line)| line.chars().enumerate().map(move |(ch_idx, ch)| (line_idx, ch_idx, ch))) + .peekable(); - while let Some((line_num, char_num, c)) = input.next() { - let cur_tok_kind = match c { - '/' => match input.peek().map(|t| t.2) { - Some('/') => { - for (_, _, c) in input.by_ref() { - if c == '\n' { - break; + while let Some((line_num, char_num, c)) = input.next() { + let cur_tok_kind = match c { + '/' => match input.peek().map(|t| t.2) { + Some('/') => { + for (_, _, c) in input.by_ref() { + if c == '\n' { + break; + } + } + continue; } - } - continue; - }, - Some('*') => { - input.next(); - let mut comment_level = 1; - while let Some((_, _, c)) = input.next() { - if c == '*' && input.peek().map(|t| t.2) == Some('/') { - input.next(); - comment_level -= 1; - } else if c == '/' && input.peek().map(|t| t.2) == Some('*') { - input.next(); - comment_level += 1; - } - if comment_level == 0 { - break; - } - } - if comment_level != 0 { - Error("Unclosed comment".to_string()) - } else { - continue; - } - }, - _ => Slash - }, - c if c.is_whitespace() && c != '\n' => continue, - '\n' => Newline, ';' => Semicolon, - ':' => Colon, ',' => Comma, - '(' => LParen, ')' => RParen, - '{' => LCurlyBrace, '}' => RCurlyBrace, - '[' => LSquareBracket, ']' => RSquareBracket, - '"' => handle_quote(&mut input, None), - '\\' => Backslash, - '@' => AtSign, - c if c.is_digit(10) => handle_digit(c, &mut input), - c if c.is_alphabetic() || c == '_' => handle_alphabetic(c, &mut input), - c if is_operator(&c) => handle_operator(c, &mut input), - unknown => Error(format!("Unexpected character: {}", unknown)), - }; - let location = Location { line_num: line_num.try_into().unwrap(), char_num: char_num.try_into().unwrap() }; - tokens.push(Token { kind: cur_tok_kind, location }); - } - tokens + Some('*') => { + input.next(); + let mut comment_level = 1; + while let Some((_, _, c)) = input.next() { + if c == '*' && input.peek().map(|t| t.2) == Some('/') { + input.next(); + comment_level -= 1; + } else if c == '/' && input.peek().map(|t| t.2) == Some('*') { + input.next(); + comment_level += 1; + } + if comment_level == 0 { + break; + } + } + if comment_level != 0 { + Error("Unclosed comment".to_string()) + } else { + continue; + } + } + _ => Slash, + }, + c if c.is_whitespace() && c != '\n' => continue, + '\n' => Newline, + ';' => Semicolon, + ':' => Colon, + ',' => Comma, + '(' => LParen, + ')' => RParen, + '{' => LCurlyBrace, + '}' => RCurlyBrace, + '[' => LSquareBracket, + ']' => RSquareBracket, + '"' => handle_quote(&mut input, None), + '\\' => Backslash, + '@' => AtSign, + c if c.is_digit(10) => handle_digit(c, &mut input), + c if c.is_alphabetic() || c == '_' => handle_alphabetic(c, &mut input), + c if is_operator(&c) => handle_operator(c, &mut input), + unknown => Error(format!("Unexpected character: {}", unknown)), + }; + let location = + Location { line_num: line_num.try_into().unwrap(), char_num: char_num.try_into().unwrap() }; + tokens.push(Token { kind: cur_tok_kind, location }); + } + tokens } -fn handle_digit(c: char, input: &mut Peekable>) -> TokenKind { +fn handle_digit(c: char, input: &mut Peekable>) -> TokenKind { let next_ch = input.peek().map(|&(_, _, c)| c); if c == '0' && next_ch == Some('x') { input.next(); - let rest: String = input.peeking_take_while(|&(_, _, ref c)| c.is_digit(16) || *c == '_').map(|(_, _, c)| { c }).collect(); + let rest: String = input + .peeking_take_while(|&(_, _, ref c)| c.is_digit(16) || *c == '_') + .map(|(_, _, c)| c) + .collect(); HexLiteral(Rc::new(rest)) } else if c == '0' && next_ch == Some('b') { input.next(); BinNumberSigil } else { let mut buf = c.to_string(); - buf.extend(input.peeking_take_while(|&(_, _, ref c)| c.is_digit(10)).map(|(_, _, c)| { c })); + buf.extend(input.peeking_take_while(|&(_, _, ref c)| c.is_digit(10)).map(|(_, _, c)| c)); DigitGroup(Rc::new(buf)) } } -fn handle_quote(input: &mut Peekable>, quote_prefix: Option<&str>) -> TokenKind { - let mut buf = String::new(); - loop { - match input.next().map(|(_, _, c)| { c }) { - Some('"') => break, - Some('\\') => { - let next = input.peek().map(|&(_, _, c)| { c }); - if next == Some('n') { - input.next(); - buf.push('\n') - } else if next == Some('"') { - input.next(); - buf.push('"'); - } else if next == Some('t') { - input.next(); - buf.push('\t'); - } - }, - Some(c) => buf.push(c), - None => return TokenKind::Error("Unclosed string".to_string()), - } - } - TokenKind::StrLiteral { s: Rc::new(buf), prefix: quote_prefix.map(|s| Rc::new(s.to_string())) } -} - -fn handle_alphabetic(c: char, input: &mut Peekable>) -> TokenKind { - let mut buf = String::new(); - buf.push(c); - let next_is_alphabetic = input.peek().map(|&(_, _, c)| !c.is_alphabetic()).unwrap_or(true); - if c == '_' && next_is_alphabetic { - return TokenKind::Underscore - } - - loop { - match input.peek().map(|&(_, _, c)| { c }) { - Some(c) if c == '"' => { - input.next(); - return handle_quote(input, Some(&buf)); - }, - Some(c) if c.is_alphanumeric() || c == '_' => { - input.next(); - buf.push(c); - }, - _ => break, - } - } - - match Kw::try_from(buf.as_str()) { - Ok(kw) => TokenKind::Keyword(kw), - Err(()) => TokenKind::Identifier(Rc::new(buf)), - } -} - -fn handle_operator(c: char, input: &mut Peekable>) -> TokenKind { - match c { - '<' | '>' | '|' | '.' | '=' => { - let next = &input.peek().map(|&(_, _, c)| { c }); - let next_is_op = next.map(|n| { is_operator(&n) }).unwrap_or(false); - if !next_is_op { - return match c { - '<' => LAngleBracket, - '>' => RAngleBracket, - '|' => Pipe, - '.' => Period, - '=' => Equals, - _ => unreachable!(), - } - } - }, - _ => (), - }; - - let mut buf = String::new(); - - if c == '`' { +fn handle_quote( + input: &mut Peekable>, + quote_prefix: Option<&str>, +) -> TokenKind { + let mut buf = String::new(); loop { - match input.peek().map(|&(_, _, c)| { c }) { - Some(c) if c.is_alphabetic() || c == '_' => { - input.next(); - buf.push(c); - }, - Some('`') => { - input.next(); - break; - }, - _ => break - } + match input.next().map(|(_, _, c)| c) { + Some('"') => break, + Some('\\') => { + let next = input.peek().map(|&(_, _, c)| c); + if next == Some('n') { + input.next(); + buf.push('\n') + } else if next == Some('"') { + input.next(); + buf.push('"'); + } else if next == Some('t') { + input.next(); + buf.push('\t'); + } + } + Some(c) => buf.push(c), + None => return TokenKind::Error("Unclosed string".to_string()), + } } - } else { + TokenKind::StrLiteral { s: Rc::new(buf), prefix: quote_prefix.map(|s| Rc::new(s.to_string())) } +} + +fn handle_alphabetic(c: char, input: &mut Peekable>) -> TokenKind { + let mut buf = String::new(); buf.push(c); - loop { - match input.peek().map(|&(_, _, c)| { c }) { - Some(c) if is_operator(&c) => { - input.next(); - buf.push(c); - }, - _ => break - } + let next_is_alphabetic = input.peek().map(|&(_, _, c)| !c.is_alphabetic()).unwrap_or(true); + if c == '_' && next_is_alphabetic { + return TokenKind::Underscore; } - } - TokenKind::Operator(Rc::new(buf)) + + loop { + match input.peek().map(|&(_, _, c)| c) { + Some(c) if c == '"' => { + input.next(); + return handle_quote(input, Some(&buf)); + } + Some(c) if c.is_alphanumeric() || c == '_' => { + input.next(); + buf.push(c); + } + _ => break, + } + } + + match Kw::try_from(buf.as_str()) { + Ok(kw) => TokenKind::Keyword(kw), + Err(()) => TokenKind::Identifier(Rc::new(buf)), + } +} + +fn handle_operator(c: char, input: &mut Peekable>) -> TokenKind { + match c { + '<' | '>' | '|' | '.' | '=' => { + let next = &input.peek().map(|&(_, _, c)| c); + let next_is_op = next.map(|n| is_operator(&n)).unwrap_or(false); + if !next_is_op { + return match c { + '<' => LAngleBracket, + '>' => RAngleBracket, + '|' => Pipe, + '.' => Period, + '=' => Equals, + _ => unreachable!(), + }; + } + } + _ => (), + }; + + let mut buf = String::new(); + + if c == '`' { + loop { + match input.peek().map(|&(_, _, c)| c) { + Some(c) if c.is_alphabetic() || c == '_' => { + input.next(); + buf.push(c); + } + Some('`') => { + input.next(); + break; + } + _ => break, + } + } + } else { + buf.push(c); + loop { + match input.peek().map(|&(_, _, c)| c) { + Some(c) if is_operator(&c) => { + input.next(); + buf.push(c); + } + _ => break, + } + } + } + TokenKind::Operator(Rc::new(buf)) } #[cfg(test)] mod schala_tokenizer_tests { - use super::*; - use super::Kw::*; + use super::{Kw::*, *}; - macro_rules! digit { ($ident:expr) => { DigitGroup(Rc::new($ident.to_string())) } } - macro_rules! ident { ($ident:expr) => { Identifier(Rc::new($ident.to_string())) } } - macro_rules! op { ($ident:expr) => { Operator(Rc::new($ident.to_string())) } } + macro_rules! digit { + ($ident:expr) => { + DigitGroup(Rc::new($ident.to_string())) + }; + } + macro_rules! ident { + ($ident:expr) => { + Identifier(Rc::new($ident.to_string())) + }; + } + macro_rules! op { + ($ident:expr) => { + Operator(Rc::new($ident.to_string())) + }; + } - fn token_kinds(input: &str) -> Vec { - tokenize(input).into_iter().map(move |tok| tok.kind).collect() - } + fn token_kinds(input: &str) -> Vec { + tokenize(input).into_iter().map(move |tok| tok.kind).collect() + } - #[test] - fn tokens() { - let output = token_kinds("let a: A = c ++ d"); - assert_eq!(output, vec![Keyword(Let), ident!("a"), Colon, ident!("A"), - LAngleBracket, ident!("B"), RAngleBracket, Equals, ident!("c"), op!("++"), ident!("d")]); - } + #[test] + fn tokens() { + let output = token_kinds("let a: A = c ++ d"); + assert_eq!( + output, + vec![ + Keyword(Let), + ident!("a"), + Colon, + ident!("A"), + LAngleBracket, + ident!("B"), + RAngleBracket, + Equals, + ident!("c"), + op!("++"), + ident!("d") + ] + ); + } - #[test] - fn underscores() { - let output = token_kinds("4_8"); - assert_eq!(output, vec![digit!("4"), Underscore, digit!("8")]); + #[test] + fn underscores() { + let output = token_kinds("4_8"); + assert_eq!(output, vec![digit!("4"), Underscore, digit!("8")]); - let output = token_kinds("aba_yo"); - assert_eq!(output, vec![ident!("aba_yo")]); - } + let output = token_kinds("aba_yo"); + assert_eq!(output, vec![ident!("aba_yo")]); + } - #[test] - fn comments() { - let output = token_kinds("1 + /* hella /* bro */ */ 2"); - assert_eq!(output, vec![digit!("1"), op!("+"), digit!("2")]); + #[test] + fn comments() { + let output = token_kinds("1 + /* hella /* bro */ */ 2"); + assert_eq!(output, vec![digit!("1"), op!("+"), digit!("2")]); - let output = token_kinds("1 + /* hella /* bro */ 2"); - assert_eq!(output, vec![digit!("1"), op!("+"), Error("Unclosed comment".to_string())]); + let output = token_kinds("1 + /* hella /* bro */ 2"); + assert_eq!(output, vec![digit!("1"), op!("+"), Error("Unclosed comment".to_string())]); - //TODO not sure if I want this behavior - let output = token_kinds("1 + /* hella */ bro */ 2"); - assert_eq!(output, vec![digit!("1"), op!("+"), Identifier(Rc::new("bro".to_string())), Operator(Rc::new("*".to_string())), Slash, DigitGroup(Rc::new("2".to_string()))]); - } + //TODO not sure if I want this behavior + let output = token_kinds("1 + /* hella */ bro */ 2"); + assert_eq!( + output, + vec![ + digit!("1"), + op!("+"), + Identifier(Rc::new("bro".to_string())), + Operator(Rc::new("*".to_string())), + Slash, + DigitGroup(Rc::new("2".to_string())) + ] + ); + } - #[test] - fn backtick_operators() { - let output = token_kinds("1 `plus` 2"); - assert_eq!(output, vec![digit!("1"), op!("plus"), digit!("2")]); - } + #[test] + fn backtick_operators() { + let output = token_kinds("1 `plus` 2"); + assert_eq!(output, vec![digit!("1"), op!("plus"), digit!("2")]); + } - #[test] - fn string_literals() { - let output = token_kinds(r#""some string""#); - assert_eq!(output, vec![StrLiteral { s: Rc::new("some string".to_string()), prefix: None }]); + #[test] + fn string_literals() { + let output = token_kinds(r#""some string""#); + assert_eq!(output, vec![StrLiteral { s: Rc::new("some string".to_string()), prefix: None }]); - let output = token_kinds(r#"b"some bytestring""#); - assert_eq!(output, vec![StrLiteral { s: Rc::new("some bytestring".to_string()), prefix: Some(Rc::new("b".to_string())) }]); + let output = token_kinds(r#"b"some bytestring""#); + assert_eq!( + output, + vec![StrLiteral { + s: Rc::new("some bytestring".to_string()), + prefix: Some(Rc::new("b".to_string())) + }] + ); - let output = token_kinds(r#""Do \n \" escapes work\t""#); - assert_eq!(output, vec![StrLiteral { s: Rc::new("Do \n \" escapes work\t".to_string()), prefix: None }]); - } + let output = token_kinds(r#""Do \n \" escapes work\t""#); + assert_eq!( + output, + vec![StrLiteral { s: Rc::new("Do \n \" escapes work\t".to_string()), prefix: None }] + ); + } }