schala/src/maaru_lang/tokenizer.rs

207 lines
5.9 KiB
Rust
Raw Normal View History

extern crate itertools;
2017-01-01 20:15:17 -08:00
use std::iter::Peekable;
use std::str::Chars;
use self::itertools::Itertools;
use std::rc::Rc;
2017-01-01 20:15:17 -08:00
use language::TokenError;
2017-01-23 19:45:26 -08:00
2015-12-25 02:03:11 -08:00
#[derive(Debug, Clone, PartialEq)]
2015-07-22 03:02:55 -07:00
pub enum Token {
Newline,
Semicolon,
2015-07-22 03:02:55 -07:00
LParen,
RParen,
LSquareBracket,
RSquareBracket,
LCurlyBrace,
RCurlyBrace,
2015-07-22 03:02:55 -07:00
Comma,
2015-07-26 01:51:15 -07:00
Period,
Colon,
2015-07-22 03:02:55 -07:00
NumLiteral(f64),
StrLiteral(Rc<String>),
Identifier(Rc<String>),
Operator(OpTok),
2016-12-29 02:04:03 -08:00
Keyword(Kw),
2015-07-22 04:01:56 -07:00
}
2016-01-15 03:27:24 -08:00
#[derive(Debug, Clone, PartialEq)]
pub struct OpTok(pub Rc<String>);
2016-01-15 03:27:24 -08:00
2015-07-22 04:01:56 -07:00
#[derive(Debug, Clone, PartialEq)]
pub enum Kw {
If,
Else,
While,
Let,
Fn,
2015-08-08 00:27:40 -07:00
Null,
2015-07-22 03:02:55 -07:00
}
2015-07-22 03:12:01 -07:00
2017-01-23 19:45:26 -08:00
pub type TokenizeResult = Result<Vec<Token>, TokenError>;
2016-12-28 22:52:23 -08:00
fn is_digit(c: &char) -> bool {
c.is_digit(10)
}
2016-12-28 22:52:23 -08:00
pub fn tokenize(input: &str) -> TokenizeResult {
use self::Token::*;
2015-07-22 03:12:01 -07:00
let mut tokens = Vec::new();
2017-01-01 20:15:17 -08:00
let mut iter: Peekable<Chars> = input.chars().peekable();
while let Some(c) = iter.next() {
if c == '#' {
while let Some(c) = iter.next() {
2016-12-29 02:04:03 -08:00
if c == '\n' {
break;
}
}
2017-02-03 11:21:52 -08:00
continue;
}
let cur_tok = match c {
c if char::is_whitespace(c) && c != '\n' => continue,
'\n' => Newline,
';' => Semicolon,
'(' => LParen,
')' => RParen,
':' => Colon,
',' => Comma,
'{' => LCurlyBrace,
'}' => RCurlyBrace,
'[' => LSquareBracket,
']' => RSquareBracket,
2017-02-17 21:45:21 -08:00
'"' => tokenize_str(&mut iter)?,
c if !char::is_alphanumeric(c) => tokenize_operator(c, &mut iter)?,
c @ '.' | c if is_digit(&c) => tokenize_number_or_period(c, &mut iter)?,
c => tokenize_identifier(c, &mut iter)?,
};
tokens.push(cur_tok);
}
2016-12-28 22:52:23 -08:00
Ok(tokens)
2015-07-22 03:12:01 -07:00
}
2015-07-22 04:01:56 -07:00
2017-01-23 19:45:26 -08:00
fn tokenize_str(iter: &mut Peekable<Chars>) -> Result<Token, TokenError> {
2017-01-01 20:15:17 -08:00
let mut buffer = String::new();
loop {
// TODO handle string escapes, interpolation
match iter.next() {
Some(x) if x == '"' => break,
Some(x) => buffer.push(x),
2017-01-23 19:45:26 -08:00
None => return Err(TokenError::new("Unclosed quote")),
2017-01-01 20:15:17 -08:00
}
}
Ok(Token::StrLiteral(Rc::new(buffer)))
2017-01-01 20:15:17 -08:00
}
2017-01-23 19:45:26 -08:00
fn tokenize_operator(c: char, iter: &mut Peekable<Chars>) -> Result<Token, TokenError> {
2017-01-01 20:15:17 -08:00
let mut buffer = String::new();
buffer.push(c);
buffer.extend(iter.peeking_take_while(|x| !char::is_alphanumeric(*x) && !char::is_whitespace(*x)));
Ok(Token::Operator(OpTok(Rc::new(buffer))))
2017-01-01 20:15:17 -08:00
}
2017-01-23 19:45:26 -08:00
fn tokenize_number_or_period(c: char, iter: &mut Peekable<Chars>) -> Result<Token, TokenError> {
2017-01-01 20:15:17 -08:00
if c == '.' && !iter.peek().map_or(false, is_digit) {
return Ok(Token::Period);
}
let mut buffer = String::new();
buffer.push(c);
buffer.extend(iter.peeking_take_while(|x| is_digit(x) || *x == '.'));
2017-01-01 20:15:17 -08:00
match buffer.parse::<f64>() {
Ok(f) => Ok(Token::NumLiteral(f)),
2017-01-23 19:45:26 -08:00
Err(_) => Err(TokenError::new("Failed to parse digit")),
2017-01-01 20:15:17 -08:00
}
}
2017-01-23 19:45:26 -08:00
fn tokenize_identifier(c: char, iter: &mut Peekable<Chars>) -> Result<Token, TokenError> {
2017-01-01 20:15:17 -08:00
fn ends_identifier(c: &char) -> bool {
let c = *c;
char::is_whitespace(c) || is_digit(&c) || c == ';' || c == '(' || c == ')' ||
2017-02-17 21:13:57 -08:00
c == ',' || c == '.' || c == ',' || c == ':' || c == '[' || c == ']'
2017-01-01 20:15:17 -08:00
}
use self::Token::*;
let mut buffer = String::new();
buffer.push(c);
2017-01-02 01:47:39 -08:00
buffer.extend(iter.peeking_take_while(|x| !ends_identifier(x)));
2017-01-01 20:15:17 -08:00
Ok(match &buffer[..] {
"if" => Keyword(Kw::If),
"else" => Keyword(Kw::Else),
"while" => Keyword(Kw::While),
"let" => Keyword(Kw::Let),
"fn" => Keyword(Kw::Fn),
"null" => Keyword(Kw::Null),
b => Identifier(Rc::new(b.to_string())),
2017-01-01 20:15:17 -08:00
})
}
2015-12-20 17:03:03 -08:00
#[cfg(test)]
mod tests {
2017-01-02 01:54:46 -08:00
use super::*;
use super::Token::*;
2016-01-09 00:07:48 -08:00
2017-01-02 01:54:46 -08:00
macro_rules! token_test {
($input: expr, $output: pat, $ifexpr: expr) => {
2016-01-09 00:07:48 -08:00
let tokens = tokenize($input).unwrap();
2017-01-02 01:54:46 -08:00
match tokens[..] {
$output if $ifexpr => (),
_ => panic!("Actual output: {:?}", tokens),
2016-01-09 00:07:48 -08:00
}
}
}
2015-12-20 17:03:03 -08:00
#[test]
2017-01-02 01:54:46 -08:00
fn basic_tokeniziation_tests() {
token_test!("let a = 3\n",
[Keyword(Kw::Let), Identifier(ref a), Operator(OpTok(ref b)), NumLiteral(3.0), Newline],
2017-01-05 02:36:28 -08:00
**a == "a" && **b == "=");
2016-01-09 00:07:48 -08:00
2017-01-02 01:54:46 -08:00
token_test!("2+1",
[NumLiteral(2.0), Operator(OpTok(ref a)), NumLiteral(1.0)],
2017-01-05 02:36:28 -08:00
**a == "+");
2016-01-09 00:07:48 -08:00
2017-01-02 01:54:46 -08:00
token_test!("2 + 1",
[NumLiteral(2.0), Operator(OpTok(ref a)), NumLiteral(1.0)],
2017-01-05 02:36:28 -08:00
**a == "+");
2016-01-09 00:07:48 -08:00
2017-01-02 01:54:46 -08:00
token_test!("2.3*49.2",
[NumLiteral(2.3), Operator(OpTok(ref a)), NumLiteral(49.2)],
2017-01-05 02:36:28 -08:00
**a == "*");
2016-01-09 00:09:48 -08:00
token_test!("a+3",
[Identifier(ref a), NumLiteral(3.0)],
**a == "a+");
2016-12-28 22:52:23 -08:00
assert!(tokenize("2.4.5").is_err());
token_test!("fn my_func(a) { a ? 3[1] }",
[Keyword(Kw::Fn), Identifier(ref a), LParen, Identifier(ref b), RParen, LCurlyBrace, Identifier(ref c),
Operator(OpTok(ref d)), NumLiteral(3.0), LSquareBracket, NumLiteral(1.0), RSquareBracket, RCurlyBrace],
**a == "my_func" && **b == "a" && **c == "a" && **d == "?");
}
#[test]
2017-01-02 01:54:46 -08:00
fn string_test() {
token_test!("null + \"a string\"",
[Keyword(Kw::Null), Operator(OpTok(ref a)), StrLiteral(ref b)],
2017-01-05 02:36:28 -08:00
**a == "+" && **b == "a string");
token_test!("\"{?'q@?\"",
[StrLiteral(ref a)],
**a == "{?'q@?");
2017-01-02 01:54:46 -08:00
}
#[test]
fn operator_test() {
token_test!("a *> b",
[Identifier(ref a), Operator(OpTok(ref b)), Identifier(ref c)],
2017-01-05 02:36:28 -08:00
**a == "a" && **b == "*>" && **c == "b");
2017-01-02 01:54:46 -08:00
2016-01-09 00:09:48 -08:00
2015-12-20 17:03:03 -08:00
}
}