From d423e888452c29071cd5e904b70b70bb05ae7500 Mon Sep 17 00:00:00 2001 From: greg Date: Fri, 23 Feb 2018 01:58:06 -0800 Subject: [PATCH] Separate tokenizing module Parsing was getting too long --- src/schala_lang/mod.rs | 3 +- src/schala_lang/parsing.rs | 245 +--------------------------------- src/schala_lang/tokenizing.rs | 242 +++++++++++++++++++++++++++++++++ 3 files changed, 248 insertions(+), 242 deletions(-) create mode 100644 src/schala_lang/tokenizing.rs diff --git a/src/schala_lang/mod.rs b/src/schala_lang/mod.rs index 96d81c7..5d4a147 100644 --- a/src/schala_lang/mod.rs +++ b/src/schala_lang/mod.rs @@ -1,6 +1,7 @@ use itertools::Itertools; use schala_lib::{ProgrammingLanguageInterface, EvalOptions, TraceArtifact, ReplOutput}; +mod tokenizing; mod parsing; //mod type_check; mod typechecking; @@ -33,7 +34,7 @@ impl ProgrammingLanguageInterface for Schala { fn evaluate_in_repl(&mut self, input: &str, options: &EvalOptions) -> ReplOutput { let mut output = ReplOutput::default(); - let tokens = parsing::tokenize(input); + let tokens = tokenizing::tokenize(input); if options.debug_tokens { let token_string = tokens.iter().map(|t| format!("{:?}<{}>", t.token_type, t.offset)).join(", "); output.add_artifact(TraceArtifact::new("tokens", format!("{:?}", token_string))); diff --git a/src/schala_lang/parsing.rs b/src/schala_lang/parsing.rs index 75d14d1..bb6e3c9 100644 --- a/src/schala_lang/parsing.rs +++ b/src/schala_lang/parsing.rs @@ -1,247 +1,10 @@ -use itertools::Itertools; -use std::collections::HashMap; use std::rc::Rc; -use std::iter::{Enumerate, Peekable}; +use std::iter::Peekable; use std::vec::IntoIter; -use std::str::Chars; -#[derive(Debug, PartialEq, Clone)] -pub enum TokenType { - Newline, Semicolon, - - LParen, RParen, - LSquareBracket, RSquareBracket, - LAngleBracket, RAngleBracket, - LCurlyBrace, RCurlyBrace, - Pipe, - - Comma, Period, Colon, Underscore, - - Operator(Rc), - DigitGroup(Rc), HexLiteral(Rc), BinNumberSigil, - StrLiteral(Rc), - Identifier(Rc), - Keyword(Kw), - - EOF, - - Error(String), -} -use self::TokenType::*; - -#[derive(Debug, Clone, Copy, PartialEq)] -pub enum Kw { - If, Else, - Func, - For, - Match, - Var, Const, Let, In, - Return, - Alias, Type, SelfType, SelfIdent, - Trait, Impl, - True, False, - Module -} -use self::Kw::*; - -lazy_static! { - static ref KEYWORDS: HashMap<&'static str, Kw> = - hashmap! { - "if" => Kw::If, - "else" => Kw::Else, - "fn" => Kw::Func, - "for" => Kw::For, - "match" => Kw::Match, - "var" => Kw::Var, - "const" => Kw::Const, - "let" => Kw::Let, - "in" => Kw::In, - "return" => Kw::Return, - "alias" => Kw::Alias, - "type" => Kw::Type, - "Self" => Kw::SelfType, - "self" => Kw::SelfIdent, - "trait" => Kw::Trait, - "impl" => Kw::Impl, - "true" => Kw::True, - "false" => Kw::False, - "module" => Kw::Module, - }; -} - -#[derive(Debug)] -pub struct Token { - pub token_type: TokenType, - pub offset: usize, -} - -impl Token { - pub fn get_error(&self) -> Option<&String> { - match self.token_type { - TokenType::Error(ref s) => Some(s), - _ => None, - } - } -} - -const OPERATOR_CHARS: [char; 19] = ['!', '$', '%', '&', '*', '+', '-', '.', '/', ':', '<', '>', '=', '?', '@', '^', '|', '~', '`']; -fn is_operator(c: &char) -> bool { - OPERATOR_CHARS.iter().any(|x| x == c) -} - -type CharIter<'a> = Peekable>>; - -pub fn tokenize(input: &str) -> Vec { - let mut tokens: Vec = Vec::new(); - let mut input: CharIter = input.chars().enumerate().peekable(); - - while let Some((idx, c)) = input.next() { - let cur_tok_type = match c { - '#' => { - if let Some(&(_, '{')) = input.peek() { - } else { - while let Some((_, c)) = input.next() { - if c == '\n' { - break; - } - } - } - continue; - }, - c if c.is_whitespace() && c != '\n' => continue, - '\n' => Newline, ';' => Semicolon, - ':' => Colon, ',' => Comma, - '(' => LParen, ')' => RParen, - '{' => LCurlyBrace, '}' => RCurlyBrace, - '[' => LSquareBracket, ']' => RSquareBracket, - '"' => handle_quote(&mut input), - c if c.is_digit(10) => handle_digit(c, &mut input), - c if c.is_alphabetic() || c == '_' => handle_alphabetic(c, &mut input), //TODO I'll probably have to rewrite this if I care about types being uppercase, also type parameterization - c if is_operator(&c) => handle_operator(c, &mut input), - unknown => Error(format!("Unexpected character: {}", unknown)), - }; - tokens.push(Token { token_type: cur_tok_type, offset: idx }); - } - tokens -} - -fn handle_digit(c: char, input: &mut CharIter) -> TokenType { - if c == '0' && input.peek().map_or(false, |&(_, c)| { c == 'x' }) { - input.next(); - let rest: String = input.peeking_take_while(|&(_, ref c)| c.is_digit(16) || *c == '_').map(|(_, c)| { c }).collect(); - HexLiteral(Rc::new(rest)) - } else if c == '0' && input.peek().map_or(false, |&(_, c)| { c == 'b' }) { - input.next(); - BinNumberSigil - } else { - let mut buf = c.to_string(); - buf.extend(input.peeking_take_while(|&(_, ref c)| c.is_digit(10)).map(|(_, c)| { c })); - DigitGroup(Rc::new(buf)) - } -} - -fn handle_quote(input: &mut CharIter) -> TokenType { - let mut buf = String::new(); - loop { - match input.next().map(|(_, c)| { c }) { - Some('"') => break, - Some('\\') => { - let next = input.peek().map(|&(_, c)| { c }); - if next == Some('n') { - input.next(); - buf.push('\n') - } else if next == Some('"') { - input.next(); - buf.push('"'); - } else if next == Some('t') { - input.next(); - buf.push('\t'); - } - }, - Some(c) => buf.push(c), - None => return TokenType::Error(format!("Unclosed string")), - } - } - TokenType::StrLiteral(Rc::new(buf)) -} - -fn handle_alphabetic(c: char, input: &mut CharIter) -> TokenType { - let mut buf = String::new(); - buf.push(c); - if c == '_' && input.peek().map(|&(_, c)| { !c.is_alphabetic() }).unwrap_or(true) { - return TokenType::Underscore - } - - loop { - match input.peek().map(|&(_, c)| { c }) { - Some(c) if c.is_alphanumeric() => { - input.next(); - buf.push(c); - }, - _ => break, - } - } - - match KEYWORDS.get(buf.as_str()) { - Some(kw) => TokenType::Keyword(*kw), - None => TokenType::Identifier(Rc::new(buf)), - } -} - -fn handle_operator(c: char, input: &mut CharIter) -> TokenType { - match c { - '<' | '>' | '|' | '.' => { - let ref next = input.peek().map(|&(_, c)| { c }); - if !next.map(|n| { is_operator(&n) }).unwrap_or(false) { - return match c { - '<' => LAngleBracket, - '>' => RAngleBracket, - '|' => Pipe, - '.' => Period, - _ => unreachable!(), - } - } - }, - _ => (), - }; - - let mut buf = String::new(); - buf.push(c); - loop { - match input.peek().map(|&(_, c)| { c }) { - Some(c) if is_operator(&c) => { - input.next(); - buf.push(c); - }, - _ => break - } - } - TokenType::Operator(Rc::new(buf)) -} - -#[cfg(test)] -mod schala_tokenizer_tests { - use super::*; - - macro_rules! digit { ($ident:expr) => { DigitGroup(Rc::new($ident.to_string())) } } - macro_rules! ident { ($ident:expr) => { Identifier(Rc::new($ident.to_string())) } } - macro_rules! op { ($ident:expr) => { Operator(Rc::new($ident.to_string())) } } - - - #[test] - fn tokens() { - let a = tokenize("let a: A = c ++ d"); - let token_types: Vec = a.into_iter().map(move |t| t.token_type).collect(); - assert_eq!(token_types, vec![Keyword(Let), ident!("a"), Colon, ident!("A"), - LAngleBracket, ident!("B"), RAngleBracket, op!("="), ident!("c"), op!("++"), ident!("d")]); - } - - #[test] - fn underscores() { - let token_types: Vec = tokenize("4_8").into_iter().map(move |t| t.token_type).collect(); - assert_eq!(token_types, vec![digit!("4"), Underscore, digit!("8")]); - } -} +use schala_lang::tokenizing::*; +use schala_lang::tokenizing::Kw::*; +use schala_lang::tokenizing::TokenType::*; /* Schala EBNF Grammar */ /* Terminal productions are in 'single quotes' or UPPERCASE if they are a class diff --git a/src/schala_lang/tokenizing.rs b/src/schala_lang/tokenizing.rs new file mode 100644 index 0000000..5971955 --- /dev/null +++ b/src/schala_lang/tokenizing.rs @@ -0,0 +1,242 @@ +use itertools::Itertools; +use std::collections::HashMap; +use std::rc::Rc; +use std::iter::{Enumerate, Peekable}; +use std::str::Chars; + +#[derive(Debug, PartialEq, Clone)] +pub enum TokenType { + Newline, Semicolon, + + LParen, RParen, + LSquareBracket, RSquareBracket, + LAngleBracket, RAngleBracket, + LCurlyBrace, RCurlyBrace, + Pipe, + + Comma, Period, Colon, Underscore, + + Operator(Rc), + DigitGroup(Rc), HexLiteral(Rc), BinNumberSigil, + StrLiteral(Rc), + Identifier(Rc), + Keyword(Kw), + + EOF, + + Error(String), +} +use self::TokenType::*; + +#[derive(Debug, Clone, Copy, PartialEq)] +pub enum Kw { + If, Else, + Func, + For, + Match, + Var, Const, Let, In, + Return, + Alias, Type, SelfType, SelfIdent, + Trait, Impl, + True, False, + Module +} + +lazy_static! { + static ref KEYWORDS: HashMap<&'static str, Kw> = + hashmap! { + "if" => Kw::If, + "else" => Kw::Else, + "fn" => Kw::Func, + "for" => Kw::For, + "match" => Kw::Match, + "var" => Kw::Var, + "const" => Kw::Const, + "let" => Kw::Let, + "in" => Kw::In, + "return" => Kw::Return, + "alias" => Kw::Alias, + "type" => Kw::Type, + "Self" => Kw::SelfType, + "self" => Kw::SelfIdent, + "trait" => Kw::Trait, + "impl" => Kw::Impl, + "true" => Kw::True, + "false" => Kw::False, + "module" => Kw::Module, + }; +} + +#[derive(Debug)] +pub struct Token { + pub token_type: TokenType, + pub offset: usize, +} + +impl Token { + pub fn get_error(&self) -> Option<&String> { + match self.token_type { + TokenType::Error(ref s) => Some(s), + _ => None, + } + } +} + +const OPERATOR_CHARS: [char; 19] = ['!', '$', '%', '&', '*', '+', '-', '.', '/', ':', '<', '>', '=', '?', '@', '^', '|', '~', '`']; +fn is_operator(c: &char) -> bool { + OPERATOR_CHARS.iter().any(|x| x == c) +} + +type CharIter<'a> = Peekable>>; + +pub fn tokenize(input: &str) -> Vec { + let mut tokens: Vec = Vec::new(); + let mut input: CharIter = input.chars().enumerate().peekable(); + + while let Some((idx, c)) = input.next() { + let cur_tok_type = match c { + '#' => { + if let Some(&(_, '{')) = input.peek() { + } else { + while let Some((_, c)) = input.next() { + if c == '\n' { + break; + } + } + } + continue; + }, + c if c.is_whitespace() && c != '\n' => continue, + '\n' => Newline, ';' => Semicolon, + ':' => Colon, ',' => Comma, + '(' => LParen, ')' => RParen, + '{' => LCurlyBrace, '}' => RCurlyBrace, + '[' => LSquareBracket, ']' => RSquareBracket, + '"' => handle_quote(&mut input), + c if c.is_digit(10) => handle_digit(c, &mut input), + c if c.is_alphabetic() || c == '_' => handle_alphabetic(c, &mut input), //TODO I'll probably have to rewrite this if I care about types being uppercase, also type parameterization + c if is_operator(&c) => handle_operator(c, &mut input), + unknown => Error(format!("Unexpected character: {}", unknown)), + }; + tokens.push(Token { token_type: cur_tok_type, offset: idx }); + } + tokens +} + +fn handle_digit(c: char, input: &mut CharIter) -> TokenType { + if c == '0' && input.peek().map_or(false, |&(_, c)| { c == 'x' }) { + input.next(); + let rest: String = input.peeking_take_while(|&(_, ref c)| c.is_digit(16) || *c == '_').map(|(_, c)| { c }).collect(); + HexLiteral(Rc::new(rest)) + } else if c == '0' && input.peek().map_or(false, |&(_, c)| { c == 'b' }) { + input.next(); + BinNumberSigil + } else { + let mut buf = c.to_string(); + buf.extend(input.peeking_take_while(|&(_, ref c)| c.is_digit(10)).map(|(_, c)| { c })); + DigitGroup(Rc::new(buf)) + } +} + +fn handle_quote(input: &mut CharIter) -> TokenType { + let mut buf = String::new(); + loop { + match input.next().map(|(_, c)| { c }) { + Some('"') => break, + Some('\\') => { + let next = input.peek().map(|&(_, c)| { c }); + if next == Some('n') { + input.next(); + buf.push('\n') + } else if next == Some('"') { + input.next(); + buf.push('"'); + } else if next == Some('t') { + input.next(); + buf.push('\t'); + } + }, + Some(c) => buf.push(c), + None => return TokenType::Error(format!("Unclosed string")), + } + } + TokenType::StrLiteral(Rc::new(buf)) +} + +fn handle_alphabetic(c: char, input: &mut CharIter) -> TokenType { + let mut buf = String::new(); + buf.push(c); + if c == '_' && input.peek().map(|&(_, c)| { !c.is_alphabetic() }).unwrap_or(true) { + return TokenType::Underscore + } + + loop { + match input.peek().map(|&(_, c)| { c }) { + Some(c) if c.is_alphanumeric() => { + input.next(); + buf.push(c); + }, + _ => break, + } + } + + match KEYWORDS.get(buf.as_str()) { + Some(kw) => TokenType::Keyword(*kw), + None => TokenType::Identifier(Rc::new(buf)), + } +} + +fn handle_operator(c: char, input: &mut CharIter) -> TokenType { + match c { + '<' | '>' | '|' | '.' => { + let ref next = input.peek().map(|&(_, c)| { c }); + if !next.map(|n| { is_operator(&n) }).unwrap_or(false) { + return match c { + '<' => LAngleBracket, + '>' => RAngleBracket, + '|' => Pipe, + '.' => Period, + _ => unreachable!(), + } + } + }, + _ => (), + }; + + let mut buf = String::new(); + buf.push(c); + loop { + match input.peek().map(|&(_, c)| { c }) { + Some(c) if is_operator(&c) => { + input.next(); + buf.push(c); + }, + _ => break + } + } + TokenType::Operator(Rc::new(buf)) +} + +#[cfg(test)] +mod schala_tokenizer_tests { + use super::*; + + macro_rules! digit { ($ident:expr) => { DigitGroup(Rc::new($ident.to_string())) } } + macro_rules! ident { ($ident:expr) => { Identifier(Rc::new($ident.to_string())) } } + macro_rules! op { ($ident:expr) => { Operator(Rc::new($ident.to_string())) } } + + + #[test] + fn tokens() { + let a = tokenize("let a: A = c ++ d"); + let token_types: Vec = a.into_iter().map(move |t| t.token_type).collect(); + assert_eq!(token_types, vec![Keyword(Let), ident!("a"), Colon, ident!("A"), + LAngleBracket, ident!("B"), RAngleBracket, op!("="), ident!("c"), op!("++"), ident!("d")]); + } + + #[test] + fn underscores() { + let token_types: Vec = tokenize("4_8").into_iter().map(move |t| t.token_type).collect(); + assert_eq!(token_types, vec![digit!("4"), Underscore, digit!("8")]); + } +}