2017-09-06 05:09:20 -07:00
extern crate itertools ;
2017-09-07 19:38:22 -07:00
use std ::collections ::HashMap ;
2017-08-29 05:08:09 -07:00
use std ::rc ::Rc ;
2017-09-06 05:09:20 -07:00
use std ::iter ::{ Enumerate , Peekable } ;
use self ::itertools ::Itertools ;
2017-09-09 01:25:11 -07:00
use std ::vec ::IntoIter ;
2017-09-06 05:09:20 -07:00
use std ::str ::Chars ;
2017-08-29 04:27:07 -07:00
2017-09-09 01:25:11 -07:00
#[ derive(Debug, PartialEq, Clone) ]
2017-09-04 12:17:20 -07:00
pub enum TokenType {
2017-09-06 05:09:20 -07:00
Newline , Semicolon ,
2017-09-04 12:17:20 -07:00
2017-09-06 05:09:20 -07:00
LParen , RParen ,
LSquareBracket , RSquareBracket ,
LAngleBracket , RAngleBracket ,
LCurlyBrace , RCurlyBrace ,
2017-09-07 22:29:23 -07:00
Pipe ,
2017-09-04 12:17:20 -07:00
2017-09-06 05:09:20 -07:00
Comma , Period , Colon , Underscore ,
2017-09-04 12:17:20 -07:00
2017-09-06 05:09:20 -07:00
Operator ( Rc < String > ) ,
DigitGroup ( Rc < String > ) , HexNumberSigil , BinNumberSigil ,
2017-08-29 05:08:09 -07:00
StrLiteral ( Rc < String > ) ,
Identifier ( Rc < String > ) ,
2017-09-04 12:17:20 -07:00
Keyword ( Kw ) ,
2017-09-06 05:09:20 -07:00
Error ( String ) ,
2017-08-29 05:08:09 -07:00
}
2017-09-08 02:43:03 -07:00
#[ derive(Debug, Clone, Copy, PartialEq) ]
2017-09-04 12:17:20 -07:00
pub enum Kw {
2017-09-07 22:29:23 -07:00
If , Else ,
2017-09-04 12:17:20 -07:00
Func ,
2017-09-06 05:09:20 -07:00
For ,
2017-09-07 23:40:42 -07:00
Match ,
Var , Const , Let , In ,
Alias , Type , SelfType , SelfIdent ,
2017-09-07 22:29:23 -07:00
Trait , Impl ,
True , False
2017-09-04 12:17:20 -07:00
}
2017-09-07 19:38:22 -07:00
lazy_static! {
static ref KEYWORDS : HashMap < & 'static str , Kw > =
hashmap! {
" if " = > Kw ::If ,
" else " = > Kw ::Else ,
2017-09-07 22:29:23 -07:00
" fn " = > Kw ::Func ,
" for " = > Kw ::For ,
2017-09-07 23:40:42 -07:00
" match " = > Kw ::Match ,
2017-09-07 22:29:23 -07:00
" var " = > Kw ::Var ,
" const " = > Kw ::Const ,
" let " = > Kw ::Let ,
2017-09-07 23:40:42 -07:00
" in " = > Kw ::In ,
" alias " = > Kw ::Alias ,
2017-09-07 22:29:23 -07:00
" type " = > Kw ::Type ,
" Self " = > Kw ::SelfType ,
" self " = > Kw ::SelfIdent ,
" trait " = > Kw ::Trait ,
" impl " = > Kw ::Impl ,
" true " = > Kw ::True ,
" false " = > Kw ::False ,
2017-09-07 19:38:22 -07:00
} ;
}
2017-09-04 12:17:20 -07:00
#[ derive(Debug) ]
pub struct Token {
token_type : TokenType ,
2017-09-06 05:09:20 -07:00
offset : usize ,
2017-09-04 12:17:20 -07:00
}
2017-09-06 23:52:25 -07:00
impl Token {
pub fn get_error ( & self ) -> Option < & String > {
match self . token_type {
TokenType ::Error ( ref s ) = > Some ( s ) ,
_ = > None ,
}
}
}
2017-09-06 05:09:20 -07:00
fn is_digit ( c : & char ) -> bool {
c . is_digit ( 10 )
}
type CharIter < ' a > = Peekable < Enumerate < Chars < ' a > > > ;
2017-09-06 23:52:25 -07:00
pub fn tokenize ( input : & str ) -> Vec < Token > {
2017-09-06 05:09:20 -07:00
use self ::TokenType ::* ;
let mut tokens : Vec < Token > = Vec ::new ( ) ;
let mut input : CharIter = input . chars ( ) . enumerate ( ) . peekable ( ) ;
while let Some ( ( idx , c ) ) = input . next ( ) {
let cur_tok_type = match c {
'#' = > {
if let Some ( & ( _ , '{' ) ) = input . peek ( ) {
} else {
while let Some ( ( _ , c ) ) = input . next ( ) {
if c = = '\n' {
break ;
}
}
}
continue ;
} ,
2017-09-07 22:29:23 -07:00
c if c . is_whitespace ( ) & & c ! = '\n' = > continue ,
2017-09-06 05:09:20 -07:00
'\n' = > Newline , ';' = > Semicolon ,
2017-09-06 09:42:29 -07:00
':' = > Colon , ',' = > Comma , '.' = > Period ,
2017-09-06 05:09:20 -07:00
'(' = > LParen , ')' = > RParen ,
'{' = > LCurlyBrace , '}' = > RCurlyBrace ,
'<' = > LAngleBracket , '>' = > RAngleBracket ,
'[' = > LSquareBracket , ']' = > RSquareBracket ,
2017-09-07 22:29:23 -07:00
'|' = > Pipe ,
2017-09-06 09:42:29 -07:00
'"' = > handle_quote ( & mut input ) ,
2017-09-06 05:09:20 -07:00
c if is_digit ( & c ) = > handle_digit ( c , & mut input ) ,
2017-09-08 01:33:27 -07:00
c if c . is_alphabetic ( ) | | c = = '_' = > handle_alphabetic ( c , & mut input ) , //TODO I'll probably have to rewrite this if I care about types being uppercase, also type parameterization
2017-09-06 09:42:29 -07:00
c = > handle_operator ( c , & mut input ) ,
2017-09-06 05:09:20 -07:00
} ;
tokens . push ( Token { token_type : cur_tok_type , offset : idx } ) ;
}
2017-09-06 23:52:25 -07:00
tokens
2017-09-06 05:09:20 -07:00
}
fn handle_digit ( c : char , input : & mut CharIter ) -> TokenType {
use self ::TokenType ::* ;
if c = = '0' & & input . peek ( ) . map_or ( false , | & ( _ , c ) | { c = = 'x' } ) {
input . next ( ) ;
HexNumberSigil
} else if c = = '0' & & input . peek ( ) . map_or ( false , | & ( _ , c ) | { c = = 'b' } ) {
input . next ( ) ;
BinNumberSigil
} else {
let mut buf = c . to_string ( ) ;
buf . extend ( input . peeking_take_while ( | & ( _ , ref c ) | is_digit ( c ) ) . map ( | ( _ , c ) | { c } ) ) ;
DigitGroup ( Rc ::new ( buf ) )
}
2017-08-29 05:08:09 -07:00
}
2017-09-06 09:42:29 -07:00
fn handle_quote ( input : & mut CharIter ) -> TokenType {
2017-09-06 16:52:49 -07:00
let mut buf = String ::new ( ) ;
2017-09-07 00:18:36 -07:00
loop {
match input . next ( ) . map ( | ( _ , c ) | { c } ) {
Some ( '"' ) = > break ,
Some ( '\\' ) = > {
let next = input . peek ( ) . map ( | & ( _ , c ) | { c } ) ;
if next = = Some ( 'n' ) {
input . next ( ) ;
buf . push ( '\n' )
} else if next = = Some ( '"' ) {
input . next ( ) ;
buf . push ( '"' ) ;
} else if next = = Some ( 't' ) {
input . next ( ) ;
buf . push ( '\t' ) ;
}
} ,
Some ( c ) = > buf . push ( c ) ,
None = > return TokenType ::Error ( format! ( " Unclosed string " ) ) ,
2017-09-06 16:52:49 -07:00
}
}
TokenType ::StrLiteral ( Rc ::new ( buf ) )
2017-09-06 09:42:29 -07:00
}
fn handle_alphabetic ( c : char , input : & mut CharIter ) -> TokenType {
2017-09-07 19:38:22 -07:00
let mut buf = String ::new ( ) ;
buf . push ( c ) ;
2017-09-08 01:33:27 -07:00
if c = = '_' & & input . peek ( ) . map ( | & ( _ , c ) | { ! c . is_alphabetic ( ) } ) . unwrap_or ( true ) {
return TokenType ::Identifier ( Rc ::new ( format! ( " _ " ) ) )
}
2017-09-07 19:38:22 -07:00
loop {
match input . peek ( ) . map ( | & ( _ , c ) | { c } ) {
Some ( c ) if c . is_alphanumeric ( ) = > {
input . next ( ) ;
buf . push ( c ) ;
} ,
_ = > break ,
}
}
match KEYWORDS . get ( buf . as_str ( ) ) {
Some ( kw ) = > TokenType ::Keyword ( kw . clone ( ) ) ,
None = > TokenType ::Identifier ( Rc ::new ( buf ) ) ,
}
2017-09-06 09:42:29 -07:00
}
fn handle_operator ( c : char , input : & mut CharIter ) -> TokenType {
2017-09-07 22:29:23 -07:00
let mut buf = String ::new ( ) ;
buf . push ( c ) ;
loop {
match input . peek ( ) . map ( | & ( _ , c ) | { c } ) {
Some ( c ) if ! c . is_alphabetic ( ) & & ! c . is_whitespace ( ) = > {
input . next ( ) ;
buf . push ( c ) ;
} ,
_ = > break
}
}
TokenType ::Operator ( Rc ::new ( buf ) )
2017-09-06 09:42:29 -07:00
}
2017-09-08 02:43:03 -07:00
#[ cfg(test) ]
mod schala_tokenizer_tests {
use super ::* ;
use super ::TokenType ::* ;
use super ::Kw ::* ;
macro_rules ! ident { ( $ident :expr ) = > { Identifier ( Rc ::new ( $ident . to_string ( ) ) ) } }
macro_rules ! op { ( $ident :expr ) = > { Operator ( Rc ::new ( $ident . to_string ( ) ) ) } }
#[ test ]
fn tokens ( ) {
let a = tokenize ( " let a: A<B> = c ++ d " ) ;
let token_types : Vec < TokenType > = a . into_iter ( ) . map ( move | t | t . token_type ) . collect ( ) ;
assert_eq! ( token_types , vec! [ Keyword ( Let ) , ident! ( " a " ) , Colon , ident! ( " A " ) ,
LAngleBracket , ident! ( " B " ) , RAngleBracket , op! ( " = " ) , ident! ( " c " ) , op! ( " ++ " ) , ident! ( " d " ) ] ) ;
}
}
2017-09-06 09:42:29 -07:00
2017-08-29 05:08:09 -07:00
/*
2017-08-30 04:28:52 -07:00
Schala EBNF grammar
type alias < name > = < other type >
type < name > = struct { < field > : < type > , * }
type < name > = Variant1 | Variant2 ( type , type ) | Variant3 struct { }
' ' = literal , all other symbols are nonterminals
2017-08-29 05:08:09 -07:00
program := ( statement delimiter ? ) *
2017-08-30 04:28:52 -07:00
delimiter := ' Newline ' | ';'
2017-08-29 05:08:09 -07:00
statement := declaration | expression
2017-08-30 04:28:52 -07:00
declaration := module | function | type_decl
type_decl := ' type ' type_format
type_format := ' alias ' ' = ' type | type_constructor
type_constructor := capital_ident '=' type_rhs
type_rhs := struct_decl | type_variant ( '|' type_variant ) *
struct_decl := ' struct ' ' { ' ( ident ':' type ) * '}'
type_variant := capital_ident | tuple_type | capital_ident struct_decl
tuple_type := // something like Variant(a,b)
type := // something like Type[A[b]]
ascription := expression ( ':' type ) +
function := ' fn ' prototype '{' ( statement ) * '}'
prototype := identifier '(' identlist ')'
identlist := identifier ( ',' identifier ) * | ε
2017-08-29 05:08:09 -07:00
declaration := FN prototype LCurlyBrace ( statement ) * RCurlyBrace
prototype := identifier LParen identlist RParen
identlist := Ident ( Comma Ident ) * | ε
exprlist := Expression ( Comma Expression ) * | ε
itemlist := Ident COLON Expression ( Comma Ident COLON Expression ) * | ε
expression := postop_expression ( op postop_expression ) *
postop_expression := primary_expression postop
primary_expression := number_expr | String | identifier_expr | paren_expr | conditional_expr | while_expr | lambda_expr | list_expr | struct_expr
number_expr := ( PLUS | MINUS ) number_expr | Number
identifier_expr := call_expression | Variable
list_expr := LSquareBracket exprlist RSquareBracket
struct_expr := LCurlyBrace itemlist RCurlyBrace
call_expression := Identifier LParen exprlist RParen
while_expr := WHILE primary_expression LCurlyBrace ( expression delimiter ) * RCurlyBrace
paren_expr := LParen expression RParen
conditional_expr := IF expression LCurlyBrace ( expression delimiter ) * RCurlyBrace ( LCurlyBrace ( expresion delimiter ) * RCurlyBrace ) ?
lambda_expr := FN LParen identlist RParen LCurlyBrace ( expression delimiter ) * RCurlyBrace
lambda_call := | LParen exprlist RParen
postop := ε | LParen exprlist RParen | LBracket expression RBracket
op := '+' , '-' , etc .
* /
2017-09-09 01:25:11 -07:00
type TokenIter = Peekable < IntoIter < Token > > ;
2017-09-08 16:42:42 -07:00
struct Parser {
2017-09-09 01:25:11 -07:00
tokens : TokenIter ,
}
impl Parser {
fn new ( input : Vec < Token > ) -> Parser {
Parser { tokens : input . into_iter ( ) . peekable ( ) }
}
2017-09-09 01:27:15 -07:00
fn peek ( & mut self ) -> Option < TokenType > {
self . tokens . peek ( ) . map ( | ref t | { t . token_type . clone ( ) } )
}
fn next ( & mut self ) -> Option < TokenType > {
self . tokens . next ( ) . map ( | ref t | { t . token_type . clone ( ) } )
2017-09-09 01:25:11 -07:00
}
fn program ( & mut self ) -> ParseResult < AST > {
use self ::Statement ::* ; use self ::Declaration ::* ; use self ::Expression ::* ;
let statements = vec! [ Expression ( UnsignedIntLiteral ( 1 ) ) ] ;
Ok ( AST ( statements ) )
}
2017-09-08 16:42:42 -07:00
}
2017-09-09 00:31:15 -07:00
pub struct ParseError {
pub msg : String ,
}
pub type ParseResult < T > = Result < T , ParseError > ;
2017-08-29 04:27:07 -07:00
#[ derive(Debug) ]
2017-09-08 16:42:42 -07:00
pub struct AST ( Vec < Statement > ) ;
#[ derive(Debug, PartialEq) ]
pub enum Statement {
Expression ( Expression ) ,
Declaration ( Declaration ) ,
}
#[ derive(Debug, PartialEq) ]
pub enum Declaration {
FuncDecl ,
TypeDecl
}
#[ derive(Debug, PartialEq) ]
pub enum Expression {
UnsignedIntLiteral ( u64 ) ,
SignedIntLiteral ( i64 ) ,
FloatLiteral ( f64 ) ,
}
2017-09-09 00:31:15 -07:00
pub fn parse ( input : Vec < Token > ) -> Result < AST , ParseError > {
let mut parser = Parser ::new ( input ) ;
parser . program ( )
2017-08-29 05:08:09 -07:00
}