2017-09-06 05:09:20 -07:00
extern crate itertools ;
2017-09-07 19:38:22 -07:00
use std ::collections ::HashMap ;
2017-08-29 05:08:09 -07:00
use std ::rc ::Rc ;
2017-09-06 05:09:20 -07:00
use std ::iter ::{ Enumerate , Peekable } ;
use self ::itertools ::Itertools ;
2017-09-09 01:25:11 -07:00
use std ::vec ::IntoIter ;
2017-09-06 05:09:20 -07:00
use std ::str ::Chars ;
2017-08-29 04:27:07 -07:00
2017-09-09 01:25:11 -07:00
#[ derive(Debug, PartialEq, Clone) ]
2017-09-04 12:17:20 -07:00
pub enum TokenType {
2017-09-06 05:09:20 -07:00
Newline , Semicolon ,
2017-09-04 12:17:20 -07:00
2017-09-06 05:09:20 -07:00
LParen , RParen ,
LSquareBracket , RSquareBracket ,
LAngleBracket , RAngleBracket ,
LCurlyBrace , RCurlyBrace ,
2017-09-07 22:29:23 -07:00
Pipe ,
2017-09-04 12:17:20 -07:00
2017-09-06 05:09:20 -07:00
Comma , Period , Colon , Underscore ,
2017-09-04 12:17:20 -07:00
2017-09-06 05:09:20 -07:00
Operator ( Rc < String > ) ,
DigitGroup ( Rc < String > ) , HexNumberSigil , BinNumberSigil ,
2017-08-29 05:08:09 -07:00
StrLiteral ( Rc < String > ) ,
Identifier ( Rc < String > ) ,
2017-09-04 12:17:20 -07:00
Keyword ( Kw ) ,
2017-09-06 05:09:20 -07:00
2017-09-11 02:07:17 -07:00
EOF ,
2017-09-06 05:09:20 -07:00
Error ( String ) ,
2017-08-29 05:08:09 -07:00
}
2017-09-11 03:13:19 -07:00
use self ::TokenType ::* ;
2017-08-29 05:08:09 -07:00
2017-09-08 02:43:03 -07:00
#[ derive(Debug, Clone, Copy, PartialEq) ]
2017-09-04 12:17:20 -07:00
pub enum Kw {
2017-09-07 22:29:23 -07:00
If , Else ,
2017-09-04 12:17:20 -07:00
Func ,
2017-09-06 05:09:20 -07:00
For ,
2017-09-07 23:40:42 -07:00
Match ,
Var , Const , Let , In ,
Alias , Type , SelfType , SelfIdent ,
2017-09-07 22:29:23 -07:00
Trait , Impl ,
True , False
2017-09-04 12:17:20 -07:00
}
2017-09-11 03:13:19 -07:00
use self ::Kw ::* ;
2017-09-04 12:17:20 -07:00
2017-09-07 19:38:22 -07:00
lazy_static! {
static ref KEYWORDS : HashMap < & 'static str , Kw > =
hashmap! {
" if " = > Kw ::If ,
" else " = > Kw ::Else ,
2017-09-07 22:29:23 -07:00
" fn " = > Kw ::Func ,
" for " = > Kw ::For ,
2017-09-07 23:40:42 -07:00
" match " = > Kw ::Match ,
2017-09-07 22:29:23 -07:00
" var " = > Kw ::Var ,
" const " = > Kw ::Const ,
" let " = > Kw ::Let ,
2017-09-07 23:40:42 -07:00
" in " = > Kw ::In ,
" alias " = > Kw ::Alias ,
2017-09-07 22:29:23 -07:00
" type " = > Kw ::Type ,
" Self " = > Kw ::SelfType ,
" self " = > Kw ::SelfIdent ,
" trait " = > Kw ::Trait ,
" impl " = > Kw ::Impl ,
" true " = > Kw ::True ,
" false " = > Kw ::False ,
2017-09-07 19:38:22 -07:00
} ;
}
2017-09-04 12:17:20 -07:00
#[ derive(Debug) ]
pub struct Token {
token_type : TokenType ,
2017-09-06 05:09:20 -07:00
offset : usize ,
2017-09-04 12:17:20 -07:00
}
2017-09-06 23:52:25 -07:00
impl Token {
pub fn get_error ( & self ) -> Option < & String > {
match self . token_type {
TokenType ::Error ( ref s ) = > Some ( s ) ,
_ = > None ,
}
}
}
2017-09-06 05:09:20 -07:00
fn is_digit ( c : & char ) -> bool {
c . is_digit ( 10 )
}
type CharIter < ' a > = Peekable < Enumerate < Chars < ' a > > > ;
2017-09-06 23:52:25 -07:00
pub fn tokenize ( input : & str ) -> Vec < Token > {
2017-09-06 05:09:20 -07:00
let mut tokens : Vec < Token > = Vec ::new ( ) ;
let mut input : CharIter = input . chars ( ) . enumerate ( ) . peekable ( ) ;
while let Some ( ( idx , c ) ) = input . next ( ) {
let cur_tok_type = match c {
'#' = > {
if let Some ( & ( _ , '{' ) ) = input . peek ( ) {
} else {
while let Some ( ( _ , c ) ) = input . next ( ) {
if c = = '\n' {
break ;
}
}
}
continue ;
} ,
2017-09-07 22:29:23 -07:00
c if c . is_whitespace ( ) & & c ! = '\n' = > continue ,
2017-09-06 05:09:20 -07:00
'\n' = > Newline , ';' = > Semicolon ,
2017-09-06 09:42:29 -07:00
':' = > Colon , ',' = > Comma , '.' = > Period ,
2017-09-06 05:09:20 -07:00
'(' = > LParen , ')' = > RParen ,
'{' = > LCurlyBrace , '}' = > RCurlyBrace ,
'<' = > LAngleBracket , '>' = > RAngleBracket ,
'[' = > LSquareBracket , ']' = > RSquareBracket ,
2017-09-07 22:29:23 -07:00
'|' = > Pipe ,
2017-09-06 09:42:29 -07:00
'"' = > handle_quote ( & mut input ) ,
2017-09-06 05:09:20 -07:00
c if is_digit ( & c ) = > handle_digit ( c , & mut input ) ,
2017-09-08 01:33:27 -07:00
c if c . is_alphabetic ( ) | | c = = '_' = > handle_alphabetic ( c , & mut input ) , //TODO I'll probably have to rewrite this if I care about types being uppercase, also type parameterization
2017-09-06 09:42:29 -07:00
c = > handle_operator ( c , & mut input ) ,
2017-09-06 05:09:20 -07:00
} ;
tokens . push ( Token { token_type : cur_tok_type , offset : idx } ) ;
}
2017-09-06 23:52:25 -07:00
tokens
2017-09-06 05:09:20 -07:00
}
fn handle_digit ( c : char , input : & mut CharIter ) -> TokenType {
if c = = '0' & & input . peek ( ) . map_or ( false , | & ( _ , c ) | { c = = 'x' } ) {
input . next ( ) ;
HexNumberSigil
} else if c = = '0' & & input . peek ( ) . map_or ( false , | & ( _ , c ) | { c = = 'b' } ) {
input . next ( ) ;
BinNumberSigil
} else {
let mut buf = c . to_string ( ) ;
buf . extend ( input . peeking_take_while ( | & ( _ , ref c ) | is_digit ( c ) ) . map ( | ( _ , c ) | { c } ) ) ;
DigitGroup ( Rc ::new ( buf ) )
}
2017-08-29 05:08:09 -07:00
}
2017-09-06 09:42:29 -07:00
fn handle_quote ( input : & mut CharIter ) -> TokenType {
2017-09-06 16:52:49 -07:00
let mut buf = String ::new ( ) ;
2017-09-07 00:18:36 -07:00
loop {
match input . next ( ) . map ( | ( _ , c ) | { c } ) {
Some ( '"' ) = > break ,
Some ( '\\' ) = > {
let next = input . peek ( ) . map ( | & ( _ , c ) | { c } ) ;
if next = = Some ( 'n' ) {
input . next ( ) ;
buf . push ( '\n' )
} else if next = = Some ( '"' ) {
input . next ( ) ;
buf . push ( '"' ) ;
} else if next = = Some ( 't' ) {
input . next ( ) ;
buf . push ( '\t' ) ;
}
} ,
Some ( c ) = > buf . push ( c ) ,
None = > return TokenType ::Error ( format! ( " Unclosed string " ) ) ,
2017-09-06 16:52:49 -07:00
}
}
TokenType ::StrLiteral ( Rc ::new ( buf ) )
2017-09-06 09:42:29 -07:00
}
fn handle_alphabetic ( c : char , input : & mut CharIter ) -> TokenType {
2017-09-07 19:38:22 -07:00
let mut buf = String ::new ( ) ;
buf . push ( c ) ;
2017-09-08 01:33:27 -07:00
if c = = '_' & & input . peek ( ) . map ( | & ( _ , c ) | { ! c . is_alphabetic ( ) } ) . unwrap_or ( true ) {
2017-09-11 02:07:17 -07:00
return TokenType ::Underscore
2017-09-08 01:33:27 -07:00
}
2017-09-07 19:38:22 -07:00
loop {
match input . peek ( ) . map ( | & ( _ , c ) | { c } ) {
Some ( c ) if c . is_alphanumeric ( ) = > {
input . next ( ) ;
buf . push ( c ) ;
} ,
_ = > break ,
}
}
match KEYWORDS . get ( buf . as_str ( ) ) {
Some ( kw ) = > TokenType ::Keyword ( kw . clone ( ) ) ,
None = > TokenType ::Identifier ( Rc ::new ( buf ) ) ,
}
2017-09-06 09:42:29 -07:00
}
fn handle_operator ( c : char , input : & mut CharIter ) -> TokenType {
2017-09-07 22:29:23 -07:00
let mut buf = String ::new ( ) ;
buf . push ( c ) ;
loop {
match input . peek ( ) . map ( | & ( _ , c ) | { c } ) {
Some ( c ) if ! c . is_alphabetic ( ) & & ! c . is_whitespace ( ) = > {
input . next ( ) ;
buf . push ( c ) ;
} ,
_ = > break
}
}
TokenType ::Operator ( Rc ::new ( buf ) )
2017-09-06 09:42:29 -07:00
}
2017-09-08 02:43:03 -07:00
#[ cfg(test) ]
mod schala_tokenizer_tests {
use super ::* ;
use super ::TokenType ::* ;
use super ::Kw ::* ;
2017-09-11 02:07:17 -07:00
macro_rules ! digit { ( $ident :expr ) = > { DigitGroup ( Rc ::new ( $ident . to_string ( ) ) ) } }
2017-09-08 02:43:03 -07:00
macro_rules ! ident { ( $ident :expr ) = > { Identifier ( Rc ::new ( $ident . to_string ( ) ) ) } }
macro_rules ! op { ( $ident :expr ) = > { Operator ( Rc ::new ( $ident . to_string ( ) ) ) } }
#[ test ]
fn tokens ( ) {
let a = tokenize ( " let a: A<B> = c ++ d " ) ;
let token_types : Vec < TokenType > = a . into_iter ( ) . map ( move | t | t . token_type ) . collect ( ) ;
assert_eq! ( token_types , vec! [ Keyword ( Let ) , ident! ( " a " ) , Colon , ident! ( " A " ) ,
LAngleBracket , ident! ( " B " ) , RAngleBracket , op! ( " = " ) , ident! ( " c " ) , op! ( " ++ " ) , ident! ( " d " ) ] ) ;
}
2017-09-11 02:07:17 -07:00
#[ test ]
fn underscores ( ) {
let token_types : Vec < TokenType > = tokenize ( " 4_8 " ) . into_iter ( ) . map ( move | t | t . token_type ) . collect ( ) ;
assert_eq! ( token_types , vec! [ digit! ( " 4 " ) , Underscore , digit! ( " 8 " ) ] ) ;
}
2017-09-08 02:43:03 -07:00
}
2017-09-06 09:42:29 -07:00
2017-08-29 05:08:09 -07:00
/*
2017-09-11 02:07:17 -07:00
Schala ( PROVISIONAL ! ! ) EBNF grammar
2017-08-30 04:28:52 -07:00
' ' = literal , all other symbols are nonterminals
2017-08-29 05:08:09 -07:00
program := ( statement delimiter ? ) *
2017-08-30 04:28:52 -07:00
delimiter := ' Newline ' | ';'
2017-08-29 05:08:09 -07:00
statement := declaration | expression
2017-08-30 04:28:52 -07:00
declaration := module | function | type_decl
type_decl := ' type ' type_format
type_format := ' alias ' ' = ' type | type_constructor
type_constructor := capital_ident '=' type_rhs
type_rhs := struct_decl | type_variant ( '|' type_variant ) *
struct_decl := ' struct ' ' { ' ( ident ':' type ) * '}'
type_variant := capital_ident | tuple_type | capital_ident struct_decl
tuple_type := // something like Variant(a,b)
type := // something like Type[A[b]]
ascription := expression ( ':' type ) +
function := ' fn ' prototype '{' ( statement ) * '}'
prototype := identifier '(' identlist ')'
identlist := identifier ( ',' identifier ) * | ε
2017-08-29 05:08:09 -07:00
declaration := FN prototype LCurlyBrace ( statement ) * RCurlyBrace
prototype := identifier LParen identlist RParen
identlist := Ident ( Comma Ident ) * | ε
exprlist := Expression ( Comma Expression ) * | ε
itemlist := Ident COLON Expression ( Comma Ident COLON Expression ) * | ε
expression := postop_expression ( op postop_expression ) *
postop_expression := primary_expression postop
primary_expression := number_expr | String | identifier_expr | paren_expr | conditional_expr | while_expr | lambda_expr | list_expr | struct_expr
number_expr := ( PLUS | MINUS ) number_expr | Number
identifier_expr := call_expression | Variable
list_expr := LSquareBracket exprlist RSquareBracket
struct_expr := LCurlyBrace itemlist RCurlyBrace
call_expression := Identifier LParen exprlist RParen
while_expr := WHILE primary_expression LCurlyBrace ( expression delimiter ) * RCurlyBrace
paren_expr := LParen expression RParen
conditional_expr := IF expression LCurlyBrace ( expression delimiter ) * RCurlyBrace ( LCurlyBrace ( expresion delimiter ) * RCurlyBrace ) ?
lambda_expr := FN LParen identlist RParen LCurlyBrace ( expression delimiter ) * RCurlyBrace
lambda_call := | LParen exprlist RParen
postop := ε | LParen exprlist RParen | LBracket expression RBracket
op := '+' , '-' , etc .
* /
2017-09-11 02:07:17 -07:00
/* Schala EBNF Grammar */
/*
program := ( statement delimiter ) * EOF
delimiter := NEWLINE | SEMICOLON
statement := expression | declaration
declaration := type_declaration | func_declaration
2017-09-11 15:42:49 -07:00
type_declaration := TYPE identifier
2017-09-11 02:07:17 -07:00
func_declaration := FN
expression := primary
primary := literal
literal := TRUE | FALSE | number_literal | str_literal
2017-09-11 15:42:49 -07:00
identifier := IDENTIFIER
2017-09-11 02:07:17 -07:00
// a float_literal can still be assigned to an int in type-checking
number_literal := int_literal | float_literal
int_literal = ( HEX_SIGIL | BIN_SIGIL ) digits
float_literal := digits ( PERIOD digits )
digits := ( digit_group underscore ) +
* /
2017-09-09 01:25:11 -07:00
type TokenIter = Peekable < IntoIter < Token > > ;
2017-09-11 03:21:07 -07:00
#[ derive(Debug) ]
2017-09-11 02:07:17 -07:00
pub struct ParseError {
pub msg : String ,
}
impl ParseError {
fn new < T > ( msg : & str ) -> ParseResult < T > {
Err ( ParseError { msg : msg . to_string ( ) } )
}
}
pub type ParseResult < T > = Result < T , ParseError > ;
2017-09-08 16:42:42 -07:00
struct Parser {
2017-09-09 01:25:11 -07:00
tokens : TokenIter ,
}
impl Parser {
fn new ( input : Vec < Token > ) -> Parser {
Parser { tokens : input . into_iter ( ) . peekable ( ) }
}
2017-09-11 02:07:17 -07:00
fn peek ( & mut self ) -> TokenType {
self . tokens . peek ( ) . map ( | ref t | { t . token_type . clone ( ) } ) . unwrap_or ( TokenType ::EOF )
2017-09-09 01:27:15 -07:00
}
2017-09-11 02:07:17 -07:00
fn next ( & mut self ) -> TokenType {
self . tokens . next ( ) . map ( | ref t | { t . token_type . clone ( ) } ) . unwrap_or ( TokenType ::EOF )
2017-09-09 01:25:11 -07:00
}
2017-09-08 16:42:42 -07:00
}
2017-09-11 02:07:17 -07:00
macro_rules ! expect {
( $self :expr , $token_type :pat , $message :expr ) = > {
match $self . peek ( ) {
2017-09-11 15:42:49 -07:00
$token_type = > $self . next ( ) ,
_ = > return Err ( ParseError { msg : $message . to_string ( ) } ) ,
2017-09-11 02:07:17 -07:00
}
}
2017-09-09 00:31:15 -07:00
}
2017-09-11 03:21:07 -07:00
#[ derive(Debug, PartialEq) ]
2017-09-08 16:42:42 -07:00
pub struct AST ( Vec < Statement > ) ;
#[ derive(Debug, PartialEq) ]
pub enum Statement {
Expression ( Expression ) ,
Declaration ( Declaration ) ,
}
#[ derive(Debug, PartialEq) ]
pub enum Declaration {
FuncDecl ,
2017-09-11 15:42:49 -07:00
TypeDecl ( Rc < String > , TypeBody )
}
#[ derive(Debug, PartialEq) ]
pub enum TypeBody {
TypeBody
2017-09-08 16:42:42 -07:00
}
#[ derive(Debug, PartialEq) ]
pub enum Expression {
2017-09-11 03:10:10 -07:00
IntLiteral ( u64 ) ,
2017-09-08 16:42:42 -07:00
FloatLiteral ( f64 ) ,
}
2017-09-11 02:07:17 -07:00
impl Parser {
fn program ( & mut self ) -> ParseResult < AST > {
let mut statements = Vec ::new ( ) ;
loop {
match self . peek ( ) {
EOF = > break ,
Newline | Semicolon = > {
self . next ( ) ;
continue ;
} ,
_ = > statements . push ( self . statement ( ) ? ) ,
}
}
Ok ( AST ( statements ) )
}
fn statement ( & mut self ) -> ParseResult < Statement > {
use self ::Kw ::* ;
//TODO handle error recovery here
match self . peek ( ) {
Keyword ( Type ) = > self . type_declaration ( ) . map ( | decl | { Statement ::Declaration ( decl ) } ) ,
Keyword ( Func ) = > self . func_declaration ( ) . map ( | func | { Statement ::Declaration ( func ) } ) ,
_ = > self . expression ( ) . map ( | expr | { Statement ::Expression ( expr ) } ) ,
}
}
fn type_declaration ( & mut self ) -> ParseResult < Declaration > {
2017-09-11 15:42:49 -07:00
expect! ( self , Keyword ( Type ) , " Expected 'type' " ) ;
let name = self . identifier ( ) ? ;
Ok ( Declaration ::TypeDecl ( name , TypeBody ::TypeBody ) )
2017-09-11 02:07:17 -07:00
}
fn func_declaration ( & mut self ) -> ParseResult < Declaration > {
unimplemented! ( )
}
fn expression ( & mut self ) -> ParseResult < Expression > {
self . primary ( )
}
fn primary ( & mut self ) -> ParseResult < Expression > {
self . literal ( )
}
2017-09-11 15:42:49 -07:00
fn identifier ( & mut self ) -> ParseResult < Rc < String > > {
match self . next ( ) {
Identifier ( s ) = > Ok ( s ) ,
p = > ParseError ::new ( & format! ( " Expected an identifier, got {:?} " , p ) ) ,
}
}
2017-09-11 02:07:17 -07:00
fn literal ( & mut self ) -> ParseResult < Expression > {
match self . peek ( ) {
DigitGroup ( _ ) | HexNumberSigil | BinNumberSigil | Period = > self . number_literal ( ) ,
_ = > unimplemented! ( ) ,
}
}
fn number_literal ( & mut self ) -> ParseResult < Expression > {
match self . peek ( ) {
HexNumberSigil | BinNumberSigil = > self . int_literal ( ) ,
_ = > self . float_literal ( ) ,
}
}
fn int_literal ( & mut self ) -> ParseResult < Expression > {
use self ::Expression ::* ;
let digits = self . digits ( ) ? ;
match self . next ( ) {
BinNumberSigil = > {
unimplemented! ( )
} ,
HexNumberSigil = > {
unimplemented! ( )
} ,
_ = > return ParseError ::new ( " Expected '0x' or '0b' " ) ,
}
}
fn float_literal ( & mut self ) -> ParseResult < Expression > {
use self ::Expression ::* ;
2017-09-11 02:38:27 -07:00
let mut digits = self . digits ( ) ? ;
let p = self . peek ( ) ;
if let TokenType ::Period = self . peek ( ) {
self . next ( ) ;
digits . push_str ( " . " ) ;
digits . push_str ( & self . digits ( ) ? ) ;
match digits . parse ::< f64 > ( ) {
Ok ( f ) = > Ok ( FloatLiteral ( f ) ) ,
Err ( e ) = > unimplemented! ( " Float didn't parse with error: {} " , e ) ,
}
} else {
match digits . parse ::< u64 > ( ) {
2017-09-11 03:10:10 -07:00
Ok ( d ) = > Ok ( IntLiteral ( d ) ) ,
2017-09-11 02:38:27 -07:00
Err ( e ) = > unimplemented! ( " Need to handle numbers that don't parse to a Rust u64 {} " , e ) ,
}
2017-09-11 02:07:17 -07:00
}
}
fn digits ( & mut self ) -> ParseResult < String > {
let mut ds = String ::new ( ) ;
loop {
2017-09-11 02:38:27 -07:00
match self . peek ( ) {
Underscore = > { self . next ( ) ; continue ; } ,
DigitGroup ( ref s ) = > { self . next ( ) ; ds . push_str ( s ) } ,
2017-09-11 02:07:48 -07:00
_ = > break ,
2017-09-11 02:07:17 -07:00
}
}
Ok ( ds )
}
}
2017-09-09 00:31:15 -07:00
pub fn parse ( input : Vec < Token > ) -> Result < AST , ParseError > {
let mut parser = Parser ::new ( input ) ;
parser . program ( )
2017-08-29 05:08:09 -07:00
}
2017-09-11 03:21:07 -07:00
#[ cfg(test) ]
mod parse_tests {
use super ::* ;
use super ::Statement ::* ;
use super ::Expression ::* ;
use super ::ParseError ;
#[ test ]
fn test_parsing ( ) {
let a = " 8.1 " ;
assert_eq! ( parse ( tokenize ( a ) ) . unwrap ( ) , AST ( vec! [ Expression ( FloatLiteral ( 8.1 ) ) ] ) ) ;
}
}