2018-02-23 01:58:06 -08:00
use itertools ::Itertools ;
use std ::collections ::HashMap ;
use std ::rc ::Rc ;
2018-03-03 00:28:52 -08:00
use std ::iter ::{ Iterator , Peekable } ;
2018-03-02 22:11:25 -08:00
use std ::fmt ;
2018-02-23 01:58:06 -08:00
#[ derive(Debug, PartialEq, Clone) ]
2018-11-16 23:17:34 -08:00
pub enum TokenKind {
2018-02-23 01:58:06 -08:00
Newline , Semicolon ,
LParen , RParen ,
LSquareBracket , RSquareBracket ,
LAngleBracket , RAngleBracket ,
LCurlyBrace , RCurlyBrace ,
2018-11-05 18:50:45 -08:00
Pipe , Backslash ,
2018-02-23 01:58:06 -08:00
Comma , Period , Colon , Underscore ,
2018-03-17 19:12:58 -07:00
Slash ,
2018-02-23 01:58:06 -08:00
Operator ( Rc < String > ) ,
DigitGroup ( Rc < String > ) , HexLiteral ( Rc < String > ) , BinNumberSigil ,
StrLiteral ( Rc < String > ) ,
Identifier ( Rc < String > ) ,
Keyword ( Kw ) ,
EOF ,
Error ( String ) ,
}
2018-11-16 23:17:34 -08:00
use self ::TokenKind ::* ;
2018-02-23 01:58:06 -08:00
2018-11-16 23:17:34 -08:00
impl fmt ::Display for TokenKind {
2018-03-02 22:11:25 -08:00
fn fmt ( & self , f : & mut fmt ::Formatter ) -> fmt ::Result {
match self {
& Operator ( ref s ) = > write! ( f , " Operator({}) " , * * s ) ,
& DigitGroup ( ref s ) = > write! ( f , " DigitGroup({}) " , s ) ,
& HexLiteral ( ref s ) = > write! ( f , " HexLiteral({}) " , s ) ,
& StrLiteral ( ref s ) = > write! ( f , " StrLiteral({}) " , s ) ,
& Identifier ( ref s ) = > write! ( f , " Identifier({}) " , s ) ,
& Error ( ref s ) = > write! ( f , " Error({}) " , s ) ,
other = > write! ( f , " {:?} " , other ) ,
}
}
}
2018-02-23 01:58:06 -08:00
#[ derive(Debug, Clone, Copy, PartialEq) ]
pub enum Kw {
2018-06-19 00:07:28 -07:00
If , Then , Else ,
Is ,
2018-02-23 01:58:06 -08:00
Func ,
2018-05-12 23:49:02 -07:00
For , While ,
2018-07-11 16:44:15 -07:00
Const , Let , In ,
Mut ,
2018-02-23 01:58:06 -08:00
Return ,
Alias , Type , SelfType , SelfIdent ,
2018-04-24 16:30:17 -07:00
Interface , Impl ,
2018-02-23 01:58:06 -08:00
True , False ,
Module
}
lazy_static! {
static ref KEYWORDS : HashMap < & 'static str , Kw > =
hashmap! {
" if " = > Kw ::If ,
2018-06-19 00:07:28 -07:00
" then " = > Kw ::Then ,
2018-02-23 01:58:06 -08:00
" else " = > Kw ::Else ,
2018-06-19 00:07:28 -07:00
" is " = > Kw ::Is ,
2018-02-23 01:58:06 -08:00
" fn " = > Kw ::Func ,
" for " = > Kw ::For ,
2018-05-12 23:49:02 -07:00
" while " = > Kw ::While ,
2018-02-23 01:58:06 -08:00
" const " = > Kw ::Const ,
" let " = > Kw ::Let ,
" in " = > Kw ::In ,
2018-07-11 16:44:15 -07:00
" mut " = > Kw ::Mut ,
2018-02-23 01:58:06 -08:00
" return " = > Kw ::Return ,
" alias " = > Kw ::Alias ,
" type " = > Kw ::Type ,
" Self " = > Kw ::SelfType ,
" self " = > Kw ::SelfIdent ,
2018-04-24 16:30:17 -07:00
" interface " = > Kw ::Interface ,
2018-02-23 01:58:06 -08:00
" impl " = > Kw ::Impl ,
" true " = > Kw ::True ,
" false " = > Kw ::False ,
" module " = > Kw ::Module ,
} ;
}
2018-03-02 00:42:52 -08:00
#[ derive(Debug, Clone) ]
2018-02-23 01:58:06 -08:00
pub struct Token {
2018-11-16 23:17:34 -08:00
pub kind : TokenKind ,
2019-01-08 02:11:19 -08:00
pub line_num : usize ,
pub char_num : usize
2018-11-16 03:51:03 -08:00
}
2018-02-23 01:58:06 -08:00
impl Token {
2018-05-02 01:14:46 -07:00
pub fn get_error ( & self ) -> Option < String > {
2018-11-16 23:17:34 -08:00
match self . kind {
TokenKind ::Error ( ref s ) = > Some ( s . clone ( ) ) ,
2018-02-23 01:58:06 -08:00
_ = > None ,
}
}
2018-03-02 22:11:25 -08:00
pub fn to_string_with_metadata ( & self ) -> String {
2019-01-08 02:11:19 -08:00
format! ( " {} (L: {} ,c: {} ) " , self . kind , self . line_num , self . char_num )
2018-03-02 22:11:25 -08:00
}
2018-11-16 03:51:03 -08:00
2018-11-16 23:17:34 -08:00
pub fn get_kind ( & self ) -> TokenKind {
self . kind . clone ( )
2018-11-16 03:51:03 -08:00
}
2018-02-23 01:58:06 -08:00
}
2018-03-17 19:12:58 -07:00
const OPERATOR_CHARS : [ char ; 18 ] = [ '!' , '$' , '%' , '&' , '*' , '+' , '-' , '.' , ':' , '<' , '>' , '=' , '?' , '@' , '^' , '|' , '~' , '`' ] ;
2018-02-23 01:58:06 -08:00
fn is_operator ( c : & char ) -> bool {
OPERATOR_CHARS . iter ( ) . any ( | x | x = = c )
}
2018-05-11 02:08:05 -07:00
type CharData = ( usize , usize , char ) ;
2018-03-02 02:57:04 -08:00
2018-02-23 01:58:06 -08:00
pub fn tokenize ( input : & str ) -> Vec < Token > {
let mut tokens : Vec < Token > = Vec ::new ( ) ;
2018-03-02 02:57:04 -08:00
2018-03-02 15:15:12 -08:00
let mut input = input . lines ( ) . enumerate ( )
2018-03-24 18:38:28 -07:00
. intersperse ( ( 0 , " \n " ) )
2018-03-02 02:57:04 -08:00
. flat_map ( | ( line_idx , ref line ) | {
line . chars ( ) . enumerate ( ) . map ( move | ( ch_idx , ch ) | ( line_idx , ch_idx , ch ) )
2018-03-24 18:38:28 -07:00
} )
. peekable ( ) ;
2018-03-02 02:57:04 -08:00
2019-01-08 02:11:19 -08:00
while let Some ( ( line_num , char_num , c ) ) = input . next ( ) {
2018-11-16 23:17:34 -08:00
let cur_tok_kind = match c {
2018-03-17 22:25:43 -07:00
'/' = > match input . peek ( ) . map ( | t | t . 2 ) {
Some ( '/' ) = > {
2018-03-02 15:15:12 -08:00
while let Some ( ( _ , _ , c ) ) = input . next ( ) {
2018-02-23 01:58:06 -08:00
if c = = '\n' {
break ;
}
}
2018-03-17 19:12:58 -07:00
continue ;
} ,
2018-03-17 22:25:43 -07:00
Some ( '*' ) = > {
input . next ( ) ;
let mut comment_level = 1 ;
while let Some ( ( _ , _ , c ) ) = input . next ( ) {
if c = = '*' & & input . peek ( ) . map ( | t | t . 2 ) = = Some ( '/' ) {
input . next ( ) ;
comment_level - = 1 ;
} else if c = = '/' & & input . peek ( ) . map ( | t | t . 2 ) = = Some ( '*' ) {
input . next ( ) ;
comment_level + = 1 ;
}
if comment_level = = 0 {
break ;
}
}
continue ;
2018-03-17 19:12:58 -07:00
} ,
_ = > Slash
2018-02-23 01:58:06 -08:00
} ,
c if c . is_whitespace ( ) & & c ! = '\n' = > continue ,
'\n' = > Newline , ';' = > Semicolon ,
':' = > Colon , ',' = > Comma ,
'(' = > LParen , ')' = > RParen ,
'{' = > LCurlyBrace , '}' = > RCurlyBrace ,
'[' = > LSquareBracket , ']' = > RSquareBracket ,
'"' = > handle_quote ( & mut input ) ,
2018-11-05 18:50:45 -08:00
'\\' = > Backslash ,
2018-02-23 01:58:06 -08:00
c if c . is_digit ( 10 ) = > handle_digit ( c , & mut input ) ,
c if c . is_alphabetic ( ) | | c = = '_' = > handle_alphabetic ( c , & mut input ) , //TODO I'll probably have to rewrite this if I care about types being uppercase, also type parameterization
c if is_operator ( & c ) = > handle_operator ( c , & mut input ) ,
unknown = > Error ( format! ( " Unexpected character: {} " , unknown ) ) ,
} ;
2019-01-08 02:11:19 -08:00
tokens . push ( Token { kind : cur_tok_kind , line_num , char_num } ) ;
2018-02-23 01:58:06 -08:00
}
tokens
}
2018-11-16 23:17:34 -08:00
fn handle_digit ( c : char , input : & mut Peekable < impl Iterator < Item = CharData > > ) -> TokenKind {
2018-03-02 15:15:12 -08:00
if c = = '0' & & input . peek ( ) . map_or ( false , | & ( _ , _ , c ) | { c = = 'x' } ) {
2018-02-23 01:58:06 -08:00
input . next ( ) ;
2018-03-02 15:15:12 -08:00
let rest : String = input . peeking_take_while ( | & ( _ , _ , ref c ) | c . is_digit ( 16 ) | | * c = = '_' ) . map ( | ( _ , _ , c ) | { c } ) . collect ( ) ;
2018-02-23 01:58:06 -08:00
HexLiteral ( Rc ::new ( rest ) )
2018-03-02 15:15:12 -08:00
} else if c = = '0' & & input . peek ( ) . map_or ( false , | & ( _ , _ , c ) | { c = = 'b' } ) {
2018-02-23 01:58:06 -08:00
input . next ( ) ;
BinNumberSigil
} else {
let mut buf = c . to_string ( ) ;
2018-03-02 15:15:12 -08:00
buf . extend ( input . peeking_take_while ( | & ( _ , _ , ref c ) | c . is_digit ( 10 ) ) . map ( | ( _ , _ , c ) | { c } ) ) ;
2018-02-23 01:58:06 -08:00
DigitGroup ( Rc ::new ( buf ) )
}
}
2018-11-16 23:17:34 -08:00
fn handle_quote ( input : & mut Peekable < impl Iterator < Item = CharData > > ) -> TokenKind {
2018-02-23 01:58:06 -08:00
let mut buf = String ::new ( ) ;
loop {
2018-03-02 15:15:12 -08:00
match input . next ( ) . map ( | ( _ , _ , c ) | { c } ) {
2018-02-23 01:58:06 -08:00
Some ( '"' ) = > break ,
Some ( '\\' ) = > {
2018-03-02 15:15:12 -08:00
let next = input . peek ( ) . map ( | & ( _ , _ , c ) | { c } ) ;
2018-02-23 01:58:06 -08:00
if next = = Some ( 'n' ) {
input . next ( ) ;
buf . push ( '\n' )
} else if next = = Some ( '"' ) {
input . next ( ) ;
buf . push ( '"' ) ;
} else if next = = Some ( 't' ) {
input . next ( ) ;
buf . push ( '\t' ) ;
}
} ,
Some ( c ) = > buf . push ( c ) ,
2018-11-16 23:17:34 -08:00
None = > return TokenKind ::Error ( format! ( " Unclosed string " ) ) ,
2018-02-23 01:58:06 -08:00
}
}
2018-11-16 23:17:34 -08:00
TokenKind ::StrLiteral ( Rc ::new ( buf ) )
2018-02-23 01:58:06 -08:00
}
2018-11-16 23:17:34 -08:00
fn handle_alphabetic ( c : char , input : & mut Peekable < impl Iterator < Item = CharData > > ) -> TokenKind {
2018-02-23 01:58:06 -08:00
let mut buf = String ::new ( ) ;
buf . push ( c ) ;
2018-03-02 15:15:12 -08:00
if c = = '_' & & input . peek ( ) . map ( | & ( _ , _ , c ) | { ! c . is_alphabetic ( ) } ) . unwrap_or ( true ) {
2018-11-16 23:17:34 -08:00
return TokenKind ::Underscore
2018-02-23 01:58:06 -08:00
}
loop {
2018-03-02 15:15:12 -08:00
match input . peek ( ) . map ( | & ( _ , _ , c ) | { c } ) {
2018-11-15 16:19:53 -08:00
Some ( c ) if c . is_alphanumeric ( ) | | c = = '_' = > {
2018-02-23 01:58:06 -08:00
input . next ( ) ;
buf . push ( c ) ;
} ,
_ = > break ,
}
}
match KEYWORDS . get ( buf . as_str ( ) ) {
2018-11-16 23:17:34 -08:00
Some ( kw ) = > TokenKind ::Keyword ( * kw ) ,
None = > TokenKind ::Identifier ( Rc ::new ( buf ) ) ,
2018-02-23 01:58:06 -08:00
}
}
2018-11-16 23:17:34 -08:00
fn handle_operator ( c : char , input : & mut Peekable < impl Iterator < Item = CharData > > ) -> TokenKind {
2018-02-23 01:58:06 -08:00
match c {
'<' | '>' | '|' | '.' = > {
2018-03-02 15:15:12 -08:00
let ref next = input . peek ( ) . map ( | & ( _ , _ , c ) | { c } ) ;
2018-02-23 01:58:06 -08:00
if ! next . map ( | n | { is_operator ( & n ) } ) . unwrap_or ( false ) {
return match c {
'<' = > LAngleBracket ,
'>' = > RAngleBracket ,
'|' = > Pipe ,
'.' = > Period ,
_ = > unreachable! ( ) ,
}
}
} ,
_ = > ( ) ,
} ;
let mut buf = String ::new ( ) ;
2018-04-25 03:01:41 -07:00
if c = = '`' {
loop {
match input . peek ( ) . map ( | & ( _ , _ , c ) | { c } ) {
Some ( c ) if c . is_alphabetic ( ) | | c = = '_' = > {
input . next ( ) ;
buf . push ( c ) ;
} ,
Some ( '`' ) = > {
input . next ( ) ;
break ;
} ,
_ = > break
}
}
} else {
buf . push ( c ) ;
loop {
match input . peek ( ) . map ( | & ( _ , _ , c ) | { c } ) {
Some ( c ) if is_operator ( & c ) = > {
input . next ( ) ;
buf . push ( c ) ;
} ,
_ = > break
}
2018-02-23 01:58:06 -08:00
}
}
2018-11-16 23:17:34 -08:00
TokenKind ::Operator ( Rc ::new ( buf ) )
2018-02-23 01:58:06 -08:00
}
#[ cfg(test) ]
mod schala_tokenizer_tests {
use super ::* ;
2018-02-23 01:59:53 -08:00
use super ::Kw ::* ;
2018-02-23 01:58:06 -08:00
macro_rules ! digit { ( $ident :expr ) = > { DigitGroup ( Rc ::new ( $ident . to_string ( ) ) ) } }
macro_rules ! ident { ( $ident :expr ) = > { Identifier ( Rc ::new ( $ident . to_string ( ) ) ) } }
macro_rules ! op { ( $ident :expr ) = > { Operator ( Rc ::new ( $ident . to_string ( ) ) ) } }
#[ test ]
fn tokens ( ) {
let a = tokenize ( " let a: A<B> = c ++ d " ) ;
2018-11-16 23:17:34 -08:00
let token_kinds : Vec < TokenKind > = a . into_iter ( ) . map ( move | t | t . kind ) . collect ( ) ;
assert_eq! ( token_kinds , vec! [ Keyword ( Let ) , ident! ( " a " ) , Colon , ident! ( " A " ) ,
2018-02-23 01:58:06 -08:00
LAngleBracket , ident! ( " B " ) , RAngleBracket , op! ( " = " ) , ident! ( " c " ) , op! ( " ++ " ) , ident! ( " d " ) ] ) ;
}
#[ test ]
fn underscores ( ) {
2018-11-16 23:17:34 -08:00
let token_kinds : Vec < TokenKind > = tokenize ( " 4_8 " ) . into_iter ( ) . map ( move | t | t . kind ) . collect ( ) ;
assert_eq! ( token_kinds , vec! [ digit! ( " 4 " ) , Underscore , digit! ( " 8 " ) ] ) ;
2018-11-15 16:19:53 -08:00
2018-11-16 23:17:34 -08:00
let token_kinds2 : Vec < TokenKind > = tokenize ( " aba_yo " ) . into_iter ( ) . map ( move | t | t . kind ) . collect ( ) ;
assert_eq! ( token_kinds2 , vec! [ ident! ( " aba_yo " ) ] ) ;
2018-02-23 01:58:06 -08:00
}
2018-03-17 22:25:43 -07:00
#[ test ]
fn comments ( ) {
2018-11-16 23:17:34 -08:00
let token_kinds : Vec < TokenKind > = tokenize ( " 1 + /* hella /* bro */ */ 2 " ) . into_iter ( ) . map ( move | t | t . kind ) . collect ( ) ;
assert_eq! ( token_kinds , vec! [ digit! ( " 1 " ) , op! ( " + " ) , digit! ( " 2 " ) ] ) ;
2018-03-17 22:25:43 -07:00
}
2018-04-25 03:01:41 -07:00
#[ test ]
fn backtick_operators ( ) {
2018-11-16 23:17:34 -08:00
let token_kinds : Vec < TokenKind > = tokenize ( " 1 `plus` 2 " ) . into_iter ( ) . map ( move | t | t . kind ) . collect ( ) ;
assert_eq! ( token_kinds , vec! [ digit! ( " 1 " ) , op! ( " plus " ) , digit! ( " 2 " ) ] ) ;
2018-04-25 03:01:41 -07:00
}
2018-02-23 01:58:06 -08:00
}