Run rustfmt on tokenizer code

This commit is contained in:
Greg Shuflin 2021-10-29 19:03:42 -07:00
parent 304df5c50e
commit 8111f69640

View File

@ -1,8 +1,13 @@
#![allow(clippy::upper_case_acronyms)] #![allow(clippy::upper_case_acronyms)]
use std::{
convert::{TryFrom, TryInto},
fmt,
iter::{Iterator, Peekable},
rc::Rc,
};
use itertools::Itertools; use itertools::Itertools;
use std::{iter::{Iterator, Peekable}, convert::TryFrom, rc::Rc, fmt};
use std::convert::TryInto;
/// A location in a particular source file. Note that the /// A location in a particular source file. Note that the
/// sizes of the internal unsigned integer types limit /// sizes of the internal unsigned integer types limit
@ -10,370 +15,444 @@ use std::convert::TryInto;
/// at most 2^16 characters, which should be plenty big. /// at most 2^16 characters, which should be plenty big.
#[derive(Debug, Clone, Copy, PartialEq, Default)] #[derive(Debug, Clone, Copy, PartialEq, Default)]
pub struct Location { pub struct Location {
pub(crate) line_num: u32, pub(crate) line_num: u32,
pub(crate) char_num: u16, pub(crate) char_num: u16,
} }
impl fmt::Display for Location { impl fmt::Display for Location {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "{}:{}", self.line_num, self.char_num) write!(f, "{}:{}", self.line_num, self.char_num)
} }
} }
#[derive(Debug, PartialEq, Clone)] #[derive(Debug, PartialEq, Clone)]
pub enum TokenKind { pub enum TokenKind {
Newline, Semicolon, Newline,
Semicolon,
LParen, RParen, LParen,
LSquareBracket, RSquareBracket, RParen,
LAngleBracket, RAngleBracket, LSquareBracket,
LCurlyBrace, RCurlyBrace, RSquareBracket,
Pipe, Backslash, LAngleBracket,
AtSign, RAngleBracket,
LCurlyBrace,
RCurlyBrace,
Pipe,
Backslash,
AtSign,
Comma,
Period,
Colon,
Underscore,
Slash,
Equals,
Comma, Period, Colon, Underscore, Operator(Rc<String>),
Slash, Equals, DigitGroup(Rc<String>),
HexLiteral(Rc<String>),
BinNumberSigil,
StrLiteral { s: Rc<String>, prefix: Option<Rc<String>> },
Identifier(Rc<String>),
Keyword(Kw),
Operator(Rc<String>), EOF,
DigitGroup(Rc<String>), HexLiteral(Rc<String>), BinNumberSigil,
StrLiteral {
s: Rc<String>,
prefix: Option<Rc<String>>
},
Identifier(Rc<String>),
Keyword(Kw),
EOF, Error(String),
Error(String),
} }
use self::TokenKind::*; use self::TokenKind::*;
impl fmt::Display for TokenKind { impl fmt::Display for TokenKind {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match self { match self {
&Operator(ref s) => write!(f, "Operator({})", **s), &Operator(ref s) => write!(f, "Operator({})", **s),
&DigitGroup(ref s) => write!(f, "DigitGroup({})", s), &DigitGroup(ref s) => write!(f, "DigitGroup({})", s),
&HexLiteral(ref s) => write!(f, "HexLiteral({})", s), &HexLiteral(ref s) => write!(f, "HexLiteral({})", s),
&StrLiteral {ref s, .. } => write!(f, "StrLiteral({})", s), &StrLiteral { ref s, .. } => write!(f, "StrLiteral({})", s),
&Identifier(ref s) => write!(f, "Identifier({})", s), &Identifier(ref s) => write!(f, "Identifier({})", s),
&Error(ref s) => write!(f, "Error({})", s), &Error(ref s) => write!(f, "Error({})", s),
other => write!(f, "{:?}", other), other => write!(f, "{:?}", other),
}
} }
}
} }
#[derive(Debug, Clone, Copy, PartialEq)] #[derive(Debug, Clone, Copy, PartialEq)]
pub enum Kw { pub enum Kw {
If, Then, Else, If,
Is, Then,
Func, Else,
For, While, Is,
Const, Let, In, Func,
Mut, For,
Return, While,
Alias, Type, SelfType, SelfIdent, Const,
Interface, Impl, Let,
True, False, In,
Module, Import Mut,
Return,
Alias,
Type,
SelfType,
SelfIdent,
Interface,
Impl,
True,
False,
Module,
Import,
} }
impl TryFrom<&str> for Kw { impl TryFrom<&str> for Kw {
type Error = (); type Error = ();
fn try_from(value: &str) -> Result<Self, Self::Error> { fn try_from(value: &str) -> Result<Self, Self::Error> {
Ok(match value { Ok(match value {
"if" => Kw::If, "if" => Kw::If,
"then" => Kw::Then, "then" => Kw::Then,
"else" => Kw::Else, "else" => Kw::Else,
"is" => Kw::Is, "is" => Kw::Is,
"fn" => Kw::Func, "fn" => Kw::Func,
"for" => Kw::For, "for" => Kw::For,
"while" => Kw::While, "while" => Kw::While,
"const" => Kw::Const, "const" => Kw::Const,
"let" => Kw::Let, "let" => Kw::Let,
"in" => Kw::In, "in" => Kw::In,
"mut" => Kw::Mut, "mut" => Kw::Mut,
"return" => Kw::Return, "return" => Kw::Return,
"alias" => Kw::Alias, "alias" => Kw::Alias,
"type" => Kw::Type, "type" => Kw::Type,
"Self" => Kw::SelfType, "Self" => Kw::SelfType,
"self" => Kw::SelfIdent, "self" => Kw::SelfIdent,
"interface" => Kw::Interface, "interface" => Kw::Interface,
"impl" => Kw::Impl, "impl" => Kw::Impl,
"true" => Kw::True, "true" => Kw::True,
"false" => Kw::False, "false" => Kw::False,
"module" => Kw::Module, "module" => Kw::Module,
"import" => Kw::Import, "import" => Kw::Import,
_ => return Err(()), _ => return Err(()),
}) })
} }
} }
#[derive(Debug, Clone, PartialEq)] #[derive(Debug, Clone, PartialEq)]
pub struct Token { pub struct Token {
pub kind: TokenKind, pub kind: TokenKind,
pub(crate) location: Location, pub(crate) location: Location,
} }
impl Token { impl Token {
pub fn to_string_with_metadata(&self) -> String { pub fn to_string_with_metadata(&self) -> String {
format!("{}({})", self.kind, self.location) format!("{}({})", self.kind, self.location)
} }
pub fn get_kind(&self) -> TokenKind { pub fn get_kind(&self) -> TokenKind {
self.kind.clone() self.kind.clone()
} }
} }
const OPERATOR_CHARS: [char; 17] = ['!', '$', '%', '&', '*', '+', '-', '.', ':', '<', '>', '=', '?', '^', '|', '~', '`']; const OPERATOR_CHARS: [char; 17] =
['!', '$', '%', '&', '*', '+', '-', '.', ':', '<', '>', '=', '?', '^', '|', '~', '`'];
fn is_operator(c: &char) -> bool { fn is_operator(c: &char) -> bool {
OPERATOR_CHARS.iter().any(|x| x == c) OPERATOR_CHARS.iter().any(|x| x == c)
} }
type CharData = (usize, usize, char); type CharData = (usize, usize, char);
pub fn tokenize(input: &str) -> Vec<Token> { pub fn tokenize(input: &str) -> Vec<Token> {
let mut tokens: Vec<Token> = Vec::new(); let mut tokens: Vec<Token> = Vec::new();
let mut input = Iterator::intersperse(input.lines().enumerate(), (0, "\n")) let mut input = Iterator::intersperse(input.lines().enumerate(), (0, "\n"))
.flat_map(|(line_idx, line)| { .flat_map(|(line_idx, line)| line.chars().enumerate().map(move |(ch_idx, ch)| (line_idx, ch_idx, ch)))
line.chars().enumerate().map(move |(ch_idx, ch)| (line_idx, ch_idx, ch)) .peekable();
})
.peekable();
while let Some((line_num, char_num, c)) = input.next() { while let Some((line_num, char_num, c)) = input.next() {
let cur_tok_kind = match c { let cur_tok_kind = match c {
'/' => match input.peek().map(|t| t.2) { '/' => match input.peek().map(|t| t.2) {
Some('/') => { Some('/') => {
for (_, _, c) in input.by_ref() { for (_, _, c) in input.by_ref() {
if c == '\n' { if c == '\n' {
break; break;
}
}
continue;
} }
} Some('*') => {
continue; input.next();
}, let mut comment_level = 1;
Some('*') => { while let Some((_, _, c)) = input.next() {
input.next(); if c == '*' && input.peek().map(|t| t.2) == Some('/') {
let mut comment_level = 1; input.next();
while let Some((_, _, c)) = input.next() { comment_level -= 1;
if c == '*' && input.peek().map(|t| t.2) == Some('/') { } else if c == '/' && input.peek().map(|t| t.2) == Some('*') {
input.next(); input.next();
comment_level -= 1; comment_level += 1;
} else if c == '/' && input.peek().map(|t| t.2) == Some('*') { }
input.next(); if comment_level == 0 {
comment_level += 1; break;
} }
if comment_level == 0 { }
break; if comment_level != 0 {
} Error("Unclosed comment".to_string())
} } else {
if comment_level != 0 { continue;
Error("Unclosed comment".to_string()) }
} else { }
continue; _ => Slash,
} },
}, c if c.is_whitespace() && c != '\n' => continue,
_ => Slash '\n' => Newline,
}, ';' => Semicolon,
c if c.is_whitespace() && c != '\n' => continue, ':' => Colon,
'\n' => Newline, ';' => Semicolon, ',' => Comma,
':' => Colon, ',' => Comma, '(' => LParen,
'(' => LParen, ')' => RParen, ')' => RParen,
'{' => LCurlyBrace, '}' => RCurlyBrace, '{' => LCurlyBrace,
'[' => LSquareBracket, ']' => RSquareBracket, '}' => RCurlyBrace,
'"' => handle_quote(&mut input, None), '[' => LSquareBracket,
'\\' => Backslash, ']' => RSquareBracket,
'@' => AtSign, '"' => handle_quote(&mut input, None),
c if c.is_digit(10) => handle_digit(c, &mut input), '\\' => Backslash,
c if c.is_alphabetic() || c == '_' => handle_alphabetic(c, &mut input), '@' => AtSign,
c if is_operator(&c) => handle_operator(c, &mut input), c if c.is_digit(10) => handle_digit(c, &mut input),
unknown => Error(format!("Unexpected character: {}", unknown)), c if c.is_alphabetic() || c == '_' => handle_alphabetic(c, &mut input),
}; c if is_operator(&c) => handle_operator(c, &mut input),
let location = Location { line_num: line_num.try_into().unwrap(), char_num: char_num.try_into().unwrap() }; unknown => Error(format!("Unexpected character: {}", unknown)),
tokens.push(Token { kind: cur_tok_kind, location }); };
} let location =
tokens Location { line_num: line_num.try_into().unwrap(), char_num: char_num.try_into().unwrap() };
tokens.push(Token { kind: cur_tok_kind, location });
}
tokens
} }
fn handle_digit(c: char, input: &mut Peekable<impl Iterator<Item=CharData>>) -> TokenKind { fn handle_digit(c: char, input: &mut Peekable<impl Iterator<Item = CharData>>) -> TokenKind {
let next_ch = input.peek().map(|&(_, _, c)| c); let next_ch = input.peek().map(|&(_, _, c)| c);
if c == '0' && next_ch == Some('x') { if c == '0' && next_ch == Some('x') {
input.next(); input.next();
let rest: String = input.peeking_take_while(|&(_, _, ref c)| c.is_digit(16) || *c == '_').map(|(_, _, c)| { c }).collect(); let rest: String = input
.peeking_take_while(|&(_, _, ref c)| c.is_digit(16) || *c == '_')
.map(|(_, _, c)| c)
.collect();
HexLiteral(Rc::new(rest)) HexLiteral(Rc::new(rest))
} else if c == '0' && next_ch == Some('b') { } else if c == '0' && next_ch == Some('b') {
input.next(); input.next();
BinNumberSigil BinNumberSigil
} else { } else {
let mut buf = c.to_string(); let mut buf = c.to_string();
buf.extend(input.peeking_take_while(|&(_, _, ref c)| c.is_digit(10)).map(|(_, _, c)| { c })); buf.extend(input.peeking_take_while(|&(_, _, ref c)| c.is_digit(10)).map(|(_, _, c)| c));
DigitGroup(Rc::new(buf)) DigitGroup(Rc::new(buf))
} }
} }
fn handle_quote(input: &mut Peekable<impl Iterator<Item=CharData>>, quote_prefix: Option<&str>) -> TokenKind { fn handle_quote(
let mut buf = String::new(); input: &mut Peekable<impl Iterator<Item = CharData>>,
loop { quote_prefix: Option<&str>,
match input.next().map(|(_, _, c)| { c }) { ) -> TokenKind {
Some('"') => break, let mut buf = String::new();
Some('\\') => {
let next = input.peek().map(|&(_, _, c)| { c });
if next == Some('n') {
input.next();
buf.push('\n')
} else if next == Some('"') {
input.next();
buf.push('"');
} else if next == Some('t') {
input.next();
buf.push('\t');
}
},
Some(c) => buf.push(c),
None => return TokenKind::Error("Unclosed string".to_string()),
}
}
TokenKind::StrLiteral { s: Rc::new(buf), prefix: quote_prefix.map(|s| Rc::new(s.to_string())) }
}
fn handle_alphabetic(c: char, input: &mut Peekable<impl Iterator<Item=CharData>>) -> TokenKind {
let mut buf = String::new();
buf.push(c);
let next_is_alphabetic = input.peek().map(|&(_, _, c)| !c.is_alphabetic()).unwrap_or(true);
if c == '_' && next_is_alphabetic {
return TokenKind::Underscore
}
loop {
match input.peek().map(|&(_, _, c)| { c }) {
Some(c) if c == '"' => {
input.next();
return handle_quote(input, Some(&buf));
},
Some(c) if c.is_alphanumeric() || c == '_' => {
input.next();
buf.push(c);
},
_ => break,
}
}
match Kw::try_from(buf.as_str()) {
Ok(kw) => TokenKind::Keyword(kw),
Err(()) => TokenKind::Identifier(Rc::new(buf)),
}
}
fn handle_operator(c: char, input: &mut Peekable<impl Iterator<Item=CharData>>) -> TokenKind {
match c {
'<' | '>' | '|' | '.' | '=' => {
let next = &input.peek().map(|&(_, _, c)| { c });
let next_is_op = next.map(|n| { is_operator(&n) }).unwrap_or(false);
if !next_is_op {
return match c {
'<' => LAngleBracket,
'>' => RAngleBracket,
'|' => Pipe,
'.' => Period,
'=' => Equals,
_ => unreachable!(),
}
}
},
_ => (),
};
let mut buf = String::new();
if c == '`' {
loop { loop {
match input.peek().map(|&(_, _, c)| { c }) { match input.next().map(|(_, _, c)| c) {
Some(c) if c.is_alphabetic() || c == '_' => { Some('"') => break,
input.next(); Some('\\') => {
buf.push(c); let next = input.peek().map(|&(_, _, c)| c);
}, if next == Some('n') {
Some('`') => { input.next();
input.next(); buf.push('\n')
break; } else if next == Some('"') {
}, input.next();
_ => break buf.push('"');
} } else if next == Some('t') {
input.next();
buf.push('\t');
}
}
Some(c) => buf.push(c),
None => return TokenKind::Error("Unclosed string".to_string()),
}
} }
} else { TokenKind::StrLiteral { s: Rc::new(buf), prefix: quote_prefix.map(|s| Rc::new(s.to_string())) }
}
fn handle_alphabetic(c: char, input: &mut Peekable<impl Iterator<Item = CharData>>) -> TokenKind {
let mut buf = String::new();
buf.push(c); buf.push(c);
loop { let next_is_alphabetic = input.peek().map(|&(_, _, c)| !c.is_alphabetic()).unwrap_or(true);
match input.peek().map(|&(_, _, c)| { c }) { if c == '_' && next_is_alphabetic {
Some(c) if is_operator(&c) => { return TokenKind::Underscore;
input.next();
buf.push(c);
},
_ => break
}
} }
}
TokenKind::Operator(Rc::new(buf)) loop {
match input.peek().map(|&(_, _, c)| c) {
Some(c) if c == '"' => {
input.next();
return handle_quote(input, Some(&buf));
}
Some(c) if c.is_alphanumeric() || c == '_' => {
input.next();
buf.push(c);
}
_ => break,
}
}
match Kw::try_from(buf.as_str()) {
Ok(kw) => TokenKind::Keyword(kw),
Err(()) => TokenKind::Identifier(Rc::new(buf)),
}
}
fn handle_operator(c: char, input: &mut Peekable<impl Iterator<Item = CharData>>) -> TokenKind {
match c {
'<' | '>' | '|' | '.' | '=' => {
let next = &input.peek().map(|&(_, _, c)| c);
let next_is_op = next.map(|n| is_operator(&n)).unwrap_or(false);
if !next_is_op {
return match c {
'<' => LAngleBracket,
'>' => RAngleBracket,
'|' => Pipe,
'.' => Period,
'=' => Equals,
_ => unreachable!(),
};
}
}
_ => (),
};
let mut buf = String::new();
if c == '`' {
loop {
match input.peek().map(|&(_, _, c)| c) {
Some(c) if c.is_alphabetic() || c == '_' => {
input.next();
buf.push(c);
}
Some('`') => {
input.next();
break;
}
_ => break,
}
}
} else {
buf.push(c);
loop {
match input.peek().map(|&(_, _, c)| c) {
Some(c) if is_operator(&c) => {
input.next();
buf.push(c);
}
_ => break,
}
}
}
TokenKind::Operator(Rc::new(buf))
} }
#[cfg(test)] #[cfg(test)]
mod schala_tokenizer_tests { mod schala_tokenizer_tests {
use super::*; use super::{Kw::*, *};
use super::Kw::*;
macro_rules! digit { ($ident:expr) => { DigitGroup(Rc::new($ident.to_string())) } } macro_rules! digit {
macro_rules! ident { ($ident:expr) => { Identifier(Rc::new($ident.to_string())) } } ($ident:expr) => {
macro_rules! op { ($ident:expr) => { Operator(Rc::new($ident.to_string())) } } DigitGroup(Rc::new($ident.to_string()))
};
}
macro_rules! ident {
($ident:expr) => {
Identifier(Rc::new($ident.to_string()))
};
}
macro_rules! op {
($ident:expr) => {
Operator(Rc::new($ident.to_string()))
};
}
fn token_kinds(input: &str) -> Vec<TokenKind> { fn token_kinds(input: &str) -> Vec<TokenKind> {
tokenize(input).into_iter().map(move |tok| tok.kind).collect() tokenize(input).into_iter().map(move |tok| tok.kind).collect()
} }
#[test] #[test]
fn tokens() { fn tokens() {
let output = token_kinds("let a: A<B> = c ++ d"); let output = token_kinds("let a: A<B> = c ++ d");
assert_eq!(output, vec![Keyword(Let), ident!("a"), Colon, ident!("A"), assert_eq!(
LAngleBracket, ident!("B"), RAngleBracket, Equals, ident!("c"), op!("++"), ident!("d")]); output,
} vec![
Keyword(Let),
ident!("a"),
Colon,
ident!("A"),
LAngleBracket,
ident!("B"),
RAngleBracket,
Equals,
ident!("c"),
op!("++"),
ident!("d")
]
);
}
#[test] #[test]
fn underscores() { fn underscores() {
let output = token_kinds("4_8"); let output = token_kinds("4_8");
assert_eq!(output, vec![digit!("4"), Underscore, digit!("8")]); assert_eq!(output, vec![digit!("4"), Underscore, digit!("8")]);
let output = token_kinds("aba_yo"); let output = token_kinds("aba_yo");
assert_eq!(output, vec![ident!("aba_yo")]); assert_eq!(output, vec![ident!("aba_yo")]);
} }
#[test] #[test]
fn comments() { fn comments() {
let output = token_kinds("1 + /* hella /* bro */ */ 2"); let output = token_kinds("1 + /* hella /* bro */ */ 2");
assert_eq!(output, vec![digit!("1"), op!("+"), digit!("2")]); assert_eq!(output, vec![digit!("1"), op!("+"), digit!("2")]);
let output = token_kinds("1 + /* hella /* bro */ 2"); let output = token_kinds("1 + /* hella /* bro */ 2");
assert_eq!(output, vec![digit!("1"), op!("+"), Error("Unclosed comment".to_string())]); assert_eq!(output, vec![digit!("1"), op!("+"), Error("Unclosed comment".to_string())]);
//TODO not sure if I want this behavior //TODO not sure if I want this behavior
let output = token_kinds("1 + /* hella */ bro */ 2"); let output = token_kinds("1 + /* hella */ bro */ 2");
assert_eq!(output, vec![digit!("1"), op!("+"), Identifier(Rc::new("bro".to_string())), Operator(Rc::new("*".to_string())), Slash, DigitGroup(Rc::new("2".to_string()))]); assert_eq!(
} output,
vec![
digit!("1"),
op!("+"),
Identifier(Rc::new("bro".to_string())),
Operator(Rc::new("*".to_string())),
Slash,
DigitGroup(Rc::new("2".to_string()))
]
);
}
#[test] #[test]
fn backtick_operators() { fn backtick_operators() {
let output = token_kinds("1 `plus` 2"); let output = token_kinds("1 `plus` 2");
assert_eq!(output, vec![digit!("1"), op!("plus"), digit!("2")]); assert_eq!(output, vec![digit!("1"), op!("plus"), digit!("2")]);
} }
#[test] #[test]
fn string_literals() { fn string_literals() {
let output = token_kinds(r#""some string""#); let output = token_kinds(r#""some string""#);
assert_eq!(output, vec![StrLiteral { s: Rc::new("some string".to_string()), prefix: None }]); assert_eq!(output, vec![StrLiteral { s: Rc::new("some string".to_string()), prefix: None }]);
let output = token_kinds(r#"b"some bytestring""#); let output = token_kinds(r#"b"some bytestring""#);
assert_eq!(output, vec![StrLiteral { s: Rc::new("some bytestring".to_string()), prefix: Some(Rc::new("b".to_string())) }]); assert_eq!(
output,
vec![StrLiteral {
s: Rc::new("some bytestring".to_string()),
prefix: Some(Rc::new("b".to_string()))
}]
);
let output = token_kinds(r#""Do \n \" escapes work\t""#); let output = token_kinds(r#""Do \n \" escapes work\t""#);
assert_eq!(output, vec![StrLiteral { s: Rc::new("Do \n \" escapes work\t".to_string()), prefix: None }]); assert_eq!(
} output,
vec![StrLiteral { s: Rc::new("Do \n \" escapes work\t".to_string()), prefix: None }]
);
}
} }