From 861173581c22bf09b6853ab2f28b052b856991bc Mon Sep 17 00:00:00 2001 From: Casey Rodarmor Date: Sat, 18 Nov 2017 03:36:02 -0800 Subject: [PATCH] Refactor Tokenizer (#260) --- src/main.rs | 11 +- src/parser.rs | 6 + src/run.rs | 3 +- src/scanner.rs | 600 +++++++++++++++++++++++++++++++++++++++++++++++ src/testing.rs | 6 +- src/tokenizer.rs | 585 --------------------------------------------- 6 files changed, 611 insertions(+), 600 deletions(-) create mode 100644 src/scanner.rs delete mode 100644 src/tokenizer.rs diff --git a/src/main.rs b/src/main.rs index 7b3aa2d..1e12f1c 100644 --- a/src/main.rs +++ b/src/main.rs @@ -33,11 +33,9 @@ mod recipe; mod recipe_resolver; mod run; mod runtime_error; +mod scanner; mod shebang; mod token; -mod tokenizer; - -use tokenizer::tokenize; mod common { pub use std::borrow::Cow; @@ -70,18 +68,13 @@ mod common { pub use recipe::Recipe; pub use recipe_resolver::RecipeResolver; pub use runtime_error::{RuntimeError, RunResult}; + pub use scanner::Scanner; pub use shebang::Shebang; pub use token::{Token, TokenKind}; } use common::*; -fn compile(text: &str) -> CompilationResult { - let tokens = tokenize(text)?; - let parser = Parser::new(text, tokens); - parser.justfile() -} - fn main() { run::run(); } diff --git a/src/parser.rs b/src/parser.rs index dc94016..01d653d 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -14,6 +14,12 @@ pub struct Parser<'a> { } impl<'a> Parser<'a> { + pub fn parse(text: &'a str) -> CompilationResult<'a, Justfile> { + let tokens = Scanner::scan(text)?; + let parser = Parser::new(text, tokens); + parser.justfile() + } + pub fn new(text: &'a str, tokens: Vec>) -> Parser<'a> { Parser { text: text, diff --git a/src/run.rs b/src/run.rs index 59af6ad..789919b 100644 --- a/src/run.rs +++ b/src/run.rs @@ -2,7 +2,6 @@ use common::*; use std::{convert, ffi}; use clap::{App, Arg, ArgGroup, AppSettings}; -use compile; use misc::maybe_s; use configuration::DEFAULT_SHELL; @@ -232,7 +231,7 @@ pub fn run() { .unwrap_or_else(|error| die!("Error reading justfile: {}", error)); } - let justfile = compile(&text).unwrap_or_else(|error| + let justfile = Parser::parse(&text).unwrap_or_else(|error| if color.stderr().active() { die!("{:#}", error); } else { diff --git a/src/scanner.rs b/src/scanner.rs new file mode 100644 index 0000000..234c430 --- /dev/null +++ b/src/scanner.rs @@ -0,0 +1,600 @@ +use common::*; + +use TokenKind::*; +use CompilationErrorKind::*; + +fn re(pattern: &str) -> Regex { + Regex::new(pattern).unwrap() +} + +fn token(pattern: &str) -> Regex { + let mut s = String::new(); + s += r"^(?m)([ \t]*)("; + s += pattern; + s += ")"; + re(&s) +} + +fn mixed_whitespace(text: &str) -> bool { + !(text.chars().all(|c| c == ' ') || text.chars().all(|c| c == '\t')) +} + +pub struct Scanner<'a> { + tokens: Vec>, + text: &'a str, + rest: &'a str, + index: usize, + column: usize, + line: usize, + state: Vec>, +} + +#[derive(PartialEq)] +enum State<'a> { + Start, + Indent(&'a str), + Text, + Interpolation, +} + +impl<'a> Scanner<'a> { + pub fn scan(text: &'a str) -> CompilationResult>> { + let scanner = Scanner{ + tokens: vec![], + text: text, + rest: text, + index: 0, + line: 0, + column: 0, + state: vec![State::Start], + }; + + scanner.inner() + } + + fn error(&self, kind: CompilationErrorKind<'a>) -> CompilationError<'a> { + CompilationError { + text: self.text, + index: self.index, + line: self.line, + column: self.column, + width: None, + kind: kind, + } + } + + fn token(&self, prefix: &'a str, lexeme: &'a str, kind: TokenKind) -> Token<'a> { + Token { + index: self.index, + line: self.line, + column: self.column, + text: self.text, + prefix: prefix, + lexeme: lexeme, + kind: kind, + } + } + + fn scan_indent(&mut self) -> CompilationResult<'a, Option>> { + lazy_static! { + static ref INDENT: Regex = re(r"^([ \t]*)[^ \t\n\r]"); + } + + let indentation = INDENT.captures(self.rest).map(|captures| captures.get(1).unwrap().as_str()); + + if self.column == 0 { + if let Some(kind) = match (self.state.last().unwrap(), indentation) { + // ignore: was no indentation and there still isn't + // or current line is blank + (&State::Start, Some("")) | (_, None) => { + None + } + // indent: was no indentation, now there is + (&State::Start, Some(current)) => { + if mixed_whitespace(current) { + return Err(self.error(MixedLeadingWhitespace{whitespace: current})); + } + //indent = Some(current); + self.state.push(State::Indent(current)); + Some(Indent) + } + // dedent: there was indentation and now there isn't + (&State::Indent(_), Some("")) => { + // indent = None; + self.state.pop(); + Some(Dedent) + } + // was indentation and still is, check if the new indentation matches + (&State::Indent(previous), Some(current)) => { + if !current.starts_with(previous) { + return Err(self.error(InconsistentLeadingWhitespace{ + expected: previous, + found: current + })); + } + None + } + // at column 0 in some other state: this should never happen + (&State::Text, _) | (&State::Interpolation, _) => { + return Err(self.error(Internal { + message: "unexpected state at column 0".to_string() + })); + } + } { + return Ok(Some(self.token("", "", kind))); + } + } + Ok(None) + } + + pub fn inner(mut self) -> CompilationResult<'a, Vec>> { + lazy_static! { + static ref BACKTICK: Regex = token(r"`[^`\n\r]*`" ); + static ref COLON: Regex = token(r":" ); + static ref AT: Regex = token(r"@" ); + static ref COMMENT: Regex = token(r"#([^!\n\r].*)?$" ); + static ref EOF: Regex = token(r"(?-m)$" ); + static ref EOL: Regex = token(r"\n|\r\n" ); + static ref EQUALS: Regex = token(r"=" ); + static ref INTERPOLATION_END: Regex = token(r"[}][}]" ); + static ref INTERPOLATION_START_TOKEN: Regex = token(r"[{][{]" ); + static ref NAME: Regex = token(r"([a-zA-Z_][a-zA-Z0-9_-]*)" ); + static ref PLUS: Regex = token(r"[+]" ); + static ref STRING: Regex = token("\"" ); + static ref RAW_STRING: Regex = token(r#"'[^']*'"# ); + static ref UNTERMINATED_RAW_STRING: Regex = token(r#"'[^']*"# ); + static ref INTERPOLATION_START: Regex = re(r"^[{][{]" ); + static ref LEADING_TEXT: Regex = re(r"^(?m)(.+?)[{][{]" ); + static ref LINE: Regex = re(r"^(?m)[ \t]+[^ \t\n\r].*$"); + static ref TEXT: Regex = re(r"^(?m)(.+)" ); + } + + loop { + if let Some(token) = self.scan_indent()? { + self.tokens.push(token); + } + + // insert a dedent if we're indented and we hit the end of the file + if &State::Start != self.state.last().unwrap() && EOF.is_match(self.rest) { + let token = self.token("", "", Dedent); + self.tokens.push(token); + } + + let (prefix, lexeme, kind) = + if let (0, &State::Indent(indent), Some(captures)) = + (self.column, self.state.last().unwrap(), LINE.captures(self.rest)) { + let line = captures.get(0).unwrap().as_str(); + if !line.starts_with(indent) { + return Err(self.error(Internal{message: "unexpected indent".to_string()})); + } + self.state.push(State::Text); + (&line[0..indent.len()], "", Line) + } else if let Some(captures) = EOF.captures(self.rest) { + (captures.get(1).unwrap().as_str(), captures.get(2).unwrap().as_str(), Eof) + } else if let State::Text = *self.state.last().unwrap() { + if let Some(captures) = INTERPOLATION_START.captures(self.rest) { + self.state.push(State::Interpolation); + ("", captures.get(0).unwrap().as_str(), InterpolationStart) + } else if let Some(captures) = LEADING_TEXT.captures(self.rest) { + ("", captures.get(1).unwrap().as_str(), Text) + } else if let Some(captures) = TEXT.captures(self.rest) { + ("", captures.get(1).unwrap().as_str(), Text) + } else if let Some(captures) = EOL.captures(self.rest) { + self.state.pop(); + (captures.get(1).unwrap().as_str(), captures.get(2).unwrap().as_str(), Eol) + } else { + return Err(self.error(Internal { + message: format!("Could not match token in text state: \"{}\"", self.rest) + })); + } + } else if let Some(captures) = INTERPOLATION_START_TOKEN.captures(self.rest) { + (captures.get(1).unwrap().as_str(), captures.get(2).unwrap().as_str(), InterpolationStart) + } else if let Some(captures) = INTERPOLATION_END.captures(self.rest) { + if self.state.last().unwrap() == &State::Interpolation { + self.state.pop(); + } + (captures.get(1).unwrap().as_str(), captures.get(2).unwrap().as_str(), InterpolationEnd) + } else if let Some(captures) = NAME.captures(self.rest) { + (captures.get(1).unwrap().as_str(), captures.get(2).unwrap().as_str(), Name) + } else if let Some(captures) = EOL.captures(self.rest) { + if self.state.last().unwrap() == &State::Interpolation { + return Err(self.error(Internal { + message: "hit EOL while still in interpolation state".to_string() + })); + } + (captures.get(1).unwrap().as_str(), captures.get(2).unwrap().as_str(), Eol) + } else if let Some(captures) = BACKTICK.captures(self.rest) { + (captures.get(1).unwrap().as_str(), captures.get(2).unwrap().as_str(), Backtick) + } else if let Some(captures) = COLON.captures(self.rest) { + (captures.get(1).unwrap().as_str(), captures.get(2).unwrap().as_str(), Colon) + } else if let Some(captures) = AT.captures(self.rest) { + (captures.get(1).unwrap().as_str(), captures.get(2).unwrap().as_str(), At) + } else if let Some(captures) = PLUS.captures(self.rest) { + (captures.get(1).unwrap().as_str(), captures.get(2).unwrap().as_str(), Plus) + } else if let Some(captures) = EQUALS.captures(self.rest) { + (captures.get(1).unwrap().as_str(), captures.get(2).unwrap().as_str(), Equals) + } else if let Some(captures) = COMMENT.captures(self.rest) { + (captures.get(1).unwrap().as_str(), captures.get(2).unwrap().as_str(), Comment) + } else if let Some(captures) = RAW_STRING.captures(self.rest) { + (captures.get(1).unwrap().as_str(), captures.get(2).unwrap().as_str(), RawString) + } else if UNTERMINATED_RAW_STRING.is_match(self.rest) { + return Err(self.error(UnterminatedString)); + } else if let Some(captures) = STRING.captures(self.rest) { + let prefix = captures.get(1).unwrap().as_str(); + let contents = &self.rest[prefix.len()+1..]; + if contents.is_empty() { + return Err(self.error(UnterminatedString)); + } + let mut len = 0; + let mut escape = false; + for c in contents.chars() { + if c == '\n' || c == '\r' { + return Err(self.error(UnterminatedString)); + } else if !escape && c == '"' { + break; + } else if !escape && c == '\\' { + escape = true; + } else if escape { + escape = false; + } + len += c.len_utf8(); + } + let start = prefix.len(); + let content_end = start + len + 1; + if escape || content_end >= self.rest.len() { + return Err(self.error(UnterminatedString)); + } + (prefix, &self.rest[start..content_end + 1], StringToken) + } else if self.rest.starts_with("#!") { + return Err(self.error(OuterShebang)); + } else { + return Err(self.error(UnknownStartOfToken)); + }; + + let token = self.token(prefix, lexeme, kind); + self.tokens.push(token); + + let len = prefix.len() + lexeme.len(); + + if len == 0 { + let last = self.tokens.last().unwrap(); + match last.kind { + Eof => {}, + _ => return Err(last.error(Internal { + message: format!("zero length token: {:?}", last) + })), + } + } + + match self.tokens.last().unwrap().kind { + Eol => { + self.line += 1; + self.column = 0; + } + Eof => { + break; + } + RawString => { + let lexeme_lines = lexeme.lines().count(); + self.line += lexeme_lines - 1; + if lexeme_lines == 1 { + self.column += len; + } else { + self.column = lexeme.lines().last().unwrap().len(); + } + } + _ => { + self.column += len; + } + } + + self.rest = &self.rest[len..]; + self.index += len; + } + + Ok(self.tokens) + } +} + +#[cfg(test)] +mod test { + use super::*; + + macro_rules! summary_test { + ($name:ident, $input:expr, $expected:expr $(,)*) => { + #[test] + fn $name() { + let input = $input; + let expected = $expected; + let tokens = ::Scanner::scan(input).unwrap(); + let roundtrip = tokens.iter().map(|t| { + let mut s = String::new(); + s += t.prefix; + s += t.lexeme; + s + }).collect::>().join(""); + let actual = token_summary(&tokens); + if actual != expected { + panic!("token summary mismatch:\nexpected: {}\ngot: {}\n", expected, actual); + } + assert_eq!(input, roundtrip); + } + } + } + + fn token_summary(tokens: &[Token]) -> String { + tokens.iter().map(|t| { + match t.kind { + At => "@", + Backtick => "`", + Colon => ":", + Comment{..} => "#", + Dedent => "<", + Eof => ".", + Eol => "$", + Equals => "=", + Indent{..} => ">", + InterpolationEnd => "}", + InterpolationStart => "{", + Line{..} => "^", + Name => "N", + Plus => "+", + RawString => "'", + StringToken => "\"", + Text => "_", + } + }).collect::>().join("") + } + + macro_rules! error_test { + ( + name: $name:ident, + input: $input:expr, + index: $index:expr, + line: $line:expr, + column: $column:expr, + width: $width:expr, + kind: $kind:expr, + ) => { + #[test] + fn $name() { + let input = $input; + + let expected = CompilationError { + text: input, + index: $index, + line: $line, + column: $column, + width: $width, + kind: $kind, + }; + + if let Err(error) = Scanner::scan(input) { + assert_eq!(error.text, expected.text); + assert_eq!(error.index, expected.index); + assert_eq!(error.line, expected.line); + assert_eq!(error.column, expected.column); + assert_eq!(error.kind, expected.kind); + assert_eq!(error, expected); + } else { + panic!("tokenize succeeded but expected: {}\n{}", expected, input); + } + } + } + } + + summary_test! { + tokenize_strings, + r#"a = "'a'" + '"b"' + "'c'" + '"d"'#echo hello"#, + r#"N="+'+"+'#."#, + } + + summary_test! { + tokenize_recipe_interpolation_eol, + "foo: # some comment + {{hello}} +", + "N:#$>^{N}$<.", + } + + summary_test! { + tokenize_recipe_interpolation_eof, + "foo: # more comments + {{hello}} +# another comment +", + "N:#$>^{N}$<#$.", + } + + summary_test! { + tokenize_recipe_complex_interpolation_expression, + "foo: #lol\n {{a + b + \"z\" + blarg}}", + "N:#$>^{N+N+\"+N}<.", + } + + summary_test! { + tokenize_recipe_multiple_interpolations, + "foo:#ok\n {{a}}0{{b}}1{{c}}", + "N:#$>^{N}_{N}_{N}<.", + } + + summary_test! { + tokenize_junk, + "bob + +hello blah blah blah : a b c #whatever + ", + "N$$NNNN:NNN#$.", + } + + summary_test! { + tokenize_empty_lines, + " +# this does something +hello: + asdf + bsdf + + csdf + + dsdf # whatever + +# yolo + ", + "$#$N:$>^_$^_$$^_$$^_$$<#$.", + } + + summary_test! { + tokenize_comment_before_variable, + " +# +A='1' +echo: + echo {{A}} + ", + "$#$N='$N:$>^_{N}$<.", + } + + summary_test! { + tokenize_interpolation_backticks, + "hello:\n echo {{`echo hello` + `echo goodbye`}}", + "N:$>^_{`+`}<.", + } + + summary_test! { + tokenize_assignment_backticks, + "a = `echo hello` + `echo goodbye`", + "N=`+`.", + } + + summary_test! { + tokenize_multiple, + " +hello: + a + b + + c + + d + +# hello +bob: + frank + ", + + "$N:$>^_$^_$$^_$$^_$$<#$N:$>^_$<.", + } + + summary_test! { + tokenize_comment, + "a:=#", + "N:=#." + } + + summary_test! { + tokenize_order, + r" +b: a + @mv a b + +a: + @touch F + @touch a + +d: c + @rm c + +c: b + @mv b c", + "$N:N$>^_$$^_$^_$$^_$$^_<.", + } + + error_test! { + name: tokenize_space_then_tab, + input: "a: + 0 + 1 +\t2 +", + index: 9, + line: 3, + column: 0, + width: None, + kind: InconsistentLeadingWhitespace{expected: " ", found: "\t"}, + } + + error_test! { + name: tokenize_tabs_then_tab_space, + input: "a: +\t\t0 +\t\t 1 +\t 2 +", + index: 12, + line: 3, + column: 0, + width: None, + kind: InconsistentLeadingWhitespace{expected: "\t\t", found: "\t "}, + } + + error_test! { + name: tokenize_outer_shebang, + input: "#!/usr/bin/env bash", + index: 0, + line: 0, + column: 0, + width: None, + kind: OuterShebang, + } + + error_test! { + name: tokenize_unknown, + input: "~", + index: 0, + line: 0, + column: 0, + width: None, + kind: UnknownStartOfToken, + } + + error_test! { + name: unterminated_string, + input: r#"a = ""#, + index: 3, + line: 0, + column: 3, + width: None, + kind: UnterminatedString, + } + + error_test! { + name: unterminated_string_with_escapes, + input: r#"a = "\n\t\r\"\\"#, + index: 3, + line: 0, + column: 3, + width: None, + kind: UnterminatedString, + } + + error_test! { + name: unterminated_raw_string, + input: "r a='asdf", + index: 4, + line: 0, + column: 4, + width: None, + kind: UnterminatedString, + } + + error_test! { + name: mixed_leading_whitespace, + input: "a:\n\t echo hello", + index: 3, + line: 1, + column: 0, + width: None, + kind: MixedLeadingWhitespace{whitespace: "\t "}, + } +} diff --git a/src/testing.rs b/src/testing.rs index d609a5b..376de70 100644 --- a/src/testing.rs +++ b/src/testing.rs @@ -1,9 +1,7 @@ use common::*; -use compile; - pub fn parse_success(text: &str) -> Justfile { - match compile(text) { + match Parser::parse(text) { Ok(justfile) => justfile, Err(error) => panic!("Expected successful parse but got error:\n{}", error), } @@ -32,7 +30,7 @@ macro_rules! compilation_error_test { kind: $kind, }; - let tokens = ::tokenizer::tokenize(input).unwrap(); + let tokens = ::Scanner::scan(input).unwrap(); let parser = ::Parser::new(input, tokens); if let Err(error) = parser.justfile() { diff --git a/src/tokenizer.rs b/src/tokenizer.rs deleted file mode 100644 index 0a91dcc..0000000 --- a/src/tokenizer.rs +++ /dev/null @@ -1,585 +0,0 @@ -use common::*; - -use TokenKind::*; -use CompilationErrorKind::*; - -fn re(pattern: &str) -> Regex { - Regex::new(pattern).unwrap() -} - -fn token(pattern: &str) -> Regex { - let mut s = String::new(); - s += r"^(?m)([ \t]*)("; - s += pattern; - s += ")"; - re(&s) -} - -fn mixed_whitespace(text: &str) -> bool { - !(text.chars().all(|c| c == ' ') || text.chars().all(|c| c == '\t')) -} - -pub fn tokenize(text: &str) -> CompilationResult> { - lazy_static! { - static ref BACKTICK: Regex = token(r"`[^`\n\r]*`" ); - static ref COLON: Regex = token(r":" ); - static ref AT: Regex = token(r"@" ); - static ref COMMENT: Regex = token(r"#([^!\n\r].*)?$" ); - static ref EOF: Regex = token(r"(?-m)$" ); - static ref EOL: Regex = token(r"\n|\r\n" ); - static ref EQUALS: Regex = token(r"=" ); - static ref INTERPOLATION_END: Regex = token(r"[}][}]" ); - static ref INTERPOLATION_START_TOKEN: Regex = token(r"[{][{]" ); - static ref NAME: Regex = token(r"([a-zA-Z_][a-zA-Z0-9_-]*)" ); - static ref PLUS: Regex = token(r"[+]" ); - static ref STRING: Regex = token("\"" ); - static ref RAW_STRING: Regex = token(r#"'[^']*'"# ); - static ref UNTERMINATED_RAW_STRING: Regex = token(r#"'[^']*"# ); - static ref INDENT: Regex = re(r"^([ \t]*)[^ \t\n\r]" ); - static ref INTERPOLATION_START: Regex = re(r"^[{][{]" ); - static ref LEADING_TEXT: Regex = re(r"^(?m)(.+?)[{][{]" ); - static ref LINE: Regex = re(r"^(?m)[ \t]+[^ \t\n\r].*$"); - static ref TEXT: Regex = re(r"^(?m)(.+)" ); - } - - #[derive(PartialEq)] - enum State<'a> { - Start, - Indent(&'a str), - Text, - Interpolation, - } - - fn indentation(text: &str) -> Option<&str> { - INDENT.captures(text).map(|captures| captures.get(1).unwrap().as_str()) - } - - let mut tokens = vec![]; - let mut rest = text; - let mut index = 0; - let mut line = 0; - let mut column = 0; - let mut state = vec![State::Start]; - - macro_rules! error { - ($kind:expr) => {{ - Err(CompilationError { - text: text, - index: index, - line: line, - column: column, - width: None, - kind: $kind, - }) - }}; - } - - loop { - if column == 0 { - if let Some(kind) = match (state.last().unwrap(), indentation(rest)) { - // ignore: was no indentation and there still isn't - // or current line is blank - (&State::Start, Some("")) | (_, None) => { - None - } - // indent: was no indentation, now there is - (&State::Start, Some(current)) => { - if mixed_whitespace(current) { - return error!(MixedLeadingWhitespace{whitespace: current}) - } - //indent = Some(current); - state.push(State::Indent(current)); - Some(Indent) - } - // dedent: there was indentation and now there isn't - (&State::Indent(_), Some("")) => { - // indent = None; - state.pop(); - Some(Dedent) - } - // was indentation and still is, check if the new indentation matches - (&State::Indent(previous), Some(current)) => { - if !current.starts_with(previous) { - return error!(InconsistentLeadingWhitespace{ - expected: previous, - found: current - }); - } - None - } - // at column 0 in some other state: this should never happen - (&State::Text, _) | (&State::Interpolation, _) => { - return error!(Internal { - message: "unexpected state at column 0".to_string() - }); - } - } { - tokens.push(Token { - index: index, - line: line, - column: column, - text: text, - prefix: "", - lexeme: "", - kind: kind, - }); - } - } - - // insert a dedent if we're indented and we hit the end of the file - if &State::Start != state.last().unwrap() && EOF.is_match(rest) { - tokens.push(Token { - index: index, - line: line, - column: column, - text: text, - prefix: "", - lexeme: "", - kind: Dedent, - }); - } - - let (prefix, lexeme, kind) = - if let (0, &State::Indent(indent), Some(captures)) = - (column, state.last().unwrap(), LINE.captures(rest)) { - let line = captures.get(0).unwrap().as_str(); - if !line.starts_with(indent) { - return error!(Internal{message: "unexpected indent".to_string()}); - } - state.push(State::Text); - (&line[0..indent.len()], "", Line) - } else if let Some(captures) = EOF.captures(rest) { - (captures.get(1).unwrap().as_str(), captures.get(2).unwrap().as_str(), Eof) - } else if let State::Text = *state.last().unwrap() { - if let Some(captures) = INTERPOLATION_START.captures(rest) { - state.push(State::Interpolation); - ("", captures.get(0).unwrap().as_str(), InterpolationStart) - } else if let Some(captures) = LEADING_TEXT.captures(rest) { - ("", captures.get(1).unwrap().as_str(), Text) - } else if let Some(captures) = TEXT.captures(rest) { - ("", captures.get(1).unwrap().as_str(), Text) - } else if let Some(captures) = EOL.captures(rest) { - state.pop(); - (captures.get(1).unwrap().as_str(), captures.get(2).unwrap().as_str(), Eol) - } else { - return error!(Internal { - message: format!("Could not match token in text state: \"{}\"", rest) - }); - } - } else if let Some(captures) = INTERPOLATION_START_TOKEN.captures(rest) { - (captures.get(1).unwrap().as_str(), captures.get(2).unwrap().as_str(), InterpolationStart) - } else if let Some(captures) = INTERPOLATION_END.captures(rest) { - if state.last().unwrap() == &State::Interpolation { - state.pop(); - } - (captures.get(1).unwrap().as_str(), captures.get(2).unwrap().as_str(), InterpolationEnd) - } else if let Some(captures) = NAME.captures(rest) { - (captures.get(1).unwrap().as_str(), captures.get(2).unwrap().as_str(), Name) - } else if let Some(captures) = EOL.captures(rest) { - if state.last().unwrap() == &State::Interpolation { - return error!(Internal { - message: "hit EOL while still in interpolation state".to_string() - }); - } - (captures.get(1).unwrap().as_str(), captures.get(2).unwrap().as_str(), Eol) - } else if let Some(captures) = BACKTICK.captures(rest) { - (captures.get(1).unwrap().as_str(), captures.get(2).unwrap().as_str(), Backtick) - } else if let Some(captures) = COLON.captures(rest) { - (captures.get(1).unwrap().as_str(), captures.get(2).unwrap().as_str(), Colon) - } else if let Some(captures) = AT.captures(rest) { - (captures.get(1).unwrap().as_str(), captures.get(2).unwrap().as_str(), At) - } else if let Some(captures) = PLUS.captures(rest) { - (captures.get(1).unwrap().as_str(), captures.get(2).unwrap().as_str(), Plus) - } else if let Some(captures) = EQUALS.captures(rest) { - (captures.get(1).unwrap().as_str(), captures.get(2).unwrap().as_str(), Equals) - } else if let Some(captures) = COMMENT.captures(rest) { - (captures.get(1).unwrap().as_str(), captures.get(2).unwrap().as_str(), Comment) - } else if let Some(captures) = RAW_STRING.captures(rest) { - (captures.get(1).unwrap().as_str(), captures.get(2).unwrap().as_str(), RawString) - } else if UNTERMINATED_RAW_STRING.is_match(rest) { - return error!(UnterminatedString); - } else if let Some(captures) = STRING.captures(rest) { - let prefix = captures.get(1).unwrap().as_str(); - let contents = &rest[prefix.len()+1..]; - if contents.is_empty() { - return error!(UnterminatedString); - } - let mut len = 0; - let mut escape = false; - for c in contents.chars() { - if c == '\n' || c == '\r' { - return error!(UnterminatedString); - } else if !escape && c == '"' { - break; - } else if !escape && c == '\\' { - escape = true; - } else if escape { - escape = false; - } - len += c.len_utf8(); - } - let start = prefix.len(); - let content_end = start + len + 1; - if escape || content_end >= rest.len() { - return error!(UnterminatedString); - } - (prefix, &rest[start..content_end + 1], StringToken) - } else if rest.starts_with("#!") { - return error!(OuterShebang) - } else { - return error!(UnknownStartOfToken) - }; - - tokens.push(Token { - index: index, - line: line, - column: column, - prefix: prefix, - text: text, - lexeme: lexeme, - kind: kind, - }); - - let len = prefix.len() + lexeme.len(); - - if len == 0 { - let last = tokens.last().unwrap(); - match last.kind { - Eof => {}, - _ => return Err(last.error(Internal { - message: format!("zero length token: {:?}", last) - })), - } - } - - match tokens.last().unwrap().kind { - Eol => { - line += 1; - column = 0; - } - Eof => { - break; - } - RawString => { - let lexeme_lines = lexeme.lines().count(); - line += lexeme_lines - 1; - if lexeme_lines == 1 { - column += len; - } else { - column = lexeme.lines().last().unwrap().len(); - } - } - _ => { - column += len; - } - } - - rest = &rest[len..]; - index += len; - } - - Ok(tokens) -} - -#[cfg(test)] -mod test { - use super::*; - - macro_rules! summary_test { - ($name:ident, $input:expr, $expected:expr $(,)*) => { - #[test] - fn $name() { - let input = $input; - let expected = $expected; - let tokens = tokenize(input).unwrap(); - let roundtrip = tokens.iter().map(|t| { - let mut s = String::new(); - s += t.prefix; - s += t.lexeme; - s - }).collect::>().join(""); - let actual = token_summary(&tokens); - if actual != expected { - panic!("token summary mismatch:\nexpected: {}\ngot: {}\n", expected, actual); - } - assert_eq!(input, roundtrip); - } - } - } - - fn token_summary(tokens: &[Token]) -> String { - tokens.iter().map(|t| { - match t.kind { - At => "@", - Backtick => "`", - Colon => ":", - Comment{..} => "#", - Dedent => "<", - Eof => ".", - Eol => "$", - Equals => "=", - Indent{..} => ">", - InterpolationEnd => "}", - InterpolationStart => "{", - Line{..} => "^", - Name => "N", - Plus => "+", - RawString => "'", - StringToken => "\"", - Text => "_", - } - }).collect::>().join("") - } - - macro_rules! error_test { - ( - name: $name:ident, - input: $input:expr, - index: $index:expr, - line: $line:expr, - column: $column:expr, - width: $width:expr, - kind: $kind:expr, - ) => { - #[test] - fn $name() { - let input = $input; - - let expected = CompilationError { - text: input, - index: $index, - line: $line, - column: $column, - width: $width, - kind: $kind, - }; - - if let Err(error) = tokenize(input) { - assert_eq!(error.text, expected.text); - assert_eq!(error.index, expected.index); - assert_eq!(error.line, expected.line); - assert_eq!(error.column, expected.column); - assert_eq!(error.kind, expected.kind); - assert_eq!(error, expected); - } else { - panic!("tokenize() succeeded but expected: {}\n{}", expected, input); - } - } - } - } - - summary_test! { - tokenize_strings, - r#"a = "'a'" + '"b"' + "'c'" + '"d"'#echo hello"#, - r#"N="+'+"+'#."#, - } - - summary_test! { - tokenize_recipe_interpolation_eol, - "foo: # some comment - {{hello}} -", - "N:#$>^{N}$<.", - } - - summary_test! { - tokenize_recipe_interpolation_eof, - "foo: # more comments - {{hello}} -# another comment -", - "N:#$>^{N}$<#$.", - } - - summary_test! { - tokenize_recipe_complex_interpolation_expression, - "foo: #lol\n {{a + b + \"z\" + blarg}}", - "N:#$>^{N+N+\"+N}<.", - } - - summary_test! { - tokenize_recipe_multiple_interpolations, - "foo:#ok\n {{a}}0{{b}}1{{c}}", - "N:#$>^{N}_{N}_{N}<.", - } - - summary_test! { - tokenize_junk, - "bob - -hello blah blah blah : a b c #whatever - ", - "N$$NNNN:NNN#$.", - } - - summary_test! { - tokenize_empty_lines, - " -# this does something -hello: - asdf - bsdf - - csdf - - dsdf # whatever - -# yolo - ", - "$#$N:$>^_$^_$$^_$$^_$$<#$.", - } - - summary_test! { - tokenize_comment_before_variable, - " -# -A='1' -echo: - echo {{A}} - ", - "$#$N='$N:$>^_{N}$<.", - } - - summary_test! { - tokenize_interpolation_backticks, - "hello:\n echo {{`echo hello` + `echo goodbye`}}", - "N:$>^_{`+`}<.", - } - - summary_test! { - tokenize_assignment_backticks, - "a = `echo hello` + `echo goodbye`", - "N=`+`.", - } - - summary_test! { - tokenize_multiple, - " -hello: - a - b - - c - - d - -# hello -bob: - frank - ", - - "$N:$>^_$^_$$^_$$^_$$<#$N:$>^_$<.", - } - - summary_test! { - tokenize_comment, - "a:=#", - "N:=#." - } - - summary_test! { - tokenize_order, - r" -b: a - @mv a b - -a: - @touch F - @touch a - -d: c - @rm c - -c: b - @mv b c", - "$N:N$>^_$$^_$^_$$^_$$^_<.", - } - - error_test! { - name: tokenize_space_then_tab, - input: "a: - 0 - 1 -\t2 -", - index: 9, - line: 3, - column: 0, - width: None, - kind: InconsistentLeadingWhitespace{expected: " ", found: "\t"}, - } - - error_test! { - name: tokenize_tabs_then_tab_space, - input: "a: -\t\t0 -\t\t 1 -\t 2 -", - index: 12, - line: 3, - column: 0, - width: None, - kind: InconsistentLeadingWhitespace{expected: "\t\t", found: "\t "}, - } - - error_test! { - name: tokenize_outer_shebang, - input: "#!/usr/bin/env bash", - index: 0, - line: 0, - column: 0, - width: None, - kind: OuterShebang, - } - - error_test! { - name: tokenize_unknown, - input: "~", - index: 0, - line: 0, - column: 0, - width: None, - kind: UnknownStartOfToken, - } - - error_test! { - name: unterminated_string, - input: r#"a = ""#, - index: 3, - line: 0, - column: 3, - width: None, - kind: UnterminatedString, - } - - error_test! { - name: unterminated_string_with_escapes, - input: r#"a = "\n\t\r\"\\"#, - index: 3, - line: 0, - column: 3, - width: None, - kind: UnterminatedString, - } - - error_test! { - name: unterminated_raw_string, - input: "r a='asdf", - index: 4, - line: 0, - column: 4, - width: None, - kind: UnterminatedString, - } - - error_test! { - name: mixed_leading_whitespace, - input: "a:\n\t echo hello", - index: 3, - line: 1, - column: 0, - width: None, - kind: MixedLeadingWhitespace{whitespace: "\t "}, - } -}