From 861173581c22bf09b6853ab2f28b052b856991bc Mon Sep 17 00:00:00 2001
From: Casey Rodarmor <casey@rodarmor.com>
Date: Sat, 18 Nov 2017 03:36:02 -0800
Subject: [PATCH] Refactor Tokenizer (#260)

---
 src/main.rs      |  11 +-
 src/parser.rs    |   6 +
 src/run.rs       |   3 +-
 src/scanner.rs   | 600 +++++++++++++++++++++++++++++++++++++++++++++++
 src/testing.rs   |   6 +-
 src/tokenizer.rs | 585 ---------------------------------------------
 6 files changed, 611 insertions(+), 600 deletions(-)
 create mode 100644 src/scanner.rs
 delete mode 100644 src/tokenizer.rs
diff --git a/src/main.rs b/src/main.rs
index 7b3aa2d..1e12f1c 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -33,11 +33,9 @@ mod recipe;
 mod recipe_resolver;
 mod run;
 mod runtime_error;
+mod scanner;
 mod shebang;
 mod token;
-mod tokenizer;
-
-use tokenizer::tokenize;
 
 mod common {
   pub use std::borrow::Cow;
@@ -70,18 +68,13 @@ mod common {
   pub use recipe::Recipe;
   pub use recipe_resolver::RecipeResolver;
   pub use runtime_error::{RuntimeError, RunResult};
+  pub use scanner::Scanner;
   pub use shebang::Shebang;
   pub use token::{Token, TokenKind};
 }
 
 use common::*;
 
-fn compile(text: &str) -> CompilationResult<Justfile> {
-  let tokens = tokenize(text)?;
-  let parser = Parser::new(text, tokens);
-  parser.justfile()
-}
-
 fn main() {
   run::run();
 }
diff --git a/src/parser.rs b/src/parser.rs
index dc94016..01d653d 100644
--- a/src/parser.rs
+++ b/src/parser.rs
@@ -14,6 +14,12 @@ pub struct Parser<'a> {
 }
 
 impl<'a> Parser<'a> {
+  pub fn parse(text: &'a str) -> CompilationResult<'a, Justfile> {
+    let tokens = Scanner::scan(text)?;
+    let parser = Parser::new(text, tokens);
+    parser.justfile()
+  }
+
   pub fn new(text: &'a str, tokens: Vec<Token<'a>>) -> Parser<'a> {
     Parser {
       text:              text,
diff --git a/src/run.rs b/src/run.rs
index 59af6ad..789919b 100644
--- a/src/run.rs
+++ b/src/run.rs
@@ -2,7 +2,6 @@ use common::*;
 
 use std::{convert, ffi};
 use clap::{App, Arg, ArgGroup, AppSettings};
-use compile;
 use misc::maybe_s;
 use configuration::DEFAULT_SHELL;
 
@@ -232,7 +231,7 @@ pub fn run() {
       .unwrap_or_else(|error| die!("Error reading justfile: {}", error));
   }
 
-  let justfile = compile(&text).unwrap_or_else(|error|
+  let justfile = Parser::parse(&text).unwrap_or_else(|error|
     if color.stderr().active() {
       die!("{:#}", error);
     } else {
diff --git a/src/scanner.rs b/src/scanner.rs
new file mode 100644
index 0000000..234c430
--- /dev/null
+++ b/src/scanner.rs
@@ -0,0 +1,600 @@
+use common::*;
+
+use TokenKind::*;
+use CompilationErrorKind::*;
+
+fn re(pattern: &str) -> Regex {
+  Regex::new(pattern).unwrap()
+}
+
+fn token(pattern: &str) -> Regex {
+  let mut s = String::new();
+  s += r"^(?m)([ \t]*)(";
+  s += pattern;
+  s += ")";
+  re(&s)
+}
+
+fn mixed_whitespace(text: &str) -> bool {
+  !(text.chars().all(|c| c == ' ') || text.chars().all(|c| c == '\t'))
+}
+
+pub struct Scanner<'a> {
+  tokens: Vec<Token<'a>>,
+  text:   &'a str,
+  rest:   &'a str,
+  index:  usize,
+  column: usize,
+  line:   usize,
+  state:  Vec<State<'a>>,
+}
+
+#[derive(PartialEq)]
+enum State<'a> {
+  Start,
+  Indent(&'a str),
+  Text,
+  Interpolation,
+}
+
+impl<'a> Scanner<'a> {
+  pub fn scan(text: &'a str) -> CompilationResult<Vec<Token<'a>>> {
+    let scanner = Scanner{
+      tokens: vec![],
+      text:   text,
+      rest:   text,
+      index:  0,
+      line:   0,
+      column: 0,
+      state:  vec![State::Start],
+    };
+
+    scanner.inner()
+  }
+
+  fn error(&self, kind: CompilationErrorKind<'a>) -> CompilationError<'a> {
+    CompilationError {
+      text:   self.text,
+      index:  self.index,
+      line:   self.line,
+      column: self.column,
+      width:  None,
+      kind:   kind,
+    }
+  }
+
+  fn token(&self, prefix: &'a str, lexeme: &'a str, kind: TokenKind) -> Token<'a> {
+    Token {
+      index:  self.index,
+      line:   self.line,
+      column: self.column,
+      text:   self.text,
+      prefix: prefix,
+      lexeme: lexeme,
+      kind:   kind,
+    }
+  }
+
+  fn scan_indent(&mut self) -> CompilationResult<'a, Option<Token<'a>>> {
+    lazy_static! {
+      static ref INDENT: Regex = re(r"^([ \t]*)[^ \t\n\r]");
+    }
+
+    let indentation = INDENT.captures(self.rest).map(|captures| captures.get(1).unwrap().as_str());
+
+    if self.column == 0 {
+      if let Some(kind) = match (self.state.last().unwrap(), indentation) {
+        // ignore: was no indentation and there still isn't
+        //         or current line is blank
+        (&State::Start, Some("")) | (_, None) => {
+          None
+        }
+        // indent: was no indentation, now there is
+        (&State::Start, Some(current)) => {
+          if mixed_whitespace(current) {
+            return Err(self.error(MixedLeadingWhitespace{whitespace: current}));
+          }
+          //indent = Some(current);
+          self.state.push(State::Indent(current));
+          Some(Indent)
+        }
+        // dedent: there was indentation and now there isn't
+        (&State::Indent(_), Some("")) => {
+          // indent = None;
+          self.state.pop();
+          Some(Dedent)
+        }
+        // was indentation and still is, check if the new indentation matches
+        (&State::Indent(previous), Some(current)) => {
+          if !current.starts_with(previous) {
+            return Err(self.error(InconsistentLeadingWhitespace{
+              expected: previous,
+              found: current
+            }));
+          }
+          None
+        }
+        // at column 0 in some other state: this should never happen
+        (&State::Text, _) | (&State::Interpolation, _) => {
+          return Err(self.error(Internal {
+            message: "unexpected state at column 0".to_string()
+          }));
+        }
+      } {
+        return Ok(Some(self.token("", "", kind)));
+      }
+    }
+    Ok(None)
+  }
+
+  pub fn inner(mut self) -> CompilationResult<'a, Vec<Token<'a>>> {
+    lazy_static! {
+      static ref BACKTICK:                  Regex = token(r"`[^`\n\r]*`"               );
+      static ref COLON:                     Regex = token(r":"                         );
+      static ref AT:                        Regex = token(r"@"                         );
+      static ref COMMENT:                   Regex = token(r"#([^!\n\r].*)?$"           );
+      static ref EOF:                       Regex = token(r"(?-m)$"                    );
+      static ref EOL:                       Regex = token(r"\n|\r\n"                   );
+      static ref EQUALS:                    Regex = token(r"="                         );
+      static ref INTERPOLATION_END:         Regex = token(r"[}][}]"                    );
+      static ref INTERPOLATION_START_TOKEN: Regex = token(r"[{][{]"                    );
+      static ref NAME:                      Regex = token(r"([a-zA-Z_][a-zA-Z0-9_-]*)" );
+      static ref PLUS:                      Regex = token(r"[+]"                       );
+      static ref STRING:                    Regex = token("\""                         );
+      static ref RAW_STRING:                Regex = token(r#"'[^']*'"#                 );
+      static ref UNTERMINATED_RAW_STRING:   Regex = token(r#"'[^']*"#                  );
+      static ref INTERPOLATION_START:       Regex = re(r"^[{][{]"                 );
+      static ref LEADING_TEXT:              Regex = re(r"^(?m)(.+?)[{][{]"        );
+      static ref LINE:                      Regex = re(r"^(?m)[ \t]+[^ \t\n\r].*$");
+      static ref TEXT:                      Regex = re(r"^(?m)(.+)"               );
+    }
+
+    loop {
+      if let Some(token) = self.scan_indent()? {
+        self.tokens.push(token);
+      }
+
+      // insert a dedent if we're indented and we hit the end of the file
+      if &State::Start != self.state.last().unwrap() && EOF.is_match(self.rest) {
+        let token = self.token("", "", Dedent);
+        self.tokens.push(token);
+      }
+
+      let (prefix, lexeme, kind) =
+      if let (0, &State::Indent(indent), Some(captures)) =
+        (self.column, self.state.last().unwrap(), LINE.captures(self.rest)) {
+        let line = captures.get(0).unwrap().as_str();
+        if !line.starts_with(indent) {
+          return Err(self.error(Internal{message: "unexpected indent".to_string()}));
+        }
+        self.state.push(State::Text);
+        (&line[0..indent.len()], "", Line)
+      } else if let Some(captures) = EOF.captures(self.rest) {
+        (captures.get(1).unwrap().as_str(), captures.get(2).unwrap().as_str(), Eof)
+      } else if let State::Text = *self.state.last().unwrap() {
+        if let Some(captures) = INTERPOLATION_START.captures(self.rest) {
+          self.state.push(State::Interpolation);
+          ("", captures.get(0).unwrap().as_str(), InterpolationStart)
+        } else if let Some(captures) = LEADING_TEXT.captures(self.rest) {
+          ("", captures.get(1).unwrap().as_str(), Text)
+        } else if let Some(captures) = TEXT.captures(self.rest) {
+          ("", captures.get(1).unwrap().as_str(), Text)
+        } else if let Some(captures) = EOL.captures(self.rest) {
+          self.state.pop();
+          (captures.get(1).unwrap().as_str(), captures.get(2).unwrap().as_str(), Eol)
+        } else {
+          return Err(self.error(Internal {
+            message: format!("Could not match token in text state: \"{}\"", self.rest)
+          }));
+        }
+      } else if let Some(captures) = INTERPOLATION_START_TOKEN.captures(self.rest) {
+        (captures.get(1).unwrap().as_str(), captures.get(2).unwrap().as_str(), InterpolationStart)
+      } else if let Some(captures) = INTERPOLATION_END.captures(self.rest) {
+        if self.state.last().unwrap() == &State::Interpolation {
+          self.state.pop();
+        }
+        (captures.get(1).unwrap().as_str(), captures.get(2).unwrap().as_str(), InterpolationEnd)
+      } else if let Some(captures) = NAME.captures(self.rest) {
+        (captures.get(1).unwrap().as_str(), captures.get(2).unwrap().as_str(), Name)
+      } else if let Some(captures) = EOL.captures(self.rest) {
+        if self.state.last().unwrap() == &State::Interpolation {
+          return Err(self.error(Internal {
+            message: "hit EOL while still in interpolation state".to_string()
+          }));
+        }
+        (captures.get(1).unwrap().as_str(), captures.get(2).unwrap().as_str(), Eol)
+      } else if let Some(captures) = BACKTICK.captures(self.rest) {
+        (captures.get(1).unwrap().as_str(), captures.get(2).unwrap().as_str(), Backtick)
+      } else if let Some(captures) = COLON.captures(self.rest) {
+        (captures.get(1).unwrap().as_str(), captures.get(2).unwrap().as_str(), Colon)
+      } else if let Some(captures) = AT.captures(self.rest) {
+        (captures.get(1).unwrap().as_str(), captures.get(2).unwrap().as_str(), At)
+      } else if let Some(captures) = PLUS.captures(self.rest) {
+        (captures.get(1).unwrap().as_str(), captures.get(2).unwrap().as_str(), Plus)
+      } else if let Some(captures) = EQUALS.captures(self.rest) {
+        (captures.get(1).unwrap().as_str(), captures.get(2).unwrap().as_str(), Equals)
+      } else if let Some(captures) = COMMENT.captures(self.rest) {
+        (captures.get(1).unwrap().as_str(), captures.get(2).unwrap().as_str(), Comment)
+      } else if let Some(captures) = RAW_STRING.captures(self.rest) {
+        (captures.get(1).unwrap().as_str(), captures.get(2).unwrap().as_str(), RawString)
+      } else if UNTERMINATED_RAW_STRING.is_match(self.rest) {
+        return Err(self.error(UnterminatedString));
+      } else if let Some(captures) = STRING.captures(self.rest) {
+        let prefix = captures.get(1).unwrap().as_str();
+        let contents = &self.rest[prefix.len()+1..];
+        if contents.is_empty() {
+          return Err(self.error(UnterminatedString));
+        }
+        let mut len = 0;
+        let mut escape = false;
+        for c in contents.chars() {
+          if c == '\n' || c == '\r' {
+            return Err(self.error(UnterminatedString));
+          } else if !escape && c == '"' {
+            break;
+          } else if !escape && c == '\\' {
+            escape = true;
+          } else if escape {
+            escape = false;
+          }
+          len += c.len_utf8();
+        }
+        let start = prefix.len();
+        let content_end = start + len + 1;
+        if escape || content_end >= self.rest.len() {
+          return Err(self.error(UnterminatedString));
+        }
+        (prefix, &self.rest[start..content_end + 1], StringToken)
+      } else if self.rest.starts_with("#!") {
+        return Err(self.error(OuterShebang));
+      } else {
+        return Err(self.error(UnknownStartOfToken));
+      };
+
+      let token = self.token(prefix, lexeme, kind);
+      self.tokens.push(token);
+
+      let len = prefix.len() + lexeme.len();
+
+      if len == 0 {
+        let last = self.tokens.last().unwrap();
+        match last.kind {
+          Eof => {},
+          _ => return Err(last.error(Internal {
+            message: format!("zero length token: {:?}", last)
+          })),
+        }
+      }
+
+      match self.tokens.last().unwrap().kind {
+        Eol => {
+          self.line += 1;
+          self.column = 0;
+        }
+        Eof => {
+          break;
+        }
+        RawString => {
+          let lexeme_lines = lexeme.lines().count();
+          self.line += lexeme_lines - 1;
+          if lexeme_lines == 1 {
+            self.column += len;
+          } else {
+            self.column = lexeme.lines().last().unwrap().len();
+          }
+        }
+        _ => {
+          self.column += len;
+        }
+      }
+
+      self.rest = &self.rest[len..];
+      self.index += len;
+    }
+
+    Ok(self.tokens)
+  }
+}
+
+#[cfg(test)]
+mod test {
+  use super::*;
+
+  macro_rules! summary_test {
+    ($name:ident, $input:expr, $expected:expr $(,)*) => {
+      #[test]
+      fn $name() {
+        let input = $input;
+        let expected = $expected;
+        let tokens = ::Scanner::scan(input).unwrap();
+        let roundtrip = tokens.iter().map(|t| {
+          let mut s = String::new();
+          s += t.prefix;
+          s += t.lexeme;
+          s
+        }).collect::<Vec<_>>().join("");
+        let actual = token_summary(&tokens);
+        if actual != expected {
+          panic!("token summary mismatch:\nexpected: {}\ngot:      {}\n", expected, actual);
+        }
+        assert_eq!(input, roundtrip);
+      }
+    }
+  }
+
+  fn token_summary(tokens: &[Token]) -> String {
+    tokens.iter().map(|t| {
+      match t.kind {
+        At                 => "@",
+        Backtick           => "`",
+        Colon              => ":",
+        Comment{..}        => "#",
+        Dedent             => "<",
+        Eof                => ".",
+        Eol                => "$",
+        Equals             => "=",
+        Indent{..}         => ">",
+        InterpolationEnd   => "}",
+        InterpolationStart => "{",
+        Line{..}           => "^",
+        Name               => "N",
+        Plus               => "+",
+        RawString          => "'",
+        StringToken        => "\"",
+        Text               => "_",
+      }
+    }).collect::<Vec<_>>().join("")
+  }
+
+  macro_rules! error_test {
+    (
+      name:     $name:ident,
+      input:    $input:expr,
+      index:    $index:expr,
+      line:     $line:expr,
+      column:   $column:expr,
+      width:    $width:expr,
+      kind:     $kind:expr,
+    ) => {
+      #[test]
+      fn $name() {
+        let input = $input;
+
+        let expected = CompilationError {
+          text:   input,
+          index:  $index,
+          line:   $line,
+          column: $column,
+          width:  $width,
+          kind:   $kind,
+        };
+
+        if let Err(error) = Scanner::scan(input) {
+          assert_eq!(error.text,   expected.text);
+          assert_eq!(error.index,  expected.index);
+          assert_eq!(error.line,   expected.line);
+          assert_eq!(error.column, expected.column);
+          assert_eq!(error.kind,   expected.kind);
+          assert_eq!(error,        expected);
+        } else {
+          panic!("tokenize succeeded but expected: {}\n{}", expected, input);
+        }
+      }
+    }
+  }
+
+  summary_test! {
+    tokenize_strings,
+    r#"a = "'a'" + '"b"' + "'c'" + '"d"'#echo hello"#,
+    r#"N="+'+"+'#."#,
+  }
+
+  summary_test! {
+    tokenize_recipe_interpolation_eol,
+    "foo: # some comment
+ {{hello}}
+",
+    "N:#$>^{N}$<.",
+  }
+
+  summary_test! {
+    tokenize_recipe_interpolation_eof,
+    "foo: # more comments
+ {{hello}}
+# another comment
+",
+    "N:#$>^{N}$<#$.",
+  }
+
+  summary_test! {
+    tokenize_recipe_complex_interpolation_expression,
+    "foo: #lol\n {{a + b + \"z\" + blarg}}",
+    "N:#$>^{N+N+\"+N}<.",
+  }
+
+  summary_test! {
+    tokenize_recipe_multiple_interpolations,
+    "foo:#ok\n {{a}}0{{b}}1{{c}}",
+    "N:#$>^{N}_{N}_{N}<.",
+  }
+
+  summary_test! {
+    tokenize_junk,
+    "bob
+
+hello blah blah blah : a b c #whatever
+    ",
+    "N$$NNNN:NNN#$.",
+  }
+
+  summary_test! {
+    tokenize_empty_lines,
+    "
+# this does something
+hello:
+  asdf
+  bsdf
+
+  csdf
+
+  dsdf # whatever
+
+# yolo
+  ",
+    "$#$N:$>^_$^_$$^_$$^_$$<#$.",
+  }
+
+  summary_test! {
+    tokenize_comment_before_variable,
+    "
+#
+A='1'
+echo:
+  echo {{A}}
+  ",
+    "$#$N='$N:$>^_{N}$<.",
+  }
+
+  summary_test! {
+    tokenize_interpolation_backticks,
+    "hello:\n echo {{`echo hello` + `echo goodbye`}}",
+    "N:$>^_{`+`}<.",
+  }
+
+  summary_test! {
+    tokenize_assignment_backticks,
+    "a = `echo hello` + `echo goodbye`",
+    "N=`+`.",
+  }
+
+  summary_test! {
+    tokenize_multiple,
+    "
+hello:
+  a
+  b
+
+  c
+
+  d
+
+# hello
+bob:
+  frank
+  ",
+
+    "$N:$>^_$^_$$^_$$^_$$<#$N:$>^_$<.",
+  }
+
+  summary_test! {
+    tokenize_comment,
+    "a:=#",
+    "N:=#."
+  }
+
+  summary_test! {
+    tokenize_order,
+    r"
+b: a
+  @mv a b
+
+a:
+  @touch F
+  @touch a
+
+d: c
+  @rm c
+
+c: b
+  @mv b c",
+    "$N:N$>^_$$<N:$>^_$^_$$<N:N$>^_$$<N:N$>^_<.",
+  }
+
+  error_test! {
+    name:  tokenize_space_then_tab,
+    input: "a:
+ 0
+ 1
+\t2
+",
+    index:  9,
+    line:   3,
+    column: 0,
+    width:  None,
+    kind:   InconsistentLeadingWhitespace{expected: " ", found: "\t"},
+  }
+
+  error_test! {
+    name:  tokenize_tabs_then_tab_space,
+    input: "a:
+\t\t0
+\t\t 1
+\t  2
+",
+    index:  12,
+    line:   3,
+    column: 0,
+    width:  None,
+    kind:   InconsistentLeadingWhitespace{expected: "\t\t", found: "\t  "},
+  }
+
+  error_test! {
+    name: tokenize_outer_shebang,
+    input: "#!/usr/bin/env bash",
+    index:  0,
+    line:   0,
+    column: 0,
+    width:  None,
+    kind:   OuterShebang,
+  }
+
+  error_test! {
+    name: tokenize_unknown,
+    input: "~",
+    index:  0,
+    line:   0,
+    column: 0,
+    width:  None,
+    kind:   UnknownStartOfToken,
+  }
+
+  error_test! {
+    name: unterminated_string,
+    input: r#"a = ""#,
+    index:  3,
+    line:   0,
+    column: 3,
+    width:  None,
+    kind:   UnterminatedString,
+  }
+
+  error_test! {
+    name: unterminated_string_with_escapes,
+    input: r#"a = "\n\t\r\"\\"#,
+    index:  3,
+    line:   0,
+    column: 3,
+    width:  None,
+    kind:   UnterminatedString,
+  }
+
+  error_test! {
+    name:  unterminated_raw_string,
+    input: "r a='asdf",
+    index:  4,
+    line:   0,
+    column: 4,
+    width:  None,
+    kind:   UnterminatedString,
+  }
+
+  error_test! {
+    name: mixed_leading_whitespace,
+    input: "a:\n\t echo hello",
+    index:  3,
+    line:   1,
+    column: 0,
+    width:  None,
+    kind:   MixedLeadingWhitespace{whitespace: "\t "},
+  }
+}
diff --git a/src/testing.rs b/src/testing.rs
index d609a5b..376de70 100644
--- a/src/testing.rs
+++ b/src/testing.rs
@@ -1,9 +1,7 @@
 use common::*;
 
-use compile;
-
 pub fn parse_success(text: &str) -> Justfile {
-  match compile(text) {
+  match Parser::parse(text) {
     Ok(justfile) => justfile,
     Err(error) => panic!("Expected successful parse but got error:\n{}", error),
   }
@@ -32,7 +30,7 @@ macro_rules! compilation_error_test {
         kind:   $kind,
       };
 
-      let tokens = ::tokenizer::tokenize(input).unwrap();
+      let tokens = ::Scanner::scan(input).unwrap();
       let parser = ::Parser::new(input, tokens);
 
       if let Err(error) = parser.justfile() {
diff --git a/src/tokenizer.rs b/src/tokenizer.rs
deleted file mode 100644
index 0a91dcc..0000000
--- a/src/tokenizer.rs
+++ /dev/null
@@ -1,585 +0,0 @@
-use common::*;
-
-use TokenKind::*;
-use CompilationErrorKind::*;
-
-fn re(pattern: &str) -> Regex {
-  Regex::new(pattern).unwrap()
-}
-
-fn token(pattern: &str) -> Regex {
-  let mut s = String::new();
-  s += r"^(?m)([ \t]*)(";
-  s += pattern;
-  s += ")";
-  re(&s)
-}
-
-fn mixed_whitespace(text: &str) -> bool {
-  !(text.chars().all(|c| c == ' ') || text.chars().all(|c| c == '\t'))
-}
-
-pub fn tokenize(text: &str) -> CompilationResult<Vec<Token>> {
-  lazy_static! {
-    static ref BACKTICK:                  Regex = token(r"`[^`\n\r]*`"               );
-    static ref COLON:                     Regex = token(r":"                         );
-    static ref AT:                        Regex = token(r"@"                         );
-    static ref COMMENT:                   Regex = token(r"#([^!\n\r].*)?$"           );
-    static ref EOF:                       Regex = token(r"(?-m)$"                    );
-    static ref EOL:                       Regex = token(r"\n|\r\n"                   );
-    static ref EQUALS:                    Regex = token(r"="                         );
-    static ref INTERPOLATION_END:         Regex = token(r"[}][}]"                    );
-    static ref INTERPOLATION_START_TOKEN: Regex = token(r"[{][{]"                    );
-    static ref NAME:                      Regex = token(r"([a-zA-Z_][a-zA-Z0-9_-]*)" );
-    static ref PLUS:                      Regex = token(r"[+]"                       );
-    static ref STRING:                    Regex = token("\""                         );
-    static ref RAW_STRING:                Regex = token(r#"'[^']*'"#                 );
-    static ref UNTERMINATED_RAW_STRING:   Regex = token(r#"'[^']*"#                  );
-    static ref INDENT:                    Regex = re(r"^([ \t]*)[^ \t\n\r]"     );
-    static ref INTERPOLATION_START:       Regex = re(r"^[{][{]"                 );
-    static ref LEADING_TEXT:              Regex = re(r"^(?m)(.+?)[{][{]"        );
-    static ref LINE:                      Regex = re(r"^(?m)[ \t]+[^ \t\n\r].*$");
-    static ref TEXT:                      Regex = re(r"^(?m)(.+)"               );
-  }
-
-  #[derive(PartialEq)]
-  enum State<'a> {
-    Start,
-    Indent(&'a str),
-    Text,
-    Interpolation,
-  }
-
-  fn indentation(text: &str) -> Option<&str> {
-    INDENT.captures(text).map(|captures| captures.get(1).unwrap().as_str())
-  }
-
-  let mut tokens = vec![];
-  let mut rest   = text;
-  let mut index  = 0;
-  let mut line   = 0;
-  let mut column = 0;
-  let mut state  = vec![State::Start];
-
-  macro_rules! error {
-    ($kind:expr) => {{
-      Err(CompilationError {
-        text:   text,
-        index:  index,
-        line:   line,
-        column: column,
-        width:  None,
-        kind:   $kind,
-      })
-    }};
-  }
-
-  loop {
-    if column == 0 {
-      if let Some(kind) = match (state.last().unwrap(), indentation(rest)) {
-        // ignore: was no indentation and there still isn't
-        //         or current line is blank
-        (&State::Start, Some("")) | (_, None) => {
-          None
-        }
-        // indent: was no indentation, now there is
-        (&State::Start, Some(current)) => {
-          if mixed_whitespace(current) {
-            return error!(MixedLeadingWhitespace{whitespace: current})
-          }
-          //indent = Some(current);
-          state.push(State::Indent(current));
-          Some(Indent)
-        }
-        // dedent: there was indentation and now there isn't
-        (&State::Indent(_), Some("")) => {
-          // indent = None;
-          state.pop();
-          Some(Dedent)
-        }
-        // was indentation and still is, check if the new indentation matches
-        (&State::Indent(previous), Some(current)) => {
-          if !current.starts_with(previous) {
-            return error!(InconsistentLeadingWhitespace{
-              expected: previous,
-              found: current
-            });
-          }
-          None
-        }
-        // at column 0 in some other state: this should never happen
-        (&State::Text, _) | (&State::Interpolation, _) => {
-          return error!(Internal {
-            message: "unexpected state at column 0".to_string()
-          });
-        }
-      } {
-        tokens.push(Token {
-          index:  index,
-          line:   line,
-          column: column,
-          text:   text,
-          prefix: "",
-          lexeme: "",
-          kind:   kind,
-        });
-      }
-    }
-
-    // insert a dedent if we're indented and we hit the end of the file
-    if &State::Start != state.last().unwrap() && EOF.is_match(rest) {
-      tokens.push(Token {
-        index:  index,
-        line:   line,
-        column: column,
-        text:   text,
-        prefix: "",
-        lexeme: "",
-        kind:   Dedent,
-      });
-    }
-
-    let (prefix, lexeme, kind) =
-    if let (0, &State::Indent(indent), Some(captures)) =
-      (column, state.last().unwrap(), LINE.captures(rest)) {
-      let line = captures.get(0).unwrap().as_str();
-      if !line.starts_with(indent) {
-        return error!(Internal{message: "unexpected indent".to_string()});
-      }
-      state.push(State::Text);
-      (&line[0..indent.len()], "", Line)
-    } else if let Some(captures) = EOF.captures(rest) {
-      (captures.get(1).unwrap().as_str(), captures.get(2).unwrap().as_str(), Eof)
-    } else if let State::Text = *state.last().unwrap() {
-      if let Some(captures) = INTERPOLATION_START.captures(rest) {
-        state.push(State::Interpolation);
-        ("", captures.get(0).unwrap().as_str(), InterpolationStart)
-      } else if let Some(captures) = LEADING_TEXT.captures(rest) {
-        ("", captures.get(1).unwrap().as_str(), Text)
-      } else if let Some(captures) = TEXT.captures(rest) {
-        ("", captures.get(1).unwrap().as_str(), Text)
-      } else if let Some(captures) = EOL.captures(rest) {
-        state.pop();
-        (captures.get(1).unwrap().as_str(), captures.get(2).unwrap().as_str(), Eol)
-      } else {
-        return error!(Internal {
-          message: format!("Could not match token in text state: \"{}\"", rest)
-        });
-      }
-    } else if let Some(captures) = INTERPOLATION_START_TOKEN.captures(rest) {
-      (captures.get(1).unwrap().as_str(), captures.get(2).unwrap().as_str(), InterpolationStart)
-    } else if let Some(captures) = INTERPOLATION_END.captures(rest) {
-      if state.last().unwrap() == &State::Interpolation {
-        state.pop();
-      }
-      (captures.get(1).unwrap().as_str(), captures.get(2).unwrap().as_str(), InterpolationEnd)
-    } else if let Some(captures) = NAME.captures(rest) {
-      (captures.get(1).unwrap().as_str(), captures.get(2).unwrap().as_str(), Name)
-    } else if let Some(captures) = EOL.captures(rest) {
-      if state.last().unwrap() == &State::Interpolation {
-        return error!(Internal {
-          message: "hit EOL while still in interpolation state".to_string()
-        });
-      }
-      (captures.get(1).unwrap().as_str(), captures.get(2).unwrap().as_str(), Eol)
-    } else if let Some(captures) = BACKTICK.captures(rest) {
-      (captures.get(1).unwrap().as_str(), captures.get(2).unwrap().as_str(), Backtick)
-    } else if let Some(captures) = COLON.captures(rest) {
-      (captures.get(1).unwrap().as_str(), captures.get(2).unwrap().as_str(), Colon)
-    } else if let Some(captures) = AT.captures(rest) {
-      (captures.get(1).unwrap().as_str(), captures.get(2).unwrap().as_str(), At)
-    } else if let Some(captures) = PLUS.captures(rest) {
-      (captures.get(1).unwrap().as_str(), captures.get(2).unwrap().as_str(), Plus)
-    } else if let Some(captures) = EQUALS.captures(rest) {
-      (captures.get(1).unwrap().as_str(), captures.get(2).unwrap().as_str(), Equals)
-    } else if let Some(captures) = COMMENT.captures(rest) {
-      (captures.get(1).unwrap().as_str(), captures.get(2).unwrap().as_str(), Comment)
-    } else if let Some(captures) = RAW_STRING.captures(rest) {
-      (captures.get(1).unwrap().as_str(), captures.get(2).unwrap().as_str(), RawString)
-    } else if UNTERMINATED_RAW_STRING.is_match(rest) {
-      return error!(UnterminatedString);
-    } else if let Some(captures) = STRING.captures(rest) {
-      let prefix = captures.get(1).unwrap().as_str();
-      let contents = &rest[prefix.len()+1..];
-      if contents.is_empty() {
-        return error!(UnterminatedString);
-      }
-      let mut len = 0;
-      let mut escape = false;
-      for c in contents.chars() {
-        if c == '\n' || c == '\r' {
-          return error!(UnterminatedString);
-        } else if !escape && c == '"' {
-          break;
-        } else if !escape && c == '\\' {
-          escape = true;
-        } else if escape {
-          escape = false;
-        }
-        len += c.len_utf8();
-      }
-      let start = prefix.len();
-      let content_end = start + len + 1;
-      if escape || content_end >= rest.len() {
-        return error!(UnterminatedString);
-      }
-      (prefix, &rest[start..content_end + 1], StringToken)
-    } else if rest.starts_with("#!") {
-      return error!(OuterShebang)
-    } else {
-      return error!(UnknownStartOfToken)
-    };
-
-    tokens.push(Token {
-      index:  index,
-      line:   line,
-      column: column,
-      prefix: prefix,
-      text:   text,
-      lexeme: lexeme,
-      kind:   kind,
-    });
-
-    let len = prefix.len() + lexeme.len();
-
-    if len == 0 {
-      let last = tokens.last().unwrap();
-      match last.kind {
-        Eof => {},
-        _ => return Err(last.error(Internal {
-          message: format!("zero length token: {:?}", last)
-        })),
-      }
-    }
-
-    match tokens.last().unwrap().kind {
-      Eol => {
-        line += 1;
-        column = 0;
-      }
-      Eof => {
-        break;
-      }
-      RawString => {
-        let lexeme_lines = lexeme.lines().count();
-        line += lexeme_lines - 1;
-        if lexeme_lines == 1 {
-          column += len;
-        } else {
-          column = lexeme.lines().last().unwrap().len();
-        }
-      }
-      _ => {
-        column += len;
-      }
-    }
-
-    rest = &rest[len..];
-    index += len;
-  }
-
-  Ok(tokens)
-}
-
-#[cfg(test)]
-mod test {
-  use super::*;
-
-  macro_rules! summary_test {
-    ($name:ident, $input:expr, $expected:expr $(,)*) => {
-      #[test]
-      fn $name() {
-        let input = $input;
-        let expected = $expected;
-        let tokens = tokenize(input).unwrap();
-        let roundtrip = tokens.iter().map(|t| {
-          let mut s = String::new();
-          s += t.prefix;
-          s += t.lexeme;
-          s
-        }).collect::<Vec<_>>().join("");
-        let actual = token_summary(&tokens);
-        if actual != expected {
-          panic!("token summary mismatch:\nexpected: {}\ngot:      {}\n", expected, actual);
-        }
-        assert_eq!(input, roundtrip);
-      }
-    }
-  }
-
-  fn token_summary(tokens: &[Token]) -> String {
-    tokens.iter().map(|t| {
-      match t.kind {
-        At                 => "@",
-        Backtick           => "`",
-        Colon              => ":",
-        Comment{..}        => "#",
-        Dedent             => "<",
-        Eof                => ".",
-        Eol                => "$",
-        Equals             => "=",
-        Indent{..}         => ">",
-        InterpolationEnd   => "}",
-        InterpolationStart => "{",
-        Line{..}           => "^",
-        Name               => "N",
-        Plus               => "+",
-        RawString          => "'",
-        StringToken        => "\"",
-        Text               => "_",
-      }
-    }).collect::<Vec<_>>().join("")
-  }
-
-  macro_rules! error_test {
-    (
-      name:     $name:ident,
-      input:    $input:expr,
-      index:    $index:expr,
-      line:     $line:expr,
-      column:   $column:expr,
-      width:    $width:expr,
-      kind:     $kind:expr,
-    ) => {
-      #[test]
-      fn $name() {
-        let input = $input;
-
-        let expected = CompilationError {
-          text:   input,
-          index:  $index,
-          line:   $line,
-          column: $column,
-          width:  $width,
-          kind:   $kind,
-        };
-
-        if let Err(error) = tokenize(input) {
-          assert_eq!(error.text,   expected.text);
-          assert_eq!(error.index,  expected.index);
-          assert_eq!(error.line,   expected.line);
-          assert_eq!(error.column, expected.column);
-          assert_eq!(error.kind,   expected.kind);
-          assert_eq!(error,        expected);
-        } else {
-          panic!("tokenize() succeeded but expected: {}\n{}", expected, input);
-        }
-      }
-    }
-  }
-
-  summary_test! {
-    tokenize_strings,
-    r#"a = "'a'" + '"b"' + "'c'" + '"d"'#echo hello"#,
-    r#"N="+'+"+'#."#,
-  }
-
-  summary_test! {
-    tokenize_recipe_interpolation_eol,
-    "foo: # some comment
- {{hello}}
-", 
-    "N:#$>^{N}$<.",
-  }
-
-  summary_test! {
-    tokenize_recipe_interpolation_eof,
-    "foo: # more comments
- {{hello}}
-# another comment
-",
-    "N:#$>^{N}$<#$.",
-  }
-
-  summary_test! {
-    tokenize_recipe_complex_interpolation_expression,
-    "foo: #lol\n {{a + b + \"z\" + blarg}}",
-    "N:#$>^{N+N+\"+N}<.",
-  }
-
-  summary_test! {
-    tokenize_recipe_multiple_interpolations,
-    "foo:#ok\n {{a}}0{{b}}1{{c}}",
-    "N:#$>^{N}_{N}_{N}<.",
-  }
-
-  summary_test! {
-    tokenize_junk,
-    "bob
-
-hello blah blah blah : a b c #whatever
-    ",
-    "N$$NNNN:NNN#$.",
-  }
-
-  summary_test! {
-    tokenize_empty_lines,
-    "
-# this does something
-hello:
-  asdf
-  bsdf
-
-  csdf
-
-  dsdf # whatever
-
-# yolo
-  ",
-    "$#$N:$>^_$^_$$^_$$^_$$<#$.",
-  }
-
-  summary_test! {
-    tokenize_comment_before_variable,
-    "
-#
-A='1'
-echo:
-  echo {{A}}
-  ",
-    "$#$N='$N:$>^_{N}$<.",
-  }
-
-  summary_test! {
-    tokenize_interpolation_backticks,
-    "hello:\n echo {{`echo hello` + `echo goodbye`}}",
-    "N:$>^_{`+`}<.",
-  }
-
-  summary_test! {
-    tokenize_assignment_backticks,
-    "a = `echo hello` + `echo goodbye`",
-    "N=`+`.",
-  }
-
-  summary_test! {
-    tokenize_multiple,
-    "
-hello:
-  a
-  b
-
-  c
-
-  d
-
-# hello
-bob:
-  frank
-  ",
-
-    "$N:$>^_$^_$$^_$$^_$$<#$N:$>^_$<.",
-  }
-
-  summary_test! {
-    tokenize_comment,
-    "a:=#",
-    "N:=#."
-  }
-
-  summary_test! {
-    tokenize_order,
-    r"
-b: a
-  @mv a b
-
-a:
-  @touch F
-  @touch a
-
-d: c
-  @rm c
-
-c: b
-  @mv b c",
-    "$N:N$>^_$$<N:$>^_$^_$$<N:N$>^_$$<N:N$>^_<.",
-  }
-
-  error_test! {
-    name:  tokenize_space_then_tab,
-    input: "a:
- 0
- 1
-\t2
-",
-    index:  9,
-    line:   3,
-    column: 0,
-    width:  None,
-    kind:   InconsistentLeadingWhitespace{expected: " ", found: "\t"},
-  }
-
-  error_test! {
-    name:  tokenize_tabs_then_tab_space,
-    input: "a:
-\t\t0
-\t\t 1
-\t  2
-",
-    index:  12,
-    line:   3,
-    column: 0,
-    width:  None,
-    kind:   InconsistentLeadingWhitespace{expected: "\t\t", found: "\t  "},
-  }
-
-  error_test! {
-    name: tokenize_outer_shebang,
-    input: "#!/usr/bin/env bash",
-    index:  0,
-    line:   0,
-    column: 0,
-    width:  None,
-    kind:   OuterShebang,
-  }
-
-  error_test! {
-    name: tokenize_unknown,
-    input: "~",
-    index:  0,
-    line:   0,
-    column: 0,
-    width:  None,
-    kind:   UnknownStartOfToken,
-  }
-
-  error_test! {
-    name: unterminated_string,
-    input: r#"a = ""#,
-    index:  3,
-    line:   0,
-    column: 3,
-    width:  None,
-    kind:   UnterminatedString,
-  }
-
-  error_test! {
-    name: unterminated_string_with_escapes,
-    input: r#"a = "\n\t\r\"\\"#,
-    index:  3,
-    line:   0,
-    column: 3,
-    width:  None,
-    kind:   UnterminatedString,
-  }
-
-  error_test! {
-    name:  unterminated_raw_string,
-    input: "r a='asdf",
-    index:  4,
-    line:   0,
-    column: 4,
-    width:  None,
-    kind:   UnterminatedString,
-  }
-
-  error_test! {
-    name: mixed_leading_whitespace,
-    input: "a:\n\t echo hello",
-    index:  3,
-    line:   1,
-    column: 0,
-    width:  None,
-    kind:   MixedLeadingWhitespace{whitespace: "\t "},
-  }
-}