just/src/lexer.rs

709 lines
17 KiB
Rust
Raw Normal View History

2017-11-18 03:36:02 -08:00
use common::*;
use CompilationErrorKind::*;
use TokenKind::*;
2017-11-18 03:36:02 -08:00
fn re(pattern: &str) -> Regex {
Regex::new(pattern).unwrap()
}
fn token(pattern: &str) -> Regex {
let mut s = String::new();
s += r"^(?m)([ \t]*)(";
s += pattern;
s += ")";
re(&s)
}
fn mixed_whitespace(text: &str) -> bool {
!(text.chars().all(|c| c == ' ') || text.chars().all(|c| c == '\t'))
}
2017-12-01 02:22:32 -08:00
pub struct Lexer<'a> {
2017-11-18 03:36:02 -08:00
tokens: Vec<Token<'a>>,
text: &'a str,
rest: &'a str,
index: usize,
2017-11-18 03:36:02 -08:00
column: usize,
line: usize,
state: Vec<State<'a>>,
2017-11-18 03:36:02 -08:00
}
#[derive(PartialEq)]
enum State<'a> {
Start,
Indent(&'a str),
Text,
Interpolation,
}
2017-12-01 02:22:32 -08:00
impl<'a> Lexer<'a> {
pub fn lex(text: &'a str) -> CompilationResult<Vec<Token<'a>>> {
let lexer = Lexer {
2017-11-18 03:36:02 -08:00
tokens: vec![],
rest: text,
index: 0,
line: 0,
2017-11-18 03:36:02 -08:00
column: 0,
state: vec![State::Start],
2018-03-05 13:21:35 -08:00
text,
2017-11-18 03:36:02 -08:00
};
2017-12-01 02:22:32 -08:00
lexer.inner()
2017-11-18 03:36:02 -08:00
}
fn error(&self, kind: CompilationErrorKind<'a>) -> CompilationError<'a> {
CompilationError {
text: self.text,
index: self.index,
line: self.line,
2017-11-18 03:36:02 -08:00
column: self.column,
width: None,
2018-03-05 13:21:35 -08:00
kind,
2017-11-18 03:36:02 -08:00
}
}
fn token(&self, prefix: &'a str, lexeme: &'a str, kind: TokenKind) -> Token<'a> {
Token {
index: self.index,
line: self.line,
2017-11-18 03:36:02 -08:00
column: self.column,
text: self.text,
2018-03-05 13:21:35 -08:00
prefix,
lexeme,
kind,
2017-11-18 03:36:02 -08:00
}
}
2017-12-01 02:22:32 -08:00
fn lex_indent(&mut self) -> CompilationResult<'a, Option<Token<'a>>> {
2017-11-18 03:36:02 -08:00
lazy_static! {
static ref INDENT: Regex = re(r"^([ \t]*)[^ \t\n\r]");
}
let indentation = INDENT
.captures(self.rest)
.map(|captures| captures.get(1).unwrap().as_str());
2017-11-18 03:36:02 -08:00
if self.column == 0 {
if let Some(kind) = match (self.state.last().unwrap(), indentation) {
// ignore: was no indentation and there still isn't
// or current line is blank
(&State::Start, Some("")) | (_, None) => None,
2017-11-18 03:36:02 -08:00
// indent: was no indentation, now there is
(&State::Start, Some(current)) => {
if mixed_whitespace(current) {
return Err(self.error(MixedLeadingWhitespace {
whitespace: current,
}));
2017-11-18 03:36:02 -08:00
}
//indent = Some(current);
self.state.push(State::Indent(current));
Some(Indent)
}
// dedent: there was indentation and now there isn't
(&State::Indent(_), Some("")) => {
// indent = None;
self.state.pop();
Some(Dedent)
}
// was indentation and still is, check if the new indentation matches
(&State::Indent(previous), Some(current)) => {
if !current.starts_with(previous) {
return Err(self.error(InconsistentLeadingWhitespace {
2017-11-18 03:36:02 -08:00
expected: previous,
found: current,
2017-11-18 03:36:02 -08:00
}));
}
None
}
// at column 0 in some other state: this should never happen
(&State::Text, _) | (&State::Interpolation, _) => {
return Err(self.error(Internal {
message: "unexpected state at column 0".to_string(),
2017-11-18 03:36:02 -08:00
}));
}
} {
return Ok(Some(self.token("", "", kind)));
}
}
Ok(None)
}
pub fn inner(mut self) -> CompilationResult<'a, Vec<Token<'a>>> {
lazy_static! {
static ref AT: Regex = token(r"@");
static ref BACKTICK: Regex = token(r"`[^`\n\r]*`");
static ref COLON: Regex = token(r":");
static ref COMMA: Regex = token(r",");
static ref COMMENT: Regex = token(r"#([^\n\r][^\n\r]*)?\r?$");
static ref EOF: Regex = token(r"\z");
static ref EOL: Regex = token(r"\n|\r\n");
static ref EQUALS: Regex = token(r"=");
static ref INTERPOLATION_END: Regex = token(r"[}][}]");
static ref INTERPOLATION_START_TOKEN: Regex = token(r"[{][{]");
static ref NAME: Regex = token(r"([a-zA-Z_][a-zA-Z0-9_-]*)");
static ref PAREN_L: Regex = token(r"[(]");
static ref PAREN_R: Regex = token(r"[)]");
static ref PLUS: Regex = token(r"[+]");
static ref RAW_STRING: Regex = token(r#"'[^']*'"#);
static ref STRING: Regex = token(r#"["]"#);
static ref UNTERMINATED_RAW_STRING: Regex = token(r#"'[^']*"#);
static ref INTERPOLATION_START: Regex = re(r"^[{][{]");
static ref LEADING_TEXT: Regex = re(r"^(?m)(.+?)[{][{]");
static ref LINE: Regex = re(r"^(?m)[ \t]+[^ \t\n\r].*$");
static ref TEXT: Regex = re(r"^(?m)(.+)");
2017-11-18 03:36:02 -08:00
}
loop {
2017-12-01 02:22:32 -08:00
if let Some(token) = self.lex_indent()? {
2017-11-18 03:36:02 -08:00
self.tokens.push(token);
}
// insert a dedent if we're indented and we hit the end of the file
if &State::Start != self.state.last().unwrap() && EOF.is_match(self.rest) {
let token = self.token("", "", Dedent);
self.tokens.push(token);
}
let (prefix, lexeme, kind) = if let (0, &State::Indent(indent), Some(captures)) = (
self.column,
self.state.last().unwrap(),
LINE.captures(self.rest),
) {
2017-11-18 03:36:02 -08:00
let line = captures.get(0).unwrap().as_str();
if !line.starts_with(indent) {
return Err(self.error(Internal {
message: "unexpected indent".to_string(),
}));
2017-11-18 03:36:02 -08:00
}
self.state.push(State::Text);
(&line[0..indent.len()], "", Line)
} else if let Some(captures) = EOF.captures(self.rest) {
(
captures.get(1).unwrap().as_str(),
captures.get(2).unwrap().as_str(),
Eof,
)
2017-11-18 03:36:02 -08:00
} else if let State::Text = *self.state.last().unwrap() {
if let Some(captures) = INTERPOLATION_START.captures(self.rest) {
self.state.push(State::Interpolation);
("", captures.get(0).unwrap().as_str(), InterpolationStart)
} else if let Some(captures) = LEADING_TEXT.captures(self.rest) {
("", captures.get(1).unwrap().as_str(), Text)
} else if let Some(captures) = TEXT.captures(self.rest) {
("", captures.get(1).unwrap().as_str(), Text)
} else if let Some(captures) = EOL.captures(self.rest) {
self.state.pop();
(
captures.get(1).unwrap().as_str(),
captures.get(2).unwrap().as_str(),
Eol,
)
2017-11-18 03:36:02 -08:00
} else {
return Err(self.error(Internal {
message: format!("Could not match token in text state: \"{}\"", self.rest),
2017-11-18 03:36:02 -08:00
}));
}
} else if let Some(captures) = INTERPOLATION_START_TOKEN.captures(self.rest) {
(
captures.get(1).unwrap().as_str(),
captures.get(2).unwrap().as_str(),
InterpolationStart,
)
2017-11-18 03:36:02 -08:00
} else if let Some(captures) = INTERPOLATION_END.captures(self.rest) {
if self.state.last().unwrap() == &State::Interpolation {
self.state.pop();
}
(
captures.get(1).unwrap().as_str(),
captures.get(2).unwrap().as_str(),
InterpolationEnd,
)
2017-11-18 03:36:02 -08:00
} else if let Some(captures) = NAME.captures(self.rest) {
(
captures.get(1).unwrap().as_str(),
captures.get(2).unwrap().as_str(),
Name,
)
2017-11-18 03:36:02 -08:00
} else if let Some(captures) = EOL.captures(self.rest) {
if self.state.last().unwrap() == &State::Interpolation {
return Err(self.error(UnterminatedInterpolation));
2017-11-18 03:36:02 -08:00
}
(
captures.get(1).unwrap().as_str(),
captures.get(2).unwrap().as_str(),
Eol,
)
2017-11-18 03:36:02 -08:00
} else if let Some(captures) = BACKTICK.captures(self.rest) {
(
captures.get(1).unwrap().as_str(),
captures.get(2).unwrap().as_str(),
Backtick,
)
2017-11-18 03:36:02 -08:00
} else if let Some(captures) = COLON.captures(self.rest) {
(
captures.get(1).unwrap().as_str(),
captures.get(2).unwrap().as_str(),
Colon,
)
2017-11-18 03:36:02 -08:00
} else if let Some(captures) = AT.captures(self.rest) {
(
captures.get(1).unwrap().as_str(),
captures.get(2).unwrap().as_str(),
At,
)
} else if let Some(captures) = COMMA.captures(self.rest) {
(
captures.get(1).unwrap().as_str(),
captures.get(2).unwrap().as_str(),
Comma,
)
} else if let Some(captures) = PAREN_L.captures(self.rest) {
(
captures.get(1).unwrap().as_str(),
captures.get(2).unwrap().as_str(),
ParenL,
)
} else if let Some(captures) = PAREN_R.captures(self.rest) {
(
captures.get(1).unwrap().as_str(),
captures.get(2).unwrap().as_str(),
ParenR,
)
2017-11-18 03:36:02 -08:00
} else if let Some(captures) = PLUS.captures(self.rest) {
(
captures.get(1).unwrap().as_str(),
captures.get(2).unwrap().as_str(),
Plus,
)
2017-11-18 03:36:02 -08:00
} else if let Some(captures) = EQUALS.captures(self.rest) {
(
captures.get(1).unwrap().as_str(),
captures.get(2).unwrap().as_str(),
Equals,
)
2017-11-18 03:36:02 -08:00
} else if let Some(captures) = COMMENT.captures(self.rest) {
(
captures.get(1).unwrap().as_str(),
captures.get(2).unwrap().as_str(),
Comment,
)
2017-11-18 03:36:02 -08:00
} else if let Some(captures) = RAW_STRING.captures(self.rest) {
(
captures.get(1).unwrap().as_str(),
captures.get(2).unwrap().as_str(),
RawString,
)
2017-11-18 03:36:02 -08:00
} else if UNTERMINATED_RAW_STRING.is_match(self.rest) {
return Err(self.error(UnterminatedString));
} else if let Some(captures) = STRING.captures(self.rest) {
let prefix = captures.get(1).unwrap().as_str();
let contents = &self.rest[prefix.len() + 1..];
2017-11-18 03:36:02 -08:00
if contents.is_empty() {
return Err(self.error(UnterminatedString));
}
let mut len = 0;
let mut escape = false;
for c in contents.chars() {
if c == '\n' || c == '\r' {
return Err(self.error(UnterminatedString));
} else if !escape && c == '"' {
break;
} else if !escape && c == '\\' {
escape = true;
} else if escape {
escape = false;
}
len += c.len_utf8();
}
let start = prefix.len();
let content_end = start + len + 1;
if escape || content_end >= self.rest.len() {
return Err(self.error(UnterminatedString));
}
(prefix, &self.rest[start..=content_end], StringToken)
2017-11-18 03:36:02 -08:00
} else {
return Err(self.error(UnknownStartOfToken));
};
let token = self.token(prefix, lexeme, kind);
self.tokens.push(token);
let len = prefix.len() + lexeme.len();
if len == 0 {
let last = self.tokens.last().unwrap();
match last.kind {
Eof => {}
_ => {
return Err(last.error(Internal {
message: format!("zero length token: {:?}", last),
}));
}
2017-11-18 03:36:02 -08:00
}
}
match self.tokens.last().unwrap().kind {
Eol => {
self.line += 1;
self.column = 0;
}
Eof => {
break;
}
RawString => {
let lexeme_lines = lexeme.lines().count();
self.line += lexeme_lines - 1;
if lexeme_lines == 1 {
self.column += len;
} else {
self.column = lexeme.lines().last().unwrap().len();
}
}
_ => {
self.column += len;
}
}
self.rest = &self.rest[len..];
self.index += len;
}
Ok(self.tokens)
}
}
#[cfg(test)]
mod test {
use super::*;
macro_rules! summary_test {
($name:ident, $input:expr, $expected:expr $(,)*) => {
#[test]
fn $name() {
let input = $input;
let expected = $expected;
2017-12-01 02:22:32 -08:00
let tokens = ::Lexer::lex(input).unwrap();
let roundtrip = tokens
.iter()
.map(|t| {
let mut s = String::new();
s += t.prefix;
s += t.lexeme;
s
})
.collect::<Vec<_>>()
.join("");
2017-11-18 03:36:02 -08:00
let actual = token_summary(&tokens);
if actual != expected {
panic!(
"token summary mismatch:\nexpected: {}\ngot: {}\n",
expected, actual
);
2017-11-18 03:36:02 -08:00
}
assert_eq!(input, roundtrip);
}
};
2017-11-18 03:36:02 -08:00
}
fn token_summary(tokens: &[Token]) -> String {
tokens
.iter()
.map(|t| match t.kind {
At => "@",
Backtick => "`",
Colon => ":",
Comma => ",",
Comment { .. } => "#",
Dedent => "<",
Eof => ".",
Eol => "$",
Equals => "=",
Indent { .. } => ">",
InterpolationEnd => "}",
2017-11-18 03:36:02 -08:00
InterpolationStart => "{",
Line { .. } => "^",
Name => "N",
ParenL => "(",
ParenR => ")",
Plus => "+",
RawString => "'",
StringToken => "\"",
Text => "_",
})
.collect::<Vec<_>>()
.join("")
2017-11-18 03:36:02 -08:00
}
macro_rules! error_test {
(
name: $name:ident,
input: $input:expr,
index: $index:expr,
line: $line:expr,
column: $column:expr,
width: $width:expr,
kind: $kind:expr,
) => {
#[test]
fn $name() {
let input = $input;
let expected = CompilationError {
text: input,
index: $index,
line: $line,
2017-11-18 03:36:02 -08:00
column: $column,
width: $width,
kind: $kind,
2017-11-18 03:36:02 -08:00
};
2017-12-01 02:22:32 -08:00
if let Err(error) = Lexer::lex(input) {
assert_eq!(error.text, expected.text);
assert_eq!(error.index, expected.index);
assert_eq!(error.line, expected.line);
2017-11-18 03:36:02 -08:00
assert_eq!(error.column, expected.column);
assert_eq!(error.kind, expected.kind);
assert_eq!(error, expected);
2017-11-18 03:36:02 -08:00
} else {
panic!("tokenize succeeded but expected: {}\n{}", expected, input);
}
}
};
2017-11-18 03:36:02 -08:00
}
summary_test! {
tokenize_strings,
r#"a = "'a'" + '"b"' + "'c'" + '"d"'#echo hello"#,
r#"N="+'+"+'#."#,
}
summary_test! {
tokenize_recipe_interpolation_eol,
"foo: # some comment
{{hello}}
",
"N:#$>^{N}$<.",
}
summary_test! {
tokenize_recipe_interpolation_eof,
"foo: # more comments
{{hello}}
# another comment
",
"N:#$>^{N}$<#$.",
}
summary_test! {
tokenize_recipe_complex_interpolation_expression,
"foo: #lol\n {{a + b + \"z\" + blarg}}",
"N:#$>^{N+N+\"+N}<.",
}
summary_test! {
tokenize_recipe_multiple_interpolations,
"foo:,#ok\n {{a}}0{{b}}1{{c}}",
"N:,#$>^{N}_{N}_{N}<.",
2017-11-18 03:36:02 -08:00
}
summary_test! {
tokenize_junk,
"bob
hello blah blah blah : a b c #whatever
",
"N$$NNNN:NNN#$.",
}
summary_test! {
tokenize_empty_lines,
"
# this does something
hello:
asdf
bsdf
csdf
dsdf # whatever
# yolo
",
"$#$N:$>^_$^_$$^_$$^_$$<#$.",
}
summary_test! {
tokenize_comment_before_variable,
"
#
A='1'
echo:
echo {{A}}
",
"$#$N='$N:$>^_{N}$<.",
}
summary_test! {
tokenize_interpolation_backticks,
"hello:\n echo {{`echo hello` + `echo goodbye`}}",
"N:$>^_{`+`}<.",
}
summary_test! {
tokenize_assignment_backticks,
"a = `echo hello` + `echo goodbye`",
"N=`+`.",
}
summary_test! {
tokenize_multiple,
"
hello:
a
b
c
d
# hello
bob:
frank
",
"$N:$>^_$^_$$^_$$^_$$<#$N:$>^_$<.",
}
summary_test! {
tokenize_comment,
"a:=#",
"N:=#."
}
2018-01-22 23:17:14 -08:00
summary_test! {
tokenize_comment_with_bang,
"a:=#foo!",
"N:=#."
}
2017-11-18 03:36:02 -08:00
summary_test! {
tokenize_order,
r"
b: a
@mv a b
a:
@touch F
@touch a
d: c
@rm c
c: b
@mv b c",
"$N:N$>^_$$<N:$>^_$^_$$<N:N$>^_$$<N:N$>^_<.",
}
summary_test! {
tokenize_parens,
r"((())) )abc(+",
"((())))N(+.",
}
2018-01-05 02:03:58 -08:00
summary_test! {
crlf_newline,
"#\r\n#asdf\r\n",
"#$#$.",
}
2017-11-18 03:36:02 -08:00
error_test! {
name: tokenize_space_then_tab,
input: "a:
0
1
\t2
",
index: 9,
line: 3,
column: 0,
width: None,
kind: InconsistentLeadingWhitespace{expected: " ", found: "\t"},
}
error_test! {
name: tokenize_tabs_then_tab_space,
input: "a:
\t\t0
\t\t 1
\t 2
",
index: 12,
line: 3,
column: 0,
width: None,
kind: InconsistentLeadingWhitespace{expected: "\t\t", found: "\t "},
}
error_test! {
name: tokenize_unknown,
input: "~",
index: 0,
line: 0,
column: 0,
width: None,
kind: UnknownStartOfToken,
}
error_test! {
name: unterminated_string,
input: r#"a = ""#,
index: 3,
line: 0,
column: 3,
width: None,
kind: UnterminatedString,
}
error_test! {
name: unterminated_string_with_escapes,
input: r#"a = "\n\t\r\"\\"#,
index: 3,
line: 0,
column: 3,
width: None,
kind: UnterminatedString,
}
error_test! {
name: unterminated_raw_string,
input: "r a='asdf",
index: 4,
line: 0,
column: 4,
width: None,
kind: UnterminatedString,
}
error_test! {
name: unterminated_interpolation,
input: "foo:\n echo {{
",
index: 13,
line: 1,
column: 8,
width: None,
kind: UnterminatedInterpolation,
}
2017-11-18 03:36:02 -08:00
error_test! {
2018-01-05 02:03:58 -08:00
name: mixed_leading_whitespace,
input: "a:\n\t echo hello",
2017-11-18 03:36:02 -08:00
index: 3,
line: 1,
column: 0,
width: None,
kind: MixedLeadingWhitespace{whitespace: "\t "},
}
}