Lexer code deduplication and refactoring (#414)

This commit is contained in:
Casey Rodarmor 2019-04-18 13:12:38 -07:00 committed by GitHub
parent 0ad5574ecc
commit d065d1c54f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -85,6 +85,21 @@ impl<'a> Lexer<'a> {
&self.text[self.token_start.offset..self.token_end.offset] &self.text[self.token_start.offset..self.token_end.offset]
} }
/// Length of current token
fn current_token_length(&self) -> usize {
self.token_end.offset - self.token_start.offset
}
/// Is next character c?
fn next_is(&self, c: char) -> bool {
self.next == Some(c)
}
/// Is next character ' ' or '\t'?
fn next_is_whitespace(&self) -> bool {
self.next_is(' ') || self.next_is('\t')
}
/// Un-lexed text /// Un-lexed text
fn rest(&self) -> &'a str { fn rest(&self) -> &'a str {
&self.text[self.token_end.offset..] &self.text[self.token_end.offset..]
@ -95,9 +110,14 @@ impl<'a> Lexer<'a> {
self.rest().starts_with(prefix) self.rest().starts_with(prefix)
} }
/// Length of current token /// Does rest start with "\n" or "\r\n"?
fn current_token_length(&self) -> usize { fn at_eol(&self) -> bool {
self.token_end.offset - self.token_start.offset self.next_is('\n') || self.rest_starts_with("\r\n")
}
/// Are we at end-of-line or end-of-file?
fn at_eol_or_eof(&self) -> bool {
self.at_eol() || self.rest().is_empty()
} }
/// Get current state /// Get current state
@ -237,7 +257,7 @@ impl<'a> Lexer<'a> {
// Handle blank line // Handle blank line
if rest.starts_with('\n') || rest.starts_with("\r\n") || rest.is_empty() { if rest.starts_with('\n') || rest.starts_with("\r\n") || rest.is_empty() {
while let Some(' ') | Some('\t') = self.next { while self.next_is_whitespace() {
self.advance()?; self.advance()?;
} }
@ -250,7 +270,7 @@ impl<'a> Lexer<'a> {
} }
// Handle nonblank lines with no leading whitespace // Handle nonblank lines with no leading whitespace
if self.next != Some(' ') && self.next != Some('\t') { if !self.next_is_whitespace() {
if let State::Indented { .. } = self.state()? { if let State::Indented { .. } = self.state()? {
self.token(Dedent); self.token(Dedent);
self.pop_state()?; self.pop_state()?;
@ -261,30 +281,33 @@ impl<'a> Lexer<'a> {
// Handle continued indentation // Handle continued indentation
if let State::Indented { indentation } = self.state()? { if let State::Indented { indentation } = self.state()? {
let mut remaining = indentation.len(); if self.rest_starts_with(indentation) {
for _ in indentation.chars() {
self.advance()?;
}
// Advance over whitespace up to length of current indentation // Indentation matches, lex as whitespace
while let Some(' ') | Some('\t') = self.next { self.token(Whitespace);
self.advance()?;
remaining -= 1; return Ok(());
if remaining == 0 { }
// Consume whitespace characters, matching or not, up to the length
// of expected indentation
for _ in indentation.chars().zip(self.rest().chars()) {
if self.next_is_whitespace() {
self.advance()?;
} else {
break; break;
} }
} }
let lexeme = self.lexeme(); // We've either advanced over not enough whitespace or mismatching
// whitespace, so return an error
if lexeme != indentation { return Err(self.error(InconsistentLeadingWhitespace {
return Err(self.error(InconsistentLeadingWhitespace { expected: indentation,
expected: indentation, found: self.lexeme(),
found: lexeme, }));
}));
}
// Indentation matches, lex as whitespace
self.token(Whitespace);
return Ok(());
} }
if self.state()? != State::Normal { if self.state()? != State::Normal {
@ -295,7 +318,7 @@ impl<'a> Lexer<'a> {
} }
// Handle new indentation // Handle new indentation
while let Some(' ') | Some('\t') = self.next { while self.next_is_whitespace() {
self.advance()?; self.advance()?;
} }
@ -356,7 +379,7 @@ impl<'a> Lexer<'a> {
self.pop_state()?; self.pop_state()?;
// Emit interpolation end token // Emit interpolation end token
self.lex_double(InterpolationEnd) self.lex_double(InterpolationEnd)
} else if self.rest_starts_with("\n") || self.rest_starts_with("\r\n") { } else if self.at_eol_or_eof() {
// Return unterminated interpolation error that highlights the opening {{ // Return unterminated interpolation error that highlights the opening {{
Err(self.unterminated_interpolation_error(interpolation_start)) Err(self.unterminated_interpolation_error(interpolation_start))
} else { } else {
@ -446,7 +469,7 @@ impl<'a> Lexer<'a> {
fn lex_colon(&mut self) -> CompilationResult<'a, ()> { fn lex_colon(&mut self) -> CompilationResult<'a, ()> {
self.advance()?; self.advance()?;
if let Some('=') = self.next { if self.next_is('=') {
self.advance()?; self.advance()?;
self.token(ColonEquals); self.token(ColonEquals);
} else { } else {
@ -492,8 +515,10 @@ impl<'a> Lexer<'a> {
/// Lex name: [a-zA-Z_][a-zA-Z0-9_]* /// Lex name: [a-zA-Z_][a-zA-Z0-9_]*
fn lex_name(&mut self) -> CompilationResult<'a, ()> { fn lex_name(&mut self) -> CompilationResult<'a, ()> {
while let Some('a'...'z') | Some('A'...'Z') | Some('0'...'9') | Some('_') | Some('-') = while self
self.next .next
.map(|c| c.is_ascii_alphanumeric() || c == '-' || c == '_')
.unwrap_or(false)
{ {
self.advance()?; self.advance()?;
} }
@ -508,11 +533,7 @@ impl<'a> Lexer<'a> {
// advance over # // advance over #
self.advance()?; self.advance()?;
loop { while !self.at_eol_or_eof() {
if let Some('\r') | Some('\n') | None = self.next {
break;
}
self.advance()?; self.advance()?;
} }
@ -523,22 +544,18 @@ impl<'a> Lexer<'a> {
/// Lex backtick: `[^\r\n]*` /// Lex backtick: `[^\r\n]*`
fn lex_backtick(&mut self) -> CompilationResult<'a, ()> { fn lex_backtick(&mut self) -> CompilationResult<'a, ()> {
// advance over ` // advance over initial `
self.advance()?; self.advance()?;
loop { while !self.next_is('`') {
if let Some('\r') | Some('\n') | None = self.next { if self.at_eol_or_eof() {
return Err(self.error(UnterminatedBacktick)); return Err(self.error(UnterminatedBacktick));
} }
if let Some('`') = self.next {
self.advance()?;
break;
}
self.advance()?; self.advance()?;
} }
self.advance()?;
self.token(Backtick); self.token(Backtick);
Ok(()) Ok(())
@ -546,7 +563,7 @@ impl<'a> Lexer<'a> {
/// Lex whitespace: [ \t]+ /// Lex whitespace: [ \t]+
fn lex_whitespace(&mut self) -> CompilationResult<'a, ()> { fn lex_whitespace(&mut self) -> CompilationResult<'a, ()> {
while let Some(' ') | Some('\t') = self.next { while self.next_is_whitespace() {
self.advance()? self.advance()?
} }