use std::{cell::RefCell, rc::Rc}; use nom::{ branch::alt, bytes::complete::{tag, escaped_transform, take_while, take_till}, character::is_alphanumeric, character::complete::{ alpha1, alphanumeric0, char, line_ending, none_of, not_line_ending, one_of, space0, space1, }, combinator::{map, not, opt, peek, recognize, value}, error::{context, ParseError, VerboseError}, multi::{fold_many1, many0, many1, separated_list0, separated_list1}, sequence::{delimited, pair, preceded, tuple}, Err, IResult, Parser, }; use nom_locate::{position, LocatedSpan}; use crate::identifier::{Id, IdStore}; type StoreRef = Rc>>; pub type Span<'a> = LocatedSpan<&'a str, StoreRef>; type ParseResult<'a, O> = IResult, O, VerboseError>>; use crate::ast::*; fn rc_string(s: &str) -> Rc { Rc::new(s.to_string()) } fn fresh_id(span: &Span) -> Id { let mut table_handle = span.extra.borrow_mut(); table_handle.fresh() } fn fresh_id_rc(store_ref: &StoreRef) -> Id { let mut table_handle = store_ref.borrow_mut(); table_handle.fresh() } fn tok<'a, O>( input_parser: impl Parser, O, VerboseError>>, ) -> impl FnMut(Span<'a>) -> IResult, O, VerboseError>> { context("tok", map(tuple((ws0, input_parser)), |(_, output)| output)) } fn kw<'a>(keyword_str: &'static str) -> impl FnMut(Span<'a>) -> ParseResult<()> { context("keyword", tok(value((), tag(keyword_str)))) } // whitespace does consume at least one piece of whitespace - use ws0 for maybe none fn whitespace(input: Span) -> ParseResult<()> { context("whitespace", alt((block_comment, line_comment, value((), space1))))(input) } fn ws0(input: Span) -> ParseResult<()> { context("WS0", value((), many0(whitespace)))(input) } fn line_comment(input: Span) -> ParseResult<()> { value((), tuple((tag("//"), not_line_ending)))(input) } fn block_comment(input: Span) -> ParseResult<()> { context( "Block-comment", value( (), tuple(( tag("/*"), many0(alt((value((), none_of("*/")), value((), none_of("/*")), block_comment))), tag("*/"), )), ), )(input) } fn statement_delimiter(input: Span) -> ParseResult<()> { tok(alt((value((), line_ending), value((), char(';')))))(input) } pub fn program(input: Span) -> ParseResult { let id = fresh_id(&input); //TODO `rest` should be empty let (rest, statements) = context("AST", map( tuple(( many0(statement_delimiter), separated_list0(statement_delimiter, statement), many0(statement_delimiter), )), |(_, items, _)| items.into()) )(input)?; println!("REST: {}", rest.fragment()); let ast = AST { id, statements }; Ok((rest, ast)) } pub fn block(input: Span) -> ParseResult { context( "block", map( tuple(( tok(char('{')), many0(statement_delimiter), separated_list0(statement_delimiter, statement), many0(statement_delimiter), tok(char('}')), )), |(_, _, items, _, _)| items.into(), ), )(input) } fn statement(input: Span) -> ParseResult { let (input, pos) = position(input)?; let location = pos.location_offset().into(); let id = fresh_id(&input); let (rest, kind) = context( "Parsing-statement", alt(( map(declaration, StatementKind::Declaration), map(expression, StatementKind::Expression), )) )(input)?; Ok((rest, Statement { id, location, kind })) } fn declaration(input: Span) -> ParseResult { alt((binding, module))(input) } fn binding(input: Span) -> ParseResult { let parser = tuple((kw("let"), opt(kw("mut")), tok(identifier), opt(type_anno), tok(char('=')), expression)); map(parser, |(_, maybe_mut, ident, type_anno, _, expr)| Declaration::Binding { name: rc_string(ident.fragment()), constant: maybe_mut.is_none(), type_anno, expr })(input) } fn module(input: Span) -> ParseResult { map(tuple((kw("module"), tok(identifier), block)), |(_, name, items)| Declaration::Module { name: rc_string(name.fragment()), items })(input) } pub fn expression(input: Span) -> ParseResult { let id = fresh_id(&input); map(pair(expression_kind, opt(type_anno)), move |(kind, type_anno)| Expression { id, type_anno, kind })(input) } fn type_anno(input: Span) -> ParseResult { preceded(tok(char(':')), type_identifier)(input) } fn type_identifier(input: Span) -> ParseResult { alt(( map(delimited(tok(char('(')), separated_list0(tok(char(',')), type_identifier), tok(char(')'))), TypeIdentifier::Tuple), map(type_singleton_name, TypeIdentifier::Singleton), ))(input) } fn type_singleton_name(input: Span) -> ParseResult { map(pair(tok(identifier), opt(type_params)), |(name, params)| TypeSingletonName { name: rc_string(name.fragment()), params: if let Some(params) = params { params } else { vec![] } })(input) } fn type_params(input: Span) -> ParseResult> { delimited( tok(char('<')), separated_list1(tok(char(',')), type_identifier), tok(char('>')) )(input) } pub fn expression_kind(input: Span) -> ParseResult { context("expression-kind", precedence_expr)(input) } fn precedence_expr(input: Span) -> ParseResult { let handle = input.extra.clone(); map( pair(prefix_expr, many0(precedence_continuation)), move |(first, rest): (ExpressionKind, Vec<(BinOp, ExpressionKind)>)| { let mut handle_ref = handle.borrow_mut(); BinopSequence { first, rest }.do_precedence(&mut handle_ref) })(input) } fn precedence_continuation(input: Span) -> ParseResult<(BinOp, ExpressionKind)> { pair(operator, prefix_expr)(input) } fn operator(input: Span) -> ParseResult { tok(map( tuple((not(tag("*/")), recognize(many1(one_of("+-*/%<>=!$&|?^`"))))), |(_, sigil_span): ((), Span)| BinOp::from_sigil(sigil_span.fragment()), ))(input) } fn prefix_op(input: Span) -> ParseResult { tok(map(recognize(one_of("+-!")), |sigil: Span| PrefixOp::from_sigil(sigil.fragment())))(input) } fn prefix_expr(input: Span) -> ParseResult { let handle = input.extra.clone(); context( "prefix-expr", map(pair(opt(prefix_op), extended_expr), move |(prefix, expr)| { if let Some(prefix) = prefix { let expr = Expression::new(fresh_id_rc(&handle), expr); ExpressionKind::PrefixExp(prefix, Box::new(expr)) } else { expr } }), )(input) } #[derive(Debug)] enum ExtendedPart<'a> { Index(Vec), Call(Vec), Accessor(&'a str), } fn extended_expr(input: Span) -> ParseResult { let (s, (primary, parts)) = context("extended-expr", pair(primary_expr, many0(extended_expr_part)))(input)?; let mut expression = Expression::new(fresh_id(&s), primary); for part in parts.into_iter() { let kind = match part { ExtendedPart::Index(indexers) => { ExpressionKind::Index { indexee: Box::new(expression), indexers } }, ExtendedPart::Call(arguments) => { ExpressionKind::Call { f: Box::new(expression), arguments } } ExtendedPart::Accessor(name) => { let name = rc_string(name); ExpressionKind::Access { name, expr: Box::new(expression) } }, }; expression = Expression::new(fresh_id(&s), kind); } Ok((s, expression.kind)) } fn extended_expr_part(input: Span) -> ParseResult { fn index_part(input: Span) -> ParseResult> { delimited( tok(char('[')), separated_list1(tok(char(',')), expression), tok(char(']')), )(input) } fn call_part(input: Span) -> ParseResult> { delimited( tok(char('(')), separated_list0(tok(char(',')), invocation_argument), tok(char(')')), )(input) } fn access_part(input: Span) -> ParseResult<&str> { preceded( tok(char('.')), map(identifier, |item| *item.fragment()) )(input) } alt(( map(index_part, ExtendedPart::Index), map(call_part, ExtendedPart::Call), map(access_part, ExtendedPart::Accessor) ))(input) } //TODO this shouldn't be an expression b/c type annotations disallowed here fn invocation_argument(input: Span) -> ParseResult { alt(( map(tok(char('_')), |_| InvocationArgument::Ignored), map(tuple(( tok(identifier), tok(char('=')), expression, )), |(name, _, expr)| InvocationArgument::Keyword { name: rc_string(name.fragment()), expr }), map(expression, InvocationArgument::Positional), ))(input) } fn primary_expr(input: Span) -> ParseResult { context("primary-expr", alt(( list_expr, paren_expr, string_literal, float_literal, number_literal, bool_literal, identifier_expr )) )(input) } fn paren_expr(input: Span) -> ParseResult { delimited( tok(char('(')), map(separated_list0(tok(char(',')), expression), |mut exprs| match exprs.len() { 1 => exprs.pop().unwrap().kind, _ => ExpressionKind::TupleLiteral(exprs), }), tok(char(')')) )(input) } fn list_expr(input: Span) -> ParseResult { map( delimited( tok(char('[')), separated_list0(tok(char(',')), expression), tok(char(']')), ), |items| ExpressionKind::ListLiteral(items))(input) } //TODO need to do something with prefix in the AST fn string_literal(input: Span) -> ParseResult { tok( map(pair(opt(identifier), bare_string_literal), |(_maybe_prefix, s)| ExpressionKind::StringLiteral(Rc::new(s))) )(input) } fn bare_string_literal(input: Span) -> ParseResult { let string_escape_transforms = alt(( value("\\", tag("\\")), value("\"", tag("\"")), value("\n", tag("n")), value("\t", tag("t")), )); alt((map(tag(r#""""#), |_| String::new()), map( tuple(( char('"'), escaped_transform(none_of(r#""\"#), '\\', string_escape_transforms), char('"'), )), |(_, s, _)| s)))(input) } fn identifier_expr(input: Span) -> ParseResult { context("identifier-expr", map(qualified_identifier, ExpressionKind::Value))(input) } fn qualified_identifier(input: Span) -> ParseResult { let id = fresh_id(&input); tok(map(separated_list1(tag("::"), map(identifier, |x| rc_string(x.fragment()))), move |items| { QualifiedName { id, components: items } }))(input) } fn identifier(input: Span) -> ParseResult { recognize(pair(alt((tag("_"), alpha1)), take_while(|ch: char| { is_alphanumeric(ch as u8) || ch == '_'})))(input) } fn bool_literal(input: Span) -> ParseResult { context( "bool-literal", alt(( map(kw("true"), |_| ExpressionKind::BoolLiteral(true)), map(kw("false"), |_| ExpressionKind::BoolLiteral(false)), )), )(input) } fn float_literal(input: Span) -> ParseResult { tok(map( alt(( recognize(tuple((digits(digit_group_dec), char('.'), opt(digits(digit_group_dec))))), recognize(tuple((char('.'), digits(digit_group_dec)))), )), |ds| ExpressionKind::FloatLiteral(ds.fragment().parse().unwrap()), ))(input) } fn number_literal(input: Span) -> ParseResult { map(alt((tok(hex_literal), tok(bin_literal), tok(dec_literal))), ExpressionKind::NatLiteral)(input) } fn dec_literal(input: Span) -> ParseResult { map(digits(digit_group_dec), |chars: Vec| { let s: String = chars.into_iter().collect(); s.parse().unwrap() })(input) } fn hex_literal(input: Span) -> ParseResult { map(preceded(alt((tag("0x"), tag("0X"))), digits(digit_group_hex)), |chars: Vec| { let s: String = chars.into_iter().collect(); parse_hex(&s).unwrap() })(input) } fn bin_literal(input: Span) -> ParseResult { map(preceded(alt((tag("0b"), tag("0B"))), digits(digit_group_bin)), |chars: Vec| { let s: String = chars.into_iter().collect(); parse_binary(&s).unwrap() })(input) } fn digits<'a, E: ParseError>>( digit_type: impl Parser, Vec, E>, ) -> impl FnMut(Span<'a>) -> IResult, Vec, E> { map(separated_list1(many1(char('_')), digit_type), |items: Vec>| { items.into_iter().flatten().collect() }) } fn digit_group_dec(input: Span) -> ParseResult> { many1(one_of("0123456789"))(input) } fn digit_group_hex(input: Span) -> ParseResult> { many1(one_of("0123456789abcdefABCDEF"))(input) } fn digit_group_bin(input: Span) -> ParseResult> { many1(one_of("01"))(input) } fn parse_binary(digits: &str) -> Result { let mut result: u64 = 0; let mut multiplier = 1; for d in digits.chars().rev() { match d { '1' => result += multiplier, '0' => (), '_' => continue, _ => unreachable!(), } multiplier = match multiplier.checked_mul(2) { Some(m) => m, None => return Err("Binary expression will overflow"), } } Ok(result) } fn parse_hex(digits: &str) -> Result { let mut result: u64 = 0; let mut multiplier: u64 = 1; for d in digits.chars().rev() { if d == '_' { continue; } match d.to_digit(16) { Some(n) => result += n as u64 * multiplier, None => return Err("Internal parser error: invalid hex digit"), } multiplier = match multiplier.checked_mul(16) { Some(m) => m, None => return Err("Hexadecimal expression will overflow"), } } Ok(result) } #[derive(Debug)] struct BinopSequence { first: ExpressionKind, rest: Vec<(BinOp, ExpressionKind)>, } impl BinopSequence { fn do_precedence(self, store: &mut IdStore) -> ExpressionKind { fn helper( precedence: i32, lhs: ExpressionKind, rest: &mut Vec<(BinOp, ExpressionKind)>, store: &mut IdStore, ) -> Expression { let mut lhs = Expression::new(store.fresh(), lhs); while let Some((next_op, next_rhs)) = rest.pop() { let new_precedence = next_op.get_precedence(); if precedence >= new_precedence { rest.push((next_op, next_rhs)); break; } let rhs = helper(new_precedence, next_rhs, rest, store); lhs = Expression::new( store.fresh(), ExpressionKind::BinExp(next_op, Box::new(lhs), Box::new(rhs)), ); } lhs } let mut as_stack = self.rest.into_iter().rev().collect(); helper(BinOp::min_precedence(), self.first, &mut as_stack, store).kind } } #[cfg(test)] mod test { use pretty_assertions::assert_eq; use super::*; fn rc(s: &str) -> Rc { Rc::new(s.to_owned()) } macro_rules! qn { ( $( $component:ident),* ) => { { let mut components = vec![]; $( components.push(rc(stringify!($component))); )* QualifiedName { components, id: Default::default() } } }; } macro_rules! span { ($func:expr, $input:expr) => {{ let id_store: IdStore = IdStore::new(); let span = Span::new_extra($input, Rc::new(RefCell::new(id_store))); $func(span).map(|(span, x)| (*span.fragment(), x)) }}; } #[test] fn combinator_test1() { assert_eq!(span!(digits(digit_group_dec), "342"), Ok(("", vec!['3', '4', '2']))); assert_eq!(span!(bin_literal, "0b1111qsdf"), Ok(("qsdf", 15))); assert_eq!(span!(bare_string_literal, r#""fah""#), Ok(("", "fah".to_string()))); assert_eq!(span!(bare_string_literal, r#""""#), Ok(("", "".to_string()))); } #[test] fn combinator_test_ws0() { assert_eq!(span!(block_comment, "/*yolo*/"), Ok(("", ()))); assert_eq!(span!(block_comment, "/*yolo*/ jumpy /*nah*/"), Ok((" jumpy /*nah*/", ()))); assert_eq!(span!(ws0, "/* yolo */ "), Ok(("", ()))); assert_eq!(span!(ws0, "/* /* no */ yolo */ "), Ok(("", ()))); } #[test] fn combinator_test2() { for s in [" 15", " 0b1111", " 1_5_", "0XF__", "0Xf"].iter() { assert_eq!(span!(expression_kind, s).unwrap().1, ExpressionKind::NatLiteral(15)); } assert_eq!(span!(expression_kind, " /*gay*/ true").unwrap().1, ExpressionKind::BoolLiteral(true)); assert_eq!( span!(expression_kind, " /*yolo*/ barnaby").unwrap().1, ExpressionKind::Value(qn!(barnaby)) ); } }