From d797df5488bf7ed84b6a53cc503bec15262a31bf Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Sat, 16 Oct 2021 20:08:42 -0700 Subject: initial copy of hand-written sexpr parsing --- src/sexpr.rs | 284 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 284 insertions(+) create mode 100644 src/sexpr.rs (limited to 'src/sexpr.rs') diff --git a/src/sexpr.rs b/src/sexpr.rs new file mode 100644 index 0000000..bc13bd4 --- /dev/null +++ b/src/sexpr.rs @@ -0,0 +1,284 @@ +/* + * Simple hand-rolled "S-Expression" parser with no dependencies. + * + * Copied and tweaked from my 'spectrum' rust scheme implementation. + * + * Not performant: allocates all over the place, takes a full string to parse as input, etc. + * + * string interning (or something like `smartstring`) would be better for symbols, but would pull + * in a dependency. + */ + +use std::str; +use std::io::Read; +use std::fs::File; +use std::path::Path; + +//////////// Types and Constants + +const SEXPR_BUILTINS: [&'static str; 14] = [ + "=", ">", ">=", "<", "<=", + "+", "-", "*", "/", + "exp", "log", "sin", "cos", "tan", + ]; + +#[derive(Clone, PartialEq)] +pub enum SExpr { + SNull, + STrue, + SFalse, + SInteger(i64), + SFloat(f64), + SBuiltin(String), + SSymbol(String), + SIdentifier(String), + SString(String), + SList(Vec), + SVector(Vec), + //SQuote(Vec), +} + +//////////// Lexing, Parsing, and Printing + +fn is_whitespace(c: char) -> bool{ + " \t\r\n".find(c) != None +} + +fn is_seperator(c: char) -> bool { + "()#".find(c) != None +} + +fn is_valid_identifier(s: &str) -> bool { + // Valid Scheme (R7RS-small) identifier characters: ! $ % & * + - . / : < = > ? @ ^ _ ~ + // Note: this is not complete or compliant + if s.len() == 0 { + return false; + } + if s == "." { + return false; + } + if s.starts_with("-") || s.starts_with("'") { + return false; + } + for (i, c) in s.chars().enumerate() { + if !( c.is_alphabetic() || "!$%&*+-./:<=>?@^_~".find(c) != None || (c.is_numeric() && i > 0) ) { + return false; + } + } + return true; +} + +/* + * This function takes a raw string and splits it up into a flat sequence of string tokens. + * It should handle basic quotes (double quotes only) and comments (';' character to end-of-line). + */ +pub fn sexpr_tokenize<'a>(raw_str: &'a str) -> Result, String> { + let mut ret = Vec::<&str>::new(); + let mut food: usize = 0; // "how many chars of current token have we read?" + let mut quoted: bool = false; + let mut commented: bool = false; + for (i, c) in raw_str.chars().enumerate() { + if quoted { + // Safe to look-back a character here because quoted can't be true for first char + if c == '"' && raw_str.chars().collect::>()[i-1] != '\\' { + ret.push(&raw_str[i-food-1..i+1]); + quoted = false; + food = 0; + } else if raw_str.len() == i+1 { + return Err(format!("unmatched quote char")); + } else { + food += 1; + } + } else if commented { + if c == '\n' { + commented = false; + } + } else if c == ';' { + if food > 0 { + ret.push(&raw_str[i-food..i]); + } + commented = true; + food = 0; + } else if c == '"' { + if food > 0 { + return Err(format!("unexpected quote char")); + } + quoted = true; + } else if is_whitespace(c) || is_seperator(c) { + if food > 0 { + ret.push(&raw_str[i-food..i]); + } + if is_seperator(c) { + ret.push(&raw_str[i..i+1]); + } + food = 0; + } else if raw_str.len() == i+1 { + // end of input + ret.push(&raw_str[i-food..]); + } else { + food += 1; + } + } + if quoted { + return Err(format!("unmatched (trailing) quote char")); + } + return Ok(ret); +} + +/* + * This function takes a token (still a string) and parses it into a single SExpression + */ +fn sexpr_parse_token(token: &str) -> Result { + + // Is it a constant? + match token { + "#t" => return Ok(SExpr::STrue), + "#f" => return Ok(SExpr::SFalse), + _ => () + } + + // Is it a builtin? + if SEXPR_BUILTINS.contains(&token) { + return Ok(SExpr::SBuiltin(token.to_string())); + } + + // Try to parse as an integer + match token.parse::() { + Ok(x) => return Ok(SExpr::SInteger(x)), + Err(_) => () + } + + // Try to parse as a number + match token.parse::() { + Ok(x) => return Ok(SExpr::SFloat(x)), + Err(_) => () + } + + // Is it a string? + if token.starts_with("\"") && token.ends_with("\"") { + return Ok(SExpr::SString(token.to_string())); + } + + // Is it a quoted literal? + if token.starts_with("'") && token.len() > 1 { + match sexpr_parse_token(&token[1..]) { + Ok(SExpr::SIdentifier(t)) => return Ok(SExpr::SSymbol(t)), + Ok(e) => return Ok(e), + _ => {}, + } + } + + // Else, we'll treat it as an identifier + if is_valid_identifier(token) { + return Ok(SExpr::SIdentifier(token.to_string())); + } + + return Err(format!("unparsable token: \"{}\"", token)); +} + +/* + * This function takes a flat sequence of string tokens (as output by sexpr_tokenize) and parses + * into a SExpression (eg, a nested list of expressions). + */ +pub fn sexpr_parse(tokens: &Vec<&str>, depth: u32) -> Result<(Vec, usize), String> { + let mut i: usize = 0; + if tokens.len() == 0 { + return Ok((vec![SExpr::SNull], 0)); + } else if tokens.len() == 1 { + let expr = sexpr_parse_token(tokens[0])?; + return Ok((vec![expr], 1)); + } + let mut parsed: usize = 0; + let mut ret = Vec::::new(); + while i < tokens.len() { + parsed += 1; + match tokens[i] { + "(" | "#(" => { + // "Read ahead" to check for empty tuple + if i+1 < tokens.len() && tokens[i+1] == ")" { + ret.push(SExpr::SNull); + i += 1; + parsed += 1; + } else { + let (expr_list, skip) = sexpr_parse(&tokens[i+1..].to_vec(), depth+1)?; + i += skip; + parsed += skip; + match tokens[i-skip] { + "(" => ret.push(SExpr::SList(expr_list)), + "#(" => ret.push(SExpr::SVector(expr_list)), + _ => unreachable!(), + }; + } + }, + ")" => { + if depth == 0 { + return Err(format!("missing an open bracket")); + } + return Ok((ret, parsed)); + }, + "'" => { + }, + token => { + let expr = sexpr_parse_token(token)?; + ret.push(expr); + } + } + i += 1; + } + if depth > 0 { + return Err(format!("missing a close bracket")); + } + return Ok((ret, parsed)); +} + +/* + * This function takes an arbitary SExpression and returns a string representation. + * It's basically the inverse of sexpr_tokenize and sexpr_parse; the output representation is + * just plain old LISP/Scheme s-expr syntax. + */ +pub fn sexpr_repr(ast: &SExpr) -> Result { + return match ast { + &SExpr::STrue => Ok("#t".to_string()), + &SExpr::SFalse => Ok("#f".to_string()), + &SExpr::SNull => Ok("'()".to_string()), // TODO: just () ? + &SExpr::SInteger(num) => Ok(format!("{}", num).to_string()), + &SExpr::SFloat(num) => Ok(format!("{}", num).to_string()), + &SExpr::SBuiltin(ref b)=> Ok(b.clone()), + &SExpr::SString(ref s)=> Ok(s.clone()), + &SExpr::SSymbol(ref s)=> Ok(s.clone()), + &SExpr::SIdentifier(ref s)=> Ok(s.to_string()), + &SExpr::SList(ref list) => { + let elements: Vec = list.iter().map(|ref el| sexpr_repr(&el).unwrap()).collect(); + Ok(format!("({})", elements.join(" "))) + }, + &SExpr::SVector(ref list) => { + let elements: Vec = list.iter().map(|ref el| sexpr_repr(&el).unwrap()).collect(); + Ok(format!("#({})", elements.join(" "))) + }, + /* + &SExpr::SQuote(ref list) => { + let mut ret: String = + list.iter().fold("(quote ".to_string(), + |acc, ref el| acc + " " + &sexpr_repr(&el).unwrap()); + ret.push_str(" )"); + Ok(ret) + }, + */ + } +} + +pub fn sexpr_parse_file(fpath: &Path) -> Result<(), String> { + + let mut raw_bytes: Vec = Vec::new(); + + let mut f = File::open(fpath) + .expect(&format!("couldn't open file: {}", &fpath.to_str().unwrap())); + f.read_to_end(&mut raw_bytes) + .expect(&format!("couldn't read file: {}", &fpath.to_str().unwrap())); + let contents = String::from_utf8(raw_bytes) + .expect(&format!("UTF-8 decode error reading file: {}", &fpath.to_str().unwrap())); + + let tokens = sexpr_tokenize(&contents)?; + let (_ast_list, _) = sexpr_parse(&tokens, 0)?; + Ok(()) +} -- cgit v1.2.3