/* * Simple hand-rolled "S-Expression" parser with no dependencies. * * Copied and tweaked from my 'spectrum' rust scheme implementation. * * Not performant: allocates all over the place, takes a full string to parse as input, etc. * * string interning (or something like `smartstring`) would be better for symbols, but would pull * in a dependency. */ use std::str; use std::fmt; use std::io::Read; use std::fs::File; use std::path::Path; //////////// Types and Constants #[derive(Clone, PartialEq, Debug)] pub enum SExpr { SNull, SBoolean(bool), SInteger(i64), SFloat(f64), SString(String), SIdentifier(String), SList(Vec), } //////////// Lexing, Parsing, and Printing fn is_whitespace(c: char) -> bool{ " \t\r\n".find(c) != None } fn is_seperator(c: char) -> bool { "()".find(c) != None } fn is_valid_identifier(s: &str) -> bool { // Valid Scheme (R7RS-small) identifier characters: ! $ % & * + - . / : < = > ? @ ^ _ ~ // Note: this is not complete or compliant if s.len() == 0 { return false; } if s == "." { return false; } if s.starts_with("-") || s.starts_with("'") { return false; } for (i, c) in s.chars().enumerate() { if !( c.is_alphabetic() || "!$%&*+-./:<=>?@^_~".find(c) != None || (c.is_numeric() && i > 0) ) { return false; } } return true; } /* * This function takes a raw string and splits it up into a flat sequence of string tokens. * It should handle basic quotes (double quotes only) and comments (';' character to end-of-line). */ pub fn sexpr_tokenize<'a>(raw_str: &'a str) -> Result, String> { let mut ret = Vec::<&str>::new(); let mut food: usize = 0; // "how many chars of current token have we read?" let mut quoted: bool = false; let mut commented: bool = false; for (i, c) in raw_str.chars().enumerate() { if quoted { // Safe to look-back a character here because quoted can't be true for first char if c == '"' && raw_str.chars().collect::>()[i-1] != '\\' { ret.push(&raw_str[i-food-1..i+1]); quoted = false; food = 0; } else if raw_str.len() == i+1 { return Err(format!("unmatched quote char")); } else { food += 1; } } else if commented { if c == '\n' { commented = false; } } else if c == ';' { if food > 0 { ret.push(&raw_str[i-food..i]); } commented = true; food = 0; } else if c == '"' { if food > 0 { return Err(format!("unexpected quote char")); } quoted = true; } else if is_whitespace(c) || is_seperator(c) { if food > 0 { ret.push(&raw_str[i-food..i]); } if is_seperator(c) { ret.push(&raw_str[i..i+1]); } food = 0; } else if raw_str.len() == i+1 { // end of input ret.push(&raw_str[i-food..]); } else { food += 1; } } if quoted { return Err(format!("unmatched (trailing) quote char")); } return Ok(ret); } /* * This function takes a token (still a string) and parses it into a single SExpression */ fn sexpr_parse_token(token: &str) -> Result { // Is it a constant? match token { "#t" => return Ok(SExpr::SBoolean(true)), "#f" => return Ok(SExpr::SBoolean(false)), _ => () } // Try to parse as an integer match token.parse::() { Ok(x) => return Ok(SExpr::SInteger(x)), Err(_) => () } // Try to parse as floating-point number match token.parse::() { Ok(x) => return Ok(SExpr::SFloat(x)), Err(_) => () } // Is it a string? if token.starts_with("\"") && token.ends_with("\"") { return Ok(SExpr::SString(token.to_string())); } // Else, we'll treat it as an identifier if is_valid_identifier(token) { return Ok(SExpr::SIdentifier(token.to_string())); } return Err(format!("unparsable token: \"{}\"", token)); } /* * This function takes a flat sequence of string tokens (as output by sexpr_tokenize) and parses * into a SExpression (eg, a nested list of expressions). */ pub fn sexpr_parse(tokens: &Vec<&str>, depth: u32) -> Result<(Vec, usize), String> { let mut i: usize = 0; if tokens.len() == 0 { return Ok((vec![SExpr::SNull], 0)); } else if tokens.len() == 1 { let expr = sexpr_parse_token(tokens[0])?; return Ok((vec![expr], 1)); } let mut parsed: usize = 0; let mut ret = Vec::::new(); while i < tokens.len() { parsed += 1; match tokens[i] { "(" => { // "Read ahead" to check for empty tuple if i+1 < tokens.len() && tokens[i+1] == ")" { ret.push(SExpr::SNull); i += 1; parsed += 1; } else { let (expr_list, skip) = sexpr_parse(&tokens[i+1..].to_vec(), depth+1)?; i += skip; parsed += skip; ret.push(SExpr::SList(expr_list)); } }, ")" => { if depth == 0 { return Err(format!("missing an open bracket")); } return Ok((ret, parsed)); }, "'" => { }, token => { let expr = sexpr_parse_token(token)?; ret.push(expr); } } i += 1; } if depth > 0 { return Err(format!("missing a close bracket")); } return Ok((ret, parsed)); } /* * This function takes an arbitary SExpression and returns a string representation. * It's basically the inverse of sexpr_tokenize and sexpr_parse; the output representation is * just plain old LISP/Scheme s-expr syntax. */ pub fn sexpr_repr(ast: &SExpr) -> Result { return match ast { &SExpr::SNull => Ok("()".to_string()), &SExpr::SBoolean(true) => Ok("#t".to_string()), &SExpr::SBoolean(false) => Ok("#f".to_string()), &SExpr::SInteger(num) => Ok(format!("{}", num).to_string()), &SExpr::SFloat(num) => Ok(format!("{}", num).to_string()), &SExpr::SString(ref s)=> Ok(s.clone()), &SExpr::SIdentifier(ref s)=> Ok(s.to_string()), &SExpr::SList(ref list) => { let elements: Vec = list.iter().map(|ref el| sexpr_repr(&el).unwrap()).collect(); Ok(format!("({})", elements.join(" "))) }, } } pub fn sexpr_parse_file(fpath: &Path) -> Result<(), String> { let mut raw_bytes: Vec = Vec::new(); let mut f = File::open(fpath) .expect(&format!("couldn't open file: {}", &fpath.to_str().unwrap())); f.read_to_end(&mut raw_bytes) .expect(&format!("couldn't read file: {}", &fpath.to_str().unwrap())); let contents = String::from_utf8(raw_bytes) .expect(&format!("UTF-8 decode error reading file: {}", &fpath.to_str().unwrap())); let tokens = sexpr_tokenize(&contents)?; let (_ast_list, _) = sexpr_parse(&tokens, 0)?; Ok(()) } impl SExpr { pub fn from_str(raw: &str) -> Result { let tokens = sexpr_tokenize(&raw)?; let (mut ast, _) = sexpr_parse(&tokens, 0)?; Ok(ast.remove(0)) } } impl fmt::Display for SExpr { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match sexpr_repr(self) { Ok(repr) => write!(f, "{}", repr), Err(_) => Err(std::fmt::Error), } } }