From d797df5488bf7ed84b6a53cc503bec15262a31bf Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Sat, 16 Oct 2021 20:08:42 -0700 Subject: initial copy of hand-written sexpr parsing --- .gitignore | 1 + Cargo.lock | 7 ++ Cargo.toml | 12 +++ src/main.rs | 115 ++++++++++++++++++++++++ src/sexpr.rs | 284 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 419 insertions(+) create mode 100644 Cargo.lock create mode 100644 Cargo.toml create mode 100644 src/main.rs create mode 100644 src/sexpr.rs diff --git a/.gitignore b/.gitignore index 81a4762..6543366 100644 --- a/.gitignore +++ b/.gitignore @@ -16,6 +16,7 @@ build/ _build/ src/build/ *.log +target/ # Don't ignore this file itself !.gitignore diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 0000000..33515ef --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,7 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "casual" +version = "0.1.0" diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..8d86d4e --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,12 @@ +[package] +name = "casual" +version = "0.1.0" +edition = "2018" +authors = ["Bryan Newbold "] +license = "MIT" +readme = "README.md" +keywords = ["computer-algebra"] + +[dependencies] + +[dev-dependencies] diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 0000000..6b2bc57 --- /dev/null +++ b/src/main.rs @@ -0,0 +1,115 @@ + +use std::io; +use std::env; +use std::io::Write; +use std::path::Path; + +mod sexpr; + +fn repl(verbose: bool) { + + let stdin = io::stdin(); + let mut stdout = io::stdout(); + + loop { + let raw_input = &mut String::new(); + stdout.write(b"\ncasual> ").unwrap(); + stdout.flush().unwrap(); + stdin.read_line(raw_input).unwrap(); + let raw_input = raw_input; // mutable to immutable reference + if raw_input.len() == 0 { + // end-of-line, aka Ctrl-D. Blank line will still have newline char + stdout.write(b"\nCiao!\n").unwrap(); + return; + } + // TODO: use debug or something instead of "verbose"? + let tokens = match sexpr::sexpr_tokenize(&raw_input) { + Ok(tokens) => { + if verbose { println!("Tokens: {}", tokens.join(", ")); }; + tokens + }, + Err(e) => { + println!("couldn't tokenize: {}", e); + continue; + } + }; + let ast = match sexpr::sexpr_parse(&tokens, 0) { + Ok((mut ast_list, _)) => { + if verbose { + for ast in &ast_list { + println!("AST: {}", sexpr::sexpr_repr(ast).unwrap()); + }; + }; + // We're a REPL, so only one expression at a time + if ast_list.len() > 1 { + println!("one expression at a time on the REPL, please!"); + continue; + } else if ast_list.len() == 0 { + sexpr::SExpr::SNull + } else { + let ast = ast_list.pop().unwrap(); + ast + } + }, + Err(e) => { + println!("couldn't parse: {}", e); + continue; + } + }; + println!("{}", sexpr::sexpr_repr(&ast).unwrap()); + } +} + +fn usage() { + println!("usage:\tcasual [-h] [-v] [--no-repl] []"); + println!(""); + println!("Files will be loaded in order, then drop to REPL (unless \"--no-repl\" is passed)."); + println!("Verbose flag (\"-v\") will result in lexed tokens and parsed AST \ + being dumped to stdout (when on REPL)."); +} + +fn main() { + + let mut verbose: bool = false; + let mut no_repl: bool = false; + + let mut file_list = Vec::::new(); + + for arg in env::args().skip(1) { + match &*arg { + "-v" | "--verbose" => { verbose = true; }, + "--no-repl" => { no_repl = true; }, + "-h" | "--help" => { usage(); return; }, + _ if arg.starts_with("-") => { + println!("Unknown option: {}", arg); + println!(""); + usage(); + return; + }, + _ => { + file_list.push(arg.clone()); + } + } + } + + for fname in file_list { + let fpath = Path::new(&fname); + if !fpath.is_file() { + println!("File not found (or not file): {}", fname); + return; + } + println!("Loading {}...", fname); + match sexpr::sexpr_parse_file(&fpath) { + Err(e) => { + println!("Error loading file: {}\n {}", fname, e); + return; + }, + Ok(_) => () + } + } + + if !no_repl { + repl(verbose); + } +} + diff --git a/src/sexpr.rs b/src/sexpr.rs new file mode 100644 index 0000000..bc13bd4 --- /dev/null +++ b/src/sexpr.rs @@ -0,0 +1,284 @@ +/* + * Simple hand-rolled "S-Expression" parser with no dependencies. + * + * Copied and tweaked from my 'spectrum' rust scheme implementation. + * + * Not performant: allocates all over the place, takes a full string to parse as input, etc. + * + * string interning (or something like `smartstring`) would be better for symbols, but would pull + * in a dependency. + */ + +use std::str; +use std::io::Read; +use std::fs::File; +use std::path::Path; + +//////////// Types and Constants + +const SEXPR_BUILTINS: [&'static str; 14] = [ + "=", ">", ">=", "<", "<=", + "+", "-", "*", "/", + "exp", "log", "sin", "cos", "tan", + ]; + +#[derive(Clone, PartialEq)] +pub enum SExpr { + SNull, + STrue, + SFalse, + SInteger(i64), + SFloat(f64), + SBuiltin(String), + SSymbol(String), + SIdentifier(String), + SString(String), + SList(Vec), + SVector(Vec), + //SQuote(Vec), +} + +//////////// Lexing, Parsing, and Printing + +fn is_whitespace(c: char) -> bool{ + " \t\r\n".find(c) != None +} + +fn is_seperator(c: char) -> bool { + "()#".find(c) != None +} + +fn is_valid_identifier(s: &str) -> bool { + // Valid Scheme (R7RS-small) identifier characters: ! $ % & * + - . / : < = > ? @ ^ _ ~ + // Note: this is not complete or compliant + if s.len() == 0 { + return false; + } + if s == "." { + return false; + } + if s.starts_with("-") || s.starts_with("'") { + return false; + } + for (i, c) in s.chars().enumerate() { + if !( c.is_alphabetic() || "!$%&*+-./:<=>?@^_~".find(c) != None || (c.is_numeric() && i > 0) ) { + return false; + } + } + return true; +} + +/* + * This function takes a raw string and splits it up into a flat sequence of string tokens. + * It should handle basic quotes (double quotes only) and comments (';' character to end-of-line). + */ +pub fn sexpr_tokenize<'a>(raw_str: &'a str) -> Result, String> { + let mut ret = Vec::<&str>::new(); + let mut food: usize = 0; // "how many chars of current token have we read?" + let mut quoted: bool = false; + let mut commented: bool = false; + for (i, c) in raw_str.chars().enumerate() { + if quoted { + // Safe to look-back a character here because quoted can't be true for first char + if c == '"' && raw_str.chars().collect::>()[i-1] != '\\' { + ret.push(&raw_str[i-food-1..i+1]); + quoted = false; + food = 0; + } else if raw_str.len() == i+1 { + return Err(format!("unmatched quote char")); + } else { + food += 1; + } + } else if commented { + if c == '\n' { + commented = false; + } + } else if c == ';' { + if food > 0 { + ret.push(&raw_str[i-food..i]); + } + commented = true; + food = 0; + } else if c == '"' { + if food > 0 { + return Err(format!("unexpected quote char")); + } + quoted = true; + } else if is_whitespace(c) || is_seperator(c) { + if food > 0 { + ret.push(&raw_str[i-food..i]); + } + if is_seperator(c) { + ret.push(&raw_str[i..i+1]); + } + food = 0; + } else if raw_str.len() == i+1 { + // end of input + ret.push(&raw_str[i-food..]); + } else { + food += 1; + } + } + if quoted { + return Err(format!("unmatched (trailing) quote char")); + } + return Ok(ret); +} + +/* + * This function takes a token (still a string) and parses it into a single SExpression + */ +fn sexpr_parse_token(token: &str) -> Result { + + // Is it a constant? + match token { + "#t" => return Ok(SExpr::STrue), + "#f" => return Ok(SExpr::SFalse), + _ => () + } + + // Is it a builtin? + if SEXPR_BUILTINS.contains(&token) { + return Ok(SExpr::SBuiltin(token.to_string())); + } + + // Try to parse as an integer + match token.parse::() { + Ok(x) => return Ok(SExpr::SInteger(x)), + Err(_) => () + } + + // Try to parse as a number + match token.parse::() { + Ok(x) => return Ok(SExpr::SFloat(x)), + Err(_) => () + } + + // Is it a string? + if token.starts_with("\"") && token.ends_with("\"") { + return Ok(SExpr::SString(token.to_string())); + } + + // Is it a quoted literal? + if token.starts_with("'") && token.len() > 1 { + match sexpr_parse_token(&token[1..]) { + Ok(SExpr::SIdentifier(t)) => return Ok(SExpr::SSymbol(t)), + Ok(e) => return Ok(e), + _ => {}, + } + } + + // Else, we'll treat it as an identifier + if is_valid_identifier(token) { + return Ok(SExpr::SIdentifier(token.to_string())); + } + + return Err(format!("unparsable token: \"{}\"", token)); +} + +/* + * This function takes a flat sequence of string tokens (as output by sexpr_tokenize) and parses + * into a SExpression (eg, a nested list of expressions). + */ +pub fn sexpr_parse(tokens: &Vec<&str>, depth: u32) -> Result<(Vec, usize), String> { + let mut i: usize = 0; + if tokens.len() == 0 { + return Ok((vec![SExpr::SNull], 0)); + } else if tokens.len() == 1 { + let expr = sexpr_parse_token(tokens[0])?; + return Ok((vec![expr], 1)); + } + let mut parsed: usize = 0; + let mut ret = Vec::::new(); + while i < tokens.len() { + parsed += 1; + match tokens[i] { + "(" | "#(" => { + // "Read ahead" to check for empty tuple + if i+1 < tokens.len() && tokens[i+1] == ")" { + ret.push(SExpr::SNull); + i += 1; + parsed += 1; + } else { + let (expr_list, skip) = sexpr_parse(&tokens[i+1..].to_vec(), depth+1)?; + i += skip; + parsed += skip; + match tokens[i-skip] { + "(" => ret.push(SExpr::SList(expr_list)), + "#(" => ret.push(SExpr::SVector(expr_list)), + _ => unreachable!(), + }; + } + }, + ")" => { + if depth == 0 { + return Err(format!("missing an open bracket")); + } + return Ok((ret, parsed)); + }, + "'" => { + }, + token => { + let expr = sexpr_parse_token(token)?; + ret.push(expr); + } + } + i += 1; + } + if depth > 0 { + return Err(format!("missing a close bracket")); + } + return Ok((ret, parsed)); +} + +/* + * This function takes an arbitary SExpression and returns a string representation. + * It's basically the inverse of sexpr_tokenize and sexpr_parse; the output representation is + * just plain old LISP/Scheme s-expr syntax. + */ +pub fn sexpr_repr(ast: &SExpr) -> Result { + return match ast { + &SExpr::STrue => Ok("#t".to_string()), + &SExpr::SFalse => Ok("#f".to_string()), + &SExpr::SNull => Ok("'()".to_string()), // TODO: just () ? + &SExpr::SInteger(num) => Ok(format!("{}", num).to_string()), + &SExpr::SFloat(num) => Ok(format!("{}", num).to_string()), + &SExpr::SBuiltin(ref b)=> Ok(b.clone()), + &SExpr::SString(ref s)=> Ok(s.clone()), + &SExpr::SSymbol(ref s)=> Ok(s.clone()), + &SExpr::SIdentifier(ref s)=> Ok(s.to_string()), + &SExpr::SList(ref list) => { + let elements: Vec = list.iter().map(|ref el| sexpr_repr(&el).unwrap()).collect(); + Ok(format!("({})", elements.join(" "))) + }, + &SExpr::SVector(ref list) => { + let elements: Vec = list.iter().map(|ref el| sexpr_repr(&el).unwrap()).collect(); + Ok(format!("#({})", elements.join(" "))) + }, + /* + &SExpr::SQuote(ref list) => { + let mut ret: String = + list.iter().fold("(quote ".to_string(), + |acc, ref el| acc + " " + &sexpr_repr(&el).unwrap()); + ret.push_str(" )"); + Ok(ret) + }, + */ + } +} + +pub fn sexpr_parse_file(fpath: &Path) -> Result<(), String> { + + let mut raw_bytes: Vec = Vec::new(); + + let mut f = File::open(fpath) + .expect(&format!("couldn't open file: {}", &fpath.to_str().unwrap())); + f.read_to_end(&mut raw_bytes) + .expect(&format!("couldn't read file: {}", &fpath.to_str().unwrap())); + let contents = String::from_utf8(raw_bytes) + .expect(&format!("UTF-8 decode error reading file: {}", &fpath.to_str().unwrap())); + + let tokens = sexpr_tokenize(&contents)?; + let (_ast_list, _) = sexpr_parse(&tokens, 0)?; + Ok(()) +} -- cgit v1.2.3