initial copy of hand-written sexpr parsing

author: Bryan Newbold <bnewbold@robocracy.org> 2021-10-16 20:08:42 -0700
committer: Bryan Newbold <bnewbold@robocracy.org> 2021-10-16 20:08:42 -0700
commit: d797df5488bf7ed84b6a53cc503bec15262a31bf (patch)
tree: c2bf42c784d4dc2e080576e6d28fe214f80c2e52
parent: fcc026dec1b6193c187a917e310b52c47b918029 (diff)
download: casual-d797df5488bf7ed84b6a53cc503bec15262a31bf.tar.gz
casual-d797df5488bf7ed84b6a53cc503bec15262a31bf.zip
5 files changed, 419 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
index 81a4762..6543366 100644
--- a/.gitignore
+++ b/.gitignore
@@ -16,6 +16,7 @@ build/
 _build/
 src/build/
 *.log
+target/
 
 # Don't ignore this file itself
 !.gitignore
diff --git a/Cargo.lock b/Cargo.lock
new file mode 100644
index 0000000..33515ef
--- /dev/null
+++ b/Cargo.lock
@@ -0,0 +1,7 @@
+# This file is automatically @generated by Cargo.
+# It is not intended for manual editing.
+version = 3
+
+[[package]]
+name = "casual"
+version = "0.1.0"
diff --git a/Cargo.toml b/Cargo.toml
new file mode 100644
index 0000000..8d86d4e
--- /dev/null
+++ b/Cargo.toml
@@ -0,0 +1,12 @@
+[package]
+name = "casual"
+version = "0.1.0"
+edition = "2018"
+authors = ["Bryan Newbold <bnewbold@robocracy.org>"]
+license = "MIT"
+readme = "README.md"
+keywords = ["computer-algebra"]
+
+[dependencies]
+
+[dev-dependencies]
diff --git a/src/main.rs b/src/main.rs
new file mode 100644
index 0000000..6b2bc57
--- /dev/null
+++ b/src/main.rs
@@ -0,0 +1,115 @@
+
+use std::io;
+use std::env;
+use std::io::Write;
+use std::path::Path;
+
+mod sexpr;
+
+fn repl(verbose: bool) {
+
+    let stdin = io::stdin();
+    let mut stdout = io::stdout();
+
+    loop {
+        let raw_input = &mut String::new();
+        stdout.write(b"\ncasual> ").unwrap();
+        stdout.flush().unwrap();
+        stdin.read_line(raw_input).unwrap();
+        let raw_input = raw_input; // mutable to immutable reference
+        if raw_input.len() == 0 {
+            // end-of-line, aka Ctrl-D. Blank line will still have newline char
+            stdout.write(b"\nCiao!\n").unwrap();
+            return;
+        }
+        // TODO: use debug or something instead of "verbose"?
+        let tokens = match sexpr::sexpr_tokenize(&raw_input) {
+            Ok(tokens) => {
+                if verbose { println!("Tokens: {}", tokens.join(", ")); };
+                tokens
+            },
+            Err(e) => {
+                println!("couldn't tokenize: {}", e);
+                continue;
+            }
+        };
+        let ast = match sexpr::sexpr_parse(&tokens, 0) {
+            Ok((mut ast_list, _)) => {
+                if verbose {
+                    for ast in &ast_list {
+                        println!("AST: {}", sexpr::sexpr_repr(ast).unwrap());
+                    };
+                };
+                // We're a REPL, so only one expression at a time
+                if ast_list.len() > 1 {
+                    println!("one expression at a time on the REPL, please!");
+                    continue;
+                } else if ast_list.len() == 0 {
+                    sexpr::SExpr::SNull
+                } else {
+                    let ast = ast_list.pop().unwrap();
+                    ast
+                }
+            },
+            Err(e) => {
+                println!("couldn't parse: {}", e);
+                continue;
+            }
+        };
+        println!("{}", sexpr::sexpr_repr(&ast).unwrap());
+    }
+}
+
+fn usage() {
+    println!("usage:\tcasual [-h] [-v] [--no-repl] [<files>]");
+    println!("");
+    println!("Files will be loaded in order, then drop to REPL (unless \"--no-repl\" is passed).");
+    println!("Verbose flag (\"-v\") will result in lexed tokens and parsed AST \
+        being dumped to stdout (when on REPL).");
+}
+
+fn main() {
+
+    let mut verbose: bool = false;
+    let mut no_repl: bool = false;
+
+    let mut file_list = Vec::<String>::new();
+
+    for arg in env::args().skip(1) {
+        match &*arg {
+            "-v" | "--verbose"  => { verbose = true; },
+            "--no-repl"         => { no_repl = true; },
+            "-h" | "--help"     => { usage(); return; },
+            _ if arg.starts_with("-") => {
+                println!("Unknown option: {}", arg);
+                println!("");
+                usage();
+                return;
+            },
+            _ => {
+                file_list.push(arg.clone());
+            }
+        }
+    }
+
+    for fname in file_list {
+        let fpath = Path::new(&fname);
+        if !fpath.is_file() {
+            println!("File not found (or not file): {}", fname);
+            return;
+        }
+        println!("Loading {}...", fname);
+        match sexpr::sexpr_parse_file(&fpath) {
+            Err(e) => {
+                println!("Error loading file: {}\n    {}", fname, e);
+                return;
+            },
+            Ok(_) => ()
+        }
+    }
+
+    if !no_repl {
+        repl(verbose);
+    }
+}
+
diff --git a/src/sexpr.rs b/src/sexpr.rs
new file mode 100644
index 0000000..bc13bd4
--- /dev/null
+++ b/src/sexpr.rs
@@ -0,0 +1,284 @@
+/*
+ * Simple hand-rolled "S-Expression" parser with no dependencies.
+ *
+ * Copied and tweaked from my 'spectrum' rust scheme implementation.
+ *
+ * Not performant: allocates all over the place, takes a full string to parse as input, etc.
+ *
+ * string interning (or something like `smartstring`) would be better for symbols, but would pull
+ * in a dependency.
+ */
+
+use std::str;
+use std::io::Read;
+use std::fs::File;
+use std::path::Path;
+
+//////////// Types and Constants
+
+const SEXPR_BUILTINS: [&'static str; 14] = [
+    "=", ">", ">=", "<", "<=",
+    "+", "-", "*", "/",
+    "exp", "log", "sin", "cos", "tan",
+    ];
+
+#[derive(Clone, PartialEq)]
+pub enum SExpr {
+    SNull,
+    STrue,
+    SFalse,
+    SInteger(i64),
+    SFloat(f64),
+    SBuiltin(String),
+    SSymbol(String),
+    SIdentifier(String),
+    SString(String),
+    SList(Vec<SExpr>),
+    SVector(Vec<SExpr>),
+    //SQuote(Vec<SExpr>),
+}
+
+//////////// Lexing, Parsing, and Printing
+
+fn is_whitespace(c: char) -> bool{
+    " \t\r\n".find(c) != None
+}
+
+fn is_seperator(c: char) -> bool {
+    "()#".find(c) != None
+}
+
+fn is_valid_identifier(s: &str) -> bool {
+    // Valid Scheme (R7RS-small) identifier characters: ! $ % & * + - . / : < = > ? @ ^ _ ~
+    // Note: this is not complete or compliant
+    if s.len() == 0 {
+        return false;
+    }
+    if s == "." {
+        return false;
+    }
+    if s.starts_with("-") || s.starts_with("'") {
+        return false;
+    }
+    for (i, c) in s.chars().enumerate() {
+        if !( c.is_alphabetic() || "!$%&*+-./:<=>?@^_~".find(c) != None || (c.is_numeric() && i > 0) ) {
+            return false;
+        }
+    }
+    return true;
+}
+
+/*
+ * This function takes a raw string and splits it up into a flat sequence of string tokens.
+ * It should handle basic quotes (double quotes only) and comments (';' character to end-of-line).
+ */
+pub fn sexpr_tokenize<'a>(raw_str: &'a str) -> Result<Vec<&'a str>, String> {
+    let mut ret = Vec::<&str>::new();
+    let mut food: usize = 0;    // "how many chars of current token have we read?"
+    let mut quoted: bool = false;
+    let mut commented: bool = false;
+    for (i, c) in raw_str.chars().enumerate() {
+        if quoted {
+            // Safe to look-back a character here because quoted can't be true for first char
+            if c == '"' && raw_str.chars().collect::<Vec<char>>()[i-1] != '\\' {
+                ret.push(&raw_str[i-food-1..i+1]);
+                quoted = false;
+                food = 0;
+            } else if raw_str.len() == i+1 {
+                return Err(format!("unmatched quote char"));
+            } else {
+                food += 1;
+            }
+        } else if commented {
+            if c == '\n' {
+                commented = false;
+            }
+        } else if c == ';' {
+            if food > 0 {
+                ret.push(&raw_str[i-food..i]);
+            }
+            commented = true;
+            food = 0;
+        } else if c == '"' {
+            if food > 0 {
+                return Err(format!("unexpected quote char"));
+            }
+            quoted = true;
+        } else if is_whitespace(c) || is_seperator(c) {
+            if food > 0 {
+                ret.push(&raw_str[i-food..i]);
+            }
+            if is_seperator(c) {
+                ret.push(&raw_str[i..i+1]);
+            }
+            food = 0;
+        } else if raw_str.len() == i+1 {
+            // end of input
+            ret.push(&raw_str[i-food..]);
+        } else {
+            food += 1;
+        }
+    }
+    if quoted {
+        return Err(format!("unmatched (trailing) quote char"));
+    }
+    return Ok(ret);
+}
+
+/*
+ * This function takes a token (still a string) and parses it into a single SExpression
+ */
+fn sexpr_parse_token(token: &str) -> Result<SExpr, String> {
+
+    // Is it a constant?
+    match token {
+        "#t" => return Ok(SExpr::STrue),
+        "#f" => return Ok(SExpr::SFalse),
+        _ => ()
+    }
+
+    // Is it a builtin?
+    if SEXPR_BUILTINS.contains(&token) {
+        return Ok(SExpr::SBuiltin(token.to_string()));
+    }
+
+    // Try to parse as an integer
+    match token.parse::<i64>() {
+        Ok(x) => return Ok(SExpr::SInteger(x)),
+        Err(_) => ()
+    }
+
+    // Try to parse as a number
+    match token.parse::<f64>() {
+        Ok(x) => return Ok(SExpr::SFloat(x)),
+        Err(_) => ()
+    }
+
+    // Is it a string?
+    if token.starts_with("\"") && token.ends_with("\"") {
+        return Ok(SExpr::SString(token.to_string()));
+    }
+
+    // Is it a quoted literal?
+    if token.starts_with("'") && token.len() > 1 {
+        match sexpr_parse_token(&token[1..]) {
+            Ok(SExpr::SIdentifier(t)) => return Ok(SExpr::SSymbol(t)),
+            Ok(e) => return Ok(e),
+            _ => {},
+        }
+    }
+
+    // Else, we'll treat it as an identifier
+    if is_valid_identifier(token) {
+        return Ok(SExpr::SIdentifier(token.to_string()));
+    }
+
+    return Err(format!("unparsable token: \"{}\"", token));
+}
+
+/*
+ * This function takes a flat sequence of string tokens (as output by sexpr_tokenize) and parses
+ * into a SExpression (eg, a nested list of expressions).
+ */
+pub fn sexpr_parse(tokens: &Vec<&str>, depth: u32) -> Result<(Vec<SExpr>, usize), String> {
+    let mut i: usize = 0;
+    if tokens.len() == 0  {
+        return Ok((vec![SExpr::SNull], 0));
+    } else if tokens.len() == 1 {
+        let expr = sexpr_parse_token(tokens[0])?;
+        return Ok((vec![expr], 1));
+    }
+    let mut parsed: usize = 0;
+    let mut ret = Vec::<SExpr>::new();
+    while i < tokens.len() {
+        parsed += 1;
+        match tokens[i] {
+            "(" | "#(" => {
+                // "Read ahead" to check for empty tuple
+                if i+1 < tokens.len() && tokens[i+1] == ")" {
+                    ret.push(SExpr::SNull);
+                    i += 1;
+                    parsed += 1;
+                } else {
+                    let (expr_list, skip) = sexpr_parse(&tokens[i+1..].to_vec(), depth+1)?;
+                    i += skip;
+                    parsed += skip;
+                    match tokens[i-skip] {
+                        "(" => ret.push(SExpr::SList(expr_list)),
+                        "#(" => ret.push(SExpr::SVector(expr_list)),
+                        _ => unreachable!(),
+                    };
+                }
+            },
+            ")" => {
+                if depth == 0 {
+                    return Err(format!("missing an open bracket"));
+                }
+                return Ok((ret, parsed));
+            },
+            "'" => {
+            },
+            token => {
+                let expr = sexpr_parse_token(token)?;
+                ret.push(expr);
+            }
+        }
+        i += 1;
+    }
+    if depth > 0 {
+        return Err(format!("missing a close bracket"));
+    }
+    return Ok((ret, parsed));
+}
+
+/*
+ * This function takes an arbitary SExpression and returns a string representation.
+ * It's basically the inverse of sexpr_tokenize and sexpr_parse; the output representation is
+ * just plain old LISP/Scheme s-expr syntax.
+ */
+pub fn sexpr_repr(ast: &SExpr) -> Result<String, String> {
+    return match ast {
+        &SExpr::STrue => Ok("#t".to_string()),
+        &SExpr::SFalse => Ok("#f".to_string()),
+        &SExpr::SNull => Ok("'()".to_string()), // TODO: just () ?
+        &SExpr::SInteger(num) => Ok(format!("{}", num).to_string()),
+        &SExpr::SFloat(num) => Ok(format!("{}", num).to_string()),
+        &SExpr::SBuiltin(ref b)=> Ok(b.clone()),
+        &SExpr::SString(ref s)=> Ok(s.clone()),
+        &SExpr::SSymbol(ref s)=> Ok(s.clone()),
+        &SExpr::SIdentifier(ref s)=> Ok(s.to_string()),
+        &SExpr::SList(ref list) => {
+            let elements: Vec<String> = list.iter().map(|ref el| sexpr_repr(&el).unwrap()).collect();
+            Ok(format!("({})", elements.join(" ")))
+        },
+        &SExpr::SVector(ref list) => {
+            let elements: Vec<String> = list.iter().map(|ref el| sexpr_repr(&el).unwrap()).collect();
+            Ok(format!("#({})", elements.join(" ")))
+        },
+        /*
+        &SExpr::SQuote(ref list) => {
+            let mut ret: String =
+                list.iter().fold("(quote ".to_string(),
+                                 |acc, ref el| acc + " " + &sexpr_repr(&el).unwrap());
+            ret.push_str(" )");
+            Ok(ret)
+        },
+        */
+    }
+}
+
+pub fn sexpr_parse_file(fpath: &Path) -> Result<(), String> {
+
+    let mut raw_bytes: Vec<u8> = Vec::new();
+
+    let mut f = File::open(fpath)
+        .expect(&format!("couldn't open file: {}", &fpath.to_str().unwrap()));
+    f.read_to_end(&mut raw_bytes)
+        .expect(&format!("couldn't read file: {}", &fpath.to_str().unwrap()));
+    let contents = String::from_utf8(raw_bytes)
+        .expect(&format!("UTF-8 decode error reading file: {}", &fpath.to_str().unwrap()));
+
+    let tokens = sexpr_tokenize(&contents)?;
+    let (_ast_list, _) = sexpr_parse(&tokens, 0)?;
+    Ok(())
+}
author	Bryan Newbold <bnewbold@robocracy.org>	2021-10-16 20:08:42 -0700
committer	Bryan Newbold <bnewbold@robocracy.org>	2021-10-16 20:08:42 -0700
commit	d797df5488bf7ed84b6a53cc503bec15262a31bf (patch)
tree	c2bf42c784d4dc2e080576e6d28fe214f80c2e52
parent	fcc026dec1b6193c187a917e310b52c47b918029 (diff)
download	casual-d797df5488bf7ed84b6a53cc503bec15262a31bf.tar.gz casual-d797df5488bf7ed84b6a53cc503bec15262a31bf.zip