From ba7d6a842cb4d61357b588fb2d3ec552c654ae64 Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@robocracy.org>
Date: Tue, 8 Jan 2019 23:18:32 -0800
Subject: huge refactor of rust modules/files

Taking advantage of new Rust 2018 crate/module path changes, and
re-organizing things. Somewhat optimistic this could help with partial
rebuild speed also.
---
 rust/src/identifiers.rs | 376 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 376 insertions(+)
 create mode 100644 rust/src/identifiers.rs

(limited to 'rust/src/identifiers.rs')
diff --git a/rust/src/identifiers.rs b/rust/src/identifiers.rs
new file mode 100644
index 00000000..adb9f413
--- /dev/null
+++ b/rust/src/identifiers.rs
@@ -0,0 +1,376 @@
+use crate::errors::*;
+use data_encoding::BASE32_NOPAD;
+use regex::Regex;
+use serde_json;
+use std::str::FromStr;
+use uuid::Uuid;
+
+#[derive(Clone, Copy, PartialEq, Debug)]
+pub struct FatCatId(Uuid);
+
+impl ToString for FatCatId {
+    fn to_string(&self) -> String {
+        uuid2fcid(&self.to_uuid())
+    }
+}
+
+impl FromStr for FatCatId {
+    type Err = Error;
+    fn from_str(s: &str) -> Result<FatCatId> {
+        fcid2uuid(s).map(|u| FatCatId(u))
+    }
+}
+
+impl FatCatId {
+    pub fn to_uuid(&self) -> Uuid {
+        self.0
+    }
+    // TODO: just make it u: Uuid and clone (not by ref)
+    pub fn from_uuid(u: &Uuid) -> FatCatId {
+        FatCatId(*u)
+    }
+}
+
+/// Convert fatcat IDs (base32 strings) to UUID
+pub fn fcid2uuid(fcid: &str) -> Result<Uuid> {
+    if fcid.len() != 26 {
+        return Err(ErrorKind::InvalidFatcatId(fcid.to_string()).into());
+    }
+    let mut raw = vec![0; 16];
+    BASE32_NOPAD
+        .decode_mut(fcid.to_uppercase().as_bytes(), &mut raw)
+        .map_err(|_dp| ErrorKind::InvalidFatcatId(fcid.to_string()))?;
+    // unwrap() is safe here, because we know raw is always 16 bytes
+    Ok(Uuid::from_bytes(&raw).unwrap())
+}
+
+/// Convert UUID to fatcat ID string (base32 encoded)
+pub fn uuid2fcid(id: &Uuid) -> String {
+    let raw = id.as_bytes();
+    BASE32_NOPAD.encode(raw).to_lowercase()
+}
+
+pub fn check_username(raw: &str) -> Result<()> {
+    lazy_static! {
+        static ref RE: Regex = Regex::new(r"^[A-Za-z][A-Za-z0-9._-]{2,24}$").unwrap();
+    }
+    if RE.is_match(raw) {
+        Ok(())
+    } else {
+        Err(ErrorKind::MalformedExternalId(format!(
+            "not a valid username: '{}' (expected, eg, 'AcidBurn')",
+            raw
+        ))
+        .into())
+    }
+}
+
+#[test]
+fn test_check_username() {
+    assert!(check_username("bnewbold").is_ok());
+    assert!(check_username("BNEWBOLD").is_ok());
+    assert!(check_username("admin").is_ok());
+    assert!(check_username("friend-bot").is_ok());
+    assert!(check_username("dog").is_ok());
+    assert!(check_username("g_____").is_ok());
+    assert!(check_username("bnewbold2-archive").is_ok());
+    assert!(check_username("bnewbold2-internetarchive").is_ok());
+
+    assert!(check_username("").is_err());
+    assert!(check_username("_").is_err());
+    assert!(check_username("gg").is_err());
+    assert!(check_username("adminadminadminadminadminadminadmin").is_err());
+    assert!(check_username("bryan newbold").is_err());
+    assert!(check_username("01234567-3456-6780").is_err());
+    assert!(check_username(".admin").is_err());
+    assert!(check_username("-bot").is_err());
+}
+
+pub fn check_pmcid(raw: &str) -> Result<()> {
+    lazy_static! {
+        static ref RE: Regex = Regex::new(r"^PMC\d+$").unwrap();
+    }
+    if RE.is_match(raw) {
+        Ok(())
+    } else {
+        Err(ErrorKind::MalformedExternalId(format!(
+            "not a valid PubMed Central ID (PMCID): '{}' (expected, eg, 'PMC12345')",
+            raw
+        ))
+        .into())
+    }
+}
+
+pub fn check_pmid(raw: &str) -> Result<()> {
+    lazy_static! {
+        static ref RE: Regex = Regex::new(r"^\d+$").unwrap();
+    }
+    if RE.is_match(raw) {
+        Ok(())
+    } else {
+        Err(ErrorKind::MalformedExternalId(format!(
+            "not a valid PubMed ID (PMID): '{}' (expected, eg, '1234')",
+            raw
+        ))
+        .into())
+    }
+}
+
+pub fn check_wikidata_qid(raw: &str) -> Result<()> {
+    lazy_static! {
+        static ref RE: Regex = Regex::new(r"^Q\d+$").unwrap();
+    }
+    if RE.is_match(raw) {
+        Ok(())
+    } else {
+        Err(ErrorKind::MalformedExternalId(format!(
+            "not a valid Wikidata QID: '{}' (expected, eg, 'Q1234')",
+            raw
+        ))
+        .into())
+    }
+}
+
+pub fn check_doi(raw: &str) -> Result<()> {
+    lazy_static! {
+        static ref RE: Regex = Regex::new(r"^10.\d{3,6}/.+$").unwrap();
+    }
+    if RE.is_match(raw) {
+        Ok(())
+    } else {
+        Err(ErrorKind::MalformedExternalId(format!(
+            "not a valid DOI: '{}' (expected, eg, '10.1234/aksjdfh')",
+            raw
+        ))
+        .into())
+    }
+}
+
+pub fn check_issn(raw: &str) -> Result<()> {
+    lazy_static! {
+        static ref RE: Regex = Regex::new(r"^\d{4}-\d{3}[0-9X]$").unwrap();
+    }
+    if RE.is_match(raw) {
+        Ok(())
+    } else {
+        Err(ErrorKind::MalformedExternalId(format!(
+            "not a valid ISSN: '{}' (expected, eg, '1234-5678')",
+            raw
+        ))
+        .into())
+    }
+}
+
+pub fn check_orcid(raw: &str) -> Result<()> {
+    lazy_static! {
+        static ref RE: Regex = Regex::new(r"^\d{4}-\d{4}-\d{4}-\d{3}[\dX]$").unwrap();
+    }
+    if RE.is_match(raw) {
+        Ok(())
+    } else {
+        Err(ErrorKind::MalformedExternalId(format!(
+            "not a valid ORCID: '{}' (expected, eg, '0123-4567-3456-6789')",
+            raw
+        ))
+        .into())
+    }
+}
+
+#[test]
+fn test_check_orcid() {
+    assert!(check_orcid("0123-4567-3456-6789").is_ok());
+    assert!(check_orcid("0123-4567-3456-678X").is_ok());
+    assert!(check_orcid("01234567-3456-6780").is_err());
+    assert!(check_orcid("0x23-4567-3456-6780").is_err());
+}
+
+pub fn check_md5(raw: &str) -> Result<()> {
+    lazy_static! {
+        static ref RE: Regex = Regex::new(r"^[a-f0-9]{32}$").unwrap();
+    }
+    if RE.is_match(raw) {
+        Ok(())
+    } else {
+        Err(ErrorKind::MalformedChecksum(format!(
+            "not a valid MD5: '{}' (expected lower-case hex, eg, '1b39813549077b2347c0f370c3864b40')",
+            raw
+        ))
+        .into())
+    }
+}
+
+#[test]
+fn test_check_md5() {
+    assert!(check_md5("1b39813549077b2347c0f370c3864b40").is_ok());
+    assert!(check_md5("1g39813549077b2347c0f370c3864b40").is_err());
+    assert!(check_md5("1B39813549077B2347C0F370c3864b40").is_err());
+    assert!(check_md5("1b39813549077b2347c0f370c3864b4").is_err());
+    assert!(check_md5("1b39813549077b2347c0f370c3864b411").is_err());
+}
+
+pub fn check_sha1(raw: &str) -> Result<()> {
+    lazy_static! {
+        static ref RE: Regex = Regex::new(r"^[a-f0-9]{40}$").unwrap();
+    }
+    if RE.is_match(raw) {
+        Ok(())
+    } else {
+        Err(ErrorKind::MalformedChecksum(format!(
+            "not a valid SHA-1: '{}' (expected lower-case hex, eg, 'e9dd75237c94b209dc3ccd52722de6931a310ba3')",
+            raw
+        ))
+        .into())
+    }
+}
+
+#[test]
+fn test_check_sha1() {
+    assert!(check_sha1("e9dd75237c94b209dc3ccd52722de6931a310ba3").is_ok());
+    assert!(check_sha1("g9dd75237c94b209dc3ccd52722de6931a310ba3").is_err());
+    assert!(check_sha1("e9DD75237C94B209DC3CCD52722de6931a310ba3").is_err());
+    assert!(check_sha1("e9dd75237c94b209dc3ccd52722de6931a310ba").is_err());
+    assert!(check_sha1("e9dd75237c94b209dc3ccd52722de6931a310ba33").is_err());
+}
+
+pub fn check_sha256(raw: &str) -> Result<()> {
+    lazy_static! {
+        static ref RE: Regex = Regex::new(r"^[a-f0-9]{64}$").unwrap();
+    }
+    if RE.is_match(raw) {
+        Ok(())
+    } else {
+        Err(ErrorKind::MalformedChecksum(format!(
+            "not a valid SHA-256: '{}' (expected lower-case hex, eg, 'cb1c378f464d5935ddaa8de28446d82638396c61f042295d7fb85e3cccc9e452')",
+            raw
+        ))
+        .into())
+    }
+}
+
+#[test]
+fn test_check_sha256() {
+    assert!(
+        check_sha256("cb1c378f464d5935ddaa8de28446d82638396c61f042295d7fb85e3cccc9e452").is_ok()
+    );
+    assert!(
+        check_sha256("gb1c378f464d5935ddaa8de28446d82638396c61f042295d7fb85e3cccc9e452").is_err()
+    );
+    assert!(
+        check_sha256("UB1C378F464d5935ddaa8de28446d82638396c61f042295d7fb85e3cccc9e452").is_err()
+    );
+    assert!(
+        check_sha256("cb1c378f464d5935ddaa8de28446d82638396c61f042295d7fb85e3cccc9e45").is_err()
+    );
+    assert!(
+        check_sha256("cb1c378f464d5935ddaa8de28446d82638396c61f042295d7fb85e3cccc9e4522").is_err()
+    );
+}
+
+pub fn check_release_type(raw: &str) -> Result<()> {
+    let valid_types = vec![
+        // Citation Style Language official types
+        "article",
+        "article-magazine",
+        "article-newspaper",
+        "article-journal",
+        "bill",
+        "book",
+        "broadcast",
+        "chapter",
+        "dataset",
+        "entry",
+        "entry-dictionary",
+        "entry-encyclopedia",
+        "figure",
+        "graphic",
+        "interview",
+        "legislation",
+        "legal_case",
+        "manuscript",
+        "map",
+        "motion_picture",
+        "musical_score",
+        "pamphlet",
+        "paper-conference",
+        "patent",
+        "post",
+        "post-weblog",
+        "personal_communication",
+        "report",
+        "review",
+        "review-book",
+        "song",
+        "speech",
+        "thesis",
+        "treaty",
+        "webpage",
+        // fatcat-specific extensions
+        "peer_review",
+        "software",
+        "standard",
+    ];
+    for good in valid_types {
+        if raw == good {
+            return Ok(());
+        }
+    }
+    Err(ErrorKind::NotInControlledVocabulary(format!(
+        "not a valid release_type: '{}' (expected a CSL type, eg, 'article-journal', 'book')",
+        raw
+    ))
+    .into())
+}
+
+#[test]
+fn test_check_release_type() {
+    assert!(check_release_type("book").is_ok());
+    assert!(check_release_type("article-journal").is_ok());
+    assert!(check_release_type("standard").is_ok());
+    assert!(check_release_type("journal-article").is_err());
+    assert!(check_release_type("BOOK").is_err());
+    assert!(check_release_type("book ").is_err());
+}
+
+pub fn check_contrib_role(raw: &str) -> Result<()> {
+    let valid_types = vec![
+        // Citation Style Language official role types
+        "author",
+        "collection-editor",
+        "composer",
+        "container-author",
+        "director",
+        "editor",
+        "editorial-director",
+        "editortranslator",
+        "illustrator",
+        "interviewer",
+        "original-author",
+        "recipient",
+        "reviewed-author",
+        "translator",
+        // common extension (for conference proceeding chair)
+        //"chair",
+    ];
+    for good in valid_types {
+        if raw == good {
+            return Ok(());
+        }
+    }
+    Err(ErrorKind::NotInControlledVocabulary(format!(
+        "not a valid contrib.role: '{}' (expected a CSL type, eg, 'author', 'editor')",
+        raw
+    ))
+    .into())
+}
+
+#[test]
+fn test_check_contrib_role() {
+    assert!(check_contrib_role("author").is_ok());
+    assert!(check_contrib_role("editor").is_ok());
+    assert!(check_contrib_role("chair").is_err());
+    assert!(check_contrib_role("EDITOR").is_err());
+    assert!(check_contrib_role("editor ").is_err());
+}
+
+// TODO: make the above checks "more correct"
+// TODO: check ISBN-13
-- 
cgit v1.2.3