From ba7d6a842cb4d61357b588fb2d3ec552c654ae64 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 8 Jan 2019 23:18:32 -0800 Subject: huge refactor of rust modules/files Taking advantage of new Rust 2018 crate/module path changes, and re-organizing things. Somewhat optimistic this could help with partial rebuild speed also. --- rust/src/identifiers.rs | 376 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 376 insertions(+) create mode 100644 rust/src/identifiers.rs (limited to 'rust/src/identifiers.rs') diff --git a/rust/src/identifiers.rs b/rust/src/identifiers.rs new file mode 100644 index 00000000..adb9f413 --- /dev/null +++ b/rust/src/identifiers.rs @@ -0,0 +1,376 @@ +use crate::errors::*; +use data_encoding::BASE32_NOPAD; +use regex::Regex; +use serde_json; +use std::str::FromStr; +use uuid::Uuid; + +#[derive(Clone, Copy, PartialEq, Debug)] +pub struct FatCatId(Uuid); + +impl ToString for FatCatId { + fn to_string(&self) -> String { + uuid2fcid(&self.to_uuid()) + } +} + +impl FromStr for FatCatId { + type Err = Error; + fn from_str(s: &str) -> Result { + fcid2uuid(s).map(|u| FatCatId(u)) + } +} + +impl FatCatId { + pub fn to_uuid(&self) -> Uuid { + self.0 + } + // TODO: just make it u: Uuid and clone (not by ref) + pub fn from_uuid(u: &Uuid) -> FatCatId { + FatCatId(*u) + } +} + +/// Convert fatcat IDs (base32 strings) to UUID +pub fn fcid2uuid(fcid: &str) -> Result { + if fcid.len() != 26 { + return Err(ErrorKind::InvalidFatcatId(fcid.to_string()).into()); + } + let mut raw = vec![0; 16]; + BASE32_NOPAD + .decode_mut(fcid.to_uppercase().as_bytes(), &mut raw) + .map_err(|_dp| ErrorKind::InvalidFatcatId(fcid.to_string()))?; + // unwrap() is safe here, because we know raw is always 16 bytes + Ok(Uuid::from_bytes(&raw).unwrap()) +} + +/// Convert UUID to fatcat ID string (base32 encoded) +pub fn uuid2fcid(id: &Uuid) -> String { + let raw = id.as_bytes(); + BASE32_NOPAD.encode(raw).to_lowercase() +} + +pub fn check_username(raw: &str) -> Result<()> { + lazy_static! { + static ref RE: Regex = Regex::new(r"^[A-Za-z][A-Za-z0-9._-]{2,24}$").unwrap(); + } + if RE.is_match(raw) { + Ok(()) + } else { + Err(ErrorKind::MalformedExternalId(format!( + "not a valid username: '{}' (expected, eg, 'AcidBurn')", + raw + )) + .into()) + } +} + +#[test] +fn test_check_username() { + assert!(check_username("bnewbold").is_ok()); + assert!(check_username("BNEWBOLD").is_ok()); + assert!(check_username("admin").is_ok()); + assert!(check_username("friend-bot").is_ok()); + assert!(check_username("dog").is_ok()); + assert!(check_username("g_____").is_ok()); + assert!(check_username("bnewbold2-archive").is_ok()); + assert!(check_username("bnewbold2-internetarchive").is_ok()); + + assert!(check_username("").is_err()); + assert!(check_username("_").is_err()); + assert!(check_username("gg").is_err()); + assert!(check_username("adminadminadminadminadminadminadmin").is_err()); + assert!(check_username("bryan newbold").is_err()); + assert!(check_username("01234567-3456-6780").is_err()); + assert!(check_username(".admin").is_err()); + assert!(check_username("-bot").is_err()); +} + +pub fn check_pmcid(raw: &str) -> Result<()> { + lazy_static! { + static ref RE: Regex = Regex::new(r"^PMC\d+$").unwrap(); + } + if RE.is_match(raw) { + Ok(()) + } else { + Err(ErrorKind::MalformedExternalId(format!( + "not a valid PubMed Central ID (PMCID): '{}' (expected, eg, 'PMC12345')", + raw + )) + .into()) + } +} + +pub fn check_pmid(raw: &str) -> Result<()> { + lazy_static! { + static ref RE: Regex = Regex::new(r"^\d+$").unwrap(); + } + if RE.is_match(raw) { + Ok(()) + } else { + Err(ErrorKind::MalformedExternalId(format!( + "not a valid PubMed ID (PMID): '{}' (expected, eg, '1234')", + raw + )) + .into()) + } +} + +pub fn check_wikidata_qid(raw: &str) -> Result<()> { + lazy_static! { + static ref RE: Regex = Regex::new(r"^Q\d+$").unwrap(); + } + if RE.is_match(raw) { + Ok(()) + } else { + Err(ErrorKind::MalformedExternalId(format!( + "not a valid Wikidata QID: '{}' (expected, eg, 'Q1234')", + raw + )) + .into()) + } +} + +pub fn check_doi(raw: &str) -> Result<()> { + lazy_static! { + static ref RE: Regex = Regex::new(r"^10.\d{3,6}/.+$").unwrap(); + } + if RE.is_match(raw) { + Ok(()) + } else { + Err(ErrorKind::MalformedExternalId(format!( + "not a valid DOI: '{}' (expected, eg, '10.1234/aksjdfh')", + raw + )) + .into()) + } +} + +pub fn check_issn(raw: &str) -> Result<()> { + lazy_static! { + static ref RE: Regex = Regex::new(r"^\d{4}-\d{3}[0-9X]$").unwrap(); + } + if RE.is_match(raw) { + Ok(()) + } else { + Err(ErrorKind::MalformedExternalId(format!( + "not a valid ISSN: '{}' (expected, eg, '1234-5678')", + raw + )) + .into()) + } +} + +pub fn check_orcid(raw: &str) -> Result<()> { + lazy_static! { + static ref RE: Regex = Regex::new(r"^\d{4}-\d{4}-\d{4}-\d{3}[\dX]$").unwrap(); + } + if RE.is_match(raw) { + Ok(()) + } else { + Err(ErrorKind::MalformedExternalId(format!( + "not a valid ORCID: '{}' (expected, eg, '0123-4567-3456-6789')", + raw + )) + .into()) + } +} + +#[test] +fn test_check_orcid() { + assert!(check_orcid("0123-4567-3456-6789").is_ok()); + assert!(check_orcid("0123-4567-3456-678X").is_ok()); + assert!(check_orcid("01234567-3456-6780").is_err()); + assert!(check_orcid("0x23-4567-3456-6780").is_err()); +} + +pub fn check_md5(raw: &str) -> Result<()> { + lazy_static! { + static ref RE: Regex = Regex::new(r"^[a-f0-9]{32}$").unwrap(); + } + if RE.is_match(raw) { + Ok(()) + } else { + Err(ErrorKind::MalformedChecksum(format!( + "not a valid MD5: '{}' (expected lower-case hex, eg, '1b39813549077b2347c0f370c3864b40')", + raw + )) + .into()) + } +} + +#[test] +fn test_check_md5() { + assert!(check_md5("1b39813549077b2347c0f370c3864b40").is_ok()); + assert!(check_md5("1g39813549077b2347c0f370c3864b40").is_err()); + assert!(check_md5("1B39813549077B2347C0F370c3864b40").is_err()); + assert!(check_md5("1b39813549077b2347c0f370c3864b4").is_err()); + assert!(check_md5("1b39813549077b2347c0f370c3864b411").is_err()); +} + +pub fn check_sha1(raw: &str) -> Result<()> { + lazy_static! { + static ref RE: Regex = Regex::new(r"^[a-f0-9]{40}$").unwrap(); + } + if RE.is_match(raw) { + Ok(()) + } else { + Err(ErrorKind::MalformedChecksum(format!( + "not a valid SHA-1: '{}' (expected lower-case hex, eg, 'e9dd75237c94b209dc3ccd52722de6931a310ba3')", + raw + )) + .into()) + } +} + +#[test] +fn test_check_sha1() { + assert!(check_sha1("e9dd75237c94b209dc3ccd52722de6931a310ba3").is_ok()); + assert!(check_sha1("g9dd75237c94b209dc3ccd52722de6931a310ba3").is_err()); + assert!(check_sha1("e9DD75237C94B209DC3CCD52722de6931a310ba3").is_err()); + assert!(check_sha1("e9dd75237c94b209dc3ccd52722de6931a310ba").is_err()); + assert!(check_sha1("e9dd75237c94b209dc3ccd52722de6931a310ba33").is_err()); +} + +pub fn check_sha256(raw: &str) -> Result<()> { + lazy_static! { + static ref RE: Regex = Regex::new(r"^[a-f0-9]{64}$").unwrap(); + } + if RE.is_match(raw) { + Ok(()) + } else { + Err(ErrorKind::MalformedChecksum(format!( + "not a valid SHA-256: '{}' (expected lower-case hex, eg, 'cb1c378f464d5935ddaa8de28446d82638396c61f042295d7fb85e3cccc9e452')", + raw + )) + .into()) + } +} + +#[test] +fn test_check_sha256() { + assert!( + check_sha256("cb1c378f464d5935ddaa8de28446d82638396c61f042295d7fb85e3cccc9e452").is_ok() + ); + assert!( + check_sha256("gb1c378f464d5935ddaa8de28446d82638396c61f042295d7fb85e3cccc9e452").is_err() + ); + assert!( + check_sha256("UB1C378F464d5935ddaa8de28446d82638396c61f042295d7fb85e3cccc9e452").is_err() + ); + assert!( + check_sha256("cb1c378f464d5935ddaa8de28446d82638396c61f042295d7fb85e3cccc9e45").is_err() + ); + assert!( + check_sha256("cb1c378f464d5935ddaa8de28446d82638396c61f042295d7fb85e3cccc9e4522").is_err() + ); +} + +pub fn check_release_type(raw: &str) -> Result<()> { + let valid_types = vec![ + // Citation Style Language official types + "article", + "article-magazine", + "article-newspaper", + "article-journal", + "bill", + "book", + "broadcast", + "chapter", + "dataset", + "entry", + "entry-dictionary", + "entry-encyclopedia", + "figure", + "graphic", + "interview", + "legislation", + "legal_case", + "manuscript", + "map", + "motion_picture", + "musical_score", + "pamphlet", + "paper-conference", + "patent", + "post", + "post-weblog", + "personal_communication", + "report", + "review", + "review-book", + "song", + "speech", + "thesis", + "treaty", + "webpage", + // fatcat-specific extensions + "peer_review", + "software", + "standard", + ]; + for good in valid_types { + if raw == good { + return Ok(()); + } + } + Err(ErrorKind::NotInControlledVocabulary(format!( + "not a valid release_type: '{}' (expected a CSL type, eg, 'article-journal', 'book')", + raw + )) + .into()) +} + +#[test] +fn test_check_release_type() { + assert!(check_release_type("book").is_ok()); + assert!(check_release_type("article-journal").is_ok()); + assert!(check_release_type("standard").is_ok()); + assert!(check_release_type("journal-article").is_err()); + assert!(check_release_type("BOOK").is_err()); + assert!(check_release_type("book ").is_err()); +} + +pub fn check_contrib_role(raw: &str) -> Result<()> { + let valid_types = vec![ + // Citation Style Language official role types + "author", + "collection-editor", + "composer", + "container-author", + "director", + "editor", + "editorial-director", + "editortranslator", + "illustrator", + "interviewer", + "original-author", + "recipient", + "reviewed-author", + "translator", + // common extension (for conference proceeding chair) + //"chair", + ]; + for good in valid_types { + if raw == good { + return Ok(()); + } + } + Err(ErrorKind::NotInControlledVocabulary(format!( + "not a valid contrib.role: '{}' (expected a CSL type, eg, 'author', 'editor')", + raw + )) + .into()) +} + +#[test] +fn test_check_contrib_role() { + assert!(check_contrib_role("author").is_ok()); + assert!(check_contrib_role("editor").is_ok()); + assert!(check_contrib_role("chair").is_err()); + assert!(check_contrib_role("EDITOR").is_err()); + assert!(check_contrib_role("editor ").is_err()); +} + +// TODO: make the above checks "more correct" +// TODO: check ISBN-13 -- cgit v1.2.3