aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2019-05-13 19:17:19 -0700
committerBryan Newbold <bnewbold@robocracy.org>2019-05-13 19:17:19 -0700
commite63e21495c5e63f9fce3a0204f178d104c46124e (patch)
treee80e76c4a0b64dd97959e63a375d012d58e4d4df
parent5c3156b10f05bd68530dd1fb2c502764b8397cb1 (diff)
downloadfatcat-e63e21495c5e63f9fce3a0204f178d104c46124e.tar.gz
fatcat-e63e21495c5e63f9fce3a0204f178d104c46124e.zip
rust: fill in missing extid checkers
-rw-r--r--rust/src/endpoint_handlers.rs14
-rw-r--r--rust/src/entity_crud.rs17
-rw-r--r--rust/src/identifiers.rs239
3 files changed, 251 insertions, 19 deletions
diff --git a/rust/src/endpoint_handlers.rs b/rust/src/endpoint_handlers.rs
index ab3b81ce..a19d33f9 100644
--- a/rust/src/endpoint_handlers.rs
+++ b/rust/src/endpoint_handlers.rs
@@ -301,7 +301,7 @@ impl Server {
.first(conn)?
}
(None, None, Some(isbn13), None, None, None, None, None, None, None) => {
- // TODO: check_isbn13(isbn13)?;
+ check_isbn13(isbn13)?;
let (rev, ident, _extid): (ReleaseRevRow, ReleaseIdentRow, ReleaseExtidRow) =
release_rev::table
.inner_join(release_ident::table)
@@ -332,7 +332,7 @@ impl Server {
.first(conn)?
}
(None, None, None, None, None, Some(core), None, None, None, None) => {
- // TODO: check_core_id(core)?;
+ check_core_id(core)?;
release_ident::table
.inner_join(release_rev::table)
.filter(release_rev::core_id.eq(core))
@@ -341,7 +341,9 @@ impl Server {
.first(conn)?
}
(None, None, None, None, None, None, Some(arxiv), None, None, None) => {
- // TODO: check_arxiv_id(arxiv_id)?;
+ // TODO: this allows only lookup by full, versioned arxiv identifier. Probably also
+ // want to allow lookup by "work" style identifier?
+ check_arxiv_id(arxiv)?;
let (rev, ident, _extid): (ReleaseRevRow, ReleaseIdentRow, ReleaseExtidRow) =
release_rev::table
.inner_join(release_ident::table)
@@ -354,7 +356,7 @@ impl Server {
(ident, rev)
}
(None, None, None, None, None, None, None, Some(jstor), None, None) => {
- // TODO: check_jstor_id(jstor_id)?;
+ check_jstor_id(jstor)?;
let (rev, ident, _extid): (ReleaseRevRow, ReleaseIdentRow, ReleaseExtidRow) =
release_rev::table
.inner_join(release_ident::table)
@@ -367,7 +369,7 @@ impl Server {
(ident, rev)
}
(None, None, None, None, None, None, None, None, Some(ark), None) => {
- // TODO: check_ark_id(ark_id)?;
+ check_ark_id(ark)?;
let (rev, ident, _extid): (ReleaseRevRow, ReleaseIdentRow, ReleaseExtidRow) =
release_rev::table
.inner_join(release_ident::table)
@@ -380,7 +382,7 @@ impl Server {
(ident, rev)
}
(None, None, None, None, None, None, None, None, None, Some(mag)) => {
- // TODO: check_mag_id(mag_id)?;
+ check_mag_id(mag)?;
let (rev, ident, _extid): (ReleaseRevRow, ReleaseIdentRow, ReleaseExtidRow) =
release_rev::table
.inner_join(release_ident::table)
diff --git a/rust/src/entity_crud.rs b/rust/src/entity_crud.rs
index af496ad9..3bd19cce 100644
--- a/rust/src/entity_crud.rs
+++ b/rust/src/entity_crud.rs
@@ -1988,6 +1988,7 @@ impl EntityCrud for ReleaseEntity {
fn db_insert_revs(conn: &DbConn, models: &[&Self]) -> Result<Vec<Uuid>> {
// first verify external identifier syntax
for entity in models {
+ // TODO: yeah... helper function to call all these?
if let Some(ref extid) = entity.ext_ids.doi {
check_doi(extid)?;
}
@@ -2000,7 +2001,21 @@ impl EntityCrud for ReleaseEntity {
if let Some(ref extid) = entity.ext_ids.wikidata_qid {
check_wikidata_qid(extid)?;
}
- // TODO: JSTOR and arxiv IDs
+ if let Some(ref extid) = entity.ext_ids.isbn13 {
+ check_isbn13(extid)?;
+ }
+ if let Some(ref extid) = entity.ext_ids.core {
+ check_core_id(extid)?;
+ }
+ if let Some(ref extid) = entity.ext_ids.jstor {
+ check_jstor_id(extid)?;
+ }
+ if let Some(ref extid) = entity.ext_ids.mag {
+ check_mag_id(extid)?;
+ }
+ if let Some(ref extid) = entity.ext_ids.ark {
+ check_ark_id(extid)?;
+ }
if let Some(ref release_type) = entity.release_type {
check_release_type(release_type)?;
}
diff --git a/rust/src/identifiers.rs b/rust/src/identifiers.rs
index 18423643..ae75c8a7 100644
--- a/rust/src/identifiers.rs
+++ b/rust/src/identifiers.rs
@@ -51,7 +51,7 @@ impl FatcatId {
/// Convert fatcat IDs (base32 strings) to UUID
pub fn fcid2uuid(fcid: &str) -> Result<Uuid> {
- if fcid.len() != 26 {
+ if fcid.is_ascii() == false || fcid.len() != 26 {
return Err(FatcatError::InvalidFatcatId(fcid.to_string()).into());
}
let mut raw = vec![0; 16];
@@ -72,7 +72,7 @@ pub fn check_username(raw: &str) -> Result<()> {
lazy_static! {
static ref RE: Regex = Regex::new(r"^[A-Za-z][A-Za-z0-9._-]{2,24}$").unwrap();
}
- if RE.is_match(raw) {
+ if raw.is_ascii() && RE.is_match(raw) {
Ok(())
} else {
Err(FatcatError::MalformedExternalId(
@@ -96,6 +96,7 @@ fn test_check_username() {
assert!(check_username("").is_err());
assert!(check_username("_").is_err());
assert!(check_username("gg").is_err());
+ assert!(check_username("bnewbßasdf").is_err());
assert!(check_username("adminadminadminadminadminadminadmin").is_err());
assert!(check_username("bryan newbold").is_err());
assert!(check_username("01234567-3456-6780").is_err());
@@ -107,7 +108,7 @@ pub fn check_pmcid(raw: &str) -> Result<()> {
lazy_static! {
static ref RE: Regex = Regex::new(r"^PMC\d+$").unwrap();
}
- if RE.is_match(raw) {
+ if raw.is_ascii() && RE.is_match(raw) {
Ok(())
} else {
Err(FatcatError::MalformedExternalId(
@@ -117,11 +118,19 @@ pub fn check_pmcid(raw: &str) -> Result<()> {
}
}
+#[test]
+fn test_check_pmcid() {
+ assert!(check_pmcid("PMC12345").is_ok());
+ assert!(check_pmcid("PMC12345 ").is_err());
+ assert!(check_pmcid("PMC").is_err());
+ assert!(check_pmcid("PMC1.2345").is_err());
+}
+
pub fn check_pmid(raw: &str) -> Result<()> {
lazy_static! {
static ref RE: Regex = Regex::new(r"^\d+$").unwrap();
}
- if RE.is_match(raw) {
+ if raw.is_ascii() && RE.is_match(raw) {
Ok(())
} else {
Err(FatcatError::MalformedExternalId(
@@ -131,11 +140,93 @@ pub fn check_pmid(raw: &str) -> Result<()> {
}
}
+#[test]
+fn test_check_pmid() {
+ assert!(check_pmid("1234").is_ok());
+ assert!(check_pmid("1234 ").is_err());
+ assert!(check_pmid("").is_err());
+ assert!(check_pmid("1.234").is_err());
+ assert!(check_pmid("-1234").is_err());
+ assert!(check_pmid(" 1234").is_err());
+}
+
+pub fn check_mag_id(raw: &str) -> Result<()> {
+ lazy_static! {
+ static ref RE: Regex = Regex::new(r"^\d+$").unwrap();
+ }
+ if raw.is_ascii() && RE.is_match(raw) {
+ Ok(())
+ } else {
+ Err(FatcatError::MalformedExternalId(
+ "Microsoft Academic Graph (mag) (expected, eg, '1234')".to_string(),
+ raw.to_string(),
+ ))?
+ }
+}
+
+#[test]
+fn test_check_mag_id() {
+ assert!(check_mag_id("1234").is_ok());
+ assert!(check_mag_id("1234 ").is_err());
+ assert!(check_mag_id("").is_err());
+ assert!(check_mag_id("1.234").is_err());
+ assert!(check_mag_id("-1234").is_err());
+ assert!(check_mag_id(" 1234").is_err());
+}
+
+pub fn check_jstor_id(raw: &str) -> Result<()> {
+ lazy_static! {
+ static ref RE: Regex = Regex::new(r"^\d+$").unwrap();
+ }
+ if raw.is_ascii() && RE.is_match(raw) {
+ Ok(())
+ } else {
+ Err(FatcatError::MalformedExternalId(
+ "JSTOR (jstor_id) (expected, eg, '1234')".to_string(),
+ raw.to_string(),
+ ))?
+ }
+}
+
+#[test]
+fn test_check_jstor_id() {
+ assert!(check_jstor_id("1234").is_ok());
+ assert!(check_jstor_id("1234 ").is_err());
+ assert!(check_jstor_id("").is_err());
+ assert!(check_jstor_id("1.234").is_err());
+ assert!(check_jstor_id("-1234").is_err());
+ assert!(check_jstor_id(" 1234").is_err());
+}
+
+pub fn check_core_id(raw: &str) -> Result<()> {
+ lazy_static! {
+ static ref RE: Regex = Regex::new(r"^\d+$").unwrap();
+ }
+ if raw.is_ascii() && RE.is_match(raw) {
+ Ok(())
+ } else {
+ Err(FatcatError::MalformedExternalId(
+ "CORE.ac.uk (core_id) (expected, eg, '1234')".to_string(),
+ raw.to_string(),
+ ))?
+ }
+}
+
+#[test]
+fn test_check_core_id() {
+ assert!(check_core_id("1234").is_ok());
+ assert!(check_core_id("1234 ").is_err());
+ assert!(check_core_id("").is_err());
+ assert!(check_core_id("1.234").is_err());
+ assert!(check_core_id("-1234").is_err());
+ assert!(check_core_id(" 1234").is_err());
+}
+
pub fn check_wikidata_qid(raw: &str) -> Result<()> {
lazy_static! {
static ref RE: Regex = Regex::new(r"^Q\d+$").unwrap();
}
- if RE.is_match(raw) {
+ if raw.is_ascii() && RE.is_match(raw) {
Ok(())
} else {
Err(FatcatError::MalformedExternalId(
@@ -144,12 +235,20 @@ pub fn check_wikidata_qid(raw: &str) -> Result<()> {
))?
}
}
+#[test]
+fn test_check_wikidata_qid() {
+ assert!(check_wikidata_qid("Q1234").is_ok());
+ assert!(check_wikidata_qid("Q1234 ").is_err());
+ assert!(check_wikidata_qid("Q").is_err());
+ assert!(check_wikidata_qid("Q1-234").is_err());
+ assert!(check_wikidata_qid("1234").is_err());
+}
pub fn check_doi(raw: &str) -> Result<()> {
lazy_static! {
- static ref RE: Regex = Regex::new(r"^10.\d{3,6}/.+$").unwrap();
+ static ref RE: Regex = Regex::new(r"^10.\d{3,6}/\S+$").unwrap();
}
- if RE.is_match(raw) {
+ if raw.is_ascii() && RE.is_match(raw) {
Ok(())
} else {
Err(FatcatError::MalformedExternalId(
@@ -159,11 +258,111 @@ pub fn check_doi(raw: &str) -> Result<()> {
}
}
+#[test]
+fn test_check_doi() {
+ assert!(check_doi("10.1234/aksjdfh").is_ok());
+ assert!(check_doi("10.1234/ak../2949_-d.(asdf)fh").is_ok());
+ assert!(check_doi("10.1234/ßs").is_err());
+ assert!(check_doi("10.1234/aksjdfh ").is_err());
+ assert!(check_doi("10.1234/ak sjdfh").is_err());
+ assert!(check_doi("10.1234/aks\tjdfh").is_err());
+ assert!(check_doi("10.1234/ ").is_err());
+ assert!(check_doi("10.2/aksjdfh").is_err());
+ assert!(check_doi("10.1234/\naksjdfh").is_err());
+ assert!(check_doi("10.1234").is_err());
+ assert!(check_doi("10.1234/").is_err());
+}
+
+pub fn check_arxiv_id(raw: &str) -> Result<()> {
+ lazy_static! {
+ static ref RE: Regex = Regex::new(r"^(\d{4}.\d{4,5}|[a-z\-]+(\.[A-Z]{2})?/\d{7})v\d+$").unwrap();
+ }
+ if raw.is_ascii() && RE.is_match(raw) {
+ Ok(())
+ } else {
+ Err(FatcatError::MalformedExternalId(
+ "versioned arXiv identifier (expected, eg, '0806.2878v1')".to_string(),
+ raw.to_string(),
+ ))?
+ }
+}
+
+#[test]
+fn test_check_arxiv_id() {
+ assert!(check_arxiv_id("0806.2878v1").is_ok());
+ assert!(check_arxiv_id("1501.00001v1").is_ok());
+ assert!(check_arxiv_id("hep-th/9901001v1").is_ok());
+ assert!(check_arxiv_id("math.CA/0611800v2").is_ok());
+
+ assert!(check_arxiv_id("hep-TH/9901001v1").is_err());
+ assert!(check_arxiv_id("hßp-th/9901001v1").is_err());
+ assert!(check_arxiv_id("math.CA/06l1800v2").is_err());
+ assert!(check_arxiv_id("mßth.ca/0611800v2").is_err());
+ assert!(check_arxiv_id("MATH.CA/0611800v2").is_err());
+ assert!(check_arxiv_id("0806.2878v23").is_ok());
+ assert!(check_arxiv_id("0806.2878v").is_err());
+ assert!(check_arxiv_id("0806.2878").is_err());
+ assert!(check_arxiv_id("0806.2878v1 ").is_err());
+ assert!(check_arxiv_id("006.2878v1").is_err());
+ assert!(check_arxiv_id("0806.v1").is_err());
+ assert!(check_arxiv_id("08062878v1").is_err());
+}
+
+pub fn check_ark_id(raw: &str) -> Result<()> {
+ lazy_static! {
+ static ref RE: Regex = Regex::new(r"^ark:/\d{5,9}/\S+$").unwrap();
+ }
+ if raw.is_ascii() && RE.is_match(raw) {
+ Ok(())
+ } else {
+ Err(FatcatError::MalformedExternalId(
+ "ARK identifier (expected, eg, 'ark:/13030/m53r5pzm')".to_string(),
+ raw.to_string(),
+ ))?
+ }
+}
+
+#[test]
+fn test_check_ark_id() {
+ assert!(check_ark_id("ark:/13030/m53r5pzm").is_ok());
+ assert!(check_ark_id("ark:/13030/m53r5pzm ").is_err());
+ assert!(check_ark_id("ark:/13030/m53r5ßzm").is_err());
+ assert!(check_ark_id("ARK:/13030/m53r5pzm").is_err());
+ assert!(check_ark_id("ark:/13030/m53r5pzm.bla-deedah").is_ok());
+ assert!(check_ark_id("/13030/m53r5pzm").is_err());
+ assert!(check_ark_id("ark:/blah/m53r5pzm").is_err());
+ assert!(check_ark_id("ark:/13030/").is_err());
+ assert!(check_ark_id("ark:/13030").is_err());
+}
+
+pub fn check_isbn13(raw: &str) -> Result<()> {
+ lazy_static! {
+ // via https://stackoverflow.com/a/4381556
+ static ref RE: Regex = Regex::new(r"^97(?:8|9)-\d{1,5}-\d{1,7}-\d{1,6}-\d$").unwrap();
+ }
+ if raw.len() == 17 && raw.is_ascii() && RE.is_match(raw) {
+ Ok(())
+ } else {
+ Err(FatcatError::MalformedExternalId(
+ "Canonical ISBN-13 (expected, eg, '978-1-56619-909-4')".to_string(),
+ raw.to_string(),
+ ))?
+ }
+}
+
+#[test]
+fn test_check_isbn13() {
+ assert!(check_isbn13("978-1-56619-909-4").is_ok());
+ assert!(check_isbn13("978-1-4028-9462-6").is_ok());
+ assert!(check_isbn13("978-1-56619-909-4 ").is_err());
+ assert!(check_isbn13("9781566199094").is_err());
+}
+
pub fn check_issn(raw: &str) -> Result<()> {
lazy_static! {
static ref RE: Regex = Regex::new(r"^\d{4}-\d{3}[0-9X]$").unwrap();
}
- if RE.is_match(raw) {
+ if raw.is_ascii() && RE.is_match(raw) {
Ok(())
} else {
Err(FatcatError::MalformedExternalId(
@@ -173,11 +372,21 @@ pub fn check_issn(raw: &str) -> Result<()> {
}
}
+#[test]
+fn test_check_issn() {
+ assert!(check_issn("1234-5678").is_ok());
+ assert!(check_issn("1234-567X").is_ok());
+ assert!(check_issn("1234-5678 ").is_err());
+ assert!(check_issn(" 1234-5678").is_err());
+ assert!(check_issn("12345678").is_err());
+ assert!(check_issn("0123-56789").is_err());
+}
+
pub fn check_orcid(raw: &str) -> Result<()> {
lazy_static! {
static ref RE: Regex = Regex::new(r"^\d{4}-\d{4}-\d{4}-\d{3}[\dX]$").unwrap();
}
- if RE.is_match(raw) {
+ if raw.is_ascii() && RE.is_match(raw) {
Ok(())
} else {
Err(FatcatError::MalformedExternalId(
@@ -191,6 +400,7 @@ pub fn check_orcid(raw: &str) -> Result<()> {
fn test_check_orcid() {
assert!(check_orcid("0123-4567-3456-6789").is_ok());
assert!(check_orcid("0123-4567-3456-678X").is_ok());
+ assert!(check_orcid("0123-4567-3456-6789 ").is_err());
assert!(check_orcid("01234567-3456-6780").is_err());
assert!(check_orcid("0x23-4567-3456-6780").is_err());
}
@@ -199,7 +409,7 @@ pub fn check_md5(raw: &str) -> Result<()> {
lazy_static! {
static ref RE: Regex = Regex::new(r"^[a-f0-9]{32}$").unwrap();
}
- if RE.is_match(raw) {
+ if raw.is_ascii() && RE.is_match(raw) {
Ok(())
} else {
Err(FatcatError::MalformedChecksum(
@@ -212,6 +422,7 @@ pub fn check_md5(raw: &str) -> Result<()> {
#[test]
fn test_check_md5() {
assert!(check_md5("1b39813549077b2347c0f370c3864b40").is_ok());
+ assert!(check_md5("1b39813549077b2347c0f370c3864b40 ").is_err());
assert!(check_md5("1g39813549077b2347c0f370c3864b40").is_err());
assert!(check_md5("1B39813549077B2347C0F370c3864b40").is_err());
assert!(check_md5("1b39813549077b2347c0f370c3864b4").is_err());
@@ -222,7 +433,7 @@ pub fn check_sha1(raw: &str) -> Result<()> {
lazy_static! {
static ref RE: Regex = Regex::new(r"^[a-f0-9]{40}$").unwrap();
}
- if RE.is_match(raw) {
+ if raw.is_ascii() && RE.is_match(raw) {
Ok(())
} else {
Err(FatcatError::MalformedChecksum(
@@ -235,6 +446,7 @@ pub fn check_sha1(raw: &str) -> Result<()> {
#[test]
fn test_check_sha1() {
assert!(check_sha1("e9dd75237c94b209dc3ccd52722de6931a310ba3").is_ok());
+ assert!(check_sha1("e9dd75237c94b209dc3ccd52722de6931a310ba3 ").is_err());
assert!(check_sha1("g9dd75237c94b209dc3ccd52722de6931a310ba3").is_err());
assert!(check_sha1("e9DD75237C94B209DC3CCD52722de6931a310ba3").is_err());
assert!(check_sha1("e9dd75237c94b209dc3ccd52722de6931a310ba").is_err());
@@ -245,7 +457,7 @@ pub fn check_sha256(raw: &str) -> Result<()> {
lazy_static! {
static ref RE: Regex = Regex::new(r"^[a-f0-9]{64}$").unwrap();
}
- if RE.is_match(raw) {
+ if raw.is_ascii() && RE.is_match(raw) {
Ok(())
} else {
Err(FatcatError::MalformedChecksum(
@@ -261,6 +473,9 @@ fn test_check_sha256() {
check_sha256("cb1c378f464d5935ddaa8de28446d82638396c61f042295d7fb85e3cccc9e452").is_ok()
);
assert!(
+ check_sha256("cb1c378f464d5935ddaa8de28446d82638396c61f042295d7fb85e3cccc9e452 ").is_err()
+ );
+ assert!(
check_sha256("gb1c378f464d5935ddaa8de28446d82638396c61f042295d7fb85e3cccc9e452").is_err()
);
assert!(