1 files changed, 28 insertions, 15 deletions
diff --git a/fuzzycat/verify.py b/fuzzycat/verify.py
index 79cdd3b..d73dbdc 100644
--- a/fuzzycat/verify.py
+++ b/fuzzycat/verify.py
@@ -82,7 +82,7 @@ import json
 import operator
 import re
 import sys
-from typing import Dict, Tuple, Counter
+from typing import Counter, Dict, Tuple
 
 from glom import PathAccessError, glom
 
@@ -189,6 +189,7 @@ def verify(a: Dict, b: Dict, min_title_length=5) -> Tuple[str, str]:
         except PathAccessError:
             pass
 
+    # Datacite keeps track of versions.
     try:
         if a_title and a_title == b_title and glom(a, "extra.datacite.metadataVersion") != glom(
                 b, "extra.datacite.metadataVersion"):
@@ -196,6 +197,7 @@ def verify(a: Dict, b: Dict, min_title_length=5) -> Tuple[str, str]:
     except PathAccessError:
         pass
 
+    # UBC repository, we assume that different items in the same pool.
     try:
         prefix = "10.14288/"
         a_doi = glom(a, "ext_ids.doi")
@@ -209,15 +211,17 @@ def verify(a: Dict, b: Dict, min_title_length=5) -> Tuple[str, str]:
     except PathAccessError:
         pass
 
+    # The British Standards Institution (BSI) keeps various version of
+    # standards around, among them an "undated" variant.
+    # Reference to subdocument.
+    # https://api.fatcat.wiki/v0/release/tcro5wr6brhqnf5wettyiauw34
+    # https://api.fatcat.wiki/v0/release/s7a4o5v5gfg4tbzna6poyg7nzy
     try:
         a_doi = glom(a, "ext_ids.doi")
         b_doi = glom(b, "ext_ids.doi")
         if has_doi_prefix(a_doi, "10.3403") and has_doi_prefix(b_doi, "10.3403"):
             if a_doi + "u" == b_doi or b_doi + "u" == a_doi:
                 return (Status.STRONG, Reason.CUSTOM_BSI_UNDATED)
-            # Reference to subdocument.
-            # https://api.fatcat.wiki/v0/release/tcro5wr6brhqnf5wettyiauw34
-            # https://api.fatcat.wiki/v0/release/s7a4o5v5gfg4tbzna6poyg7nzy
             if a_title == b_title and ((dict_key_exists(a, "extra.subtitle")
                                         and not dict_key_exists(b, "extra.subtitle")) or
                                        (dict_key_exists(b, "extra.subtitle")
@@ -226,24 +230,28 @@ def verify(a: Dict, b: Dict, min_title_length=5) -> Tuple[str, str]:
     except PathAccessError:
         pass
 
+    # IOP science.
     try:
+        prefix = "10.1149"
         a_doi = glom(a, "ext_ids.doi")
         b_doi = glom(b, "ext_ids.doi")
-        if has_doi_prefix(a_doi, "10.1149") and has_doi_prefix(b_doi, "10.1149"):
-            if (a_doi.startswith("10.1149/ma") and not b_doi.startswith("10.1149/ma")
-                    or b_doi.startswith("10.1149/ma") and not a_doi.startswith("10.1149/ma")):
+        if has_doi_prefix(a_doi, prefix) and has_doi_prefix(b_doi, prefix):
+            v = "{}/ma".format(prefix)
+            if (a_doi.startswith(v) and not b_doi.startswith(v)
+                    or b_doi.startswith(v) and not a_doi.startswith(v)):
                 return (Status.DIFFERENT, Reason.CUSTOM_IOP_MA_PATTERN)
     except PathAccessError:
         pass
 
+    # Very manual, XXX: move this into blacklist.
     if "Zweckverband Volkshochschule " in a_title and a_title != b_title:
         return (Status.DIFFERENT, Reason.CUSTOM_VHS)
 
     if re.match(r"appendix ?[^ ]*$", a_title_lower):
         return (Status.AMBIGUOUS, Reason.APPENDIX)
 
+    # Figshare, versions.
     try:
-        # TODO: figshare versions, "xxx.v1"
         FIGSHARE_PREFIX = "10.6084/"
         if glom(a, "ext_ids.doi").startswith(FIGSHARE_PREFIX) and glom(
                 b, "ext_ids.doi").startswith(FIGSHARE_PREFIX):
@@ -254,9 +262,10 @@ def verify(a: Dict, b: Dict, min_title_length=5) -> Tuple[str, str]:
     except PathAccessError:
         pass
 
+    # Generic, versioned DOI.
+    # https://fatcat.wiki/release/cwqujxztefdghhssb7ysxj7b5m
+    # https://fatcat.wiki/release/hwnqyz7n65eabhlivvkipkytji
     try:
-        # https://fatcat.wiki/release/cwqujxztefdghhssb7ysxj7b5m
-        # https://fatcat.wiki/release/hwnqyz7n65eabhlivvkipkytji
         a_doi = glom(a, "ext_ids.doi")
         b_doi = glom(b, "ext_ids.doi")
         versioned_doi_pattern = '10[.].*/v[0-9]{1,}$'
@@ -276,12 +285,12 @@ def verify(a: Dict, b: Dict, min_title_length=5) -> Tuple[str, str]:
     except PathAccessError:
         pass
 
-    # TODO: datacite specific vocabulary
+    # Datacite related identifiers.
     # extra.datacite.relations[].{relationType=IsNewerVersionOf,relatedIdentifier=10...}
-    # beware: we have versions and "isPartOf", e.g. https://api.fatcat.wiki/v0/release/ybxygpeypbaq5pfrztu3z2itw4
-    # TODO: does glom help?
-    # ...
-    if "datacite" in (a.get("extra", []) or []) and "datacite" in (b.get("extra", []) or []):
+    # beware: we have versions and "isPartOf", e.g.
+    # https://api.fatcat.wiki/v0/release/ybxygpeypbaq5pfrztu3z2itw4
+    # Datacite md schema: https://doi.org/10.14454/7xq3-zf69
+    if dict_key_exists(a, "extra.datacite") and dict_key_exists(b, "extra.datacite"):
         whitelist = set([
             "HasPart",
             "HasVersion",
@@ -309,6 +318,7 @@ def verify(a: Dict, b: Dict, min_title_length=5) -> Tuple[str, str]:
         except PathAccessError:
             pass
 
+    # Arxiv versions.
     try:
         id_a = re.match(r"(.*)v[0-9]{1,}$", glom(a, "ext_ids.arxiv")).group(1)
         id_b = re.match(r"(.*)v[0-9]{1,}$", glom(b, "ext_ids.arxiv")).group(1)
@@ -342,6 +352,7 @@ def verify(a: Dict, b: Dict, min_title_length=5) -> Tuple[str, str]:
     except PathAccessError:
         pass
 
+    # Datasets are typically different (and have less md and look similar).
     try:
         if (glom(a, "release_type") == "dataset" and glom(b, "release_type") == "dataset"
                 and glom(a, "ext_ids.doi") != glom(b, "ext_ids.doi")):
@@ -349,6 +360,7 @@ def verify(a: Dict, b: Dict, min_title_length=5) -> Tuple[str, str]:
     except PathAccessError:
         pass
 
+    # Common chapter names should be handled here.
     try:
         if (glom(a, "release_type") == "chapter" and glom(b, "release_type") == "chapter"
                 and glom(a, "extra.container_name") != glom(b, "extra.container_name")):
@@ -356,6 +368,7 @@ def verify(a: Dict, b: Dict, min_title_length=5) -> Tuple[str, str]:
     except PathAccessError:
         pass
 
+    # Components tend to have similar names.
     try:
         if glom(a, "extra.crossref.type") == "component" and glom(a, "title") != glom(b, "title"):
             return (Status.DIFFERENT, Reason.COMPONENT)