3 files changed, 236 insertions, 48 deletions
diff --git a/fuzzycat/verify.py b/fuzzycat/verify.py
index 1a0fb95..c937aa8 100644
--- a/fuzzycat/verify.py
+++ b/fuzzycat/verify.py
@@ -29,12 +29,21 @@ import collections
 import itertools
 import json
 import operator
+import sys
+from enum import Enum
+
+from fuzzycat.cluster import slugify_string
 
 get_key_values = operator.itemgetter("k", "v")
 
 # There titles appear too often, so ignore them for now.
 TITLE_BLACKLIST = set([
     "",
+    "about this issue",
+    "about this journal",
+    "abbreviations and acronyms",
+    "acknowledgment of reviewers",
+    "abbildung",
     "abstracts",
     "acknowledgements",
     "acknowledgments",
@@ -108,6 +117,39 @@ TITLE_BLACKLIST = set([
 ])
 
 
+class Status(str, Enum):
+    """
+    Match status.
+    """
+    EXACT = 'exact'
+    DIFFERENT = 'different'
+    STRONG = 'strong'
+    WEAK = 'weak'
+    AMBIGUOUS = 'ambigiuous'
+
+
+class OK(str, Enum):
+    """
+    Reason for assuming we have a match.
+    """
+    ARXIV_VERSION = 'ok.arxiv_version'
+    DUMMY = 'ok.dummy'
+    TITLE_AUTHOR_MATCH = 'ok.title_author_match'
+    PREPRINT_PUBLISHED = 'ok.preprint_published'
+
+
+class Miss(str, Enum):
+    """
+    Reasons indicating mismatch.
+    """
+    ARXIV_VERSION = 'miss.arxiv_version'
+    BLACKLISTED = 'miss.blacklisted'
+    CONTRIB_INTERSECTION_EMPTY = 'miss.contrib_intersection_empty'
+    SHORT_TITLE = 'miss.short_title'
+    YEAR = 'miss.year'
+    CUSTOM_VHS = 'miss.vhs' # https://fatcat.wiki/release/44gk5ben5vghljq6twm7lwmxla
+
+
 class GroupVerifier:
     """
     Verifier.
@@ -122,15 +164,12 @@ class GroupVerifier:
     def __init__(self, iterable: collections.abc.Iterable, max_cluster_size: int = 10):
         self.iterable: collections.abc.Iterable = iterable
         self.max_cluster_size: int = 10
-        self.counter = collections.Counter({
-            "unique": 0,
-            "too_large": 0,
-        })
+        self.counter = collections.Counter()
 
     def run(self):
         for i, line in enumerate(self.iterable):
             if i % 20000 == 0:
-                print(i)
+                print(i, file=sys.stderr)
             line = line.strip()
             if not line:
                 continue
@@ -143,46 +182,60 @@ class GroupVerifier:
                 self.counter["too_large"] += 1
                 continue
             for a, b in itertools.combinations(vs, r=2):
-                result = self.compare(a, b)
-                # print(a.get("ident"), b.get("ident"), result)
-                # print(a.get("title")[:30], " ---- ", b.get("title")[:20])
-
-        print(json.dumps(dict(self.counter)))
-
-    def compare(self, a, b):
-        """
-        We compare two release entities here.
-
-        * ext_ids.doi
-        * contribs
-        * is the title meaningful enough, is it too common, too short
-        * files share a sha1
-        * arxiv versions
-        """
-        if len(a.get("title")) < 5:
-            self.counter["short_title"] += 1
-            return False
-        if a.get("title", "").lower() in TITLE_BLACKLIST:
-            self.counter["blacklist"] += 1
-            return False
-
-        arxiv_id_a = a.get("ext_ids", {}).get("arxiv")
-        arxiv_id_b = b.get("ext_ids", {}).get("arxiv")
-        if arxiv_id_a and arxiv_id_b:
-            id_a, version_a = arxiv_id_a.split("v")
-            id_b, version_b = arxiv_id_b.split("v")
-            if id_a == id_b:
-                self.counter["arxiv_v"] += 1
-                return True
-            else:
-                return False
-
-        a_authors = set([v.get("raw_name") for v in a.get("contribs", [])])
-        b_authors = set([v.get("raw_name") for v in b.get("contribs", [])])
-
-        if len(a_authors & b_authors) == 0:
-            self.counter["contrib_miss"] += 1
-            return False
-
-        self.counter["dummy"] += 1
-        return True
+                result, reason = compare(a, b)
+                self.counter[reason] += 1
+                print("https://fatcat.wiki/release/{}".format(a["ident"]),
+                      "https://fatcat.wiki/release/{}".format(b["ident"]), result, reason)
+
+        self.counter["total"] = sum(v for _, v in self.counter.items())
+        print(json.dumps(dict(self.counter)), file=sys.stderr)
+
+
+def compare(a, b):
+    """
+    Compare two entities, return match status.
+    """
+    if len(a.get("title", "")) < 5:
+        return (Status.AMBIGUOUS, Miss.SHORT_TITLE)
+    if a.get("title", "").lower() in TITLE_BLACKLIST:
+        return (Status.AMBIGUOUS, Miss.BLACKLISTED)
+
+    if "Zweckverband Volkshochschule " in a.get("title") and a.get("title") != b.get("title"):
+        return (Status.DIFFERENT, Miss.CUSTOM_VHS)
+
+    arxiv_id_a = a.get("ext_ids", {}).get("arxiv")
+    arxiv_id_b = b.get("ext_ids", {}).get("arxiv")
+
+    a_authors = set([v.get("raw_name") for v in a.get("contribs", [])])
+    b_authors = set([v.get("raw_name") for v in b.get("contribs", [])])
+    a_release_year = a.get("release_year")
+    b_release_year = b.get("release_year")
+
+    if a.get("title") == b.get("title"):
+        if a_authors and (a_authors == b_authors):
+            if a_release_year and b_release_year and a_release_year != b_release_year:
+                return (Status.DIFFERENT, Miss.YEAR)
+            return (Status.EXACT, OK.TITLE_AUTHOR_MATCH)
+
+    a_slug_title = slugify_string(a.get("title"))
+    b_slug_title = slugify_string(b.get("title"))
+
+    if a_slug_title and b_slug_title and a_slug_title == b_slug_title:
+        if a_authors and len(a_authors & b_authors) > 0:
+            if arxiv_id_a is not None and arxiv_id_b is None or arxiv_id_a is None and arxiv_id_b is not None:
+                return (Status.STRONG, OK.PREPRINT_PUBLISHED)
+
+    arxiv_id_a = a.get("ext_ids", {}).get("arxiv")
+    arxiv_id_b = b.get("ext_ids", {}).get("arxiv")
+    if arxiv_id_a and arxiv_id_b:
+        id_a, version_a = arxiv_id_a.split("v")
+        id_b, version_b = arxiv_id_b.split("v")
+        if id_a == id_b:
+            return (Status.STRONG, OK.ARXIV_VERSION)
+        else:
+            return (Status.DIFFERENT, Miss.ARXIV_VERSION)
+
+    if a_authors and len(a_authors & b_authors) == 0:
+        return (Status.DIFFERENT, Miss.CONTRIB_INTERSECTION_EMPTY)
+
+    return (Status.AMBIGUOUS, OK.DUMMY)
diff --git a/tests/test_verify.py b/tests/test_verify.py
new file mode 100644
index 0000000..be4b0ec
--- /dev/null
+++ b/tests/test_verify.py
@@ -0,0 +1,31 @@
+import operator
+import os
+import yaml
+try:
+    from yaml import CLoader as Loader
+except ImportError:
+    from yaml import Loader
+
+from fuzzycat.verify import compare, Status
+
+
+def test_verify_cases():
+    """
+    Test verification cases, via yaml.
+    """
+    status_map = {
+        "AMBIGUOUS": Status.AMBIGUOUS,
+        "DIFFERENT": Status.DIFFERENT,
+        "EXACT": Status.EXACT,
+        "STRONG": Status.STRONG,
+        "WEAK": Status.WEAK,
+    }
+    fields = operator.itemgetter("a", "b", "status", "about")
+    folder = os.path.join(os.path.dirname(__file__), "test_verify")
+    for root, _, files in os.walk(folder):
+        for fn in files:
+            with open(os.path.join(root, fn)) as f:
+                doc = yaml.load(f, Loader=Loader)
+                a, b, status, about = fields(doc)
+                result, _ = compare(a, b)
+                assert status_map.get(status) == result, about
diff --git a/tests/test_verify/0000.yml b/tests/test_verify/0000.yml
new file mode 100644
index 0000000..a82b2fe
--- /dev/null
+++ b/tests/test_verify/0000.yml
@@ -0,0 +1,104 @@
+about: Same document should be an exact match.
+status: EXACT
+a:
+  abstracts:
+  - content: Belgium Herbarium image of Meise Botanic Garden.
+    lang: de
+    mimetype: text/plain
+    sha1: cd3c76f5fd94bcf260f9ad74f797d9e79a824b1d
+  contribs:
+  - index: 0
+    raw_name: Meise Botanic Garden
+    role: author
+  ext_ids:
+    doi: 10.5281/zenodo.2830437
+  extra:
+    datacite:
+      license:
+      - rights: Creative Commons Attribution Share Alike 4.0 International
+        rightsUri: http://creativecommons.org/licenses/by-sa/4.0/legalcode
+      - rights: Open Access
+        rightsUri: info:eu-repo/semantics/openAccess
+      relations:
+      - relatedIdentifier: 10.5281/zenodo.2830436
+        relatedIdentifierType: DOI
+        relationType: IsVersionOf
+      - relatedIdentifier: https://zenodo.org/communities/belgiumherbarium
+        relatedIdentifierType: URL
+        relationType: IsPartOf
+      resourceType: Photo
+      resourceTypeGeneral: Image
+      subjects:
+      - subject: Biodiversity
+      - subject: Taxonomy
+      - subject: Terrestrial
+      - subject: Herbarium
+      - subject: Caryophyllaceae
+    release_month: 5
+  files: []
+  filesets: []
+  ident: jihezebuzbgxpmsj3356idy52e
+  license_slug: CC-BY-SA
+  publisher: Zenodo
+  refs: []
+  release_date: "2019-05-14"
+  release_stage: published
+  release_type: graphic
+  release_year: 2019
+  revision: 560ca270-45c5-4f21-89a6-0dfd73039546
+  state: active
+  title: Dianthus carthusianorum L. (BR0000005352692)
+  webcaptures: []
+  work_id: aaaaa34uyngfplcgmoejzjyjne
+b:
+  abstracts:
+  - content: Belgium Herbarium image of Meise Botanic Garden.
+    lang: de
+    mimetype: text/plain
+    sha1: cd3c76f5fd94bcf260f9ad74f797d9e79a824b1d
+  contribs:
+  - index: 0
+    raw_name: Meise Botanic Garden
+    role: author
+  ext_ids:
+    doi: 10.5281/zenodo.2830437
+  extra:
+    datacite:
+      license:
+      - rights: Creative Commons Attribution Share Alike 4.0 International
+        rightsUri: http://creativecommons.org/licenses/by-sa/4.0/legalcode
+      - rights: Open Access
+        rightsUri: info:eu-repo/semantics/openAccess
+      relations:
+      - relatedIdentifier: 10.5281/zenodo.2830436
+        relatedIdentifierType: DOI
+        relationType: IsVersionOf
+      - relatedIdentifier: https://zenodo.org/communities/belgiumherbarium
+        relatedIdentifierType: URL
+        relationType: IsPartOf
+      resourceType: Photo
+      resourceTypeGeneral: Image
+      subjects:
+      - subject: Biodiversity
+      - subject: Taxonomy
+      - subject: Terrestrial
+      - subject: Herbarium
+      - subject: Caryophyllaceae
+    release_month: 5
+  files: []
+  filesets: []
+  ident: jihezebuzbgxpmsj3356idy52e
+  license_slug: CC-BY-SA
+  publisher: Zenodo
+  refs: []
+  release_date: "2019-05-14"
+  release_stage: published
+  release_type: graphic
+  release_year: 2019
+  revision: 560ca270-45c5-4f21-89a6-0dfd73039546
+  state: active
+  title: Dianthus carthusianorum L. (BR0000005352692)
+  webcaptures: []
+  work_id: aaaaa34uyngfplcgmoejzjyjne
+
+