wip: verification

Output currently (1m sample): { "unique": 916075, "too_large": 575, "dummy": 10307, "contrib_miss": 27215, "short_title": 1379, "arxiv_v": 8943 }
author: Martin Czygan <martin.czygan@gmail.com> 2020-11-13 02:14:26 +0100
committer: Martin Czygan <martin.czygan@gmail.com> 2020-11-13 02:14:26 +0100
commit: 1f91606501754bf8d3fa8b3075a05c147470c7bb (patch)
tree: 87e7a93e825fca250835533d536b597323659436
parent: 4eab32b1c5929b2d7f2e6d8fed76bdb49bf9c699 (diff)
download: fuzzycat-1f91606501754bf8d3fa8b3075a05c147470c7bb.tar.gz
fuzzycat-1f91606501754bf8d3fa8b3075a05c147470c7bb.zip
3 files changed, 181 insertions, 17 deletions
diff --git a/fuzzycat/__main__.py b/fuzzycat/__main__.py
index a65eb63..3a3b3ba 100644
--- a/fuzzycat/__main__.py
+++ b/fuzzycat/__main__.py
@@ -23,6 +23,7 @@ import tempfile
 from fuzzycat.cluster import (Cluster, release_key_title, release_key_title_ngram,
                               release_key_title_normalized, release_key_title_nysiis,
                               release_key_title_sandcrawler)
+from fuzzycat.verify import GroupVerifier
 
 
 def run_cluster(args):
@@ -52,7 +53,8 @@ def run_verify(args):
     TODO. Ok, we should not fetch data we have on disk (at the clustering
     step).
     """
-    pass
+    gv = GroupVerifier(iterable=fileinput.input(files=args.files))
+    gv.run()
 
 
 if __name__ == '__main__':
diff --git a/fuzzycat/cluster.py b/fuzzycat/cluster.py
index 87b010e..8eb409c 100644
--- a/fuzzycat/cluster.py
+++ b/fuzzycat/cluster.py
@@ -106,7 +106,6 @@ class KeyDoc:
     """
     ident: str
     title: str
-    contribs: List[Contrib] = field(default_factory=list)
 
 
 @dataclass
@@ -430,7 +429,8 @@ class Cluster:
                  key: Callable[[Any], str] = None) -> Generator[Any, None, None]:
         """
         Extract a key from elements of an iterable and group them. Just as
-        uniq(1), the iterable must be ordered for this to work.
+        uniq(1), the iterable must be ordered (by the key that is extracted)
+        for this to work.
         """
         for k, g in itertools.groupby(seq, key=key):
             items = list(g)
diff --git a/fuzzycat/verify.py b/fuzzycat/verify.py
index 841df49..55b8ef6 100644
--- a/fuzzycat/verify.py
+++ b/fuzzycat/verify.py
@@ -1,27 +1,189 @@
 """
 Verification part of matching.
 
-We represent clusters as json lines. One example input line (prettified):
+We represent clusters as json lines. One example input line:
 
     {
       "v": [
-        "cjcpmod6pjaczbhrqfljdfl4m4",
-        "di5kdt5apfc6fiiqofjzkuiqey",
-        "fxhwvmc7dzc6bpuvo7ds4l5gx4",
-        "pda5cuevyrcmpgj3woxw7ktvz4",
-        "port5bx5nzb7tghqsjknnhs56y",
-        "x3a43yczavdkfhp3ekgt5hn6l4"
+        {...}, ...
       ],
       "k": "1 Grundlagen",
-      "c": "t"
     }
 
-Further steps:
+Examples from clustering stage (from a sample of 100k records):
 
-* fetch all releases, this might be via API, search index, some local key value
-store, or some other cache
-* apply various rules, return match status
-* alternatively: have a few more fields in the intermediate representation (to
-keep operation local)
+    ["Global residue formula for logarithmic indices of foliations",2]
+    ["Glossary",8]
+    ["Gordonia sp.",4]
+    ["ERRATA",6]
+    ["ERRATUM",4]
+    ["Editor's Note",8]
+    ["Editorial",95]
+    ["Editorial Board",154]
+    ["Editorial Board & Publication Information",2]
+    ...
 
 """
+
+import collections
+import itertools
+import json
+import operator
+
+get_key_values = operator.itemgetter("k", "v")
+
+# There titles appear too often, so ignore them for now.
+TITLE_BLACKLIST = set([
+    "",
+    "abstracts",
+    "acknowledgements",
+    "acknowledgments",
+    "announcement",
+    "announcements",
+    "arthrobacter sp.",
+    "author index",
+    "back matter",
+    "backmatter",
+    "bibliography",
+    "book review",
+    "book reviews",
+    "books received",
+    "calendar",
+    "conclusion",
+    "conclusions",
+    "contents",
+    "contributors",
+    "copyright",
+    "correction",
+    "correspondence",
+    "corrigendum",
+    "cover",
+    "dedication",
+    "discussion",
+    "editorial",
+    "editorial board",
+    "einleitung",
+    "erratum",
+    "foreword",
+    "front cover",
+    "front matter",
+    "frontmatter",
+    "gbif occurrence download",
+    "index",
+    "inhalt",
+    "in this issue",
+    "introduction",
+    "issue information",
+    "letters to the editor",
+    "letter to the editor",
+    "masthead",
+    "miscellany",
+    "news",
+    "not available",
+    "notes",
+    "occurrence download",
+    "[others]",
+    "oup accepted manuscript",
+    "petitions.xlsx",
+    "preface",
+    "preliminary material",
+    "preservation image",
+    "references",
+    "reply",
+    "reviews",
+    "reviews of books",
+    "short notices",
+    "[s.n.]",
+    "streptomyces sp.",
+    "subject index",
+    "table of contents",
+    "taxonomic abstract for the species.",
+    "the applause data release 2",
+    ":{unav)",
+    "奥付",
+    "投稿規定",
+    "目次",
+    "表紙",
+    "裏表紙",
+])
+
+
+class GroupVerifier:
+    """
+    Verifier.
+
+    Within a group, we could have multiple sub clusters, e.g.
+
+    > [AABAB]
+
+    We would need to compare each possible pair and decide whether they are the
+    same.
+    """
+    def __init__(self, iterable: collections.abc.Iterable, max_cluster_size: int = 10):
+        self.iterable: collections.abc.Iterable = iterable
+        self.max_cluster_size: int = 10
+        self.counter = collections.Counter({
+            "unique": 0,
+            "too_large": 0,
+        })
+
+    def run(self):
+        for i, line in enumerate(self.iterable):
+            if i % 20000 == 0:
+                print(i)
+            line = line.strip()
+            if not line:
+                continue
+            doc = json.loads(line)
+            k, vs = get_key_values(doc)
+            if len(vs) < 2:
+                self.counter["unique"] += 1
+                continue
+            if len(vs) > self.max_cluster_size:
+                self.counter["too_large"] += 1
+                continue
+            for a, b in itertools.combinations(vs, r=2):
+                result = self.compare(a, b)
+                # print(a.get("ident"), b.get("ident"), result)
+                # print(a.get("title")[:30], " ---- ", b.get("title")[:20])
+
+        print(json.dumps(dict(self.counter)))
+
+
+    def compare(self, a, b):
+        """
+        We compare two release entities here.
+
+        * ext_ids.doi
+        * contribs
+        * is the title meaningful enough, is it too common, too short
+        * files share a sha1
+        * arxiv versions
+        """
+        if len(a.get("title")) < 5:
+            self.counter["short_title"] += 1
+            return False
+        if a.get("title", "").lower() in TITLE_BLACKLIST:
+            self.counter["blacklist"] += 1
+            return False
+
+        arxiv_id_a = a.get("ext_ids", {}).get("arxiv")
+        arxiv_id_b = b.get("ext_ids", {}).get("arxiv")
+        if arxiv_id_a and arxiv_id_b:
+            id_a, version_a = arxiv_id_a.split("v")
+            id_b, version_b = arxiv_id_b.split("v")
+            if id_a == id_b:
+                self.counter["arxiv_v"] += 1
+                return True
+            else:
+                return False
+
+        a_authors = set([v.get("raw_name") for v in a.get("contribs", [])])
+        b_authors = set([v.get("raw_name") for v in b.get("contribs", [])])
+
+        if len(a_authors & b_authors) == 0:
+            self.counter["contrib_miss"] += 1
+            return False
+
+        self.counter["dummy"] += 1
+        return True
author	Martin Czygan <martin.czygan@gmail.com>	2020-11-13 02:14:26 +0100
committer	Martin Czygan <martin.czygan@gmail.com>	2020-11-13 02:14:26 +0100
commit	1f91606501754bf8d3fa8b3075a05c147470c7bb (patch)
tree	87e7a93e825fca250835533d536b597323659436
parent	4eab32b1c5929b2d7f2e6d8fed76bdb49bf9c699 (diff)
download	fuzzycat-1f91606501754bf8d3fa8b3075a05c147470c7bb.tar.gz fuzzycat-1f91606501754bf8d3fa8b3075a05c147470c7bb.zip