aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2020-11-13 02:14:26 +0100
committerMartin Czygan <martin.czygan@gmail.com>2020-11-13 02:14:26 +0100
commit1f91606501754bf8d3fa8b3075a05c147470c7bb (patch)
tree87e7a93e825fca250835533d536b597323659436
parent4eab32b1c5929b2d7f2e6d8fed76bdb49bf9c699 (diff)
downloadfuzzycat-1f91606501754bf8d3fa8b3075a05c147470c7bb.tar.gz
fuzzycat-1f91606501754bf8d3fa8b3075a05c147470c7bb.zip
wip: verification
Output currently (1m sample): { "unique": 916075, "too_large": 575, "dummy": 10307, "contrib_miss": 27215, "short_title": 1379, "arxiv_v": 8943 }
-rw-r--r--fuzzycat/__main__.py4
-rw-r--r--fuzzycat/cluster.py4
-rw-r--r--fuzzycat/verify.py190
3 files changed, 181 insertions, 17 deletions
diff --git a/fuzzycat/__main__.py b/fuzzycat/__main__.py
index a65eb63..3a3b3ba 100644
--- a/fuzzycat/__main__.py
+++ b/fuzzycat/__main__.py
@@ -23,6 +23,7 @@ import tempfile
from fuzzycat.cluster import (Cluster, release_key_title, release_key_title_ngram,
release_key_title_normalized, release_key_title_nysiis,
release_key_title_sandcrawler)
+from fuzzycat.verify import GroupVerifier
def run_cluster(args):
@@ -52,7 +53,8 @@ def run_verify(args):
TODO. Ok, we should not fetch data we have on disk (at the clustering
step).
"""
- pass
+ gv = GroupVerifier(iterable=fileinput.input(files=args.files))
+ gv.run()
if __name__ == '__main__':
diff --git a/fuzzycat/cluster.py b/fuzzycat/cluster.py
index 87b010e..8eb409c 100644
--- a/fuzzycat/cluster.py
+++ b/fuzzycat/cluster.py
@@ -106,7 +106,6 @@ class KeyDoc:
"""
ident: str
title: str
- contribs: List[Contrib] = field(default_factory=list)
@dataclass
@@ -430,7 +429,8 @@ class Cluster:
key: Callable[[Any], str] = None) -> Generator[Any, None, None]:
"""
Extract a key from elements of an iterable and group them. Just as
- uniq(1), the iterable must be ordered for this to work.
+ uniq(1), the iterable must be ordered (by the key that is extracted)
+ for this to work.
"""
for k, g in itertools.groupby(seq, key=key):
items = list(g)
diff --git a/fuzzycat/verify.py b/fuzzycat/verify.py
index 841df49..55b8ef6 100644
--- a/fuzzycat/verify.py
+++ b/fuzzycat/verify.py
@@ -1,27 +1,189 @@
"""
Verification part of matching.
-We represent clusters as json lines. One example input line (prettified):
+We represent clusters as json lines. One example input line:
{
"v": [
- "cjcpmod6pjaczbhrqfljdfl4m4",
- "di5kdt5apfc6fiiqofjzkuiqey",
- "fxhwvmc7dzc6bpuvo7ds4l5gx4",
- "pda5cuevyrcmpgj3woxw7ktvz4",
- "port5bx5nzb7tghqsjknnhs56y",
- "x3a43yczavdkfhp3ekgt5hn6l4"
+ {...}, ...
],
"k": "1 Grundlagen",
- "c": "t"
}
-Further steps:
+Examples from clustering stage (from a sample of 100k records):
-* fetch all releases, this might be via API, search index, some local key value
-store, or some other cache
-* apply various rules, return match status
-* alternatively: have a few more fields in the intermediate representation (to
-keep operation local)
+ ["Global residue formula for logarithmic indices of foliations",2]
+ ["Glossary",8]
+ ["Gordonia sp.",4]
+ ["ERRATA",6]
+ ["ERRATUM",4]
+ ["Editor's Note",8]
+ ["Editorial",95]
+ ["Editorial Board",154]
+ ["Editorial Board & Publication Information",2]
+ ...
"""
+
+import collections
+import itertools
+import json
+import operator
+
+get_key_values = operator.itemgetter("k", "v")
+
+# There titles appear too often, so ignore them for now.
+TITLE_BLACKLIST = set([
+ "",
+ "abstracts",
+ "acknowledgements",
+ "acknowledgments",
+ "announcement",
+ "announcements",
+ "arthrobacter sp.",
+ "author index",
+ "back matter",
+ "backmatter",
+ "bibliography",
+ "book review",
+ "book reviews",
+ "books received",
+ "calendar",
+ "conclusion",
+ "conclusions",
+ "contents",
+ "contributors",
+ "copyright",
+ "correction",
+ "correspondence",
+ "corrigendum",
+ "cover",
+ "dedication",
+ "discussion",
+ "editorial",
+ "editorial board",
+ "einleitung",
+ "erratum",
+ "foreword",
+ "front cover",
+ "front matter",
+ "frontmatter",
+ "gbif occurrence download",
+ "index",
+ "inhalt",
+ "in this issue",
+ "introduction",
+ "issue information",
+ "letters to the editor",
+ "letter to the editor",
+ "masthead",
+ "miscellany",
+ "news",
+ "not available",
+ "notes",
+ "occurrence download",
+ "[others]",
+ "oup accepted manuscript",
+ "petitions.xlsx",
+ "preface",
+ "preliminary material",
+ "preservation image",
+ "references",
+ "reply",
+ "reviews",
+ "reviews of books",
+ "short notices",
+ "[s.n.]",
+ "streptomyces sp.",
+ "subject index",
+ "table of contents",
+ "taxonomic abstract for the species.",
+ "the applause data release 2",
+ ":{unav)",
+ "奥付",
+ "投稿規定",
+ "目次",
+ "表紙",
+ "裏表紙",
+])
+
+
+class GroupVerifier:
+ """
+ Verifier.
+
+ Within a group, we could have multiple sub clusters, e.g.
+
+ > [AABAB]
+
+ We would need to compare each possible pair and decide whether they are the
+ same.
+ """
+ def __init__(self, iterable: collections.abc.Iterable, max_cluster_size: int = 10):
+ self.iterable: collections.abc.Iterable = iterable
+ self.max_cluster_size: int = 10
+ self.counter = collections.Counter({
+ "unique": 0,
+ "too_large": 0,
+ })
+
+ def run(self):
+ for i, line in enumerate(self.iterable):
+ if i % 20000 == 0:
+ print(i)
+ line = line.strip()
+ if not line:
+ continue
+ doc = json.loads(line)
+ k, vs = get_key_values(doc)
+ if len(vs) < 2:
+ self.counter["unique"] += 1
+ continue
+ if len(vs) > self.max_cluster_size:
+ self.counter["too_large"] += 1
+ continue
+ for a, b in itertools.combinations(vs, r=2):
+ result = self.compare(a, b)
+ # print(a.get("ident"), b.get("ident"), result)
+ # print(a.get("title")[:30], " ---- ", b.get("title")[:20])
+
+ print(json.dumps(dict(self.counter)))
+
+
+ def compare(self, a, b):
+ """
+ We compare two release entities here.
+
+ * ext_ids.doi
+ * contribs
+ * is the title meaningful enough, is it too common, too short
+ * files share a sha1
+ * arxiv versions
+ """
+ if len(a.get("title")) < 5:
+ self.counter["short_title"] += 1
+ return False
+ if a.get("title", "").lower() in TITLE_BLACKLIST:
+ self.counter["blacklist"] += 1
+ return False
+
+ arxiv_id_a = a.get("ext_ids", {}).get("arxiv")
+ arxiv_id_b = b.get("ext_ids", {}).get("arxiv")
+ if arxiv_id_a and arxiv_id_b:
+ id_a, version_a = arxiv_id_a.split("v")
+ id_b, version_b = arxiv_id_b.split("v")
+ if id_a == id_b:
+ self.counter["arxiv_v"] += 1
+ return True
+ else:
+ return False
+
+ a_authors = set([v.get("raw_name") for v in a.get("contribs", [])])
+ b_authors = set([v.get("raw_name") for v in b.get("contribs", [])])
+
+ if len(a_authors & b_authors) == 0:
+ self.counter["contrib_miss"] += 1
+ return False
+
+ self.counter["dummy"] += 1
+ return True