diff options
author | Martin Czygan <martin.czygan@gmail.com> | 2020-11-13 02:14:26 +0100 |
---|---|---|
committer | Martin Czygan <martin.czygan@gmail.com> | 2020-11-13 02:14:26 +0100 |
commit | 1f91606501754bf8d3fa8b3075a05c147470c7bb (patch) | |
tree | 87e7a93e825fca250835533d536b597323659436 | |
parent | 4eab32b1c5929b2d7f2e6d8fed76bdb49bf9c699 (diff) | |
download | fuzzycat-1f91606501754bf8d3fa8b3075a05c147470c7bb.tar.gz fuzzycat-1f91606501754bf8d3fa8b3075a05c147470c7bb.zip |
wip: verification
Output currently (1m sample):
{
"unique": 916075,
"too_large": 575,
"dummy": 10307,
"contrib_miss": 27215,
"short_title": 1379,
"arxiv_v": 8943
}
-rw-r--r-- | fuzzycat/__main__.py | 4 | ||||
-rw-r--r-- | fuzzycat/cluster.py | 4 | ||||
-rw-r--r-- | fuzzycat/verify.py | 190 |
3 files changed, 181 insertions, 17 deletions
diff --git a/fuzzycat/__main__.py b/fuzzycat/__main__.py index a65eb63..3a3b3ba 100644 --- a/fuzzycat/__main__.py +++ b/fuzzycat/__main__.py @@ -23,6 +23,7 @@ import tempfile from fuzzycat.cluster import (Cluster, release_key_title, release_key_title_ngram, release_key_title_normalized, release_key_title_nysiis, release_key_title_sandcrawler) +from fuzzycat.verify import GroupVerifier def run_cluster(args): @@ -52,7 +53,8 @@ def run_verify(args): TODO. Ok, we should not fetch data we have on disk (at the clustering step). """ - pass + gv = GroupVerifier(iterable=fileinput.input(files=args.files)) + gv.run() if __name__ == '__main__': diff --git a/fuzzycat/cluster.py b/fuzzycat/cluster.py index 87b010e..8eb409c 100644 --- a/fuzzycat/cluster.py +++ b/fuzzycat/cluster.py @@ -106,7 +106,6 @@ class KeyDoc: """ ident: str title: str - contribs: List[Contrib] = field(default_factory=list) @dataclass @@ -430,7 +429,8 @@ class Cluster: key: Callable[[Any], str] = None) -> Generator[Any, None, None]: """ Extract a key from elements of an iterable and group them. Just as - uniq(1), the iterable must be ordered for this to work. + uniq(1), the iterable must be ordered (by the key that is extracted) + for this to work. """ for k, g in itertools.groupby(seq, key=key): items = list(g) diff --git a/fuzzycat/verify.py b/fuzzycat/verify.py index 841df49..55b8ef6 100644 --- a/fuzzycat/verify.py +++ b/fuzzycat/verify.py @@ -1,27 +1,189 @@ """ Verification part of matching. -We represent clusters as json lines. One example input line (prettified): +We represent clusters as json lines. One example input line: { "v": [ - "cjcpmod6pjaczbhrqfljdfl4m4", - "di5kdt5apfc6fiiqofjzkuiqey", - "fxhwvmc7dzc6bpuvo7ds4l5gx4", - "pda5cuevyrcmpgj3woxw7ktvz4", - "port5bx5nzb7tghqsjknnhs56y", - "x3a43yczavdkfhp3ekgt5hn6l4" + {...}, ... ], "k": "1 Grundlagen", - "c": "t" } -Further steps: +Examples from clustering stage (from a sample of 100k records): -* fetch all releases, this might be via API, search index, some local key value -store, or some other cache -* apply various rules, return match status -* alternatively: have a few more fields in the intermediate representation (to -keep operation local) + ["Global residue formula for logarithmic indices of foliations",2] + ["Glossary",8] + ["Gordonia sp.",4] + ["ERRATA",6] + ["ERRATUM",4] + ["Editor's Note",8] + ["Editorial",95] + ["Editorial Board",154] + ["Editorial Board & Publication Information",2] + ... """ + +import collections +import itertools +import json +import operator + +get_key_values = operator.itemgetter("k", "v") + +# There titles appear too often, so ignore them for now. +TITLE_BLACKLIST = set([ + "", + "abstracts", + "acknowledgements", + "acknowledgments", + "announcement", + "announcements", + "arthrobacter sp.", + "author index", + "back matter", + "backmatter", + "bibliography", + "book review", + "book reviews", + "books received", + "calendar", + "conclusion", + "conclusions", + "contents", + "contributors", + "copyright", + "correction", + "correspondence", + "corrigendum", + "cover", + "dedication", + "discussion", + "editorial", + "editorial board", + "einleitung", + "erratum", + "foreword", + "front cover", + "front matter", + "frontmatter", + "gbif occurrence download", + "index", + "inhalt", + "in this issue", + "introduction", + "issue information", + "letters to the editor", + "letter to the editor", + "masthead", + "miscellany", + "news", + "not available", + "notes", + "occurrence download", + "[others]", + "oup accepted manuscript", + "petitions.xlsx", + "preface", + "preliminary material", + "preservation image", + "references", + "reply", + "reviews", + "reviews of books", + "short notices", + "[s.n.]", + "streptomyces sp.", + "subject index", + "table of contents", + "taxonomic abstract for the species.", + "the applause data release 2", + ":{unav)", + "奥付", + "投稿規定", + "目次", + "表紙", + "裏表紙", +]) + + +class GroupVerifier: + """ + Verifier. + + Within a group, we could have multiple sub clusters, e.g. + + > [AABAB] + + We would need to compare each possible pair and decide whether they are the + same. + """ + def __init__(self, iterable: collections.abc.Iterable, max_cluster_size: int = 10): + self.iterable: collections.abc.Iterable = iterable + self.max_cluster_size: int = 10 + self.counter = collections.Counter({ + "unique": 0, + "too_large": 0, + }) + + def run(self): + for i, line in enumerate(self.iterable): + if i % 20000 == 0: + print(i) + line = line.strip() + if not line: + continue + doc = json.loads(line) + k, vs = get_key_values(doc) + if len(vs) < 2: + self.counter["unique"] += 1 + continue + if len(vs) > self.max_cluster_size: + self.counter["too_large"] += 1 + continue + for a, b in itertools.combinations(vs, r=2): + result = self.compare(a, b) + # print(a.get("ident"), b.get("ident"), result) + # print(a.get("title")[:30], " ---- ", b.get("title")[:20]) + + print(json.dumps(dict(self.counter))) + + + def compare(self, a, b): + """ + We compare two release entities here. + + * ext_ids.doi + * contribs + * is the title meaningful enough, is it too common, too short + * files share a sha1 + * arxiv versions + """ + if len(a.get("title")) < 5: + self.counter["short_title"] += 1 + return False + if a.get("title", "").lower() in TITLE_BLACKLIST: + self.counter["blacklist"] += 1 + return False + + arxiv_id_a = a.get("ext_ids", {}).get("arxiv") + arxiv_id_b = b.get("ext_ids", {}).get("arxiv") + if arxiv_id_a and arxiv_id_b: + id_a, version_a = arxiv_id_a.split("v") + id_b, version_b = arxiv_id_b.split("v") + if id_a == id_b: + self.counter["arxiv_v"] += 1 + return True + else: + return False + + a_authors = set([v.get("raw_name") for v in a.get("contribs", [])]) + b_authors = set([v.get("raw_name") for v in b.get("contribs", [])]) + + if len(a_authors & b_authors) == 0: + self.counter["contrib_miss"] += 1 + return False + + self.counter["dummy"] += 1 + return True |