#!/usr/bin/env python3 """ Filters an input stream of sorted "matchcrossref" scalding job, and outputs "good enough" matches to be inserted to fatcat. Currently works on DOI numbers. Filters for a high enough string match (doesn't re-do title match), and checks author lists. Filters out slugs with too many matches, and outputs one-line-per-sha1 (aka, file). No dependencies (only python3 stdlib) """ import json import sys # out of 1000 score_threshold = 900 max_slug_lines = 10 require_authors = 1 def tokenize(s, remove_whitespace=False): s.replace("'", "'") # Remove non-alphanumeric characters s = "".join([c for c in s.lower() if c.isalnum() or c.isspace()]) if remove_whitespace: s = "".join(s.split()) # Encode as dumb ASCII (TODO: this is horrible) return s.encode("ascii", "replace").replace(b"?", b"") def check_authors(left, right): """ Intended to check GROBID extracted authors (right) against "known good" (but maybe not perfect) Crossref metadata authors ("left"). """ if not left: return False if len(left) > len(right): return False right_all = tokenize(" ".join(right)) for i in range(len(left)): l = left[i].lower().replace("jr.", "").split() if not l: return False l = tokenize(l[-1]) if len(l) <= 1: # weird author name (single char) return False if l not in right_all: # print("MISSING: {} from {}".format(l.decode('utf8'), right_all.decode('utf8'))) return False return True def test_check_authors(): assert not check_authors([], []) assert not check_authors([], ["one"]) assert check_authors(["one"], ["one"]) assert check_authors(["one two"], ["One Two"]) assert check_authors(["two"], ["One Two"]) assert check_authors(["two"], ["two, one"]) assert check_authors(["mago"], ["Mr. Magoo"]) assert check_authors(["Mr. Magoo"], ["Mr Magoo"]) assert check_authors(["one", "tw", "thr"], ["one", "two", "three"]) # Rows are (score, grobid, crossref) def process_group(rows): if len(rows) > max_slug_lines: return keepers = dict() for row in rows: score = int(row[0]) if score < score_threshold: continue grobid = json.loads(row[1]) crossref = json.loads(row[2]) if not check_authors(crossref["authors"], grobid["authors"]): # print("NO (crossref/grobid): {} {}".format(crossref['authors'], grobid['authors'])) continue else: # print("YES: {} {}".format(crossref['authors'], grobid['authors'])) pass sha1 = grobid["sha1"] doi = crossref["doi"].lower() l = keepers.get(sha1, list()) l.append(doi) keepers[sha1] = l for sha1, doi_list in keepers.items(): print("{}\t{}".format(sha1, json.dumps(doi_list))) def run(): last_slug = None lines = [] # group lines by slug, and process in batches for line in sys.stdin: line = line.strip().split("\t") assert len(line) == 4 slug = line[0] if last_slug and slug != last_slug and lines: process_group(lines) lines = [] last_slug = slug lines.append(line[1:]) # catch any remaining if lines: process_group(lines) if __name__ == "__main__": run()