python/scripts/filter_scored_matches.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116

#!/usr/bin/env python3
"""
Filters an input stream of sorted "matchcrossref" scalding job, and outputs
"good enough" matches to be inserted to fatcat.

Currently works on DOI numbers. Filters for a high enough string match (doesn't
re-do title match), and checks author lists. Filters out slugs with too many
matches, and outputs one-line-per-sha1 (aka, file).

No dependencies (only python3 stdlib)
"""

import sys
import json

# out of 1000
score_threshold = 900

max_slug_lines = 10

require_authors = 1


def tokenize(s, remove_whitespace=False):

    s.replace('&apos;', "'")
    # Remove non-alphanumeric characters
    s = ''.join([c for c in s.lower() if c.isalnum() or c.isspace()])

    if remove_whitespace:
        s = ''.join(s.split())

    # Encode as dumb ASCII (TODO: this is horrible)
    return s.encode('ascii', 'replace').replace(b'?', b'')

def check_authors(left, right):
    """
    Intended to check GROBID extracted authors (right) against "known good"
    (but maybe not perfect) Crossref metadata authors ("left").
    """
    if not left:
        return False
    if len(left) > len(right):
        return False
    right_all = tokenize(" ".join(right))
    for i in range(len(left)):
        l = left[i].lower().replace('jr.', '').split()
        if not l:
            return False
        l = tokenize(l[-1])
        if len(l) <= 1:
            # weird author name (single char)
            return False
        if l not in right_all:
            #print("MISSING: {} from {}".format(l.decode('utf8'), right_all.decode('utf8')))
            return False
    return True

def test_check_authors():
    assert not check_authors([], [])
    assert not check_authors([], ['one'])
    assert check_authors(['one'], ['one'])
    assert check_authors(['one two'], ['One Two'])
    assert check_authors(['two'], ['One Two'])
    assert check_authors(['two'], ['two, one'])
    assert check_authors(['mago'], ['Mr. Magoo'])
    assert check_authors(['Mr. Magoo'], ['Mr Magoo'])
    assert check_authors(['one', 'tw', 'thr'], ['one', 'two', 'three'])

# Rows are (score, grobid, crossref)
def process_group(rows):
    if len(rows) > max_slug_lines:
        return
    keepers = dict()
    for row in rows:
        score = int(row[0])
        if score < score_threshold:
            continue
        grobid = json.loads(row[1])
        crossref = json.loads(row[2])
        if not check_authors(crossref['authors'], grobid['authors']):
            #print("NO (crossref/grobid): {} {}".format(crossref['authors'], grobid['authors']))
            continue
        else:
            #print("YES: {} {}".format(crossref['authors'], grobid['authors']))
            pass
        sha1 = grobid['sha1']
        doi = crossref['doi'].lower()
        l = keepers.get(sha1, list())
        l.append(doi)
        keepers[sha1] = l
    for sha1, doi_list in keepers.items():
        print("{}\t{}".format(sha1, json.dumps(doi_list)))

def run():

    last_slug = None
    lines = []

    # group lines by slug, and process in batches
    for line in sys.stdin:
        line = line.strip().split('\t')
        assert len(line) == 4
        slug = line[0]
        if last_slug and slug != last_slug and lines:
            process_group(lines)
            lines = []
        last_slug = slug
        lines.append(line[1:])

    # catch any remaining
    if lines:
        process_group(lines)

if __name__=='__main__':
    run()