python/scripts/grobid_affiliations.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57

#!/usr/bin/env python3
"""
Takes old (HBase) or new (pg) style JSON wrappers of GROBID XML extraction
output, converts the XML to JSON, filters out raw affiliation strings, and
dumps these as JSON subset.

Run in bulk like:

    ls /bigger/unpaywall-transfer/2019-07-17-1741.30-dumpgrobidxml/part*gz | parallel --progress -j8 'zcat {} | ./grobid_affiliations.py > {}.affiliations'
"""

import json
import sys

from grobid_tei_xml import parse_document_xml


def parse_hbase(line):
    line = line.split("\t")
    assert len(line) == 2
    sha1hex = line[0]
    obj = json.loads(line[1])
    tei_xml = obj["tei_xml"]
    return sha1hex, tei_xml


def parse_pg(line):
    obj = json.loads(line)
    return obj["sha1hex"], obj["tei_xml"]


def run(mode="hbase"):
    for line in sys.stdin:
        if mode == "hbase":
            sha1hex, tei_xml = parse_hbase(line)
        elif mode == "pg":
            sha1hex, tei_xml = parse_pg(line)
        else:
            raise NotImplementedError("parse mode: {}".format(mode))

        tei_doc = parse_document_xml(tei_xml)
        tei_doc.remove_encumbered()
        obj = tei_doc.to_legacy_dict()

        affiliations = []
        for author in obj["authors"]:
            if author.get("affiliation"):
                affiliations.append(author["affiliation"])
        if affiliations:
            # don't duplicate affiliations; only the unique ones
            affiliations = list(set([json.dumps(a) for a in affiliations]))
            affiliations = [json.loads(a) for a in affiliations]
            print("\t".join([sha1hex, json.dumps(affiliations)]))


if __name__ == "__main__":
    run()