aboutsummaryrefslogtreecommitdiffstats
path: root/python/scripts/import_grobid_metadata.py
blob: f941881958fbb981b2d5e18e9a12e223a83c19c0 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
#!/usr/bin/env python3

import datetime
import json
import sys

MAX_ABSTRACT_BYTES = 4096


def parse_grobid_json(obj):

    if not obj.get("title"):
        return None

    extra = dict()

    if obj.get("abstract") and len(obj.get("abstract")) < MAX_ABSTRACT_BYTES:
        abobj = dict(mimetype="text/plain", language=None, content=obj.get("abstract").strip())
        abstracts = [abobj]
    else:
        abstracts = None

    contribs = []
    for a in obj.get("authors", []):
        c = dict(raw_name=a, role="author")
        contribs.append(c)

    refs = []
    for raw in obj.get("citations", []):
        extra = dict()
        ref = dict()
        ref["key"] = raw.get("id")
        if raw.get("title"):
            ref["title"] = raw["title"].strip()
        if raw.get("date"):
            try:
                year = int(raw["date"].strip()[:4])
                ref["year"] = year
            except:
                pass
        for key in ("volume", "url", "issue", "publisher"):
            if raw.get(key):
                extra[key] = raw[key].strip()
        if raw.get("authors"):
            extra["authors"] = [a["name"] for a in raw["authors"]]
        if extra:
            extra = dict(grobid=extra)
        else:
            extra = None
        ref["extra"] = extra
        refs.append(ref)

    release_type = "journal-article"
    release_date = None
    if obj.get("date"):
        # TODO: only returns year, ever? how to handle?
        release_date = datetime.datetime(year=obj["date"], month=1, day=1)

    if obj.get("doi"):
        extra["doi"] = obj["doi"].lower()
    if obj["journal"].get("name"):
        extra["container_name"] = obj["journal"]["name"]

    extra["is_longtail_oa"] = True

    # TODO: ISSN/eISSN handling? or just journal name lookup?

    if extra:
        extra = dict(grobid=extra)
    else:
        extra = None

    return dict(
        title=obj["title"].strip(),
        contribs=contribs,
        publisher=obj["journal"].get("publisher"),
        volume=obj["journal"].get("volume"),
        issue=obj["journal"].get("issue"),
        abstracts=abstracts,
        release_type=release_type,
        release_date=release_date,
        extra=extra,
    )


def run():
    for line in sys.stdin:
        obj = json.loads(line)
        out = parse_grobid_json(obj)
        if out:
            print(out)


if __name__ == "__main__":
    run()