1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
|
#!/usr/bin/env python3
import datetime
import json
import sys
MAX_ABSTRACT_BYTES = 4096
def parse_grobid_json(obj):
if not obj.get("title"):
return None
extra = dict()
if obj.get("abstract") and len(obj.get("abstract")) < MAX_ABSTRACT_BYTES:
abobj = dict(mimetype="text/plain", language=None, content=obj.get("abstract").strip())
abstracts = [abobj]
else:
abstracts = None
contribs = []
for a in obj.get("authors", []):
c = dict(raw_name=a, role="author")
contribs.append(c)
refs = []
for raw in obj.get("citations", []):
extra = dict()
ref = dict()
ref["key"] = raw.get("id")
if raw.get("title"):
ref["title"] = raw["title"].strip()
if raw.get("date"):
try:
year = int(raw["date"].strip()[:4])
ref["year"] = year
except:
pass
for key in ("volume", "url", "issue", "publisher"):
if raw.get(key):
extra[key] = raw[key].strip()
if raw.get("authors"):
extra["authors"] = [a["name"] for a in raw["authors"]]
if extra:
extra = dict(grobid=extra)
else:
extra = None
ref["extra"] = extra
refs.append(ref)
release_type = "journal-article"
release_date = None
if obj.get("date"):
# TODO: only returns year, ever? how to handle?
release_date = datetime.datetime(year=obj["date"], month=1, day=1)
if obj.get("doi"):
extra["doi"] = obj["doi"].lower()
if obj["journal"].get("name"):
extra["container_name"] = obj["journal"]["name"]
extra["is_longtail_oa"] = True
# TODO: ISSN/eISSN handling? or just journal name lookup?
if extra:
extra = dict(grobid=extra)
else:
extra = None
return dict(
title=obj["title"].strip(),
contribs=contribs,
publisher=obj["journal"].get("publisher"),
volume=obj["journal"].get("volume"),
issue=obj["journal"].get("issue"),
abstracts=abstracts,
release_type=release_type,
release_date=release_date,
extra=extra,
)
def run():
for line in sys.stdin:
obj = json.loads(line)
out = parse_grobid_json(obj)
if out:
print(out)
if __name__ == "__main__":
run()
|