1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
|
#!/usr/bin/env python3
import datetime
import json
import sys
MAX_ABSTRACT_BYTES = 4096
def parse_grobid_json(obj):
if not obj.get('title'):
return None
extra = dict()
if obj.get('abstract') and len(obj.get('abstract')) < MAX_ABSTRACT_BYTES:
abobj = dict(mimetype="text/plain", language=None, content=obj.get('abstract').strip())
abstracts = [abobj]
else:
abstracts = None
contribs = []
for a in obj.get('authors', []):
c = dict(raw_name=a, role="author")
contribs.append(c)
refs = []
for raw in obj.get('citations', []):
extra = dict()
ref = dict()
ref['key'] = raw.get('id')
if raw.get('title'):
ref['title'] = raw['title'].strip()
if raw.get('date'):
try:
year = int(raw['date'].strip()[:4])
ref['year'] = year
except:
pass
for key in ('volume', 'url', 'issue', 'publisher'):
if raw.get(key):
extra[key] = raw[key].strip()
if raw.get('authors'):
extra['authors'] = [a['name'] for a in raw['authors']]
if extra:
extra = dict(grobid=extra)
else:
extra = None
ref['extra'] = extra
refs.append(ref)
release_type = "journal-article"
release_date = None
if obj.get('date'):
# TODO: only returns year, ever? how to handle?
release_date = datetime.datetime(year=obj['date'], month=1, day=1)
if obj.get('doi'):
extra['doi'] = obj['doi'].lower()
if obj['journal'].get('name'):
extra['container_name'] = obj['journal']['name']
extra['is_longtail_oa'] = True
# TODO: ISSN/eISSN handling? or just journal name lookup?
if extra:
extra = dict(grobid=extra)
else:
extra = None
return dict(title=obj['title'].strip(),
contribs=contribs,
publisher=obj['journal'].get('publisher'),
volume=obj['journal'].get('volume'),
issue=obj['journal'].get('issue'),
abstracts=abstracts,
release_type=release_type,
release_date=release_date,
extra=extra)
def run():
for line in sys.stdin:
obj = json.loads(line)
out = parse_grobid_json(obj)
if out:
print(out)
if __name__ == "__main__":
run()
|