1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
|
#!/usr/bin/env python3
import sys
import json
def transform(m):
if m['state'] != 'active':
return None
# First, the easy ones (direct copy)
t = dict(
ident = m['ident'],
revision = m['revision'],
title = m['title'],
release_date = m.get('release_date'),
release_type = m.get('release_type'),
release_status = m.get('release_status'),
language = m.get('language'),
doi = m.get('doi'),
pmid = m.get('pmid'),
pmcid = m.get('pmcid'),
isbn13 = m.get('isbn13'),
core_id = m.get('core_id'),
wikidata_qid = m.get('wikidata_qid')
)
container = m.get('container')
container_is_kept = False
if container:
t['publisher'] = container.get('publisher')
t['container_name'] = container.get('name')
t['container_issnl'] = container.get('issnl')
container_extra = container.get('extra')
if container_extra:
t['container_is_oa'] = container_extra.get('is_oa')
container_is_kept = container_extra.get('is_kept', False)
t['container_is_longtail_oa'] = container_extra.get('is_longtail_oa')
else:
t['publisher'] = m.get('publisher')
t['container_name'] = m.get('container_name')
files = m.get('files', [])
t['file_count'] = len(files)
in_wa = False
in_ia = False
t['file_pdf_url'] = None
for f in files:
is_pdf = 'pdf' in f.get('mimetype', '')
for url in f.get('urls', []):
if url.get('rel', '') == 'webarchive':
in_wa = True
if '//web.archive.org/' in url['url'] or '//archive.org/' in url['url']:
in_ia = True
if is_pdf:
t['file_pdf_url'] = url['url']
if not t['file_pdf_url'] and is_pdf:
t['file_pdf_url'] = url['url']
t['file_in_webarchive'] = in_wa
t['file_in_ia'] = in_ia
extra = m.get('extra', dict())
if extra:
t['in_shadow'] = extra.get('in_shadow')
if extra.get('grobid') and extra['grobid'].get('is_longtail_oa'):
t['container_is_longtail_oa'] = True
t['any_abstract'] = bool(t.get('abstracts'))
t['is_kept'] = container_is_kept or extra.get('is_kept', False)
t['ref_count'] = len(m.get('refs', []))
t['contrib_count'] = len(m.get('contribs', []))
contrib_names = []
for c in m.get('contribs', []):
if c.get('raw_name'):
contrib_names.append(c.get('raw_name'))
t['contrib_names'] = contrib_names
return t
def run():
for line in sys.stdin:
obj = transform(json.loads(line))
if obj:
print(json.dumps(obj))
if __name__=="__main__":
run()
|