aboutsummaryrefslogtreecommitdiffstats
path: root/python/scripts/import_grobid_metadata.py
diff options
context:
space:
mode:
Diffstat (limited to 'python/scripts/import_grobid_metadata.py')
-rwxr-xr-xpython/scripts/import_grobid_metadata.py94
1 files changed, 94 insertions, 0 deletions
diff --git a/python/scripts/import_grobid_metadata.py b/python/scripts/import_grobid_metadata.py
new file mode 100755
index 0000000..3d2e14c
--- /dev/null
+++ b/python/scripts/import_grobid_metadata.py
@@ -0,0 +1,94 @@
+#!/usr/bin/env python3
+
+import sys
+import json
+import datetime
+
+MAX_ABSTRACT_BYTES=4096
+
+def parse_grobid_json(obj):
+
+ if not obj.get('title'):
+ return None
+
+ extra = dict()
+
+ if obj.get('abstract') and len(obj.get('abstract')) < MAX_ABSTRACT_BYTES:
+ abobj = dict(
+ mimetype="text/plain",
+ language=None,
+ content=obj.get('abstract').strip())
+ abstracts = [abobj]
+ else:
+ abstracts = None
+
+ contribs = []
+ for a in obj.get('authors', []):
+ c = dict(raw_name=a, role="author")
+ contribs.append(c)
+
+ refs = []
+ for raw in obj.get('citations', []):
+ extra = dict()
+ ref = dict()
+ ref['key'] = raw.get('id')
+ if raw.get('title'):
+ ref['title'] = raw['title'].strip()
+ if raw.get('date'):
+ try:
+ year = int(raw['date'].strip()[:4])
+ ref['year'] = year
+ except:
+ pass
+ for key in ('volume', 'url', 'issue', 'publisher'):
+ if raw.get(key):
+ extra[key] = raw[key].strip()
+ if raw.get('authors'):
+ extra['authors'] = [a['name'] for a in raw['authors']]
+ if extra:
+ extra = dict(grobid=extra)
+ else:
+ extra = None
+ ref['extra'] = extra
+ refs.append(ref)
+
+ release_type = "journal-article"
+ release_date = None
+ if obj.get('date'):
+ # TODO: only returns year, ever? how to handle?
+ release_date = datetime.datetime(year=obj['date'], month=1, day=1)
+
+ if obj.get('doi'):
+ extra['doi'] = obj['doi']
+ if obj['journal'].get('name'):
+ extra['container_name'] = obj['journal']['name']
+
+ extra['is_longtail_oa'] = True
+
+ # TODO: ISSN/eISSN handling? or just journal name lookup?
+
+ if extra:
+ extra = dict(grobid=extra)
+ else:
+ extra = None
+
+ return dict(
+ title=obj['title'].strip(),
+ contribs=contribs,
+ publisher=obj['journal'].get('publisher'),
+ volume=obj['journal'].get('volume'),
+ issue=obj['journal'].get('issue'),
+ abstracts=abstracts,
+ release_type=release_type,
+ release_date=release_date,
+ extra=extra)
+
+def run():
+ for line in sys.stdin:
+ obj = json.loads(line)
+ out = parse_grobid_json(obj)
+ if out:
+ print(out)
+
+if __name__=="__main__":
+ run()