diff options
Diffstat (limited to 'python/scripts/import_grobid_metadata.py')
-rwxr-xr-x | python/scripts/import_grobid_metadata.py | 94 |
1 files changed, 94 insertions, 0 deletions
diff --git a/python/scripts/import_grobid_metadata.py b/python/scripts/import_grobid_metadata.py new file mode 100755 index 0000000..3d2e14c --- /dev/null +++ b/python/scripts/import_grobid_metadata.py @@ -0,0 +1,94 @@ +#!/usr/bin/env python3 + +import sys +import json +import datetime + +MAX_ABSTRACT_BYTES=4096 + +def parse_grobid_json(obj): + + if not obj.get('title'): + return None + + extra = dict() + + if obj.get('abstract') and len(obj.get('abstract')) < MAX_ABSTRACT_BYTES: + abobj = dict( + mimetype="text/plain", + language=None, + content=obj.get('abstract').strip()) + abstracts = [abobj] + else: + abstracts = None + + contribs = [] + for a in obj.get('authors', []): + c = dict(raw_name=a, role="author") + contribs.append(c) + + refs = [] + for raw in obj.get('citations', []): + extra = dict() + ref = dict() + ref['key'] = raw.get('id') + if raw.get('title'): + ref['title'] = raw['title'].strip() + if raw.get('date'): + try: + year = int(raw['date'].strip()[:4]) + ref['year'] = year + except: + pass + for key in ('volume', 'url', 'issue', 'publisher'): + if raw.get(key): + extra[key] = raw[key].strip() + if raw.get('authors'): + extra['authors'] = [a['name'] for a in raw['authors']] + if extra: + extra = dict(grobid=extra) + else: + extra = None + ref['extra'] = extra + refs.append(ref) + + release_type = "journal-article" + release_date = None + if obj.get('date'): + # TODO: only returns year, ever? how to handle? + release_date = datetime.datetime(year=obj['date'], month=1, day=1) + + if obj.get('doi'): + extra['doi'] = obj['doi'] + if obj['journal'].get('name'): + extra['container_name'] = obj['journal']['name'] + + extra['is_longtail_oa'] = True + + # TODO: ISSN/eISSN handling? or just journal name lookup? + + if extra: + extra = dict(grobid=extra) + else: + extra = None + + return dict( + title=obj['title'].strip(), + contribs=contribs, + publisher=obj['journal'].get('publisher'), + volume=obj['journal'].get('volume'), + issue=obj['journal'].get('issue'), + abstracts=abstracts, + release_type=release_type, + release_date=release_date, + extra=extra) + +def run(): + for line in sys.stdin: + obj = json.loads(line) + out = parse_grobid_json(obj) + if out: + print(out) + +if __name__=="__main__": + run() |