diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2018-09-27 13:31:30 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2018-09-27 13:31:30 -0700 |
commit | 72d14a1ea8113d715e3f7933332829876a438618 (patch) | |
tree | 28f8f9bdc29d065c9a629a229a9472c792fce6a8 | |
parent | 95024f64d18f165bd262f64153a433120f5b13eb (diff) | |
download | fatcat-72d14a1ea8113d715e3f7933332829876a438618.tar.gz fatcat-72d14a1ea8113d715e3f7933332829876a438618.zip |
move grobid metadata importer from sandcrawler
-rwxr-xr-x | python/fatcat/grobid_metadata_importer.py | 93 |
1 files changed, 93 insertions, 0 deletions
diff --git a/python/fatcat/grobid_metadata_importer.py b/python/fatcat/grobid_metadata_importer.py new file mode 100755 index 00000000..4d8d6fa3 --- /dev/null +++ b/python/fatcat/grobid_metadata_importer.py @@ -0,0 +1,93 @@ +#!/usr/bin/env python3 + +import sys +import json +import datetime + +MAX_ABSTRACT_BYTES=4096 + +def parse_grobid_json(obj): + + if not obj.get('title'): + return None + + release = dict() + extra = dict() + + if obj.get('abstract') and len(obj.get('abstract')) < MAX_ABSTRACT_BYTES: + abobj = dict( + mimetype="text/plain", + language=None, + content=obj.get('abstract').strip()) + abstracts = [abobj] + else: + abstracts = None + + contribs = [] + for a in obj.get('authors', []): + c = dict(raw_name=a, role="author") + contribs.append(c) + + refs = [] + for raw in obj.get('citations', []): + extra = dict() + ref = dict() + ref['key'] = raw.get('id') + if raw.get('title'): + ref['title'] = raw['title'].strip() + if raw.get('date'): + try: + year = int(raw['date'].strip()[:4]) + ref['year'] = year + except: + pass + for key in ('volume', 'url', 'issue', 'publisher'): + if raw.get(key): + extra[key] = raw[key].strip() + if raw.get('authors'): + extra['authors'] = [a['name'] for a in raw['authors']] + if extra: + extra = dict(grobid=extra) + else: + extra = None + ref['extra'] = extra + refs.append(ref) + + release_type = "journal-article" + release_date = None + if raw.get('date'): + # TODO: only returns year, ever? how to handle? + release_date = datetime.datetime(year=raw['date'], month=1, day=1) + + if raw.get('doi'): + extra['doi'] = raw['doi'] + if raw['journal'].get('name'): + extra['container_name'] = raw['journal']['name'] + + extra['is_longtail_oa'] = True + + # TODO: ISSN/eISSN handling? or just journal name lookup? + + if extra: + extra = dict(grobid=extra) + else: + extra = None + + return dict( + title=obj['title'].strip(), + contribs=contribs, + publisher=obj['journal'].get('publisher'), + volume=obj['journal'].get('volume'), + issue=obj['journal'].get('issue'), + abstracts=abstracts, + extra=extra) + +def run(): + for line in sys.stdin: + obj = json.loads(line) + out = parse_grobid_json(obj) + if out: + print(out) + +if __name__=="__main__": + run() |