diff options
author | Bryan Newbold <bnewbold@archive.org> | 2018-10-30 17:20:48 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2018-10-30 17:20:48 -0700 |
commit | 7d78b1ab030cba8c783c78d5a13349b8367fb327 (patch) | |
tree | 66d78bd5c3b18e36c373b8090ce104aa71bb9ef9 | |
parent | ee9230fd0fe96a07006b496d678681bae47cb943 (diff) | |
download | sandcrawler-7d78b1ab030cba8c783c78d5a13349b8367fb327.tar.gz sandcrawler-7d78b1ab030cba8c783c78d5a13349b8367fb327.zip |
several bugs and lint issues in import_grobid_metadata
-rwxr-xr-x | python/import_grobid_metadata.py | 19 |
1 files changed, 10 insertions, 9 deletions
diff --git a/python/import_grobid_metadata.py b/python/import_grobid_metadata.py index 4d8d6fa..3d2e14c 100755 --- a/python/import_grobid_metadata.py +++ b/python/import_grobid_metadata.py @@ -7,11 +7,10 @@ import datetime MAX_ABSTRACT_BYTES=4096 def parse_grobid_json(obj): - + if not obj.get('title'): return None - release = dict() extra = dict() if obj.get('abstract') and len(obj.get('abstract')) < MAX_ABSTRACT_BYTES: @@ -55,15 +54,15 @@ def parse_grobid_json(obj): release_type = "journal-article" release_date = None - if raw.get('date'): + if obj.get('date'): # TODO: only returns year, ever? how to handle? - release_date = datetime.datetime(year=raw['date'], month=1, day=1) + release_date = datetime.datetime(year=obj['date'], month=1, day=1) + + if obj.get('doi'): + extra['doi'] = obj['doi'] + if obj['journal'].get('name'): + extra['container_name'] = obj['journal']['name'] - if raw.get('doi'): - extra['doi'] = raw['doi'] - if raw['journal'].get('name'): - extra['container_name'] = raw['journal']['name'] - extra['is_longtail_oa'] = True # TODO: ISSN/eISSN handling? or just journal name lookup? @@ -80,6 +79,8 @@ def parse_grobid_json(obj): volume=obj['journal'].get('volume'), issue=obj['journal'].get('issue'), abstracts=abstracts, + release_type=release_type, + release_date=release_date, extra=extra) def run(): |