aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2018-10-30 17:20:48 -0700
committerBryan Newbold <bnewbold@archive.org>2018-10-30 17:20:48 -0700
commit7d78b1ab030cba8c783c78d5a13349b8367fb327 (patch)
tree66d78bd5c3b18e36c373b8090ce104aa71bb9ef9
parentee9230fd0fe96a07006b496d678681bae47cb943 (diff)
downloadsandcrawler-7d78b1ab030cba8c783c78d5a13349b8367fb327.tar.gz
sandcrawler-7d78b1ab030cba8c783c78d5a13349b8367fb327.zip
several bugs and lint issues in import_grobid_metadata
-rwxr-xr-xpython/import_grobid_metadata.py19
1 files changed, 10 insertions, 9 deletions
diff --git a/python/import_grobid_metadata.py b/python/import_grobid_metadata.py
index 4d8d6fa..3d2e14c 100755
--- a/python/import_grobid_metadata.py
+++ b/python/import_grobid_metadata.py
@@ -7,11 +7,10 @@ import datetime
MAX_ABSTRACT_BYTES=4096
def parse_grobid_json(obj):
-
+
if not obj.get('title'):
return None
- release = dict()
extra = dict()
if obj.get('abstract') and len(obj.get('abstract')) < MAX_ABSTRACT_BYTES:
@@ -55,15 +54,15 @@ def parse_grobid_json(obj):
release_type = "journal-article"
release_date = None
- if raw.get('date'):
+ if obj.get('date'):
# TODO: only returns year, ever? how to handle?
- release_date = datetime.datetime(year=raw['date'], month=1, day=1)
+ release_date = datetime.datetime(year=obj['date'], month=1, day=1)
+
+ if obj.get('doi'):
+ extra['doi'] = obj['doi']
+ if obj['journal'].get('name'):
+ extra['container_name'] = obj['journal']['name']
- if raw.get('doi'):
- extra['doi'] = raw['doi']
- if raw['journal'].get('name'):
- extra['container_name'] = raw['journal']['name']
-
extra['is_longtail_oa'] = True
# TODO: ISSN/eISSN handling? or just journal name lookup?
@@ -80,6 +79,8 @@ def parse_grobid_json(obj):
volume=obj['journal'].get('volume'),
issue=obj['journal'].get('issue'),
abstracts=abstracts,
+ release_type=release_type,
+ release_date=release_date,
extra=extra)
def run():