diff options
Diffstat (limited to 'python/scripts/import_grobid_metadata.py')
-rwxr-xr-x | python/scripts/import_grobid_metadata.py | 69 |
1 files changed, 35 insertions, 34 deletions
diff --git a/python/scripts/import_grobid_metadata.py b/python/scripts/import_grobid_metadata.py index 3d2e14c..f941881 100755 --- a/python/scripts/import_grobid_metadata.py +++ b/python/scripts/import_grobid_metadata.py @@ -1,69 +1,67 @@ #!/usr/bin/env python3 -import sys -import json import datetime +import json +import sys + +MAX_ABSTRACT_BYTES = 4096 -MAX_ABSTRACT_BYTES=4096 def parse_grobid_json(obj): - if not obj.get('title'): + if not obj.get("title"): return None extra = dict() - if obj.get('abstract') and len(obj.get('abstract')) < MAX_ABSTRACT_BYTES: - abobj = dict( - mimetype="text/plain", - language=None, - content=obj.get('abstract').strip()) + if obj.get("abstract") and len(obj.get("abstract")) < MAX_ABSTRACT_BYTES: + abobj = dict(mimetype="text/plain", language=None, content=obj.get("abstract").strip()) abstracts = [abobj] else: abstracts = None contribs = [] - for a in obj.get('authors', []): + for a in obj.get("authors", []): c = dict(raw_name=a, role="author") contribs.append(c) refs = [] - for raw in obj.get('citations', []): + for raw in obj.get("citations", []): extra = dict() ref = dict() - ref['key'] = raw.get('id') - if raw.get('title'): - ref['title'] = raw['title'].strip() - if raw.get('date'): + ref["key"] = raw.get("id") + if raw.get("title"): + ref["title"] = raw["title"].strip() + if raw.get("date"): try: - year = int(raw['date'].strip()[:4]) - ref['year'] = year + year = int(raw["date"].strip()[:4]) + ref["year"] = year except: pass - for key in ('volume', 'url', 'issue', 'publisher'): + for key in ("volume", "url", "issue", "publisher"): if raw.get(key): extra[key] = raw[key].strip() - if raw.get('authors'): - extra['authors'] = [a['name'] for a in raw['authors']] + if raw.get("authors"): + extra["authors"] = [a["name"] for a in raw["authors"]] if extra: extra = dict(grobid=extra) else: extra = None - ref['extra'] = extra + ref["extra"] = extra refs.append(ref) release_type = "journal-article" release_date = None - if obj.get('date'): + if obj.get("date"): # TODO: only returns year, ever? how to handle? - release_date = datetime.datetime(year=obj['date'], month=1, day=1) + release_date = datetime.datetime(year=obj["date"], month=1, day=1) - if obj.get('doi'): - extra['doi'] = obj['doi'] - if obj['journal'].get('name'): - extra['container_name'] = obj['journal']['name'] + if obj.get("doi"): + extra["doi"] = obj["doi"].lower() + if obj["journal"].get("name"): + extra["container_name"] = obj["journal"]["name"] - extra['is_longtail_oa'] = True + extra["is_longtail_oa"] = True # TODO: ISSN/eISSN handling? or just journal name lookup? @@ -73,15 +71,17 @@ def parse_grobid_json(obj): extra = None return dict( - title=obj['title'].strip(), + title=obj["title"].strip(), contribs=contribs, - publisher=obj['journal'].get('publisher'), - volume=obj['journal'].get('volume'), - issue=obj['journal'].get('issue'), + publisher=obj["journal"].get("publisher"), + volume=obj["journal"].get("volume"), + issue=obj["journal"].get("issue"), abstracts=abstracts, release_type=release_type, release_date=release_date, - extra=extra) + extra=extra, + ) + def run(): for line in sys.stdin: @@ -90,5 +90,6 @@ def run(): if out: print(out) -if __name__=="__main__": + +if __name__ == "__main__": run() |