diff options
| author | Bryan Newbold <bnewbold@robocracy.org> | 2018-09-27 13:31:30 -0700 | 
|---|---|---|
| committer | Bryan Newbold <bnewbold@robocracy.org> | 2018-09-27 13:31:30 -0700 | 
| commit | 72d14a1ea8113d715e3f7933332829876a438618 (patch) | |
| tree | 28f8f9bdc29d065c9a629a229a9472c792fce6a8 /python | |
| parent | 95024f64d18f165bd262f64153a433120f5b13eb (diff) | |
| download | fatcat-72d14a1ea8113d715e3f7933332829876a438618.tar.gz fatcat-72d14a1ea8113d715e3f7933332829876a438618.zip | |
move grobid metadata importer from sandcrawler
Diffstat (limited to 'python')
| -rwxr-xr-x | python/fatcat/grobid_metadata_importer.py | 93 | 
1 files changed, 93 insertions, 0 deletions
| diff --git a/python/fatcat/grobid_metadata_importer.py b/python/fatcat/grobid_metadata_importer.py new file mode 100755 index 00000000..4d8d6fa3 --- /dev/null +++ b/python/fatcat/grobid_metadata_importer.py @@ -0,0 +1,93 @@ +#!/usr/bin/env python3 + +import sys +import json +import datetime + +MAX_ABSTRACT_BYTES=4096 + +def parse_grobid_json(obj): +     +    if not obj.get('title'): +        return None + +    release = dict() +    extra = dict() + +    if obj.get('abstract') and len(obj.get('abstract')) < MAX_ABSTRACT_BYTES: +        abobj = dict( +            mimetype="text/plain", +            language=None, +            content=obj.get('abstract').strip()) +        abstracts = [abobj] +    else: +        abstracts = None + +    contribs = [] +    for a in obj.get('authors', []): +        c = dict(raw_name=a, role="author") +        contribs.append(c) + +    refs = [] +    for raw in obj.get('citations', []): +        extra = dict() +        ref = dict() +        ref['key'] = raw.get('id') +        if raw.get('title'): +            ref['title'] = raw['title'].strip() +        if raw.get('date'): +            try: +                year = int(raw['date'].strip()[:4]) +                ref['year'] = year +            except: +                pass +        for key in ('volume', 'url', 'issue', 'publisher'): +            if raw.get(key): +                extra[key] = raw[key].strip() +        if raw.get('authors'): +            extra['authors'] = [a['name'] for a in raw['authors']] +        if extra: +            extra = dict(grobid=extra) +        else: +            extra = None +        ref['extra'] = extra +        refs.append(ref) + +    release_type = "journal-article" +    release_date = None +    if raw.get('date'): +        # TODO: only returns year, ever? how to handle? +        release_date = datetime.datetime(year=raw['date'], month=1, day=1) + +    if raw.get('doi'): +        extra['doi'] = raw['doi'] +    if raw['journal'].get('name'): +        extra['container_name'] = raw['journal']['name'] +     +    extra['is_longtail_oa'] = True + +    # TODO: ISSN/eISSN handling? or just journal name lookup? + +    if extra: +        extra = dict(grobid=extra) +    else: +        extra = None + +    return dict( +        title=obj['title'].strip(), +        contribs=contribs, +        publisher=obj['journal'].get('publisher'), +        volume=obj['journal'].get('volume'), +        issue=obj['journal'].get('issue'), +        abstracts=abstracts, +        extra=extra) + +def run(): +    for line in sys.stdin: +        obj = json.loads(line) +        out = parse_grobid_json(obj) +        if out: +            print(out) + +if __name__=="__main__": +    run() | 
