diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2019-01-28 18:42:55 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2019-01-28 18:42:55 -0800 |
commit | 722d30e403c1a1cd67bf5d36520b6ed1d6d38a46 (patch) | |
tree | 43a11231ea24c52d6de39bb862d19837150443fc /python/fatcat_tools/importers | |
parent | 050bcae1dc1483479c1e5c7c44105f0c825a88d1 (diff) | |
download | fatcat-722d30e403c1a1cd67bf5d36520b6ed1d6d38a46.tar.gz fatcat-722d30e403c1a1cd67bf5d36520b6ed1d6d38a46.zip |
many fixes in GROBID importer
Diffstat (limited to 'python/fatcat_tools/importers')
-rw-r--r-- | python/fatcat_tools/importers/grobid_metadata.py | 24 |
1 files changed, 10 insertions, 14 deletions
diff --git a/python/fatcat_tools/importers/grobid_metadata.py b/python/fatcat_tools/importers/grobid_metadata.py index 28358959..7bd7c00e 100644 --- a/python/fatcat_tools/importers/grobid_metadata.py +++ b/python/fatcat_tools/importers/grobid_metadata.py @@ -81,9 +81,8 @@ class GrobidMetadataImporter(EntityImporter): abstract = obj.get('abstract') if abstract and len(abstract) < MAX_ABSTRACT_BYTES and len(abstract) > 10: - abobj = dict( + abobj = fatcat_client.ReleaseEntityAbstracts( mimetype="text/plain", - language=None, content=clean(obj.get('abstract'))) abstracts = [abobj] else: @@ -97,31 +96,28 @@ class GrobidMetadataImporter(EntityImporter): role="author", extra=None)) - # XXX: why is this a dict()? not covered by tests? refs = [] for raw in obj.get('citations', []): cite_extra = dict() - ref = dict() - ref['key'] = clean(raw.get('id')) - if raw.get('title'): - ref['title'] = clean(raw['title']) + year = None if raw.get('date'): try: year = int(raw['date'].strip()[:4]) - ref['year'] = year except: pass for key in ('volume', 'url', 'issue', 'publisher'): if raw.get(key): cite_extra[key] = clean(raw[key]) if raw.get('authors'): - cite_extra['authors'] = [clean(a['name']) for a in raw['authors']] - if cite_extra: - cite_extra = dict(grobid=cite_extra) - else: + cite_extra['author'] = [clean(a['name']) for a in raw['authors']] + + if not cite_extra: cite_extra = None - ref['extra'] = cite_extra - refs.append(ref) + refs.append(fatcat_client.ReleaseRef( + key=clean(raw.get('id')), + year=year, + title=clean(raw['title']), + extra=cite_extra)) release_date = None release_year = None |