diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2018-09-27 17:06:15 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2018-09-27 17:06:17 -0700 |
commit | c7573bf142be405f8cb9003400c6860aeb700457 (patch) | |
tree | 552ba89d2dc80e6557519e4bf4ddbcceef058c3b /python/fatcat | |
parent | 72d14a1ea8113d715e3f7933332829876a438618 (diff) | |
download | fatcat-c7573bf142be405f8cb9003400c6860aeb700457.tar.gz fatcat-c7573bf142be405f8cb9003400c6860aeb700457.zip |
improvements to grobid_metadata importer
But still fails tests due to database collision/side-effect on sha1
lookup.
Diffstat (limited to 'python/fatcat')
-rwxr-xr-x | python/fatcat/grobid_metadata_importer.py | 233 |
1 files changed, 154 insertions, 79 deletions
diff --git a/python/fatcat/grobid_metadata_importer.py b/python/fatcat/grobid_metadata_importer.py index 4d8d6fa3..95cc285e 100755 --- a/python/fatcat/grobid_metadata_importer.py +++ b/python/fatcat/grobid_metadata_importer.py @@ -2,92 +2,167 @@ import sys import json +import base64 import datetime +import fatcat_client +from fatcat.importer_common import FatcatImporter MAX_ABSTRACT_BYTES=4096 -def parse_grobid_json(obj): - - if not obj.get('title'): - return None - - release = dict() - extra = dict() - - if obj.get('abstract') and len(obj.get('abstract')) < MAX_ABSTRACT_BYTES: - abobj = dict( - mimetype="text/plain", - language=None, - content=obj.get('abstract').strip()) - abstracts = [abobj] - else: - abstracts = None - - contribs = [] - for a in obj.get('authors', []): - c = dict(raw_name=a, role="author") - contribs.append(c) - - refs = [] - for raw in obj.get('citations', []): + +class FatcatGrobidMetadataImporter(FatcatImporter): + + def __init__(self, host_url, default_link_rel="web"): + super().__init__(host_url) + self.default_link_rel = default_link_rel + + def parse_grobid_json(self, obj): + + if not obj.get('title'): + return None + + release = dict() extra = dict() - ref = dict() - ref['key'] = raw.get('id') - if raw.get('title'): - ref['title'] = raw['title'].strip() - if raw.get('date'): - try: - year = int(raw['date'].strip()[:4]) - ref['year'] = year - except: - pass - for key in ('volume', 'url', 'issue', 'publisher'): - if raw.get(key): - extra[key] = raw[key].strip() - if raw.get('authors'): - extra['authors'] = [a['name'] for a in raw['authors']] + + if obj.get('abstract') and len(obj.get('abstract')) < MAX_ABSTRACT_BYTES: + abobj = dict( + mimetype="text/plain", + language=None, + content=obj.get('abstract').strip()) + abstracts = [abobj] + else: + abstracts = None + + contribs = [] + for i, a in enumerate(obj.get('authors', [])): + c = dict(raw_name=a['name'], role="author") + contribs.append(fatcat_client.ReleaseContrib( + index=i, + raw_name=a['name'], + role="author", + extra=None)) + + refs = [] + for raw in obj.get('citations', []): + cite_extra = dict() + ref = dict() + ref['key'] = raw.get('id') + if raw.get('title'): + ref['title'] = raw['title'].strip() + if raw.get('date'): + try: + year = int(raw['date'].strip()[:4]) + ref['year'] = year + except: + pass + for key in ('volume', 'url', 'issue', 'publisher'): + if raw.get(key): + cite_extra[key] = raw[key].strip() + if raw.get('authors'): + cite_extra['authors'] = [a['name'] for a in raw['authors']] + if cite_extra: + cite_extra = dict(grobid=cite_extra) + else: + cite_extra = None + ref['extra'] = cite_extra + refs.append(ref) + + release_type = "journal-article" + release_date = None + if obj.get('date'): + # TODO: only returns year, ever? how to handle? + release_date = datetime.datetime(year=int(obj['date'][:4]), month=1, day=1) + + if obj.get('doi'): + extra['doi'] = obj['doi'] + if obj['journal'] and obj['journal'].get('name'): + extra['container_name'] = obj['journal']['name'] + + extra['is_longtail_oa'] = True + + # TODO: ISSN/eISSN handling? or just journal name lookup? + if extra: extra = dict(grobid=extra) else: extra = None - ref['extra'] = extra - refs.append(ref) - - release_type = "journal-article" - release_date = None - if raw.get('date'): - # TODO: only returns year, ever? how to handle? - release_date = datetime.datetime(year=raw['date'], month=1, day=1) - - if raw.get('doi'): - extra['doi'] = raw['doi'] - if raw['journal'].get('name'): - extra['container_name'] = raw['journal']['name'] + + re = fatcat_client.ReleaseEntity( + title=obj['title'].strip(), + contribs=contribs, + refs=refs, + publisher=obj['journal'].get('publisher'), + volume=obj['journal'].get('volume'), + issue=obj['journal'].get('issue'), + abstracts=abstracts, + extra=extra) + return re - extra['is_longtail_oa'] = True - - # TODO: ISSN/eISSN handling? or just journal name lookup? - - if extra: - extra = dict(grobid=extra) - else: - extra = None - - return dict( - title=obj['title'].strip(), - contribs=contribs, - publisher=obj['journal'].get('publisher'), - volume=obj['journal'].get('volume'), - issue=obj['journal'].get('issue'), - abstracts=abstracts, - extra=extra) - -def run(): - for line in sys.stdin: - obj = json.loads(line) - out = parse_grobid_json(obj) - if out: - print(out) - -if __name__=="__main__": - run() + # TODO: make this a common function somewhere + def make_url(self, raw): + rel = self.default_link_rel + # TODO: this is where we could map specific domains to rel types, + # and also filter out bad domains, invalid URLs, etc + if "//archive.org/" in raw or "//arxiv.org/" in raw: + # TODO: special-case the arxiv.org bulk mirror? + rel = "repository" + elif "//web.archive.org/" in raw or "//archive.is/" in raw: + rel = "webarchive" + return fatcat_client.FileEntityUrls(url=raw, rel=rel) + + def parse_file_metadata(self, sha1_key, cdx, mimetype, file_size): + + sha1 = base64.b16encode(base64.b32decode(sha1_key.replace('sha1:', ''))).decode('ascii').lower() + + # lookup existing SHA1, or create new entity + try: + existing_file = self.api.lookup_file(sha1=sha1) + except fatcat_client.rest.ApiException as err: + if err.status != 404: + raise err + existing_file = None + + if existing_file: + # if file is already in here, presumably not actually long-tail + return None + fe = fatcat_client.FileEntity( + sha1=sha1, + size=int(file_size), + mimetype=mimetype, + releases=[], + urls=[], + ) + + # parse URLs and CDX + original = cdx['url'] + wayback = "https://web.archive.org/web/{}/{}".format( + cdx['dt'], + original) + fe.urls.append( + fatcat_client.FileEntityUrls(url=wayback, rel="webarchive")) + original_url = self.make_url(original) + if original_url != None: + fe.urls.append(original_url) + + return fe + + def create_row(self, row, editgroup=None): + if not row: + return + fields = row.split('\t') + sha1_key = fields[0] + cdx = json.loads(fields[1]) + mimetype = fields[2] + file_size = int(fields[3]) + grobid_meta = json.loads(fields[4]) + fe = self.parse_file_metadata(sha1_key, cdx, mimetype, file_size) + re = self.parse_grobid_json(grobid_meta) + if fe and re: + release_entity = self.api.create_release(re, editgroup=editgroup) + # release ident can't already be in release list because we just + # created it + fe.releases.append(release_entity.ident) + file_entity = self.api.create_file(fe, editgroup=editgroup) + self.insert_count = self.insert_count + 1 + + # NB: batch mode not implemented |