diff options
Diffstat (limited to 'python/fatcat_tools/importers/grobid_metadata.py')
-rw-r--r-- | python/fatcat_tools/importers/grobid_metadata.py | 123 |
1 files changed, 77 insertions, 46 deletions
diff --git a/python/fatcat_tools/importers/grobid_metadata.py b/python/fatcat_tools/importers/grobid_metadata.py index 5e61a154..9d95fe0b 100644 --- a/python/fatcat_tools/importers/grobid_metadata.py +++ b/python/fatcat_tools/importers/grobid_metadata.py @@ -5,12 +5,22 @@ import json import base64 import datetime import fatcat_client -from .common import FatcatImporter +from .common import EntityImporter, clean MAX_ABSTRACT_BYTES=4096 -class GrobidMetadataImporter(FatcatImporter): +class GrobidMetadataImporter(EntityImporter): + """ + This is a complex case: we need to parse and create both file and release entities. + + The "primary" entity here is really File, not Release. If a matching File + exists, we bail in want(); if not we insert the Release during parsing, and + insert both. + + TODO: should instead check if the File has any releases; if not, insert and update. + TODO: relaxing 'None' constraint on parse_record() might make this refactor-able. + """ def __init__(self, api, **kwargs): @@ -22,6 +32,45 @@ class GrobidMetadataImporter(FatcatImporter): editgroup_description=eg_desc, editgroup_extra=eg_extra) self.default_link_rel = kwargs.get("default_link_rel", "web") + self.longtail_oa = kwargs.get("longtail_oa", False) + + def want(self, raw_record): + return True + + def parse_record(self, row): + + fields = row.split('\t') + sha1_key = fields[0] + cdx = json.loads(fields[1]) + mimetype = fields[2] + file_size = int(fields[3]) + grobid_meta = json.loads(fields[4]) + fe = self.parse_file_metadata(sha1_key, cdx, mimetype, file_size) + re = self.parse_grobid_json(grobid_meta) + + if not (fe and re): + return None + + # lookup existing file SHA1 + existing = None + try: + existing = self.api.lookup_file(sha1=fe.sha1) + except fatcat_client.rest.ApiException as err: + if err.status != 404: + raise err + + # if file is already in here, presumably not actually long-tail + # HACK: this is doing an exists check in parse_record(), which is weird + # TODO: this is where we should check if the file actually has + # release_ids and/or URLs associated with it + if existing and not self.bezerk_mode: + self.counts['exists'] += 1 + self.counts['skip'] -= 1 + return None + + release_edit = self.create_release(re) + fe.release_ids.append(release_edit.ident) + return fe def parse_grobid_json(self, obj): @@ -34,7 +83,7 @@ class GrobidMetadataImporter(FatcatImporter): abobj = dict( mimetype="text/plain", language=None, - content=obj.get('abstract').strip()) + content=clean(obj.get('abstract'))) abstracts = [abobj] else: abstracts = None @@ -43,17 +92,18 @@ class GrobidMetadataImporter(FatcatImporter): for i, a in enumerate(obj.get('authors', [])): contribs.append(fatcat_client.ReleaseContrib( index=i, - raw_name=a['name'], + raw_name=clean(a['name']), role="author", extra=None)) + # XXX: why is this a dict()? not covered by tests? refs = [] for raw in obj.get('citations', []): cite_extra = dict() ref = dict() - ref['key'] = raw.get('id') + ref['key'] = clean(raw.get('id')) if raw.get('title'): - ref['title'] = raw['title'].strip() + ref['title'] = clean(raw['title']) if raw.get('date'): try: year = int(raw['date'].strip()[:4]) @@ -62,9 +112,9 @@ class GrobidMetadataImporter(FatcatImporter): pass for key in ('volume', 'url', 'issue', 'publisher'): if raw.get(key): - cite_extra[key] = raw[key].strip() + cite_extra[key] = clean(raw[key]) if raw.get('authors'): - cite_extra['authors'] = [a['name'] for a in raw['authors']] + cite_extra['authors'] = [clean(a['name']) for a in raw['authors']] if cite_extra: cite_extra = dict(grobid=cite_extra) else: @@ -81,27 +131,28 @@ class GrobidMetadataImporter(FatcatImporter): if obj.get('doi'): extra['doi'] = obj['doi'] if obj['journal'] and obj['journal'].get('name'): - extra['container_name'] = obj['journal']['name'] - - extra['is_longtail_oa'] = True + extra['container_name'] = clean(obj['journal']['name']) # TODO: ISSN/eISSN handling? or just journal name lookup? + if self.longtail_oa: + extra['longtail_oa'] = True + if extra: extra = dict(grobid=extra) else: extra = None re = fatcat_client.ReleaseEntity( - title=obj['title'].strip(), + title=clean(obj['title'], force_xml=True), release_type="article-journal", release_date=release_date, release_year=release_year, contribs=contribs, refs=refs, - publisher=obj['journal'].get('publisher'), - volume=obj['journal'].get('volume'), - issue=obj['journal'].get('issue'), + publisher=clean(obj['journal'].get('publisher')), + volume=clean(obj['journal'].get('volume')), + issue=clean(obj['journal'].get('issue')), abstracts=abstracts, extra=extra) return re @@ -122,17 +173,6 @@ class GrobidMetadataImporter(FatcatImporter): sha1 = base64.b16encode(base64.b32decode(sha1_key.replace('sha1:', ''))).decode('ascii').lower() - # lookup existing SHA1, or create new entity - try: - existing_file = self.api.lookup_file(sha1=sha1) - except fatcat_client.rest.ApiException as err: - if err.status != 404: - raise err - existing_file = None - - if existing_file: - # if file is already in here, presumably not actually long-tail - return None fe = fatcat_client.FileEntity( sha1=sha1, size=int(file_size), @@ -143,6 +183,7 @@ class GrobidMetadataImporter(FatcatImporter): # parse URLs and CDX original = cdx['url'] + assert len(cdx['dt']) >= 8 wayback = "https://web.archive.org/web/{}/{}".format( cdx['dt'], original) @@ -154,23 +195,13 @@ class GrobidMetadataImporter(FatcatImporter): return fe - def create_row(self, row, editgroup_id=None): - if not row: - return - fields = row.split('\t') - sha1_key = fields[0] - cdx = json.loads(fields[1]) - mimetype = fields[2] - file_size = int(fields[3]) - grobid_meta = json.loads(fields[4]) - fe = self.parse_file_metadata(sha1_key, cdx, mimetype, file_size) - re = self.parse_grobid_json(grobid_meta) - if fe and re: - release_entity = self.api.create_release(re, editgroup_id=editgroup_id) - # release ident can't already be in release list because we just - # created it - fe.release_ids.append(release_entity.ident) - file_entity = self.api.create_file(fe, editgroup_id=editgroup_id) - self.counts['insert'] += 1 - - # NB: batch mode not implemented + def try_update(self, entity): + # did the exists check in 'parse_record()', because we needed to create a release + return True + + def insert_batch(self, batch): + self.api.create_file_batch(batch, + autoaccept=True, + description=self.editgroup_description, + extra=json.dumps(self.editgroup_extra)) + |