diff options
Diffstat (limited to 'python/fatcat_tools/importers/grobid_metadata.py')
-rw-r--r-- | python/fatcat_tools/importers/grobid_metadata.py | 136 |
1 files changed, 74 insertions, 62 deletions
diff --git a/python/fatcat_tools/importers/grobid_metadata.py b/python/fatcat_tools/importers/grobid_metadata.py index 0f666652..f7bb5357 100644 --- a/python/fatcat_tools/importers/grobid_metadata.py +++ b/python/fatcat_tools/importers/grobid_metadata.py @@ -7,7 +7,7 @@ import fatcat_openapi_client from .common import EntityImporter, clean, make_rel_url -MAX_ABSTRACT_BYTES=4096 +MAX_ABSTRACT_BYTES = 4096 class GrobidMetadataImporter(EntityImporter): @@ -24,14 +24,13 @@ class GrobidMetadataImporter(EntityImporter): def __init__(self, api, **kwargs): - eg_desc = kwargs.get('editgroup_description', - "Import of release and file metadata, as extracted from PDFs by GROBID.") - eg_extra = kwargs.get('editgroup_extra', dict()) - eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.GrobidMetadataImporter') - super().__init__(api, - editgroup_description=eg_desc, - editgroup_extra=eg_extra, - **kwargs) + eg_desc = kwargs.get( + "editgroup_description", + "Import of release and file metadata, as extracted from PDFs by GROBID.", + ) + eg_extra = kwargs.get("editgroup_extra", dict()) + eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.GrobidMetadataImporter") + super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs) self.default_link_rel = kwargs.get("default_link_rel", "web") self.longtail_oa = kwargs.get("longtail_oa", False) @@ -40,7 +39,7 @@ class GrobidMetadataImporter(EntityImporter): def parse_record(self, row): - fields = row.split('\t') + fields = row.split("\t") sha1_key = fields[0] cdx = json.loads(fields[1]) mimetype = fields[2] @@ -65,8 +64,8 @@ class GrobidMetadataImporter(EntityImporter): # TODO: this is where we should check if the file actually has # release_ids and/or URLs associated with it if existing and not self.bezerk_mode: - self.counts['exists'] += 1 - self.counts['skip'] -= 1 + self.counts["exists"] += 1 + self.counts["skip"] -= 1 return None release_edit = self.create_release(re) @@ -75,75 +74,81 @@ class GrobidMetadataImporter(EntityImporter): def parse_grobid_json(self, obj): - if not obj.get('title'): + if not obj.get("title"): return None extra_grobid = dict() - abstract = obj.get('abstract') + abstract = obj.get("abstract") if abstract and len(abstract) < MAX_ABSTRACT_BYTES and len(abstract) > 10: abobj = fatcat_openapi_client.ReleaseAbstract( - mimetype="text/plain", - content=clean(obj.get('abstract'))) + mimetype="text/plain", content=clean(obj.get("abstract")) + ) abstracts = [abobj] else: abstracts = None contribs = [] - for i, a in enumerate(obj.get('authors', [])): - contribs.append(fatcat_openapi_client.ReleaseContrib( - index=i, - raw_name=clean(a['name']), - given_name=clean(a.get('given_name')), - surname=clean(a.get('surname')), - role="author", - extra=None)) + for i, a in enumerate(obj.get("authors", [])): + contribs.append( + fatcat_openapi_client.ReleaseContrib( + index=i, + raw_name=clean(a["name"]), + given_name=clean(a.get("given_name")), + surname=clean(a.get("surname")), + role="author", + extra=None, + ) + ) refs = [] - for raw in obj.get('citations', []): + for raw in obj.get("citations", []): cite_extra = dict() year = None - if raw.get('date'): + if raw.get("date"): try: - year = int(raw['date'].strip()[:4]) + year = int(raw["date"].strip()[:4]) except (IndexError, ValueError): pass - for key in ('volume', 'url', 'issue', 'publisher'): + for key in ("volume", "url", "issue", "publisher"): if raw.get(key): cite_extra[key] = clean(raw[key]) - if raw.get('authors'): - cite_extra['authors'] = [clean(a['name']) for a in raw['authors']] + if raw.get("authors"): + cite_extra["authors"] = [clean(a["name"]) for a in raw["authors"]] if not cite_extra: cite_extra = None - refs.append(fatcat_openapi_client.ReleaseRef( - key=clean(raw.get('id')), - year=year, - title=clean(raw['title']), - extra=cite_extra)) + refs.append( + fatcat_openapi_client.ReleaseRef( + key=clean(raw.get("id")), + year=year, + title=clean(raw["title"]), + extra=cite_extra, + ) + ) release_date = None release_year = None - if obj.get('date'): + if obj.get("date"): # only returns year, ever? - release_year = int(obj['date'][:4]) + release_year = int(obj["date"][:4]) extra = dict() - if obj.get('doi'): - extra['doi'] = obj['doi'] - if obj['journal'] and obj['journal'].get('name'): - extra['container_name'] = clean(obj['journal']['name']) + if obj.get("doi"): + extra["doi"] = obj["doi"] + if obj["journal"] and obj["journal"].get("name"): + extra["container_name"] = clean(obj["journal"]["name"]) # TODO: ISSN/eISSN handling? or just journal name lookup? if extra_grobid: - extra['grobid'] = extra_grobid + extra["grobid"] = extra_grobid if self.longtail_oa: - extra['longtail_oa'] = True + extra["longtail_oa"] = True if not extra: extra = None - title = clean(obj['title'], force_xml=True) + title = clean(obj["title"], force_xml=True) if not title or len(title) < 2: return None @@ -154,17 +159,22 @@ class GrobidMetadataImporter(EntityImporter): release_year=release_year, contribs=contribs, refs=refs, - publisher=clean(obj['journal'].get('publisher')), - volume=clean(obj['journal'].get('volume')), - issue=clean(obj['journal'].get('issue')), + publisher=clean(obj["journal"].get("publisher")), + volume=clean(obj["journal"].get("volume")), + issue=clean(obj["journal"].get("issue")), abstracts=abstracts, ext_ids=fatcat_openapi_client.ReleaseExtIds(), - extra=extra) + extra=extra, + ) return re def parse_file_metadata(self, sha1_key, cdx, mimetype, file_size): - sha1 = base64.b16encode(base64.b32decode(sha1_key.replace('sha1:', ''))).decode('ascii').lower() + sha1 = ( + base64.b16encode(base64.b32decode(sha1_key.replace("sha1:", ""))) + .decode("ascii") + .lower() + ) fe = fatcat_openapi_client.FileEntity( sha1=sha1, @@ -175,16 +185,15 @@ class GrobidMetadataImporter(EntityImporter): ) # parse URLs and CDX - original = cdx['url'] - assert len(cdx['dt']) >= 8 - wayback = "https://web.archive.org/web/{}/{}".format( - cdx['dt'], - original) - fe.urls.append( - fatcat_openapi_client.FileUrl(url=wayback, rel="webarchive")) + original = cdx["url"] + assert len(cdx["dt"]) >= 8 + wayback = "https://web.archive.org/web/{}/{}".format(cdx["dt"], original) + fe.urls.append(fatcat_openapi_client.FileUrl(url=wayback, rel="webarchive")) original_url = make_rel_url(original, default_link_rel=self.default_link_rel) if original_url is not None: - fe.urls.append(fatcat_openapi_client.FileUrl(rel=original_url[0], url=original_url[1])) + fe.urls.append( + fatcat_openapi_client.FileUrl(rel=original_url[0], url=original_url[1]) + ) return fe @@ -193,8 +202,11 @@ class GrobidMetadataImporter(EntityImporter): return True def insert_batch(self, batch): - self.api.create_file_auto_batch(fatcat_openapi_client.FileAutoBatch( - editgroup=fatcat_openapi_client.Editgroup( - description=self.editgroup_description, - extra=self.editgroup_extra), - entity_list=batch)) + self.api.create_file_auto_batch( + fatcat_openapi_client.FileAutoBatch( + editgroup=fatcat_openapi_client.Editgroup( + description=self.editgroup_description, extra=self.editgroup_extra + ), + entity_list=batch, + ) + ) |