#!/usr/bin/env python3 import base64 import json from typing import Any, Dict, List, Optional import fatcat_openapi_client from fatcat_openapi_client import ApiClient, FileEntity, ReleaseEntity from fatcat_tools.normal import clean_doi, clean_str from .common import MAX_ABSTRACT_LENGTH, EntityImporter, make_rel_url class GrobidMetadataImporter(EntityImporter): """ This is a complex case: we need to parse and create both file and release entities. The "primary" entity here is really File, not Release. If a matching File exists, we bail in want(); if not we insert the Release during parsing, and insert both. TODO: should instead check if the File has any releases; if not, insert and update. TODO: relaxing 'None' constraint on parse_record() might make this refactor-able. """ def __init__(self, api: ApiClient, **kwargs) -> None: eg_desc = kwargs.get( "editgroup_description", "Import of release and file metadata, as extracted from PDFs by GROBID.", ) eg_extra = kwargs.get("editgroup_extra", dict()) eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.GrobidMetadataImporter") super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs) self.default_link_rel = kwargs.get("default_link_rel", "web") self.longtail_oa = kwargs.get("longtail_oa", False) def want(self, raw_record: Any) -> bool: return True def parse_record(self, row: str) -> Optional[FileEntity]: fields = row.split("\t") sha1_key = fields[0] cdx = json.loads(fields[1]) mimetype = fields[2] file_size = int(fields[3]) grobid_meta = json.loads(fields[4]) fe = self.parse_file_metadata(sha1_key, cdx, mimetype, file_size) re = self.parse_grobid_json(grobid_meta) if not (fe and re): return None # lookup existing file SHA1 existing = None try: existing = self.api.lookup_file(sha1=fe.sha1) except fatcat_openapi_client.rest.ApiException as err: if err.status != 404: raise err # if file is already in here, presumably not actually long-tail # HACK: this is doing an exists check in parse_record(), which is weird # TODO: this is where we should check if the file actually has # release_ids and/or URLs associated with it if existing and not self.bezerk_mode: self.counts["exists"] += 1 self.counts["skip"] -= 1 return None release_edit = self.create_release(re) fe.release_ids.append(release_edit.ident) return fe def parse_grobid_json(self, obj: Dict[str, Any]) -> Optional[ReleaseEntity]: if not obj.get("title"): return None extra_grobid: Dict[str, Any] = dict() abstract = obj.get("abstract") if abstract and len(abstract) < MAX_ABSTRACT_LENGTH and len(abstract) > 10: abobj = fatcat_openapi_client.ReleaseAbstract( mimetype="text/plain", content=clean_str(obj.get("abstract")) ) abstracts = [abobj] else: abstracts = [] contribs = [] for i, a in enumerate(obj.get("authors", [])): contribs.append( fatcat_openapi_client.ReleaseContrib( index=i, raw_name=clean_str(a["name"]), given_name=clean_str(a.get("given_name")), surname=clean_str(a.get("surname")), role="author", extra=None, ) ) refs = [] for raw in obj.get("citations", []): cite_extra: Dict[str, Any] = dict() year = None if raw.get("date"): try: year = int(raw["date"].strip()[:4]) except (IndexError, ValueError): pass for key in ("volume", "url", "issue", "publisher"): if raw.get(key): cite_extra[key] = clean_str(raw[key]) if raw.get("authors"): cite_extra["authors"] = [clean_str(a["name"]) for a in raw["authors"]] refs.append( fatcat_openapi_client.ReleaseRef( key=clean_str(raw.get("id")), year=year, title=clean_str(raw["title"]), extra=cite_extra or None, ) ) release_date = None release_year = None if obj.get("date"): # only returns year, ever? release_year = int(obj["date"][:4]) extra: Dict[str, Any] = dict() doi = clean_doi(obj.get("doi")) if doi: extra["doi"] = doi if obj["journal"] and obj["journal"].get("name"): extra["container_name"] = clean_str(obj["journal"]["name"]) # TODO: ISSN/eISSN handling? or just journal name lookup? if extra_grobid: extra["grobid"] = extra_grobid if self.longtail_oa: extra["longtail_oa"] = True clean_title = clean_str(obj["title"], force_xml=True) if not clean_title or len(clean_title) < 2: return None title = clean_title re = fatcat_openapi_client.ReleaseEntity( title=title, release_type="article-journal", release_date=release_date, release_year=release_year, contribs=contribs, refs=refs, publisher=clean_str(obj["journal"].get("publisher")), volume=clean_str(obj["journal"].get("volume")), issue=clean_str(obj["journal"].get("issue")), abstracts=abstracts or None, ext_ids=fatcat_openapi_client.ReleaseExtIds(), extra=extra or None, ) return re def parse_file_metadata( self, sha1_key: str, cdx: Dict[str, Any], mimetype: str, file_size: int ) -> FileEntity: sha1 = ( base64.b16encode(base64.b32decode(sha1_key.replace("sha1:", ""))) .decode("ascii") .lower() ) fe = fatcat_openapi_client.FileEntity( sha1=sha1, size=int(file_size), mimetype=mimetype, release_ids=[], urls=[], ) # parse URLs and CDX original = cdx["url"] assert len(cdx["dt"]) >= 8 wayback = "https://web.archive.org/web/{}/{}".format(cdx["dt"], original) fe.urls.append(fatcat_openapi_client.FileUrl(url=wayback, rel="webarchive")) original_url = make_rel_url(original, default_link_rel=self.default_link_rel) if original_url is not None: fe.urls.append( fatcat_openapi_client.FileUrl(rel=original_url[0], url=original_url[1]) ) return fe def try_update(self, re: FileEntity) -> bool: # did the exists check in 'parse_record()', because we needed to create a release return True def insert_batch(self, batch: List[FileEntity]) -> None: self.api.create_file_auto_batch( fatcat_openapi_client.FileAutoBatch( editgroup=fatcat_openapi_client.Editgroup( description=self.editgroup_description, extra=self.editgroup_extra ), entity_list=batch, ) )