from typing import Any, Dict, List, Optional import fatcat_openapi_client from fatcat_openapi_client import ApiClient, ContainerEntity from fatcat_tools.normal import clean_str from .common import EntityImporter class ChoculaImporter(EntityImporter): """ Creates or updates container entities based on output of "chocula" script, which munges/processes journal metadata from several sources, including fatcat itself. See guide for details on the many 'extra' fields used here. """ def __init__(self, api: ApiClient, **kwargs) -> None: eg_desc = kwargs.get( "editgroup_description", "Automated import of container-level metadata from Chocula tool.", ) eg_extra = kwargs.get("editgroup_extra", dict()) eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.ChoculaImporter") super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs) def want(self, raw_record: Any) -> bool: if not raw_record.get("ident") and not raw_record.get("_known_issnl"): self.counts["skip-unknown-new-issnl"] += 1 return False if raw_record.get("issnl") and raw_record.get("name"): return True return False def parse_record(self, row: Dict[str, Any]) -> Optional[ContainerEntity]: """ row is a python dict (parsed from JSON). returns a ContainerEntity (or None if invalid or couldn't parse) """ name = clean_str(row.get("name")) if not name: # Name is required (by schema) return None name = name.strip() if name.endswith(", Proceedings of the"): name = "Proceedings of the " + name.split(",")[0] if name.endswith("."): name = name[:-1] extra = dict() for k in ( "urls", "webarchive_urls", "country", "sherpa_romeo", "ezb", "szczepanski", "doaj", "languages", "ia", "scielo", "kbart", "publisher_type", "platform", ): if row["extra"].get(k): extra[k] = row["extra"][k] container_type = None if "proceedings" in name.lower(): container_type = "proceedings" elif "journal " in name.lower(): container_type = "journal" ce = ContainerEntity( issnl=row["issnl"], issnp=row["extra"].get("issnp"), issne=row["extra"].get("issne"), ident=row["ident"], name=name, container_type=container_type, publisher=clean_str(row.get("publisher")), wikidata_qid=row.get("wikidata_qid"), extra=extra, ) return ce def try_update(self, ce: ContainerEntity) -> bool: existing = None if ce.ident: try: existing = self.api.get_container(ce.ident) except fatcat_openapi_client.rest.ApiException as err: if err.status != 404: raise err self.counts["exists"] += 1 self.counts["exists-not-found"] += 1 return False if existing.state != "active": self.counts["exists"] += 1 self.counts["exists-inactive"] += 1 return False if not existing: # check if existing by ISSN-L try: existing = self.api.lookup_container(issnl=ce.issnl) except fatcat_openapi_client.rest.ApiException as err: if err.status != 404: raise err if existing: self.counts["exists"] += 1 self.counts["exists-by-issnl"] += 1 return False # doesn't exist, always create return True # decide whether to update do_update = False if not self.do_updates: self.counts["exists"] += 1 return False if not existing.extra: existing.extra = dict() if ce.extra.get("urls") and set(ce.extra.get("urls", [])) != set( existing.extra.get("urls", []) ): do_update = True if ce.extra.get("webarchive_urls") and set(ce.extra.get("webarchive_urls", [])) != set( existing.extra.get("webarchive_urls", []) ): do_update = True for k in ("ezb", "szczepanski", "publisher_type", "platform"): if ce.extra.get(k) and not existing.extra.get(k): do_update = True for k in ("kbart", "ia", "doaj"): # always update these fields if not equal (chocula override) if ce.extra.get(k) and ce.extra[k] != existing.extra.get(k): do_update = True if ce.publisher and not existing.publisher: do_update = True if ce.wikidata_qid and not existing.wikidata_qid: do_update = True if do_update: existing.wikidata_qid = existing.wikidata_qid or ce.wikidata_qid existing.publisher = existing.publisher or ce.publisher existing.container_type = existing.container_type or ce.container_type existing.issne = existing.issne or ce.issne existing.issnp = existing.issnp or ce.issnp for k in ("urls", "webarchive_urls"): # be conservative about URL updates; don't clobber existing URL lists # may want to make this behavior more sophisticated in the # future, or at least a config flag if ce.extra.get(k) and not existing.extra.get(k): existing.extra[k] = ce.extra.get(k, []) for k in ( "sherpa_romeo", "ezb", "szczepanski", "doaj", "ia", "scielo", "kbart", "publisher_type", "platform", ): # always update (chocula over-rides) if ce.extra.get(k): existing.extra[k] = ce.extra[k] for k in ("country",): # only include if not set (don't clobber human edits) if ce.extra.get(k) and not existing.extra.get(k): existing.extra[k] = ce.extra[k] if ce.extra.get("languages"): if not existing.extra.get("languages"): existing.extra["languages"] = ce.extra["languages"] elif not ce.extra["languages"][0] in existing.extra["languages"]: existing.extra["languages"].append(ce.extra["languages"][0]) self.api.update_container(self.get_editgroup_id(), existing.ident, existing) self.counts["update"] += 1 return False else: self.counts["exists"] += 1 self.counts["exists-skip-update"] += 1 return False # if we got this far, it's a bug raise NotImplementedError def insert_batch(self, batch: List[ContainerEntity]) -> None: self.api.create_container_auto_batch( fatcat_openapi_client.ContainerAutoBatch( editgroup=fatcat_openapi_client.Editgroup( description=self.editgroup_description, extra=self.editgroup_extra ), entity_list=batch, ) )