""" Importer for DBLP container-level (journal/conference/series) metadata, pre-scraped in to JSON from HTML pages. """ import sys # noqa: F401 from typing import Any, Dict, List, Optional, Sequence import fatcat_openapi_client from fatcat_openapi_client import ApiClient, ContainerEntity from fatcat_tools.importers.common import EntityImporter from fatcat_tools.normal import clean_str class DblpContainerImporter(EntityImporter): def __init__( self, api: ApiClient, issn_map_file: Sequence, dblp_container_map_file: Sequence, dblp_container_map_output: Any, **kwargs ) -> None: eg_desc = kwargs.get( "editgroup_description", "Automated import of container-level metadata scraped from dblp HTML", ) eg_extra = kwargs.get("editgroup_extra", dict()) eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.DblpContainerImporter") super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs) self.dblp_container_map_output = dblp_container_map_output self.read_dblp_container_map_file(dblp_container_map_file) self.read_issn_map_file(issn_map_file) print("\t".join(["dblp_prefix", "container_id"]), file=self.dblp_container_map_output) def read_dblp_container_map_file(self, dblp_container_map_file: Sequence) -> None: self._dblp_container_map = dict() print("Loading existing dblp prefix container map file...", file=sys.stderr) for line in dblp_container_map_file: if line.startswith("dblp_prefix") or len(line) == 0: continue (prefix, container_id) = line.split()[0:2] assert len(container_id) == 26 self._dblp_container_map[prefix] = container_id print("\t".join([prefix, container_id]), file=self.dblp_container_map_output) print( "Got {} existing dblp container mappings.".format(len(self._dblp_container_map)), file=sys.stderr, ) def lookup_dblp_prefix(self, prefix: str) -> Optional[str]: if not prefix: return None return self._dblp_container_map.get(prefix) def want(self, raw_record: Any) -> bool: return True def parse_record(self, row: Dict[str, Any]) -> Optional[ContainerEntity]: """ row is a python dict (parsed from JSON). returns a ContainerEntity (or None if invalid or couldn't parse) """ dblp_prefix = row.get("key") or row.get("dblp_prefix") assert dblp_prefix assert row["title"] container_type = None if dblp_prefix.startswith("conf/"): container_type = "conference-series" elif dblp_prefix.startswith("journals/"): container_type = "journal" elif dblp_prefix.startswith("series/"): container_type = "book-series" issnl = None for issn in row.get("issns", []): issnl = self.issn2issnl(issn) if issnl: break extra: Dict[str, Any] = { "dblp": { "prefix": dblp_prefix, }, } if row.get("homepage_url"): extra["urls"] = [row["homepage_url"]] if row.get("acronym"): extra["acronym"] = row["acronym"] ce = fatcat_openapi_client.ContainerEntity( name=clean_str(row["title"]), container_type=container_type, issnl=issnl, wikidata_qid=row.get("wikidata_qid"), extra=extra, ) return ce def try_update(self, ce: ContainerEntity) -> bool: dblp_prefix = ce.extra["dblp"]["prefix"] existing = None existing_container_id = self.lookup_dblp_prefix(dblp_prefix) if existing_container_id: existing = self.api.get_container(existing_container_id) if not existing and ce.issnl: # check if existing by ISSN-L try: existing = self.api.lookup_container(issnl=ce.issnl) except fatcat_openapi_client.rest.ApiException as err: if err.status != 404: raise err if not existing and ce.wikidata_qid: try: existing = self.api.lookup_container(wikidata_qid=ce.wikidata_qid) except fatcat_openapi_client.rest.ApiException as err: if err.status != 404: raise err # TODO: plan to add a fuzzy match check here if not existing: return True if existing: self.counts["exists"] += 1 print( "\t".join([ce.extra["dblp"]["prefix"], existing.ident]), file=self.dblp_container_map_output, ) return False # shouldn't get here raise NotImplementedError() def insert_batch(self, batch: List[ContainerEntity]) -> None: """ Because we want to print a prefix/container_id match for each row, we require a special batch insert method """ eg = self.api.create_container_auto_batch( fatcat_openapi_client.ContainerAutoBatch( editgroup=fatcat_openapi_client.Editgroup( description=self.editgroup_description, extra=self.editgroup_extra ), entity_list=batch, ) ) for c_edit in eg.edits.containers: c = self.api.get_container(c_edit.ident) print( "\t".join([c.extra["dblp"]["prefix"], c.ident]), file=self.dblp_container_map_output, )