From 58ff361eb481bee9d2ef7249f48f94729d2a830d Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 17 Dec 2020 01:55:30 -0800 Subject: very simple dblp container importer --- python/fatcat_tools/importers/__init__.py | 1 + python/fatcat_tools/importers/dblp_container.py | 144 ++++++++++++++++++++++++ 2 files changed, 145 insertions(+) create mode 100644 python/fatcat_tools/importers/dblp_container.py (limited to 'python/fatcat_tools') diff --git a/python/fatcat_tools/importers/__init__.py b/python/fatcat_tools/importers/__init__.py index a14e2cec..6a2edeac 100644 --- a/python/fatcat_tools/importers/__init__.py +++ b/python/fatcat_tools/importers/__init__.py @@ -32,3 +32,4 @@ from .shadow import ShadowLibraryImporter from .file_meta import FileMetaImporter from .doaj_article import DoajArticleImporter from .dblp_release import DblpReleaseImporter +from .dblp_container import DblpContainerImporter diff --git a/python/fatcat_tools/importers/dblp_container.py b/python/fatcat_tools/importers/dblp_container.py new file mode 100644 index 00000000..a9f993a8 --- /dev/null +++ b/python/fatcat_tools/importers/dblp_container.py @@ -0,0 +1,144 @@ + +""" +Importer for DBLP container-level (journal/conference/series) metadata, +pre-scraped in to JSON from HTML pages. +""" + +import sys # noqa: F401 + +import fatcat_openapi_client +from fatcat_tools.normal import clean_str +from fatcat_tools.importers.common import EntityImporter + + +class DblpContainerImporter(EntityImporter): + + def __init__(self, api, issn_map_file, dblp_container_map_file, dblp_container_map_output, **kwargs): + + eg_desc = kwargs.get('editgroup_description', + "Automated import of container-level metadata scraped from dblp HTML") + eg_extra = kwargs.get('editgroup_extra', dict()) + eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.DblpContainerImporter') + super().__init__(api, + editgroup_description=eg_desc, + editgroup_extra=eg_extra, + **kwargs) + + self.dblp_container_map_output = dblp_container_map_output + self.read_dblp_container_map_file(dblp_container_map_file) + self.read_issn_map_file(issn_map_file) + print("\t".join(["dblp_prefix", "container_id"]), file=self.dblp_container_map_output) + + def read_dblp_container_map_file(self, dblp_container_map_file) -> None: + self._dblp_container_map = dict() + print("Loading existing dblp prefix container map file...", file=sys.stderr) + for line in dblp_container_map_file: + if line.startswith("dblp_prefix") or len(line) == 0: + continue + (prefix, container_id) = line.split()[0:2] + assert len(container_id) == 26 + self._dblp_container_map[prefix] = container_id + print("\t".join([prefix, container_id]), file=self.dblp_container_map_output) + print("Got {} existing dblp container mappings.".format(len(self._dblp_container_map)), file=sys.stderr) + + def lookup_dblp_prefix(self, prefix): + if not prefix: + return None + return self._dblp_container_map.get(prefix) + + def want(self, raw_record): + return True + + def parse_record(self, row): + """ + row is a python dict (parsed from JSON). + + returns a ContainerEntity (or None if invalid or couldn't parse) + """ + + dblp_prefix = row.get('key') or row.get('dblp_prefix') + assert dblp_prefix + assert row['title'] + + container_type = None + if dblp_prefix.startswith('conf/'): + container_type = "conference-series" + elif dblp_prefix.startswith('journals/'): + container_type = "journal" + elif dblp_prefix.startswith('series/'): + container_type = "book-series" + + issnl = None + for issn in row.get('issns', []): + issnl = self.issn2issnl(issn) + if issnl: + break + + extra = { + 'dblp': { + 'prefix': dblp_prefix, + }, + } + + if row.get('homepage_url'): + extra['urls'] = [row['homepage_url']] + + if row.get('acronym'): + extra['acronym'] = row['acronym'] + + ce = fatcat_openapi_client.ContainerEntity( + name=clean_str(row['title']), + container_type=container_type, + issnl=issnl, + wikidata_qid=row.get('wikidata_qid'), + extra=extra, + ) + return ce + + def try_update(self, ce): + + dblp_prefix = ce.extra['dblp']['prefix'] + existing = None + existing_container_id = self.lookup_dblp_prefix(dblp_prefix) + if existing_container_id: + existing = self.api.get_container(existing_container_id) + if not existing and ce.issnl: + # check if existing by ISSN-L + try: + existing = self.api.lookup_container(issnl=ce.issnl) + except fatcat_openapi_client.rest.ApiException as err: + if err.status != 404: + raise err + if not existing and ce.wikidata_qid: + try: + existing = self.api.lookup_container(wikidata_qid=ce.wikidata_qid) + except fatcat_openapi_client.rest.ApiException as err: + if err.status != 404: + raise err + + # TODO: plan to add a fuzzy match check here + + if not existing: + return True + + if existing: + self.counts['exists'] += 1 + print("\t".join([ce.extra['dblp']['prefix'], existing.ident]), file=self.dblp_container_map_output) + return False + + # shouldn't get here + raise NotImplementedError() + + def insert_batch(self, batch): + """ + Because we want to print a prefix/container_id match for each row, we + require a special batch insert method + """ + eg = self.api.create_container_auto_batch(fatcat_openapi_client.ContainerAutoBatch( + editgroup=fatcat_openapi_client.Editgroup( + description=self.editgroup_description, + extra=self.editgroup_extra), + entity_list=batch)) + for c_edit in eg.edits.containers: + c = self.api.get_container(c_edit.ident) + print("\t".join([c.extra['dblp']['prefix'], c.ident]), file=self.dblp_container_map_output) -- cgit v1.2.3