diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2021-11-02 18:14:59 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2021-11-02 18:14:59 -0700 |
commit | 31d1a6a713d177990609767d508209ced19ca396 (patch) | |
tree | a628a57bdb373669394a6b520102b1b4b5ffe7da /python/fatcat_tools/importers/dblp_release.py | |
parent | 9dc891b8098542bb089c8c47098b60a8beb76a53 (diff) | |
download | fatcat-31d1a6a713d177990609767d508209ced19ca396.tar.gz fatcat-31d1a6a713d177990609767d508209ced19ca396.zip |
fmt (black): fatcat_tools/
Diffstat (limited to 'python/fatcat_tools/importers/dblp_release.py')
-rw-r--r-- | python/fatcat_tools/importers/dblp_release.py | 257 |
1 files changed, 132 insertions, 125 deletions
diff --git a/python/fatcat_tools/importers/dblp_release.py b/python/fatcat_tools/importers/dblp_release.py index 6d028f2f..5baa6cd6 100644 --- a/python/fatcat_tools/importers/dblp_release.py +++ b/python/fatcat_tools/importers/dblp_release.py @@ -1,4 +1,3 @@ - """ Importer for DBLP release-level (article/paper/etc) XML metadata. @@ -44,25 +43,16 @@ from fatcat_tools.transforms import entity_to_dict class DblpReleaseImporter(EntityImporter): - - def __init__(self, - api, - dblp_container_map_file=None, - **kwargs): + def __init__(self, api, dblp_container_map_file=None, **kwargs): eg_desc = kwargs.get( - 'editgroup_description', - "Automated import of dblp metadata via XML records" + "editgroup_description", "Automated import of dblp metadata via XML records" ) - eg_extra = kwargs.get('editgroup_extra', dict()) - eg_extra['agent'] = eg_extra.get('agent', - 'fatcat_tools.DblpReleaseImporter') + eg_extra = kwargs.get("editgroup_extra", dict()) + eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.DblpReleaseImporter") # ensure default is to not do updates with this worker (override super() default) - kwargs['do_updates'] = kwargs.get("do_updates", False) - super().__init__(api, - editgroup_description=eg_desc, - editgroup_extra=eg_extra, - **kwargs) + kwargs["do_updates"] = kwargs.get("do_updates", False) + super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs) self.dump_json_mode = kwargs.get("dump_json_mode", False) self.this_year = datetime.datetime.now().year @@ -76,13 +66,16 @@ class DblpReleaseImporter(EntityImporter): "phdthesis", "mastersthesis", "www", - #"data", # no instances in 2020-11 dump + # "data", # no instances in 2020-11 dump ] def read_dblp_container_map_file(self, dblp_container_map_file) -> None: self._dblp_container_map = dict() if not dblp_container_map_file: - print("Not loading a dblp prefix container map file; entities will fail to import", file=sys.stderr) + print( + "Not loading a dblp prefix container map file; entities will fail to import", + file=sys.stderr, + ) return print("Loading dblp prefix container map file...", file=sys.stderr) for line in dblp_container_map_file: @@ -92,7 +85,10 @@ class DblpReleaseImporter(EntityImporter): container_id = container_id.strip() assert len(container_id) == 26 self._dblp_container_map[prefix] = container_id - print("Got {} dblp container mappings.".format(len(self._dblp_container_map)), file=sys.stderr) + print( + "Got {} dblp container mappings.".format(len(self._dblp_container_map)), + file=sys.stderr, + ) def lookup_dblp_prefix(self, prefix): if not prefix: @@ -101,13 +97,13 @@ class DblpReleaseImporter(EntityImporter): def want(self, xml_elem): if xml_elem.name not in self.ELEMENT_TYPES: - self.counts['skip-type'] += 1 + self.counts["skip-type"] += 1 return False - if not xml_elem.get('key'): - self.counts['skip-no-key'] += 1 + if not xml_elem.get("key"): + self.counts["skip-no-key"] += 1 return False - if xml_elem['key'].startswith('homepage/'): - self.counts['skip-type-homepage'] += 1 + if xml_elem["key"].startswith("homepage/"): + self.counts["skip-type-homepage"] += 1 return False return True @@ -127,88 +123,88 @@ class DblpReleaseImporter(EntityImporter): - isbn """ - dblp_key = xml_elem.get('key') + dblp_key = xml_elem.get("key") if not dblp_key: - self.counts['skip-empty-key'] += 1 + self.counts["skip-empty-key"] += 1 return False - dblp_key_type = dblp_key.split('/')[0] + dblp_key_type = dblp_key.split("/")[0] # dblp_prefix may be used for container lookup dblp_prefix = None - if dblp_key_type in ('journals', 'conf'): - dblp_prefix = '/'.join(dblp_key.split('/')[:2]) - elif dblp_key_type in ('series', 'reference', 'tr', 'books'): - dblp_prefix = '/'.join(dblp_key.split('/')[:-1]) + if dblp_key_type in ("journals", "conf"): + dblp_prefix = "/".join(dblp_key.split("/")[:2]) + elif dblp_key_type in ("series", "reference", "tr", "books"): + dblp_prefix = "/".join(dblp_key.split("/")[:-1]) - publtype = xml_elem.get('publtype') or None + publtype = xml_elem.get("publtype") or None dblp_type = xml_elem.name if dblp_type not in self.ELEMENT_TYPES: - self.counts[f'skip-dblp-type:{dblp_type}'] += 1 + self.counts[f"skip-dblp-type:{dblp_type}"] += 1 - if dblp_key_type in ('homepages', 'persons', 'dblpnote'): - self.counts['skip-key-type'] += 1 + if dblp_key_type in ("homepages", "persons", "dblpnote"): + self.counts["skip-key-type"] += 1 return False - if dblp_key.startswith('journals/corr/'): - self.counts['skip-arxiv-corr'] += 1 + if dblp_key.startswith("journals/corr/"): + self.counts["skip-arxiv-corr"] += 1 return False title = clean_str(" ".join(xml_elem.title.stripped_strings), force_xml=True) if not title: - self.counts['skip-title'] += 1 + self.counts["skip-title"] += 1 return False - if title.endswith('.'): + if title.endswith("."): title = title[:-1] release_type = None - release_stage = 'published' + release_stage = "published" withdrawn_status = None # primary releae_type detection: type of XML element, then prefix of key for granularity - if dblp_type == 'article': - release_type = 'article' - if dblp_key_type == 'journals' and publtype != 'informal': - release_type = 'article-journal' - elif dblp_key_type == 'tr': - release_type = 'report' + if dblp_type == "article": + release_type = "article" + if dblp_key_type == "journals" and publtype != "informal": + release_type = "article-journal" + elif dblp_key_type == "tr": + release_type = "report" elif title.startswith("Review:"): - release_type = 'review' - elif dblp_type == 'inproceedings': - release_type = 'paper-conference' - elif dblp_type == 'book': - release_type = 'book' - elif dblp_type == 'incollection': + release_type = "review" + elif dblp_type == "inproceedings": + release_type = "paper-conference" + elif dblp_type == "book": + release_type = "book" + elif dblp_type == "incollection": # XXX: part vs. chapter? - release_type = 'chapter' - elif dblp_type == 'data': - release_type = 'dataset' - elif dblp_type in ('mastersthesis', 'phdthesis'): - release_type = 'thesis' + release_type = "chapter" + elif dblp_type == "data": + release_type = "dataset" + elif dblp_type in ("mastersthesis", "phdthesis"): + release_type = "thesis" # overrides/extensions of the above - if publtype == 'informal': + if publtype == "informal": # for conferences, seems to indicate peer-review status # for journals, seems to indicate things like book reviews; split out above pass - elif publtype == 'encyclopedia': - release_type = 'entry-encyclopedia' - elif publtype == 'edited': + elif publtype == "encyclopedia": + release_type = "entry-encyclopedia" + elif publtype == "edited": # XXX: article? - release_type = 'editorial' - elif publtype == 'data': - release_type = 'dataset' - elif publtype == 'data': - release_type = 'dataset' - elif publtype == 'software': - release_type = 'software' - elif publtype == 'widthdrawn': - withdrawn_status = 'widthdrawn' - elif publtype == 'survey': + release_type = "editorial" + elif publtype == "data": + release_type = "dataset" + elif publtype == "data": + release_type = "dataset" + elif publtype == "software": + release_type = "software" + elif publtype == "widthdrawn": + withdrawn_status = "widthdrawn" + elif publtype == "survey": # XXX: flag as a review/survey article? pass - #print((release_type, dblp_type, dblp_key_type, publtype), file=sys.stderr) + # print((release_type, dblp_type, dblp_key_type, publtype), file=sys.stderr) container_name = None booktitle = clean_str(xml_elem.booktitle and xml_elem.booktitle.text) @@ -236,7 +232,9 @@ class DblpReleaseImporter(EntityImporter): part_of_key = clean_str(xml_elem.crossref and xml_elem.crossref.text) # block bogus far-future years/dates - if release_year is not None and (release_year > (self.this_year + 5) or release_year < 1000): + if release_year is not None and ( + release_year > (self.this_year + 5) or release_year < 1000 + ): release_month = None release_year = None @@ -245,39 +243,39 @@ class DblpReleaseImporter(EntityImporter): if isbn: ext_ids.isbn13 = isbn if ext_ids.doi: - self.counts['has-doi'] += 1 + self.counts["has-doi"] += 1 # dblp-specific extra dblp_extra = dict(type=dblp_type) note = clean_str(xml_elem.note and xml_elem.note.text) - if note and 'base-search.net' not in note: - dblp_extra['note'] = note + if note and "base-search.net" not in note: + dblp_extra["note"] = note if part_of_key: - dblp_extra['part_of_key'] = part_of_key + dblp_extra["part_of_key"] = part_of_key # generic extra extra = dict() if not container_id and container_name: - extra['container_name'] = container_name + extra["container_name"] = container_name - if series and (dblp_key_type == 'series' or dblp_type == 'book'): - extra['series-title'] = series + if series and (dblp_key_type == "series" or dblp_type == "book"): + extra["series-title"] = series elif series: - dblp_extra['series'] = series + dblp_extra["series"] = series - if booktitle and dblp_key_type == 'series': - extra['container-title'] = booktitle - elif booktitle and dblp_key_type == 'conf': - extra['event'] = booktitle + if booktitle and dblp_key_type == "series": + extra["container-title"] = booktitle + elif booktitle and dblp_key_type == "conf": + extra["event"] = booktitle elif booktitle: - dblp_extra['booktitle'] = booktitle + dblp_extra["booktitle"] = booktitle if release_year and release_month: # TODO: release_month schema migration - extra['release_month'] = release_month + extra["release_month"] = release_month if dblp_extra: - extra['dblp'] = dblp_extra + extra["dblp"] = dblp_extra if not extra: extra = None @@ -289,7 +287,7 @@ class DblpReleaseImporter(EntityImporter): withdrawn_status=withdrawn_status, title=title, release_year=release_year, - #release_date, + # release_date, publisher=publisher, ext_ids=ext_ids, contribs=contribs, @@ -302,8 +300,8 @@ class DblpReleaseImporter(EntityImporter): if self.dump_json_mode: re_dict = entity_to_dict(re, api_client=self.api.api_client) - re_dict['_dblp_ee_urls'] = self.dblp_ext_urls(xml_elem) - re_dict['_dblp_prefix'] = dblp_prefix + re_dict["_dblp_ee_urls"] = self.dblp_ext_urls(xml_elem) + re_dict["_dblp_prefix"] = dblp_prefix print(json.dumps(re_dict, sort_keys=True)) return False @@ -341,11 +339,11 @@ class DblpReleaseImporter(EntityImporter): # then try other ext_id lookups if not existing: - for extid_type in ('doi', 'wikidata_qid', 'isbn13', 'arxiv'): + for extid_type in ("doi", "wikidata_qid", "isbn13", "arxiv"): extid_val = getattr(re.ext_ids, extid_type) if not extid_val: continue - #print(f" lookup release type: {extid_type} val: {extid_val}") + # print(f" lookup release type: {extid_type} val: {extid_val}") try: existing = self.api.lookup_release(**{extid_type: extid_val}) except fatcat_openapi_client.rest.ApiException as err: @@ -373,12 +371,14 @@ class DblpReleaseImporter(EntityImporter): return True if not self.do_updates or existing.ext_ids.dblp: - self.counts['exists'] += 1 + self.counts["exists"] += 1 return False # logic for whether to do update or skip - if (existing.container_id and existing.release_type and existing.release_stage) or existing.ext_ids.arxiv: - self.counts['skip-update'] += 1 + if ( + existing.container_id and existing.release_type and existing.release_stage + ) or existing.ext_ids.arxiv: + self.counts["skip-update"] += 1 return False # fields to copy over for update @@ -390,20 +390,20 @@ class DblpReleaseImporter(EntityImporter): existing.release_stage = existing.release_stage or re.release_stage existing.withdrawn_status = existing.withdrawn_status or re.withdrawn_status existing.container_id = existing.container_id or re.container_id - existing.extra['dblp'] = re.extra['dblp'] + existing.extra["dblp"] = re.extra["dblp"] existing.volume = existing.volume or re.volume existing.issue = existing.issue or re.issue existing.pages = existing.pages or re.pages try: self.api.update_release(self.get_editgroup_id(), existing.ident, existing) - self.counts['update'] += 1 + self.counts["update"] += 1 except fatcat_openapi_client.rest.ApiException as err: # there is a code path where we try to update the same release # twice in a row; if that happens, just skip # NOTE: API behavior might change in the future? if "release_edit_editgroup_id_ident_id_key" in err.body: - self.counts['skip-update-conflict'] += 1 + self.counts["skip-update-conflict"] += 1 return False else: raise err @@ -411,11 +411,14 @@ class DblpReleaseImporter(EntityImporter): return False def insert_batch(self, batch): - self.api.create_release_auto_batch(fatcat_openapi_client.ReleaseAutoBatch( - editgroup=fatcat_openapi_client.Editgroup( - description=self.editgroup_description, - extra=self.editgroup_extra), - entity_list=batch)) + self.api.create_release_auto_batch( + fatcat_openapi_client.ReleaseAutoBatch( + editgroup=fatcat_openapi_client.Editgroup( + description=self.editgroup_description, extra=self.editgroup_extra + ), + entity_list=batch, + ) + ) def dblp_contribs(self, authors: List[dict]) -> List[fatcat_openapi_client.ReleaseContrib]: """ @@ -428,14 +431,14 @@ class DblpReleaseImporter(EntityImporter): """ contribs = [] index = 0 - for elem in authors.find_all('author'): + for elem in authors.find_all("author"): contrib = self.dblp_contrib_single(elem) contrib.role = "author" contrib.index = index contribs.append(contrib) index += 1 - for elem in authors.find_all('editor'): + for elem in authors.find_all("editor"): contrib = self.dblp_contrib_single(elem) contrib.role = "editor" contribs.append(contrib) @@ -459,10 +462,10 @@ class DblpReleaseImporter(EntityImporter): # remove number in author name, if present if raw_name.split()[-1].isdigit(): - raw_name = ' '.join(raw_name.split()[:-1]) + raw_name = " ".join(raw_name.split()[:-1]) - if elem.get('orcid'): - orcid = clean_orcid(elem['orcid']) + if elem.get("orcid"): + orcid = clean_orcid(elem["orcid"]) if orcid: creator_id = self.lookup_orcid(orcid) if not creator_id: @@ -491,22 +494,26 @@ class DblpReleaseImporter(EntityImporter): wikidata_qid: Optional[str] = None arxiv_id: Optional[str] = None hdl: Optional[str] = None - for ee in xml_elem.find_all('ee'): + for ee in xml_elem.find_all("ee"): url = ee.text # convert DOI-like domains, which mostly have DOIs anyways - if '://doi.acm.org/' in url: - url = url.replace('://doi.acm.org/', '://doi.org/') - elif '://doi.ieeecomputersociety.org/' in url: - url = url.replace('://doi.ieeecomputersociety.org/', '://doi.org/') + if "://doi.acm.org/" in url: + url = url.replace("://doi.acm.org/", "://doi.org/") + elif "://doi.ieeecomputersociety.org/" in url: + url = url.replace("://doi.ieeecomputersociety.org/", "://doi.org/") - if 'doi.org/10.' in url and not doi: + if "doi.org/10." in url and not doi: doi = clean_doi(url) - elif 'wikidata.org/entity/Q' in url and not wikidata_qid: + elif "wikidata.org/entity/Q" in url and not wikidata_qid: wikidata_qid = clean_wikidata_qid(url) - elif '://arxiv.org/abs/' in url and not arxiv_id: - arxiv_id = url.replace('http://', '').replace('https://', '').replace('arxiv.org/abs/', '') + elif "://arxiv.org/abs/" in url and not arxiv_id: + arxiv_id = ( + url.replace("http://", "") + .replace("https://", "") + .replace("arxiv.org/abs/", "") + ) arxiv_id = clean_arxiv_id(arxiv_id) - elif '://hdl.handle.net' in url and not hdl: + elif "://hdl.handle.net" in url and not hdl: hdl = clean_hdl(url) return fatcat_openapi_client.ReleaseExtIds( @@ -525,14 +532,14 @@ class DblpReleaseImporter(EntityImporter): sandcrawler ingest requests. """ EXTID_PATTERNS = [ - '://doi.acm.org/', - '://doi.ieeecomputersociety.org/', - 'doi.org/10.', - 'wikidata.org/entity/Q', - '://arxiv.org/abs/', + "://doi.acm.org/", + "://doi.ieeecomputersociety.org/", + "doi.org/10.", + "wikidata.org/entity/Q", + "://arxiv.org/abs/", ] urls = [] - for ee in xml_elem.find_all('ee'): + for ee in xml_elem.find_all("ee"): url = ee.text skip = False for pattern in EXTID_PATTERNS: |