diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2020-12-02 11:29:03 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2020-12-17 23:03:08 -0800 |
commit | 7d1ac51fb1b67f64f03e4c6e943202085cd4faa9 (patch) | |
tree | cd5730517b3d9418315efd6784369c2c67a49913 | |
parent | 4e332e9037530ebc62836acfa78896dc76700c9c (diff) | |
download | fatcat-7d1ac51fb1b67f64f03e4c6e943202085cd4faa9.tar.gz fatcat-7d1ac51fb1b67f64f03e4c6e943202085cd4faa9.zip |
initial implementation of dblp release importer (in progress)
-rwxr-xr-x | python/fatcat_import.py | 29 | ||||
-rw-r--r-- | python/fatcat_tools/importers/__init__.py | 1 | ||||
-rw-r--r-- | python/fatcat_tools/importers/dblp_release.py | 444 |
3 files changed, 474 insertions, 0 deletions
diff --git a/python/fatcat_import.py b/python/fatcat_import.py index 6c9e65a8..5ee81b92 100755 --- a/python/fatcat_import.py +++ b/python/fatcat_import.py @@ -273,6 +273,19 @@ def run_doaj_article(args): else: JsonLinePusher(dai, args.json_file).run() +def run_dblp_release(args): + dwi = DblpReleaseImporter(args.api, + args.issn_map_file, + edit_batch_size=args.batch_size, + do_updates=args.do_updates, + ) + Bs4XmlLargeFilePusher( + dwi, + args.xml_file, + DblpReleaseImporter.ELEMENT_TYPES, + use_lxml=True, + ).run() + def run_file_meta(args): # do_updates defaults to true for this importer fmi = FileMetaImporter(args.api, @@ -642,6 +655,22 @@ def main(): auth_var="FATCAT_AUTH_WORKER_DOAJ", ) + sub_dblp_release = subparsers.add_parser('dblp-release', + help="import dblp release metadata") + sub_dblp_release.add_argument('xml_file', + help="File with DBLP XML to import from", + default=sys.stdin, type=argparse.FileType('rb')) + sub_dblp_release.add_argument('--issn-map-file', + help="ISSN to ISSN-L mapping file", + default=None, type=argparse.FileType('r')) + sub_dblp_release.add_argument('--do-updates', + action='store_true', + help="update any pre-existing release entities") + sub_dblp_release.set_defaults( + func=run_dblp_release, + auth_var="FATCAT_AUTH_WORKER_DBLP", + ) + sub_file_meta = subparsers.add_parser('file-meta', help="simple update-only importer for file metadata") sub_file_meta.set_defaults( diff --git a/python/fatcat_tools/importers/__init__.py b/python/fatcat_tools/importers/__init__.py index d2928d09..a14e2cec 100644 --- a/python/fatcat_tools/importers/__init__.py +++ b/python/fatcat_tools/importers/__init__.py @@ -31,3 +31,4 @@ from .ingest import IngestFileResultImporter, SavePaperNowFileImporter, IngestWe from .shadow import ShadowLibraryImporter from .file_meta import FileMetaImporter from .doaj_article import DoajArticleImporter +from .dblp_release import DblpReleaseImporter diff --git a/python/fatcat_tools/importers/dblp_release.py b/python/fatcat_tools/importers/dblp_release.py new file mode 100644 index 00000000..7e19855f --- /dev/null +++ b/python/fatcat_tools/importers/dblp_release.py @@ -0,0 +1,444 @@ + +""" +Importer for DBLP release-level (article/paper/etc) XML metadata. + +Works similarly to PubMed XML importer: expects to have a large XML file +iterated over quickly, with individual elements re-parsed into smaller objects +and passed to `parse_record()`. +""" + +import sys # noqa: F401 +import warnings +import datetime +from typing import List, Optional, Any + +import fatcat_openapi_client +from fatcat_tools.normal import (clean_doi, clean_str, parse_month, + clean_orcid, + clean_arxiv_id, clean_wikidata_qid, clean_isbn13) +from fatcat_tools.importers.common import EntityImporter + + +class DblpReleaseImporter(EntityImporter): + + def __init__(self, + api, + issn_map_file, + **kwargs): + + eg_desc = kwargs.get( + 'editgroup_description', + "Automated import of dblp metadata via XML records" + ) + eg_extra = kwargs.get('editgroup_extra', dict()) + eg_extra['agent'] = eg_extra.get('agent', + 'fatcat_tools.DblpReleaseImporter') + # ensure default is to not do updates with this worker (override super() default) + kwargs['do_updates'] = kwargs.get("do_updates", False) + super().__init__(api, + issn_map_file=issn_map_file, + editgroup_description=eg_desc, + editgroup_extra=eg_extra, + **kwargs) + + self.this_year = datetime.datetime.now().year + self.read_issn_map_file(issn_map_file) + + ELEMENT_TYPES = [ + "article", + "inproceedings", + "book", + "incollection", + "phdthesis", + "mastersthesis", + "www", + #"data", # no instances in 2020-11 dump + ] + + def want(self, xml_elem): + if not xml_elem.name in self.ELEMENT_TYPES: + self.counts['skip-type'] += 1 + return False + if not xml_elem.get('key'): + self.counts['skip-no-key'] += 1 + return False + if xml_elem['key'].startswith('homepage/'): + self.counts['skip-type-homepage'] += 1 + return False + return True + + def parse_record(self, xml_elem): + """ + - title + => may contain <i>, <sub>, <sup>, <tt> + - journal (abbrev?) + - volume, pages, number (number -> issue) + - publisher + - year + => for conferences, year of conference not of publication + - month + - crossref (from inproceedings to specific proceedings volume) + - booktitle + => for inproceedings, this is the name of conference or workshop. acronym. + - isbn + """ + + dblp_key = xml_elem.get('key') + if not dblp_key: + self.counts['skip-empty-key'] += 1 + return False + dblp_key_type = dblp_key.split('/')[0] + + # dblp_prefix may be used for container lookup + dblp_prefix = None + if dblp_key_type in ('journals', 'conf'): + dblp_prefix = '/'.join(dblp_key.split('/')[:2]) + elif dblp_key_type in ('series', 'reference', 'tr', 'books'): + dblp_prefix = '/'.join(dblp_key.split('/')[:-1]) + + publtype = xml_elem.get('publtype') or None + + dblp_type = xml_elem.name + if dblp_type not in self.ELEMENT_TYPES: + self.counts[f'skip-dblp-type:{dblp_type}'] += 1 + + if dblp_key_type in ('homepages', 'persons', 'dblpnote'): + self.counts['skip-key-type'] += 1 + return False + + if dblp_key.startswith('journals/corr/'): + self.counts['skip-arxiv-corr'] += 1 + return False + + title = clean_str(" ".join(xml_elem.title.stripped_strings), force_xml=True) + if not title: + self.counts['skip-title'] += 1 + return False + if title.endswith('.'): + title = title[:-1] + + release_type = None + release_stage = 'published' + withdrawn_status = None + + # primary releae_type detection: type of XML element, then prefix of key for granularity + if dblp_type == 'article': + release_type = 'article' + if dblp_key_type == 'journals' and publtype != 'informal': + release_type = 'article-journal' + elif dblp_key_type == 'tr': + release_type = 'report' + elif title.startswith("Review:"): + release_type = 'review' + elif dblp_type == 'inproceedings': + release_type = 'paper-conference' + elif dblp_type == 'book': + release_type = 'book' + elif dblp_type == 'incollection': + # XXX: part vs. chapter? + release_type = 'chapter' + elif dblp_type == 'data': + release_type = 'dataset' + elif dblp_type in ('mastersthesis', 'phdthesis'): + release_type = 'thesis' + + # overrides/extensions of the above + if publtype == 'informal': + # for conferences, seems to indicate peer-review status + # for journals, seems to indicate things like book reviews; split out above + pass + elif publtype == 'encyclopedia': + release_type = 'entry-encyclopedia' + elif publtype == 'edited': + # XXX: article? + release_type = 'editorial' + elif publtype == 'data': + release_type = 'dataset' + elif publtype == 'data': + release_type = 'dataset' + elif publtype == 'software': + release_type = 'software' + elif publtype == 'widthdrawn': + withdrawn_status = 'widthdrawn' + elif publtype == 'survey': + # XXX: flag as a review/survey article? + pass + + #print((release_type, dblp_type, dblp_key_type, publtype), file=sys.stderr) + + container_name = None + booktitle = clean_str(xml_elem.booktitle and xml_elem.booktitle.text) + series = clean_str(xml_elem.series and xml_elem.series.text) + + if xml_elem.journal: + container_name = clean_str(xml_elem.journal.text) + + container_id = None + if dblp_prefix: + # XXX: container lookup by dblp_prefix, from local something + pass + #container_id = self.lookup_dblp_prefix(dblp_prefix) + #if not container_id: + # self.counts['skip-dblp-prefix-lookup'] += 1 + # return False + + publisher = clean_str(xml_elem.publisher and xml_elem.publisher.text) + volume = clean_str(xml_elem.volume and xml_elem.volume.text) + issue = clean_str(xml_elem.number and xml_elem.number.text) + pages = clean_str(xml_elem.pages and xml_elem.pages.text) + release_year = clean_str(xml_elem.year and xml_elem.year.text) + if release_year and release_year.isdigit(): + release_year = int(release_year) + else: + release_year = None + release_month = parse_month(clean_str(xml_elem.month and xml_elem.month.text)) + isbn = clean_isbn13(xml_elem.isbn and xml_elem.isbn.text) + part_of_key = clean_str(xml_elem.crossref and xml_elem.crossref.text) + + # block bogus far-future years/dates + if release_year is not None and (release_year > (self.this_year + 5) or release_year < 1000): + release_month = None + release_year = None + + contribs = self.dblp_contribs(xml_elem or []) + ext_ids = self.dblp_ext_ids(xml_elem, dblp_key) + if isbn: + ext_ids.isbn13 = isbn + if ext_ids.doi: + self.counts['has-doi'] += 1 + + # dblp-specific extra + dblp_extra = dict(type=dblp_type) + note = clean_str(xml_elem.note and xml_elem.note.text) + if note and not 'base-search.net' in note: + dblp_extra['note'] = note + if part_of_key: + dblp_extra['part_of_key'] = part_of_key + + # generic extra + extra = dict() + if not container_id and container_name: + extra['container_name'] = container_name + + if series and (dblp_key_type == 'series' or dblp_type == 'book'): + extra['series-title'] = series + elif series: + dblp_extra['series'] = series + + if booktitle and dblp_key_type == 'series': + extra['container-title'] = booktitle + elif booktitle and dblp_key_type == 'conf': + extra['event'] = booktitle + elif booktitle: + dblp_extra['booktitle'] = booktitle + + if release_year and release_month: + # TODO: schema migration + extra['release_month'] = release_month + + if dblp_extra: + extra['dblp'] = dblp_extra + if not extra: + extra = None + + re = fatcat_openapi_client.ReleaseEntity( + work_id=None, + container_id=container_id, + release_type=release_type, + release_stage=release_stage, + withdrawn_status=withdrawn_status, + title=title, + release_year=release_year, + #release_date, + publisher=publisher, + ext_ids=ext_ids, + contribs=contribs, + volume=volume, + issue=issue, + pages=pages, + extra=extra, + ) + re = self.biblio_hacks(re) + return re + + @staticmethod + def biblio_hacks(re): + """ + This function handles known special cases. For example, + publisher-specific or platform-specific workarounds. + """ + return re + + def try_update(self, re): + + # lookup existing release by dblp article id + existing = None + try: + existing = self.api.lookup_release(dblp=re.ext_ids.dblp) + except fatcat_openapi_client.rest.ApiException as err: + if err.status != 404: + raise err + + # then try other ext_id lookups + if not existing: + for extid_type in ('doi', 'wikidata_qid', 'isbn13', 'arxiv_id'): + extid_val = getattr(re.ext_ids, extid_type) + if not extid_val: + continue + #print(f" lookup release type: {extid_type} val: {extid_val}") + try: + existing = self.api.lookup_release(**{extid_type: extid_val}) + except fatcat_openapi_client.rest.ApiException as err: + if err.status != 404: + raise err + if existing: + if existing.ext_ids.dblp: + warn_str = f"unexpected dblp ext_id match after lookup failed dblp={re.ext_ids.dblp} ident={existing.ident}" + warnings.warn(warn_str) + self.counts["skip-dblp-id-mismatch"] += 1 + return False + break + + # TODO: in the future could do fuzzy match here, eg using elasticsearch + + # create entity + if not existing: + return True + + # other logic could go here about skipping updates + if not self.do_updates or existing.ext_ids.dblp: + self.counts['exists'] += 1 + return False + + # fields to copy over for update + # TODO: granular contrib metadata + existing.contribs = existing.contribs or re.contribs + existing.ext_ids.dblp = existing.ext_ids.dblp or re.ext_ids.dblp + existing.ext_ids.wikidata_qid = existing.ext_ids.wikidata_qid or re.ext_ids.wikidata_qid + existing.release_type = existing.release_type or re.release_type + existing.release_stage = existing.release_stage or re.release_stage + existing.withdrawn_status = existing.withdrawn_status or re.withdrawn_status + existing.container_id = existing.container_id or re.container_id + existing.extra['dblp'] = re.extra['dblp'] + existing.volume = existing.volume or re.volume + existing.issue = existing.issue or re.issue + existing.pages = existing.pages or re.pages + + try: + self.api.update_release(self.get_editgroup_id(), existing.ident, existing) + self.counts['update'] += 1 + except fatcat_openapi_client.rest.ApiException as err: + # there is a code path where we try to update the same release + # twice in a row; if that happens, just skip + # NOTE: API behavior might change in the future? + if "release_edit_editgroup_id_ident_id_key" in err.body: + self.counts['skip-update-conflict'] += 1 + return False + else: + raise err + + return False + + def insert_batch(self, batch): + self.api.create_release_auto_batch(fatcat_openapi_client.ReleaseAutoBatch( + editgroup=fatcat_openapi_client.Editgroup( + description=self.editgroup_description, + extra=self.editgroup_extra), + entity_list=batch)) + + def dblp_contribs(self, authors: List[dict]) -> List[fatcat_openapi_client.ReleaseContrib]: + """ + - author (multiple; each a single string) + => may have HTML entities + => may have a number at the end, to aid with identifier creation + => orcid + - editor (same as author) + => orcid + """ + contribs = [] + index = 0 + for elem in authors.find_all('author'): + contrib = self.dblp_contrib_single(elem) + contrib.role = "author" + contrib.index = index + contribs.append(contrib) + index += 1 + + for elem in authors.find_all('editor'): + contrib = self.dblp_contrib_single(elem) + contrib.role = "editor" + contribs.append(contrib) + + return contribs + + def dblp_contrib_single(self, elem: Any) -> fatcat_openapi_client.ReleaseContrib: + """ + In the future, might try to implement creator key-ificiation and lookup here. + + Example rows: + + <author>Michael H. Böhlen</author> + <author orcid="0000-0002-4354-9138">Nicolas Heist</author> + <author orcid="0000-0001-9108-4278">Jens Lehmann 0001</author> + """ + + creator_id = None + extra = None + raw_name = clean_str(elem.text) + + # remove number in author name, if present + if raw_name.split()[-1].isdigit(): + raw_name = ' '.join(raw_name.split()[:-1]) + + if elem.get('orcid'): + orcid = clean_orcid(elem['orcid']) + if orcid: + creator_id = self.lookup_orcid(orcid) + if not creator_id: + extra = dict(orcid=orcid) + return fatcat_openapi_client.ReleaseContrib( + raw_name=raw_name, + creator_id=creator_id, + extra=extra, + ) + + def dblp_ext_ids(self, xml_elem: Any, dblp_key: str) -> fatcat_openapi_client.ReleaseExtIds: + """ + Takes a full XML object and returns external identifiers. + + Currently these can be arixv identifiers, DOI, or wikidata QID + + - ee (electronic edition; often DOI?) + => in some cases a "local" URL + => publisher URL; often DOI + => type attr + - url + => dblp internal link to table-of-contents + """ + + doi: Optional[str] = None + wikidata_qid: Optional[str] = None + arxiv_id: Optional[str] = None + for ee in xml_elem.find_all('ee'): + url = ee.text + # convert DOI-like domains, which mostly have DOIs anyways + if '://doi.acm.org/' in url: + url = url.replace('://doi.acm.org/', '://doi.org/') + elif '://doi.ieeecomputersociety.org/' in url: + url = url.replace('://doi.ieeecomputersociety.org/', '://doi.org/') + + if 'doi.org/10.' in url and not doi: + doi = clean_doi(url) + elif 'wikidata.org/entity/Q' in url and not wikidata_qid: + wikidata_qid = clean_wikidata_qid(url) + elif '://arxiv.org/abs/' in url and not wikidata_qid: + arxiv_id = url.replace('http://', '').replace('https://', '').replace('arxiv.org/abs/', '') + arxiv_id = clean_arxiv_id(arxiv_id) + wikidata_qid = clean_wikidata_qid(url) + + return fatcat_openapi_client.ReleaseExtIds( + dblp=dblp_key, + doi=doi, + wikidata_qid=wikidata_qid, + arxiv=arxiv_id, + ) |