initial implementation of dblp release importer (in progress)

author: Bryan Newbold <bnewbold@robocracy.org> 2020-12-02 11:29:03 -0800
committer: Bryan Newbold <bnewbold@robocracy.org> 2020-12-17 23:03:08 -0800
commit: 7d1ac51fb1b67f64f03e4c6e943202085cd4faa9 (patch)
tree: cd5730517b3d9418315efd6784369c2c67a49913 /python/fatcat_tools
parent: 4e332e9037530ebc62836acfa78896dc76700c9c (diff)
download: fatcat-7d1ac51fb1b67f64f03e4c6e943202085cd4faa9.tar.gz
fatcat-7d1ac51fb1b67f64f03e4c6e943202085cd4faa9.zip
2 files changed, 445 insertions, 0 deletions
diff --git a/python/fatcat_tools/importers/__init__.py b/python/fatcat_tools/importers/__init__.py
index d2928d09..a14e2cec 100644
--- a/python/fatcat_tools/importers/__init__.py
+++ b/python/fatcat_tools/importers/__init__.py
@@ -31,3 +31,4 @@ from .ingest import IngestFileResultImporter, SavePaperNowFileImporter, IngestWe
 from .shadow import ShadowLibraryImporter
 from .file_meta import FileMetaImporter
 from .doaj_article import DoajArticleImporter
+from .dblp_release import DblpReleaseImporter
diff --git a/python/fatcat_tools/importers/dblp_release.py b/python/fatcat_tools/importers/dblp_release.py
new file mode 100644
index 00000000..7e19855f
--- /dev/null
+++ b/python/fatcat_tools/importers/dblp_release.py
@@ -0,0 +1,444 @@
+
+"""
+Importer for DBLP release-level (article/paper/etc) XML metadata.
+
+Works similarly to PubMed XML importer: expects to have a large XML file
+iterated over quickly, with individual elements re-parsed into smaller objects
+and passed to `parse_record()`.
+"""
+
+import sys  # noqa: F401
+import warnings
+import datetime
+from typing import List, Optional, Any
+
+import fatcat_openapi_client
+from fatcat_tools.normal import (clean_doi, clean_str, parse_month,
+    clean_orcid,
+    clean_arxiv_id, clean_wikidata_qid, clean_isbn13)
+from fatcat_tools.importers.common import EntityImporter
+
+
+class DblpReleaseImporter(EntityImporter):
+
+    def __init__(self,
+                 api,
+                 issn_map_file,
+                 **kwargs):
+
+        eg_desc = kwargs.get(
+            'editgroup_description',
+            "Automated import of dblp metadata via XML records"
+        )
+        eg_extra = kwargs.get('editgroup_extra', dict())
+        eg_extra['agent'] = eg_extra.get('agent',
+                                         'fatcat_tools.DblpReleaseImporter')
+        # ensure default is to not do updates with this worker (override super() default)
+        kwargs['do_updates'] = kwargs.get("do_updates", False)
+        super().__init__(api,
+                         issn_map_file=issn_map_file,
+                         editgroup_description=eg_desc,
+                         editgroup_extra=eg_extra,
+                         **kwargs)
+
+        self.this_year = datetime.datetime.now().year
+        self.read_issn_map_file(issn_map_file)
+
+    ELEMENT_TYPES = [
+        "article",
+        "inproceedings",
+        "book",
+        "incollection",
+        "phdthesis",
+        "mastersthesis",
+        "www",
+        #"data",  # no instances in 2020-11 dump
+    ]
+
+    def want(self, xml_elem):
+        if not xml_elem.name in self.ELEMENT_TYPES:
+            self.counts['skip-type'] += 1
+            return False
+        if not xml_elem.get('key'):
+            self.counts['skip-no-key'] += 1
+            return False
+        if xml_elem['key'].startswith('homepage/'):
+            self.counts['skip-type-homepage'] += 1
+            return False
+        return True
+
+    def parse_record(self, xml_elem):
+        """
+        - title
+            => may contain <i>, <sub>, <sup>, <tt>
+        - journal (abbrev?)
+        - volume, pages, number (number -> issue)
+        - publisher
+        - year
+            => for conferences, year of conference not of publication
+        - month
+        - crossref (from inproceedings to specific proceedings volume)
+        - booktitle
+            => for inproceedings, this is the name of conference or workshop. acronym.
+        - isbn
+        """
+
+        dblp_key = xml_elem.get('key')
+        if not dblp_key:
+            self.counts['skip-empty-key'] += 1
+            return False
+        dblp_key_type = dblp_key.split('/')[0]
+
+        # dblp_prefix may be used for container lookup
+        dblp_prefix = None
+        if dblp_key_type in ('journals', 'conf'):
+            dblp_prefix = '/'.join(dblp_key.split('/')[:2])
+        elif dblp_key_type in ('series', 'reference', 'tr', 'books'):
+            dblp_prefix = '/'.join(dblp_key.split('/')[:-1])
+
+        publtype = xml_elem.get('publtype') or None
+
+        dblp_type = xml_elem.name
+        if dblp_type not in self.ELEMENT_TYPES:
+            self.counts[f'skip-dblp-type:{dblp_type}'] += 1
+
+        if dblp_key_type in ('homepages', 'persons', 'dblpnote'):
+            self.counts['skip-key-type'] += 1
+            return False
+
+        if dblp_key.startswith('journals/corr/'):
+            self.counts['skip-arxiv-corr'] += 1
+            return False
+
+        title = clean_str(" ".join(xml_elem.title.stripped_strings), force_xml=True)
+        if not title:
+            self.counts['skip-title'] += 1
+            return False
+        if title.endswith('.'):
+            title = title[:-1]
+
+        release_type = None
+        release_stage = 'published'
+        withdrawn_status = None
+
+        # primary releae_type detection: type of XML element, then prefix of key for granularity
+        if dblp_type == 'article':
+            release_type = 'article'
+            if dblp_key_type == 'journals' and publtype != 'informal':
+                release_type = 'article-journal'
+            elif dblp_key_type == 'tr':
+                release_type = 'report'
+            elif title.startswith("Review:"):
+                release_type = 'review'
+        elif dblp_type == 'inproceedings':
+            release_type = 'paper-conference'
+        elif dblp_type == 'book':
+            release_type = 'book'
+        elif dblp_type == 'incollection':
+            # XXX: part vs. chapter?
+            release_type = 'chapter'
+        elif dblp_type == 'data':
+            release_type = 'dataset'
+        elif dblp_type in ('mastersthesis', 'phdthesis'):
+            release_type = 'thesis'
+
+        # overrides/extensions of the above
+        if publtype == 'informal':
+            # for conferences, seems to indicate peer-review status
+            # for journals, seems to indicate things like book reviews; split out above
+            pass
+        elif publtype == 'encyclopedia':
+            release_type = 'entry-encyclopedia'
+        elif publtype == 'edited':
+            # XXX: article?
+            release_type = 'editorial'
+        elif publtype == 'data':
+            release_type = 'dataset'
+        elif publtype == 'data':
+            release_type = 'dataset'
+        elif publtype == 'software':
+            release_type = 'software'
+        elif publtype == 'widthdrawn':
+            withdrawn_status = 'widthdrawn'
+        elif publtype == 'survey':
+            # XXX: flag as a review/survey article?
+            pass
+
+        #print((release_type, dblp_type, dblp_key_type, publtype), file=sys.stderr)
+
+        container_name = None
+        booktitle = clean_str(xml_elem.booktitle and xml_elem.booktitle.text)
+        series = clean_str(xml_elem.series and xml_elem.series.text)
+
+        if xml_elem.journal:
+            container_name = clean_str(xml_elem.journal.text)
+
+        container_id = None
+        if dblp_prefix:
+            # XXX: container lookup by dblp_prefix, from local something
+            pass
+            #container_id = self.lookup_dblp_prefix(dblp_prefix)
+            #if not container_id:
+            #    self.counts['skip-dblp-prefix-lookup'] += 1
+            #    return False
+
+        publisher = clean_str(xml_elem.publisher and xml_elem.publisher.text)
+        volume = clean_str(xml_elem.volume and xml_elem.volume.text)
+        issue = clean_str(xml_elem.number and xml_elem.number.text)
+        pages = clean_str(xml_elem.pages and xml_elem.pages.text)
+        release_year = clean_str(xml_elem.year and xml_elem.year.text)
+        if release_year and release_year.isdigit():
+            release_year = int(release_year)
+        else:
+            release_year = None
+        release_month = parse_month(clean_str(xml_elem.month and xml_elem.month.text))
+        isbn = clean_isbn13(xml_elem.isbn and xml_elem.isbn.text)
+        part_of_key = clean_str(xml_elem.crossref and xml_elem.crossref.text)
+
+        # block bogus far-future years/dates
+        if release_year is not None and (release_year > (self.this_year + 5) or release_year < 1000):
+            release_month = None
+            release_year = None
+
+        contribs = self.dblp_contribs(xml_elem or [])
+        ext_ids = self.dblp_ext_ids(xml_elem, dblp_key)
+        if isbn:
+            ext_ids.isbn13 = isbn
+        if ext_ids.doi:
+            self.counts['has-doi'] += 1
+
+        # dblp-specific extra
+        dblp_extra = dict(type=dblp_type)
+        note = clean_str(xml_elem.note and xml_elem.note.text)
+        if note and not 'base-search.net' in note:
+            dblp_extra['note'] = note
+        if part_of_key:
+            dblp_extra['part_of_key'] = part_of_key
+
+        # generic extra
+        extra = dict()
+        if not container_id and container_name:
+            extra['container_name'] = container_name
+
+        if series and (dblp_key_type == 'series' or dblp_type == 'book'):
+            extra['series-title'] = series
+        elif series:
+            dblp_extra['series'] = series
+
+        if booktitle and dblp_key_type == 'series':
+            extra['container-title'] = booktitle
+        elif booktitle and dblp_key_type == 'conf':
+            extra['event'] = booktitle
+        elif booktitle:
+            dblp_extra['booktitle'] = booktitle
+
+        if release_year and release_month:
+            # TODO: schema migration
+            extra['release_month'] = release_month
+
+        if dblp_extra:
+            extra['dblp'] = dblp_extra
+        if not extra:
+            extra = None
+
+        re = fatcat_openapi_client.ReleaseEntity(
+            work_id=None,
+            container_id=container_id,
+            release_type=release_type,
+            release_stage=release_stage,
+            withdrawn_status=withdrawn_status,
+            title=title,
+            release_year=release_year,
+            #release_date,
+            publisher=publisher,
+            ext_ids=ext_ids,
+            contribs=contribs,
+            volume=volume,
+            issue=issue,
+            pages=pages,
+            extra=extra,
+        )
+        re = self.biblio_hacks(re)
+        return re
+
+    @staticmethod
+    def biblio_hacks(re):
+        """
+        This function handles known special cases. For example,
+        publisher-specific or platform-specific workarounds.
+        """
+        return re
+
+    def try_update(self, re):
+
+        # lookup existing release by dblp article id
+        existing = None
+        try:
+            existing = self.api.lookup_release(dblp=re.ext_ids.dblp)
+        except fatcat_openapi_client.rest.ApiException as err:
+            if err.status != 404:
+                raise err
+
+        # then try other ext_id lookups
+        if not existing:
+            for extid_type in ('doi', 'wikidata_qid', 'isbn13', 'arxiv_id'):
+                extid_val = getattr(re.ext_ids, extid_type)
+                if not extid_val:
+                    continue
+                #print(f"  lookup release type: {extid_type} val: {extid_val}")
+                try:
+                    existing = self.api.lookup_release(**{extid_type: extid_val})
+                except fatcat_openapi_client.rest.ApiException as err:
+                    if err.status != 404:
+                        raise err
+                if existing:
+                    if existing.ext_ids.dblp:
+                        warn_str = f"unexpected dblp ext_id match after lookup failed dblp={re.ext_ids.dblp} ident={existing.ident}"
+                        warnings.warn(warn_str)
+                        self.counts["skip-dblp-id-mismatch"] += 1
+                        return False
+                    break
+
+        # TODO: in the future could do fuzzy match here, eg using elasticsearch
+
+        # create entity
+        if not existing:
+            return True
+
+        # other logic could go here about skipping updates
+        if not self.do_updates or existing.ext_ids.dblp:
+            self.counts['exists'] += 1
+            return False
+
+        # fields to copy over for update
+        # TODO: granular contrib metadata
+        existing.contribs = existing.contribs or re.contribs
+        existing.ext_ids.dblp = existing.ext_ids.dblp or re.ext_ids.dblp
+        existing.ext_ids.wikidata_qid = existing.ext_ids.wikidata_qid or re.ext_ids.wikidata_qid
+        existing.release_type = existing.release_type or re.release_type
+        existing.release_stage = existing.release_stage or re.release_stage
+        existing.withdrawn_status = existing.withdrawn_status or re.withdrawn_status
+        existing.container_id = existing.container_id or re.container_id
+        existing.extra['dblp'] = re.extra['dblp']
+        existing.volume = existing.volume or re.volume
+        existing.issue = existing.issue or re.issue
+        existing.pages = existing.pages or re.pages
+
+        try:
+            self.api.update_release(self.get_editgroup_id(), existing.ident, existing)
+            self.counts['update'] += 1
+        except fatcat_openapi_client.rest.ApiException as err:
+            # there is a code path where we try to update the same release
+            # twice in a row; if that happens, just skip
+            # NOTE: API behavior might change in the future?
+            if "release_edit_editgroup_id_ident_id_key" in err.body:
+                self.counts['skip-update-conflict'] += 1
+                return False
+            else:
+                raise err
+
+        return False
+
+    def insert_batch(self, batch):
+        self.api.create_release_auto_batch(fatcat_openapi_client.ReleaseAutoBatch(
+            editgroup=fatcat_openapi_client.Editgroup(
+                description=self.editgroup_description,
+                extra=self.editgroup_extra),
+            entity_list=batch))
+
+    def dblp_contribs(self, authors: List[dict]) -> List[fatcat_openapi_client.ReleaseContrib]:
+        """
+        - author (multiple; each a single string)
+            => may have HTML entities
+            => may have a number at the end, to aid with identifier creation
+            => orcid
+        - editor (same as author)
+            => orcid
+        """
+        contribs = []
+        index = 0
+        for elem in authors.find_all('author'):
+            contrib = self.dblp_contrib_single(elem)
+            contrib.role = "author"
+            contrib.index = index
+            contribs.append(contrib)
+            index += 1
+
+        for elem in authors.find_all('editor'):
+            contrib = self.dblp_contrib_single(elem)
+            contrib.role = "editor"
+            contribs.append(contrib)
+
+        return contribs
+
+    def dblp_contrib_single(self, elem: Any) -> fatcat_openapi_client.ReleaseContrib:
+        """
+        In the future, might try to implement creator key-ificiation and lookup here.
+
+        Example rows:
+
+            <author>Michael H. B&ouml;hlen</author>
+            <author orcid="0000-0002-4354-9138">Nicolas Heist</author>
+            <author orcid="0000-0001-9108-4278">Jens Lehmann 0001</author>
+        """
+
+        creator_id = None
+        extra = None
+        raw_name = clean_str(elem.text)
+
+        # remove number in author name, if present
+        if raw_name.split()[-1].isdigit():
+            raw_name = ' '.join(raw_name.split()[:-1])
+
+        if elem.get('orcid'):
+            orcid = clean_orcid(elem['orcid'])
+            if orcid:
+                creator_id = self.lookup_orcid(orcid)
+                if not creator_id:
+                    extra = dict(orcid=orcid)
+        return fatcat_openapi_client.ReleaseContrib(
+            raw_name=raw_name,
+            creator_id=creator_id,
+            extra=extra,
+        )
+
+    def dblp_ext_ids(self, xml_elem: Any, dblp_key: str) -> fatcat_openapi_client.ReleaseExtIds:
+        """
+        Takes a full XML object and returns external identifiers.
+
+        Currently these can be arixv identifiers, DOI, or wikidata QID
+
+        - ee (electronic edition; often DOI?)
+            => in some cases a "local" URL
+            => publisher URL; often DOI
+            => type attr
+        - url
+            => dblp internal link to table-of-contents
+        """
+
+        doi: Optional[str] = None
+        wikidata_qid: Optional[str] = None
+        arxiv_id: Optional[str] = None
+        for ee in xml_elem.find_all('ee'):
+            url = ee.text
+            # convert DOI-like domains, which mostly have DOIs anyways
+            if '://doi.acm.org/' in url:
+                url = url.replace('://doi.acm.org/', '://doi.org/')
+            elif '://doi.ieeecomputersociety.org/' in url:
+                url = url.replace('://doi.ieeecomputersociety.org/', '://doi.org/')
+
+            if 'doi.org/10.' in url and not doi:
+                doi = clean_doi(url)
+            elif 'wikidata.org/entity/Q' in url and not wikidata_qid:
+                wikidata_qid = clean_wikidata_qid(url)
+            elif '://arxiv.org/abs/' in url and not wikidata_qid:
+                arxiv_id = url.replace('http://', '').replace('https://', '').replace('arxiv.org/abs/', '')
+                arxiv_id = clean_arxiv_id(arxiv_id)
+                wikidata_qid = clean_wikidata_qid(url)
+
+        return fatcat_openapi_client.ReleaseExtIds(
+            dblp=dblp_key,
+            doi=doi,
+            wikidata_qid=wikidata_qid,
+            arxiv=arxiv_id,
+        )
author	Bryan Newbold <bnewbold@robocracy.org>	2020-12-02 11:29:03 -0800
committer	Bryan Newbold <bnewbold@robocracy.org>	2020-12-17 23:03:08 -0800
commit	7d1ac51fb1b67f64f03e4c6e943202085cd4faa9 (patch)
tree	cd5730517b3d9418315efd6784369c2c67a49913 /python/fatcat_tools
parent	4e332e9037530ebc62836acfa78896dc76700c9c (diff)
download	fatcat-7d1ac51fb1b67f64f03e4c6e943202085cd4faa9.tar.gz fatcat-7d1ac51fb1b67f64f03e4c6e943202085cd4faa9.zip