summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2020-12-02 11:29:03 -0800
committerBryan Newbold <bnewbold@robocracy.org>2020-12-17 23:03:08 -0800
commit7d1ac51fb1b67f64f03e4c6e943202085cd4faa9 (patch)
treecd5730517b3d9418315efd6784369c2c67a49913
parent4e332e9037530ebc62836acfa78896dc76700c9c (diff)
downloadfatcat-7d1ac51fb1b67f64f03e4c6e943202085cd4faa9.tar.gz
fatcat-7d1ac51fb1b67f64f03e4c6e943202085cd4faa9.zip
initial implementation of dblp release importer (in progress)
-rwxr-xr-xpython/fatcat_import.py29
-rw-r--r--python/fatcat_tools/importers/__init__.py1
-rw-r--r--python/fatcat_tools/importers/dblp_release.py444
3 files changed, 474 insertions, 0 deletions
diff --git a/python/fatcat_import.py b/python/fatcat_import.py
index 6c9e65a8..5ee81b92 100755
--- a/python/fatcat_import.py
+++ b/python/fatcat_import.py
@@ -273,6 +273,19 @@ def run_doaj_article(args):
else:
JsonLinePusher(dai, args.json_file).run()
+def run_dblp_release(args):
+ dwi = DblpReleaseImporter(args.api,
+ args.issn_map_file,
+ edit_batch_size=args.batch_size,
+ do_updates=args.do_updates,
+ )
+ Bs4XmlLargeFilePusher(
+ dwi,
+ args.xml_file,
+ DblpReleaseImporter.ELEMENT_TYPES,
+ use_lxml=True,
+ ).run()
+
def run_file_meta(args):
# do_updates defaults to true for this importer
fmi = FileMetaImporter(args.api,
@@ -642,6 +655,22 @@ def main():
auth_var="FATCAT_AUTH_WORKER_DOAJ",
)
+ sub_dblp_release = subparsers.add_parser('dblp-release',
+ help="import dblp release metadata")
+ sub_dblp_release.add_argument('xml_file',
+ help="File with DBLP XML to import from",
+ default=sys.stdin, type=argparse.FileType('rb'))
+ sub_dblp_release.add_argument('--issn-map-file',
+ help="ISSN to ISSN-L mapping file",
+ default=None, type=argparse.FileType('r'))
+ sub_dblp_release.add_argument('--do-updates',
+ action='store_true',
+ help="update any pre-existing release entities")
+ sub_dblp_release.set_defaults(
+ func=run_dblp_release,
+ auth_var="FATCAT_AUTH_WORKER_DBLP",
+ )
+
sub_file_meta = subparsers.add_parser('file-meta',
help="simple update-only importer for file metadata")
sub_file_meta.set_defaults(
diff --git a/python/fatcat_tools/importers/__init__.py b/python/fatcat_tools/importers/__init__.py
index d2928d09..a14e2cec 100644
--- a/python/fatcat_tools/importers/__init__.py
+++ b/python/fatcat_tools/importers/__init__.py
@@ -31,3 +31,4 @@ from .ingest import IngestFileResultImporter, SavePaperNowFileImporter, IngestWe
from .shadow import ShadowLibraryImporter
from .file_meta import FileMetaImporter
from .doaj_article import DoajArticleImporter
+from .dblp_release import DblpReleaseImporter
diff --git a/python/fatcat_tools/importers/dblp_release.py b/python/fatcat_tools/importers/dblp_release.py
new file mode 100644
index 00000000..7e19855f
--- /dev/null
+++ b/python/fatcat_tools/importers/dblp_release.py
@@ -0,0 +1,444 @@
+
+"""
+Importer for DBLP release-level (article/paper/etc) XML metadata.
+
+Works similarly to PubMed XML importer: expects to have a large XML file
+iterated over quickly, with individual elements re-parsed into smaller objects
+and passed to `parse_record()`.
+"""
+
+import sys # noqa: F401
+import warnings
+import datetime
+from typing import List, Optional, Any
+
+import fatcat_openapi_client
+from fatcat_tools.normal import (clean_doi, clean_str, parse_month,
+ clean_orcid,
+ clean_arxiv_id, clean_wikidata_qid, clean_isbn13)
+from fatcat_tools.importers.common import EntityImporter
+
+
+class DblpReleaseImporter(EntityImporter):
+
+ def __init__(self,
+ api,
+ issn_map_file,
+ **kwargs):
+
+ eg_desc = kwargs.get(
+ 'editgroup_description',
+ "Automated import of dblp metadata via XML records"
+ )
+ eg_extra = kwargs.get('editgroup_extra', dict())
+ eg_extra['agent'] = eg_extra.get('agent',
+ 'fatcat_tools.DblpReleaseImporter')
+ # ensure default is to not do updates with this worker (override super() default)
+ kwargs['do_updates'] = kwargs.get("do_updates", False)
+ super().__init__(api,
+ issn_map_file=issn_map_file,
+ editgroup_description=eg_desc,
+ editgroup_extra=eg_extra,
+ **kwargs)
+
+ self.this_year = datetime.datetime.now().year
+ self.read_issn_map_file(issn_map_file)
+
+ ELEMENT_TYPES = [
+ "article",
+ "inproceedings",
+ "book",
+ "incollection",
+ "phdthesis",
+ "mastersthesis",
+ "www",
+ #"data", # no instances in 2020-11 dump
+ ]
+
+ def want(self, xml_elem):
+ if not xml_elem.name in self.ELEMENT_TYPES:
+ self.counts['skip-type'] += 1
+ return False
+ if not xml_elem.get('key'):
+ self.counts['skip-no-key'] += 1
+ return False
+ if xml_elem['key'].startswith('homepage/'):
+ self.counts['skip-type-homepage'] += 1
+ return False
+ return True
+
+ def parse_record(self, xml_elem):
+ """
+ - title
+ => may contain <i>, <sub>, <sup>, <tt>
+ - journal (abbrev?)
+ - volume, pages, number (number -> issue)
+ - publisher
+ - year
+ => for conferences, year of conference not of publication
+ - month
+ - crossref (from inproceedings to specific proceedings volume)
+ - booktitle
+ => for inproceedings, this is the name of conference or workshop. acronym.
+ - isbn
+ """
+
+ dblp_key = xml_elem.get('key')
+ if not dblp_key:
+ self.counts['skip-empty-key'] += 1
+ return False
+ dblp_key_type = dblp_key.split('/')[0]
+
+ # dblp_prefix may be used for container lookup
+ dblp_prefix = None
+ if dblp_key_type in ('journals', 'conf'):
+ dblp_prefix = '/'.join(dblp_key.split('/')[:2])
+ elif dblp_key_type in ('series', 'reference', 'tr', 'books'):
+ dblp_prefix = '/'.join(dblp_key.split('/')[:-1])
+
+ publtype = xml_elem.get('publtype') or None
+
+ dblp_type = xml_elem.name
+ if dblp_type not in self.ELEMENT_TYPES:
+ self.counts[f'skip-dblp-type:{dblp_type}'] += 1
+
+ if dblp_key_type in ('homepages', 'persons', 'dblpnote'):
+ self.counts['skip-key-type'] += 1
+ return False
+
+ if dblp_key.startswith('journals/corr/'):
+ self.counts['skip-arxiv-corr'] += 1
+ return False
+
+ title = clean_str(" ".join(xml_elem.title.stripped_strings), force_xml=True)
+ if not title:
+ self.counts['skip-title'] += 1
+ return False
+ if title.endswith('.'):
+ title = title[:-1]
+
+ release_type = None
+ release_stage = 'published'
+ withdrawn_status = None
+
+ # primary releae_type detection: type of XML element, then prefix of key for granularity
+ if dblp_type == 'article':
+ release_type = 'article'
+ if dblp_key_type == 'journals' and publtype != 'informal':
+ release_type = 'article-journal'
+ elif dblp_key_type == 'tr':
+ release_type = 'report'
+ elif title.startswith("Review:"):
+ release_type = 'review'
+ elif dblp_type == 'inproceedings':
+ release_type = 'paper-conference'
+ elif dblp_type == 'book':
+ release_type = 'book'
+ elif dblp_type == 'incollection':
+ # XXX: part vs. chapter?
+ release_type = 'chapter'
+ elif dblp_type == 'data':
+ release_type = 'dataset'
+ elif dblp_type in ('mastersthesis', 'phdthesis'):
+ release_type = 'thesis'
+
+ # overrides/extensions of the above
+ if publtype == 'informal':
+ # for conferences, seems to indicate peer-review status
+ # for journals, seems to indicate things like book reviews; split out above
+ pass
+ elif publtype == 'encyclopedia':
+ release_type = 'entry-encyclopedia'
+ elif publtype == 'edited':
+ # XXX: article?
+ release_type = 'editorial'
+ elif publtype == 'data':
+ release_type = 'dataset'
+ elif publtype == 'data':
+ release_type = 'dataset'
+ elif publtype == 'software':
+ release_type = 'software'
+ elif publtype == 'widthdrawn':
+ withdrawn_status = 'widthdrawn'
+ elif publtype == 'survey':
+ # XXX: flag as a review/survey article?
+ pass
+
+ #print((release_type, dblp_type, dblp_key_type, publtype), file=sys.stderr)
+
+ container_name = None
+ booktitle = clean_str(xml_elem.booktitle and xml_elem.booktitle.text)
+ series = clean_str(xml_elem.series and xml_elem.series.text)
+
+ if xml_elem.journal:
+ container_name = clean_str(xml_elem.journal.text)
+
+ container_id = None
+ if dblp_prefix:
+ # XXX: container lookup by dblp_prefix, from local something
+ pass
+ #container_id = self.lookup_dblp_prefix(dblp_prefix)
+ #if not container_id:
+ # self.counts['skip-dblp-prefix-lookup'] += 1
+ # return False
+
+ publisher = clean_str(xml_elem.publisher and xml_elem.publisher.text)
+ volume = clean_str(xml_elem.volume and xml_elem.volume.text)
+ issue = clean_str(xml_elem.number and xml_elem.number.text)
+ pages = clean_str(xml_elem.pages and xml_elem.pages.text)
+ release_year = clean_str(xml_elem.year and xml_elem.year.text)
+ if release_year and release_year.isdigit():
+ release_year = int(release_year)
+ else:
+ release_year = None
+ release_month = parse_month(clean_str(xml_elem.month and xml_elem.month.text))
+ isbn = clean_isbn13(xml_elem.isbn and xml_elem.isbn.text)
+ part_of_key = clean_str(xml_elem.crossref and xml_elem.crossref.text)
+
+ # block bogus far-future years/dates
+ if release_year is not None and (release_year > (self.this_year + 5) or release_year < 1000):
+ release_month = None
+ release_year = None
+
+ contribs = self.dblp_contribs(xml_elem or [])
+ ext_ids = self.dblp_ext_ids(xml_elem, dblp_key)
+ if isbn:
+ ext_ids.isbn13 = isbn
+ if ext_ids.doi:
+ self.counts['has-doi'] += 1
+
+ # dblp-specific extra
+ dblp_extra = dict(type=dblp_type)
+ note = clean_str(xml_elem.note and xml_elem.note.text)
+ if note and not 'base-search.net' in note:
+ dblp_extra['note'] = note
+ if part_of_key:
+ dblp_extra['part_of_key'] = part_of_key
+
+ # generic extra
+ extra = dict()
+ if not container_id and container_name:
+ extra['container_name'] = container_name
+
+ if series and (dblp_key_type == 'series' or dblp_type == 'book'):
+ extra['series-title'] = series
+ elif series:
+ dblp_extra['series'] = series
+
+ if booktitle and dblp_key_type == 'series':
+ extra['container-title'] = booktitle
+ elif booktitle and dblp_key_type == 'conf':
+ extra['event'] = booktitle
+ elif booktitle:
+ dblp_extra['booktitle'] = booktitle
+
+ if release_year and release_month:
+ # TODO: schema migration
+ extra['release_month'] = release_month
+
+ if dblp_extra:
+ extra['dblp'] = dblp_extra
+ if not extra:
+ extra = None
+
+ re = fatcat_openapi_client.ReleaseEntity(
+ work_id=None,
+ container_id=container_id,
+ release_type=release_type,
+ release_stage=release_stage,
+ withdrawn_status=withdrawn_status,
+ title=title,
+ release_year=release_year,
+ #release_date,
+ publisher=publisher,
+ ext_ids=ext_ids,
+ contribs=contribs,
+ volume=volume,
+ issue=issue,
+ pages=pages,
+ extra=extra,
+ )
+ re = self.biblio_hacks(re)
+ return re
+
+ @staticmethod
+ def biblio_hacks(re):
+ """
+ This function handles known special cases. For example,
+ publisher-specific or platform-specific workarounds.
+ """
+ return re
+
+ def try_update(self, re):
+
+ # lookup existing release by dblp article id
+ existing = None
+ try:
+ existing = self.api.lookup_release(dblp=re.ext_ids.dblp)
+ except fatcat_openapi_client.rest.ApiException as err:
+ if err.status != 404:
+ raise err
+
+ # then try other ext_id lookups
+ if not existing:
+ for extid_type in ('doi', 'wikidata_qid', 'isbn13', 'arxiv_id'):
+ extid_val = getattr(re.ext_ids, extid_type)
+ if not extid_val:
+ continue
+ #print(f" lookup release type: {extid_type} val: {extid_val}")
+ try:
+ existing = self.api.lookup_release(**{extid_type: extid_val})
+ except fatcat_openapi_client.rest.ApiException as err:
+ if err.status != 404:
+ raise err
+ if existing:
+ if existing.ext_ids.dblp:
+ warn_str = f"unexpected dblp ext_id match after lookup failed dblp={re.ext_ids.dblp} ident={existing.ident}"
+ warnings.warn(warn_str)
+ self.counts["skip-dblp-id-mismatch"] += 1
+ return False
+ break
+
+ # TODO: in the future could do fuzzy match here, eg using elasticsearch
+
+ # create entity
+ if not existing:
+ return True
+
+ # other logic could go here about skipping updates
+ if not self.do_updates or existing.ext_ids.dblp:
+ self.counts['exists'] += 1
+ return False
+
+ # fields to copy over for update
+ # TODO: granular contrib metadata
+ existing.contribs = existing.contribs or re.contribs
+ existing.ext_ids.dblp = existing.ext_ids.dblp or re.ext_ids.dblp
+ existing.ext_ids.wikidata_qid = existing.ext_ids.wikidata_qid or re.ext_ids.wikidata_qid
+ existing.release_type = existing.release_type or re.release_type
+ existing.release_stage = existing.release_stage or re.release_stage
+ existing.withdrawn_status = existing.withdrawn_status or re.withdrawn_status
+ existing.container_id = existing.container_id or re.container_id
+ existing.extra['dblp'] = re.extra['dblp']
+ existing.volume = existing.volume or re.volume
+ existing.issue = existing.issue or re.issue
+ existing.pages = existing.pages or re.pages
+
+ try:
+ self.api.update_release(self.get_editgroup_id(), existing.ident, existing)
+ self.counts['update'] += 1
+ except fatcat_openapi_client.rest.ApiException as err:
+ # there is a code path where we try to update the same release
+ # twice in a row; if that happens, just skip
+ # NOTE: API behavior might change in the future?
+ if "release_edit_editgroup_id_ident_id_key" in err.body:
+ self.counts['skip-update-conflict'] += 1
+ return False
+ else:
+ raise err
+
+ return False
+
+ def insert_batch(self, batch):
+ self.api.create_release_auto_batch(fatcat_openapi_client.ReleaseAutoBatch(
+ editgroup=fatcat_openapi_client.Editgroup(
+ description=self.editgroup_description,
+ extra=self.editgroup_extra),
+ entity_list=batch))
+
+ def dblp_contribs(self, authors: List[dict]) -> List[fatcat_openapi_client.ReleaseContrib]:
+ """
+ - author (multiple; each a single string)
+ => may have HTML entities
+ => may have a number at the end, to aid with identifier creation
+ => orcid
+ - editor (same as author)
+ => orcid
+ """
+ contribs = []
+ index = 0
+ for elem in authors.find_all('author'):
+ contrib = self.dblp_contrib_single(elem)
+ contrib.role = "author"
+ contrib.index = index
+ contribs.append(contrib)
+ index += 1
+
+ for elem in authors.find_all('editor'):
+ contrib = self.dblp_contrib_single(elem)
+ contrib.role = "editor"
+ contribs.append(contrib)
+
+ return contribs
+
+ def dblp_contrib_single(self, elem: Any) -> fatcat_openapi_client.ReleaseContrib:
+ """
+ In the future, might try to implement creator key-ificiation and lookup here.
+
+ Example rows:
+
+ <author>Michael H. B&ouml;hlen</author>
+ <author orcid="0000-0002-4354-9138">Nicolas Heist</author>
+ <author orcid="0000-0001-9108-4278">Jens Lehmann 0001</author>
+ """
+
+ creator_id = None
+ extra = None
+ raw_name = clean_str(elem.text)
+
+ # remove number in author name, if present
+ if raw_name.split()[-1].isdigit():
+ raw_name = ' '.join(raw_name.split()[:-1])
+
+ if elem.get('orcid'):
+ orcid = clean_orcid(elem['orcid'])
+ if orcid:
+ creator_id = self.lookup_orcid(orcid)
+ if not creator_id:
+ extra = dict(orcid=orcid)
+ return fatcat_openapi_client.ReleaseContrib(
+ raw_name=raw_name,
+ creator_id=creator_id,
+ extra=extra,
+ )
+
+ def dblp_ext_ids(self, xml_elem: Any, dblp_key: str) -> fatcat_openapi_client.ReleaseExtIds:
+ """
+ Takes a full XML object and returns external identifiers.
+
+ Currently these can be arixv identifiers, DOI, or wikidata QID
+
+ - ee (electronic edition; often DOI?)
+ => in some cases a "local" URL
+ => publisher URL; often DOI
+ => type attr
+ - url
+ => dblp internal link to table-of-contents
+ """
+
+ doi: Optional[str] = None
+ wikidata_qid: Optional[str] = None
+ arxiv_id: Optional[str] = None
+ for ee in xml_elem.find_all('ee'):
+ url = ee.text
+ # convert DOI-like domains, which mostly have DOIs anyways
+ if '://doi.acm.org/' in url:
+ url = url.replace('://doi.acm.org/', '://doi.org/')
+ elif '://doi.ieeecomputersociety.org/' in url:
+ url = url.replace('://doi.ieeecomputersociety.org/', '://doi.org/')
+
+ if 'doi.org/10.' in url and not doi:
+ doi = clean_doi(url)
+ elif 'wikidata.org/entity/Q' in url and not wikidata_qid:
+ wikidata_qid = clean_wikidata_qid(url)
+ elif '://arxiv.org/abs/' in url and not wikidata_qid:
+ arxiv_id = url.replace('http://', '').replace('https://', '').replace('arxiv.org/abs/', '')
+ arxiv_id = clean_arxiv_id(arxiv_id)
+ wikidata_qid = clean_wikidata_qid(url)
+
+ return fatcat_openapi_client.ReleaseExtIds(
+ dblp=dblp_key,
+ doi=doi,
+ wikidata_qid=wikidata_qid,
+ arxiv=arxiv_id,
+ )