summaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools/importers/dblp_release.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2021-11-02 18:14:59 -0700
committerBryan Newbold <bnewbold@robocracy.org>2021-11-02 18:14:59 -0700
commit31d1a6a713d177990609767d508209ced19ca396 (patch)
treea628a57bdb373669394a6b520102b1b4b5ffe7da /python/fatcat_tools/importers/dblp_release.py
parent9dc891b8098542bb089c8c47098b60a8beb76a53 (diff)
downloadfatcat-31d1a6a713d177990609767d508209ced19ca396.tar.gz
fatcat-31d1a6a713d177990609767d508209ced19ca396.zip
fmt (black): fatcat_tools/
Diffstat (limited to 'python/fatcat_tools/importers/dblp_release.py')
-rw-r--r--python/fatcat_tools/importers/dblp_release.py257
1 files changed, 132 insertions, 125 deletions
diff --git a/python/fatcat_tools/importers/dblp_release.py b/python/fatcat_tools/importers/dblp_release.py
index 6d028f2f..5baa6cd6 100644
--- a/python/fatcat_tools/importers/dblp_release.py
+++ b/python/fatcat_tools/importers/dblp_release.py
@@ -1,4 +1,3 @@
-
"""
Importer for DBLP release-level (article/paper/etc) XML metadata.
@@ -44,25 +43,16 @@ from fatcat_tools.transforms import entity_to_dict
class DblpReleaseImporter(EntityImporter):
-
- def __init__(self,
- api,
- dblp_container_map_file=None,
- **kwargs):
+ def __init__(self, api, dblp_container_map_file=None, **kwargs):
eg_desc = kwargs.get(
- 'editgroup_description',
- "Automated import of dblp metadata via XML records"
+ "editgroup_description", "Automated import of dblp metadata via XML records"
)
- eg_extra = kwargs.get('editgroup_extra', dict())
- eg_extra['agent'] = eg_extra.get('agent',
- 'fatcat_tools.DblpReleaseImporter')
+ eg_extra = kwargs.get("editgroup_extra", dict())
+ eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.DblpReleaseImporter")
# ensure default is to not do updates with this worker (override super() default)
- kwargs['do_updates'] = kwargs.get("do_updates", False)
- super().__init__(api,
- editgroup_description=eg_desc,
- editgroup_extra=eg_extra,
- **kwargs)
+ kwargs["do_updates"] = kwargs.get("do_updates", False)
+ super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs)
self.dump_json_mode = kwargs.get("dump_json_mode", False)
self.this_year = datetime.datetime.now().year
@@ -76,13 +66,16 @@ class DblpReleaseImporter(EntityImporter):
"phdthesis",
"mastersthesis",
"www",
- #"data", # no instances in 2020-11 dump
+ # "data", # no instances in 2020-11 dump
]
def read_dblp_container_map_file(self, dblp_container_map_file) -> None:
self._dblp_container_map = dict()
if not dblp_container_map_file:
- print("Not loading a dblp prefix container map file; entities will fail to import", file=sys.stderr)
+ print(
+ "Not loading a dblp prefix container map file; entities will fail to import",
+ file=sys.stderr,
+ )
return
print("Loading dblp prefix container map file...", file=sys.stderr)
for line in dblp_container_map_file:
@@ -92,7 +85,10 @@ class DblpReleaseImporter(EntityImporter):
container_id = container_id.strip()
assert len(container_id) == 26
self._dblp_container_map[prefix] = container_id
- print("Got {} dblp container mappings.".format(len(self._dblp_container_map)), file=sys.stderr)
+ print(
+ "Got {} dblp container mappings.".format(len(self._dblp_container_map)),
+ file=sys.stderr,
+ )
def lookup_dblp_prefix(self, prefix):
if not prefix:
@@ -101,13 +97,13 @@ class DblpReleaseImporter(EntityImporter):
def want(self, xml_elem):
if xml_elem.name not in self.ELEMENT_TYPES:
- self.counts['skip-type'] += 1
+ self.counts["skip-type"] += 1
return False
- if not xml_elem.get('key'):
- self.counts['skip-no-key'] += 1
+ if not xml_elem.get("key"):
+ self.counts["skip-no-key"] += 1
return False
- if xml_elem['key'].startswith('homepage/'):
- self.counts['skip-type-homepage'] += 1
+ if xml_elem["key"].startswith("homepage/"):
+ self.counts["skip-type-homepage"] += 1
return False
return True
@@ -127,88 +123,88 @@ class DblpReleaseImporter(EntityImporter):
- isbn
"""
- dblp_key = xml_elem.get('key')
+ dblp_key = xml_elem.get("key")
if not dblp_key:
- self.counts['skip-empty-key'] += 1
+ self.counts["skip-empty-key"] += 1
return False
- dblp_key_type = dblp_key.split('/')[0]
+ dblp_key_type = dblp_key.split("/")[0]
# dblp_prefix may be used for container lookup
dblp_prefix = None
- if dblp_key_type in ('journals', 'conf'):
- dblp_prefix = '/'.join(dblp_key.split('/')[:2])
- elif dblp_key_type in ('series', 'reference', 'tr', 'books'):
- dblp_prefix = '/'.join(dblp_key.split('/')[:-1])
+ if dblp_key_type in ("journals", "conf"):
+ dblp_prefix = "/".join(dblp_key.split("/")[:2])
+ elif dblp_key_type in ("series", "reference", "tr", "books"):
+ dblp_prefix = "/".join(dblp_key.split("/")[:-1])
- publtype = xml_elem.get('publtype') or None
+ publtype = xml_elem.get("publtype") or None
dblp_type = xml_elem.name
if dblp_type not in self.ELEMENT_TYPES:
- self.counts[f'skip-dblp-type:{dblp_type}'] += 1
+ self.counts[f"skip-dblp-type:{dblp_type}"] += 1
- if dblp_key_type in ('homepages', 'persons', 'dblpnote'):
- self.counts['skip-key-type'] += 1
+ if dblp_key_type in ("homepages", "persons", "dblpnote"):
+ self.counts["skip-key-type"] += 1
return False
- if dblp_key.startswith('journals/corr/'):
- self.counts['skip-arxiv-corr'] += 1
+ if dblp_key.startswith("journals/corr/"):
+ self.counts["skip-arxiv-corr"] += 1
return False
title = clean_str(" ".join(xml_elem.title.stripped_strings), force_xml=True)
if not title:
- self.counts['skip-title'] += 1
+ self.counts["skip-title"] += 1
return False
- if title.endswith('.'):
+ if title.endswith("."):
title = title[:-1]
release_type = None
- release_stage = 'published'
+ release_stage = "published"
withdrawn_status = None
# primary releae_type detection: type of XML element, then prefix of key for granularity
- if dblp_type == 'article':
- release_type = 'article'
- if dblp_key_type == 'journals' and publtype != 'informal':
- release_type = 'article-journal'
- elif dblp_key_type == 'tr':
- release_type = 'report'
+ if dblp_type == "article":
+ release_type = "article"
+ if dblp_key_type == "journals" and publtype != "informal":
+ release_type = "article-journal"
+ elif dblp_key_type == "tr":
+ release_type = "report"
elif title.startswith("Review:"):
- release_type = 'review'
- elif dblp_type == 'inproceedings':
- release_type = 'paper-conference'
- elif dblp_type == 'book':
- release_type = 'book'
- elif dblp_type == 'incollection':
+ release_type = "review"
+ elif dblp_type == "inproceedings":
+ release_type = "paper-conference"
+ elif dblp_type == "book":
+ release_type = "book"
+ elif dblp_type == "incollection":
# XXX: part vs. chapter?
- release_type = 'chapter'
- elif dblp_type == 'data':
- release_type = 'dataset'
- elif dblp_type in ('mastersthesis', 'phdthesis'):
- release_type = 'thesis'
+ release_type = "chapter"
+ elif dblp_type == "data":
+ release_type = "dataset"
+ elif dblp_type in ("mastersthesis", "phdthesis"):
+ release_type = "thesis"
# overrides/extensions of the above
- if publtype == 'informal':
+ if publtype == "informal":
# for conferences, seems to indicate peer-review status
# for journals, seems to indicate things like book reviews; split out above
pass
- elif publtype == 'encyclopedia':
- release_type = 'entry-encyclopedia'
- elif publtype == 'edited':
+ elif publtype == "encyclopedia":
+ release_type = "entry-encyclopedia"
+ elif publtype == "edited":
# XXX: article?
- release_type = 'editorial'
- elif publtype == 'data':
- release_type = 'dataset'
- elif publtype == 'data':
- release_type = 'dataset'
- elif publtype == 'software':
- release_type = 'software'
- elif publtype == 'widthdrawn':
- withdrawn_status = 'widthdrawn'
- elif publtype == 'survey':
+ release_type = "editorial"
+ elif publtype == "data":
+ release_type = "dataset"
+ elif publtype == "data":
+ release_type = "dataset"
+ elif publtype == "software":
+ release_type = "software"
+ elif publtype == "widthdrawn":
+ withdrawn_status = "widthdrawn"
+ elif publtype == "survey":
# XXX: flag as a review/survey article?
pass
- #print((release_type, dblp_type, dblp_key_type, publtype), file=sys.stderr)
+ # print((release_type, dblp_type, dblp_key_type, publtype), file=sys.stderr)
container_name = None
booktitle = clean_str(xml_elem.booktitle and xml_elem.booktitle.text)
@@ -236,7 +232,9 @@ class DblpReleaseImporter(EntityImporter):
part_of_key = clean_str(xml_elem.crossref and xml_elem.crossref.text)
# block bogus far-future years/dates
- if release_year is not None and (release_year > (self.this_year + 5) or release_year < 1000):
+ if release_year is not None and (
+ release_year > (self.this_year + 5) or release_year < 1000
+ ):
release_month = None
release_year = None
@@ -245,39 +243,39 @@ class DblpReleaseImporter(EntityImporter):
if isbn:
ext_ids.isbn13 = isbn
if ext_ids.doi:
- self.counts['has-doi'] += 1
+ self.counts["has-doi"] += 1
# dblp-specific extra
dblp_extra = dict(type=dblp_type)
note = clean_str(xml_elem.note and xml_elem.note.text)
- if note and 'base-search.net' not in note:
- dblp_extra['note'] = note
+ if note and "base-search.net" not in note:
+ dblp_extra["note"] = note
if part_of_key:
- dblp_extra['part_of_key'] = part_of_key
+ dblp_extra["part_of_key"] = part_of_key
# generic extra
extra = dict()
if not container_id and container_name:
- extra['container_name'] = container_name
+ extra["container_name"] = container_name
- if series and (dblp_key_type == 'series' or dblp_type == 'book'):
- extra['series-title'] = series
+ if series and (dblp_key_type == "series" or dblp_type == "book"):
+ extra["series-title"] = series
elif series:
- dblp_extra['series'] = series
+ dblp_extra["series"] = series
- if booktitle and dblp_key_type == 'series':
- extra['container-title'] = booktitle
- elif booktitle and dblp_key_type == 'conf':
- extra['event'] = booktitle
+ if booktitle and dblp_key_type == "series":
+ extra["container-title"] = booktitle
+ elif booktitle and dblp_key_type == "conf":
+ extra["event"] = booktitle
elif booktitle:
- dblp_extra['booktitle'] = booktitle
+ dblp_extra["booktitle"] = booktitle
if release_year and release_month:
# TODO: release_month schema migration
- extra['release_month'] = release_month
+ extra["release_month"] = release_month
if dblp_extra:
- extra['dblp'] = dblp_extra
+ extra["dblp"] = dblp_extra
if not extra:
extra = None
@@ -289,7 +287,7 @@ class DblpReleaseImporter(EntityImporter):
withdrawn_status=withdrawn_status,
title=title,
release_year=release_year,
- #release_date,
+ # release_date,
publisher=publisher,
ext_ids=ext_ids,
contribs=contribs,
@@ -302,8 +300,8 @@ class DblpReleaseImporter(EntityImporter):
if self.dump_json_mode:
re_dict = entity_to_dict(re, api_client=self.api.api_client)
- re_dict['_dblp_ee_urls'] = self.dblp_ext_urls(xml_elem)
- re_dict['_dblp_prefix'] = dblp_prefix
+ re_dict["_dblp_ee_urls"] = self.dblp_ext_urls(xml_elem)
+ re_dict["_dblp_prefix"] = dblp_prefix
print(json.dumps(re_dict, sort_keys=True))
return False
@@ -341,11 +339,11 @@ class DblpReleaseImporter(EntityImporter):
# then try other ext_id lookups
if not existing:
- for extid_type in ('doi', 'wikidata_qid', 'isbn13', 'arxiv'):
+ for extid_type in ("doi", "wikidata_qid", "isbn13", "arxiv"):
extid_val = getattr(re.ext_ids, extid_type)
if not extid_val:
continue
- #print(f" lookup release type: {extid_type} val: {extid_val}")
+ # print(f" lookup release type: {extid_type} val: {extid_val}")
try:
existing = self.api.lookup_release(**{extid_type: extid_val})
except fatcat_openapi_client.rest.ApiException as err:
@@ -373,12 +371,14 @@ class DblpReleaseImporter(EntityImporter):
return True
if not self.do_updates or existing.ext_ids.dblp:
- self.counts['exists'] += 1
+ self.counts["exists"] += 1
return False
# logic for whether to do update or skip
- if (existing.container_id and existing.release_type and existing.release_stage) or existing.ext_ids.arxiv:
- self.counts['skip-update'] += 1
+ if (
+ existing.container_id and existing.release_type and existing.release_stage
+ ) or existing.ext_ids.arxiv:
+ self.counts["skip-update"] += 1
return False
# fields to copy over for update
@@ -390,20 +390,20 @@ class DblpReleaseImporter(EntityImporter):
existing.release_stage = existing.release_stage or re.release_stage
existing.withdrawn_status = existing.withdrawn_status or re.withdrawn_status
existing.container_id = existing.container_id or re.container_id
- existing.extra['dblp'] = re.extra['dblp']
+ existing.extra["dblp"] = re.extra["dblp"]
existing.volume = existing.volume or re.volume
existing.issue = existing.issue or re.issue
existing.pages = existing.pages or re.pages
try:
self.api.update_release(self.get_editgroup_id(), existing.ident, existing)
- self.counts['update'] += 1
+ self.counts["update"] += 1
except fatcat_openapi_client.rest.ApiException as err:
# there is a code path where we try to update the same release
# twice in a row; if that happens, just skip
# NOTE: API behavior might change in the future?
if "release_edit_editgroup_id_ident_id_key" in err.body:
- self.counts['skip-update-conflict'] += 1
+ self.counts["skip-update-conflict"] += 1
return False
else:
raise err
@@ -411,11 +411,14 @@ class DblpReleaseImporter(EntityImporter):
return False
def insert_batch(self, batch):
- self.api.create_release_auto_batch(fatcat_openapi_client.ReleaseAutoBatch(
- editgroup=fatcat_openapi_client.Editgroup(
- description=self.editgroup_description,
- extra=self.editgroup_extra),
- entity_list=batch))
+ self.api.create_release_auto_batch(
+ fatcat_openapi_client.ReleaseAutoBatch(
+ editgroup=fatcat_openapi_client.Editgroup(
+ description=self.editgroup_description, extra=self.editgroup_extra
+ ),
+ entity_list=batch,
+ )
+ )
def dblp_contribs(self, authors: List[dict]) -> List[fatcat_openapi_client.ReleaseContrib]:
"""
@@ -428,14 +431,14 @@ class DblpReleaseImporter(EntityImporter):
"""
contribs = []
index = 0
- for elem in authors.find_all('author'):
+ for elem in authors.find_all("author"):
contrib = self.dblp_contrib_single(elem)
contrib.role = "author"
contrib.index = index
contribs.append(contrib)
index += 1
- for elem in authors.find_all('editor'):
+ for elem in authors.find_all("editor"):
contrib = self.dblp_contrib_single(elem)
contrib.role = "editor"
contribs.append(contrib)
@@ -459,10 +462,10 @@ class DblpReleaseImporter(EntityImporter):
# remove number in author name, if present
if raw_name.split()[-1].isdigit():
- raw_name = ' '.join(raw_name.split()[:-1])
+ raw_name = " ".join(raw_name.split()[:-1])
- if elem.get('orcid'):
- orcid = clean_orcid(elem['orcid'])
+ if elem.get("orcid"):
+ orcid = clean_orcid(elem["orcid"])
if orcid:
creator_id = self.lookup_orcid(orcid)
if not creator_id:
@@ -491,22 +494,26 @@ class DblpReleaseImporter(EntityImporter):
wikidata_qid: Optional[str] = None
arxiv_id: Optional[str] = None
hdl: Optional[str] = None
- for ee in xml_elem.find_all('ee'):
+ for ee in xml_elem.find_all("ee"):
url = ee.text
# convert DOI-like domains, which mostly have DOIs anyways
- if '://doi.acm.org/' in url:
- url = url.replace('://doi.acm.org/', '://doi.org/')
- elif '://doi.ieeecomputersociety.org/' in url:
- url = url.replace('://doi.ieeecomputersociety.org/', '://doi.org/')
+ if "://doi.acm.org/" in url:
+ url = url.replace("://doi.acm.org/", "://doi.org/")
+ elif "://doi.ieeecomputersociety.org/" in url:
+ url = url.replace("://doi.ieeecomputersociety.org/", "://doi.org/")
- if 'doi.org/10.' in url and not doi:
+ if "doi.org/10." in url and not doi:
doi = clean_doi(url)
- elif 'wikidata.org/entity/Q' in url and not wikidata_qid:
+ elif "wikidata.org/entity/Q" in url and not wikidata_qid:
wikidata_qid = clean_wikidata_qid(url)
- elif '://arxiv.org/abs/' in url and not arxiv_id:
- arxiv_id = url.replace('http://', '').replace('https://', '').replace('arxiv.org/abs/', '')
+ elif "://arxiv.org/abs/" in url and not arxiv_id:
+ arxiv_id = (
+ url.replace("http://", "")
+ .replace("https://", "")
+ .replace("arxiv.org/abs/", "")
+ )
arxiv_id = clean_arxiv_id(arxiv_id)
- elif '://hdl.handle.net' in url and not hdl:
+ elif "://hdl.handle.net" in url and not hdl:
hdl = clean_hdl(url)
return fatcat_openapi_client.ReleaseExtIds(
@@ -525,14 +532,14 @@ class DblpReleaseImporter(EntityImporter):
sandcrawler ingest requests.
"""
EXTID_PATTERNS = [
- '://doi.acm.org/',
- '://doi.ieeecomputersociety.org/',
- 'doi.org/10.',
- 'wikidata.org/entity/Q',
- '://arxiv.org/abs/',
+ "://doi.acm.org/",
+ "://doi.ieeecomputersociety.org/",
+ "doi.org/10.",
+ "wikidata.org/entity/Q",
+ "://arxiv.org/abs/",
]
urls = []
- for ee in xml_elem.find_all('ee'):
+ for ee in xml_elem.find_all("ee"):
url = ee.text
skip = False
for pattern in EXTID_PATTERNS: