fmt (black): fatcat_tools/

author: Bryan Newbold <bnewbold@robocracy.org> 2021-11-02 18:14:59 -0700
committer: Bryan Newbold <bnewbold@robocracy.org> 2021-11-02 18:14:59 -0700
commit: 31d1a6a713d177990609767d508209ced19ca396 (patch)
tree: a628a57bdb373669394a6b520102b1b4b5ffe7da /python/fatcat_tools/importers/dblp_release.py
parent: 9dc891b8098542bb089c8c47098b60a8beb76a53 (diff)
download: fatcat-31d1a6a713d177990609767d508209ced19ca396.tar.gz
fatcat-31d1a6a713d177990609767d508209ced19ca396.zip
1 files changed, 132 insertions, 125 deletions
diff --git a/python/fatcat_tools/importers/dblp_release.py b/python/fatcat_tools/importers/dblp_release.py
index 6d028f2f..5baa6cd6 100644
--- a/python/fatcat_tools/importers/dblp_release.py
+++ b/python/fatcat_tools/importers/dblp_release.py
@@ -1,4 +1,3 @@
-
 """
 Importer for DBLP release-level (article/paper/etc) XML metadata.
 
@@ -44,25 +43,16 @@ from fatcat_tools.transforms import entity_to_dict
 
 
 class DblpReleaseImporter(EntityImporter):
-
-    def __init__(self,
-                 api,
-                 dblp_container_map_file=None,
-                 **kwargs):
+    def __init__(self, api, dblp_container_map_file=None, **kwargs):
 
         eg_desc = kwargs.get(
-            'editgroup_description',
-            "Automated import of dblp metadata via XML records"
+            "editgroup_description", "Automated import of dblp metadata via XML records"
         )
-        eg_extra = kwargs.get('editgroup_extra', dict())
-        eg_extra['agent'] = eg_extra.get('agent',
-                                         'fatcat_tools.DblpReleaseImporter')
+        eg_extra = kwargs.get("editgroup_extra", dict())
+        eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.DblpReleaseImporter")
         # ensure default is to not do updates with this worker (override super() default)
-        kwargs['do_updates'] = kwargs.get("do_updates", False)
-        super().__init__(api,
-                         editgroup_description=eg_desc,
-                         editgroup_extra=eg_extra,
-                         **kwargs)
+        kwargs["do_updates"] = kwargs.get("do_updates", False)
+        super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs)
 
         self.dump_json_mode = kwargs.get("dump_json_mode", False)
         self.this_year = datetime.datetime.now().year
@@ -76,13 +66,16 @@ class DblpReleaseImporter(EntityImporter):
         "phdthesis",
         "mastersthesis",
         "www",
-        #"data",  # no instances in 2020-11 dump
+        # "data",  # no instances in 2020-11 dump
     ]
 
     def read_dblp_container_map_file(self, dblp_container_map_file) -> None:
         self._dblp_container_map = dict()
         if not dblp_container_map_file:
-            print("Not loading a dblp prefix container map file; entities will fail to import", file=sys.stderr)
+            print(
+                "Not loading a dblp prefix container map file; entities will fail to import",
+                file=sys.stderr,
+            )
             return
         print("Loading dblp prefix container map file...", file=sys.stderr)
         for line in dblp_container_map_file:
@@ -92,7 +85,10 @@ class DblpReleaseImporter(EntityImporter):
             container_id = container_id.strip()
             assert len(container_id) == 26
             self._dblp_container_map[prefix] = container_id
-        print("Got {} dblp container mappings.".format(len(self._dblp_container_map)), file=sys.stderr)
+        print(
+            "Got {} dblp container mappings.".format(len(self._dblp_container_map)),
+            file=sys.stderr,
+        )
 
     def lookup_dblp_prefix(self, prefix):
         if not prefix:
@@ -101,13 +97,13 @@ class DblpReleaseImporter(EntityImporter):
 
     def want(self, xml_elem):
         if xml_elem.name not in self.ELEMENT_TYPES:
-            self.counts['skip-type'] += 1
+            self.counts["skip-type"] += 1
             return False
-        if not xml_elem.get('key'):
-            self.counts['skip-no-key'] += 1
+        if not xml_elem.get("key"):
+            self.counts["skip-no-key"] += 1
             return False
-        if xml_elem['key'].startswith('homepage/'):
-            self.counts['skip-type-homepage'] += 1
+        if xml_elem["key"].startswith("homepage/"):
+            self.counts["skip-type-homepage"] += 1
             return False
         return True
 
@@ -127,88 +123,88 @@ class DblpReleaseImporter(EntityImporter):
         - isbn
         """
 
-        dblp_key = xml_elem.get('key')
+        dblp_key = xml_elem.get("key")
         if not dblp_key:
-            self.counts['skip-empty-key'] += 1
+            self.counts["skip-empty-key"] += 1
             return False
-        dblp_key_type = dblp_key.split('/')[0]
+        dblp_key_type = dblp_key.split("/")[0]
 
         # dblp_prefix may be used for container lookup
         dblp_prefix = None
-        if dblp_key_type in ('journals', 'conf'):
-            dblp_prefix = '/'.join(dblp_key.split('/')[:2])
-        elif dblp_key_type in ('series', 'reference', 'tr', 'books'):
-            dblp_prefix = '/'.join(dblp_key.split('/')[:-1])
+        if dblp_key_type in ("journals", "conf"):
+            dblp_prefix = "/".join(dblp_key.split("/")[:2])
+        elif dblp_key_type in ("series", "reference", "tr", "books"):
+            dblp_prefix = "/".join(dblp_key.split("/")[:-1])
 
-        publtype = xml_elem.get('publtype') or None
+        publtype = xml_elem.get("publtype") or None
 
         dblp_type = xml_elem.name
         if dblp_type not in self.ELEMENT_TYPES:
-            self.counts[f'skip-dblp-type:{dblp_type}'] += 1
+            self.counts[f"skip-dblp-type:{dblp_type}"] += 1
 
-        if dblp_key_type in ('homepages', 'persons', 'dblpnote'):
-            self.counts['skip-key-type'] += 1
+        if dblp_key_type in ("homepages", "persons", "dblpnote"):
+            self.counts["skip-key-type"] += 1
             return False
 
-        if dblp_key.startswith('journals/corr/'):
-            self.counts['skip-arxiv-corr'] += 1
+        if dblp_key.startswith("journals/corr/"):
+            self.counts["skip-arxiv-corr"] += 1
             return False
 
         title = clean_str(" ".join(xml_elem.title.stripped_strings), force_xml=True)
         if not title:
-            self.counts['skip-title'] += 1
+            self.counts["skip-title"] += 1
             return False
-        if title.endswith('.'):
+        if title.endswith("."):
             title = title[:-1]
 
         release_type = None
-        release_stage = 'published'
+        release_stage = "published"
         withdrawn_status = None
 
         # primary releae_type detection: type of XML element, then prefix of key for granularity
-        if dblp_type == 'article':
-            release_type = 'article'
-            if dblp_key_type == 'journals' and publtype != 'informal':
-                release_type = 'article-journal'
-            elif dblp_key_type == 'tr':
-                release_type = 'report'
+        if dblp_type == "article":
+            release_type = "article"
+            if dblp_key_type == "journals" and publtype != "informal":
+                release_type = "article-journal"
+            elif dblp_key_type == "tr":
+                release_type = "report"
             elif title.startswith("Review:"):
-                release_type = 'review'
-        elif dblp_type == 'inproceedings':
-            release_type = 'paper-conference'
-        elif dblp_type == 'book':
-            release_type = 'book'
-        elif dblp_type == 'incollection':
+                release_type = "review"
+        elif dblp_type == "inproceedings":
+            release_type = "paper-conference"
+        elif dblp_type == "book":
+            release_type = "book"
+        elif dblp_type == "incollection":
             # XXX: part vs. chapter?
-            release_type = 'chapter'
-        elif dblp_type == 'data':
-            release_type = 'dataset'
-        elif dblp_type in ('mastersthesis', 'phdthesis'):
-            release_type = 'thesis'
+            release_type = "chapter"
+        elif dblp_type == "data":
+            release_type = "dataset"
+        elif dblp_type in ("mastersthesis", "phdthesis"):
+            release_type = "thesis"
 
         # overrides/extensions of the above
-        if publtype == 'informal':
+        if publtype == "informal":
             # for conferences, seems to indicate peer-review status
             # for journals, seems to indicate things like book reviews; split out above
             pass
-        elif publtype == 'encyclopedia':
-            release_type = 'entry-encyclopedia'
-        elif publtype == 'edited':
+        elif publtype == "encyclopedia":
+            release_type = "entry-encyclopedia"
+        elif publtype == "edited":
             # XXX: article?
-            release_type = 'editorial'
-        elif publtype == 'data':
-            release_type = 'dataset'
-        elif publtype == 'data':
-            release_type = 'dataset'
-        elif publtype == 'software':
-            release_type = 'software'
-        elif publtype == 'widthdrawn':
-            withdrawn_status = 'widthdrawn'
-        elif publtype == 'survey':
+            release_type = "editorial"
+        elif publtype == "data":
+            release_type = "dataset"
+        elif publtype == "data":
+            release_type = "dataset"
+        elif publtype == "software":
+            release_type = "software"
+        elif publtype == "widthdrawn":
+            withdrawn_status = "widthdrawn"
+        elif publtype == "survey":
             # XXX: flag as a review/survey article?
             pass
 
-        #print((release_type, dblp_type, dblp_key_type, publtype), file=sys.stderr)
+        # print((release_type, dblp_type, dblp_key_type, publtype), file=sys.stderr)
 
         container_name = None
         booktitle = clean_str(xml_elem.booktitle and xml_elem.booktitle.text)
@@ -236,7 +232,9 @@ class DblpReleaseImporter(EntityImporter):
         part_of_key = clean_str(xml_elem.crossref and xml_elem.crossref.text)
 
         # block bogus far-future years/dates
-        if release_year is not None and (release_year > (self.this_year + 5) or release_year < 1000):
+        if release_year is not None and (
+            release_year > (self.this_year + 5) or release_year < 1000
+        ):
             release_month = None
             release_year = None
 
@@ -245,39 +243,39 @@ class DblpReleaseImporter(EntityImporter):
         if isbn:
             ext_ids.isbn13 = isbn
         if ext_ids.doi:
-            self.counts['has-doi'] += 1
+            self.counts["has-doi"] += 1
 
         # dblp-specific extra
         dblp_extra = dict(type=dblp_type)
         note = clean_str(xml_elem.note and xml_elem.note.text)
-        if note and 'base-search.net' not in note:
-            dblp_extra['note'] = note
+        if note and "base-search.net" not in note:
+            dblp_extra["note"] = note
         if part_of_key:
-            dblp_extra['part_of_key'] = part_of_key
+            dblp_extra["part_of_key"] = part_of_key
 
         # generic extra
         extra = dict()
         if not container_id and container_name:
-            extra['container_name'] = container_name
+            extra["container_name"] = container_name
 
-        if series and (dblp_key_type == 'series' or dblp_type == 'book'):
-            extra['series-title'] = series
+        if series and (dblp_key_type == "series" or dblp_type == "book"):
+            extra["series-title"] = series
         elif series:
-            dblp_extra['series'] = series
+            dblp_extra["series"] = series
 
-        if booktitle and dblp_key_type == 'series':
-            extra['container-title'] = booktitle
-        elif booktitle and dblp_key_type == 'conf':
-            extra['event'] = booktitle
+        if booktitle and dblp_key_type == "series":
+            extra["container-title"] = booktitle
+        elif booktitle and dblp_key_type == "conf":
+            extra["event"] = booktitle
         elif booktitle:
-            dblp_extra['booktitle'] = booktitle
+            dblp_extra["booktitle"] = booktitle
 
         if release_year and release_month:
             # TODO: release_month schema migration
-            extra['release_month'] = release_month
+            extra["release_month"] = release_month
 
         if dblp_extra:
-            extra['dblp'] = dblp_extra
+            extra["dblp"] = dblp_extra
         if not extra:
             extra = None
 
@@ -289,7 +287,7 @@ class DblpReleaseImporter(EntityImporter):
             withdrawn_status=withdrawn_status,
             title=title,
             release_year=release_year,
-            #release_date,
+            # release_date,
             publisher=publisher,
             ext_ids=ext_ids,
             contribs=contribs,
@@ -302,8 +300,8 @@ class DblpReleaseImporter(EntityImporter):
 
         if self.dump_json_mode:
             re_dict = entity_to_dict(re, api_client=self.api.api_client)
-            re_dict['_dblp_ee_urls'] = self.dblp_ext_urls(xml_elem)
-            re_dict['_dblp_prefix'] = dblp_prefix
+            re_dict["_dblp_ee_urls"] = self.dblp_ext_urls(xml_elem)
+            re_dict["_dblp_prefix"] = dblp_prefix
             print(json.dumps(re_dict, sort_keys=True))
             return False
 
@@ -341,11 +339,11 @@ class DblpReleaseImporter(EntityImporter):
 
         # then try other ext_id lookups
         if not existing:
-            for extid_type in ('doi', 'wikidata_qid', 'isbn13', 'arxiv'):
+            for extid_type in ("doi", "wikidata_qid", "isbn13", "arxiv"):
                 extid_val = getattr(re.ext_ids, extid_type)
                 if not extid_val:
                     continue
-                #print(f"  lookup release type: {extid_type} val: {extid_val}")
+                # print(f"  lookup release type: {extid_type} val: {extid_val}")
                 try:
                     existing = self.api.lookup_release(**{extid_type: extid_val})
                 except fatcat_openapi_client.rest.ApiException as err:
@@ -373,12 +371,14 @@ class DblpReleaseImporter(EntityImporter):
             return True
 
         if not self.do_updates or existing.ext_ids.dblp:
-            self.counts['exists'] += 1
+            self.counts["exists"] += 1
             return False
 
         # logic for whether to do update or skip
-        if (existing.container_id and existing.release_type and existing.release_stage) or existing.ext_ids.arxiv:
-            self.counts['skip-update'] += 1
+        if (
+            existing.container_id and existing.release_type and existing.release_stage
+        ) or existing.ext_ids.arxiv:
+            self.counts["skip-update"] += 1
             return False
 
         # fields to copy over for update
@@ -390,20 +390,20 @@ class DblpReleaseImporter(EntityImporter):
         existing.release_stage = existing.release_stage or re.release_stage
         existing.withdrawn_status = existing.withdrawn_status or re.withdrawn_status
         existing.container_id = existing.container_id or re.container_id
-        existing.extra['dblp'] = re.extra['dblp']
+        existing.extra["dblp"] = re.extra["dblp"]
         existing.volume = existing.volume or re.volume
         existing.issue = existing.issue or re.issue
         existing.pages = existing.pages or re.pages
 
         try:
             self.api.update_release(self.get_editgroup_id(), existing.ident, existing)
-            self.counts['update'] += 1
+            self.counts["update"] += 1
         except fatcat_openapi_client.rest.ApiException as err:
             # there is a code path where we try to update the same release
             # twice in a row; if that happens, just skip
             # NOTE: API behavior might change in the future?
             if "release_edit_editgroup_id_ident_id_key" in err.body:
-                self.counts['skip-update-conflict'] += 1
+                self.counts["skip-update-conflict"] += 1
                 return False
             else:
                 raise err
@@ -411,11 +411,14 @@ class DblpReleaseImporter(EntityImporter):
         return False
 
     def insert_batch(self, batch):
-        self.api.create_release_auto_batch(fatcat_openapi_client.ReleaseAutoBatch(
-            editgroup=fatcat_openapi_client.Editgroup(
-                description=self.editgroup_description,
-                extra=self.editgroup_extra),
-            entity_list=batch))
+        self.api.create_release_auto_batch(
+            fatcat_openapi_client.ReleaseAutoBatch(
+                editgroup=fatcat_openapi_client.Editgroup(
+                    description=self.editgroup_description, extra=self.editgroup_extra
+                ),
+                entity_list=batch,
+            )
+        )
 
     def dblp_contribs(self, authors: List[dict]) -> List[fatcat_openapi_client.ReleaseContrib]:
         """
@@ -428,14 +431,14 @@ class DblpReleaseImporter(EntityImporter):
         """
         contribs = []
         index = 0
-        for elem in authors.find_all('author'):
+        for elem in authors.find_all("author"):
             contrib = self.dblp_contrib_single(elem)
             contrib.role = "author"
             contrib.index = index
             contribs.append(contrib)
             index += 1
 
-        for elem in authors.find_all('editor'):
+        for elem in authors.find_all("editor"):
             contrib = self.dblp_contrib_single(elem)
             contrib.role = "editor"
             contribs.append(contrib)
@@ -459,10 +462,10 @@ class DblpReleaseImporter(EntityImporter):
 
         # remove number in author name, if present
         if raw_name.split()[-1].isdigit():
-            raw_name = ' '.join(raw_name.split()[:-1])
+            raw_name = " ".join(raw_name.split()[:-1])
 
-        if elem.get('orcid'):
-            orcid = clean_orcid(elem['orcid'])
+        if elem.get("orcid"):
+            orcid = clean_orcid(elem["orcid"])
             if orcid:
                 creator_id = self.lookup_orcid(orcid)
                 if not creator_id:
@@ -491,22 +494,26 @@ class DblpReleaseImporter(EntityImporter):
         wikidata_qid: Optional[str] = None
         arxiv_id: Optional[str] = None
         hdl: Optional[str] = None
-        for ee in xml_elem.find_all('ee'):
+        for ee in xml_elem.find_all("ee"):
             url = ee.text
             # convert DOI-like domains, which mostly have DOIs anyways
-            if '://doi.acm.org/' in url:
-                url = url.replace('://doi.acm.org/', '://doi.org/')
-            elif '://doi.ieeecomputersociety.org/' in url:
-                url = url.replace('://doi.ieeecomputersociety.org/', '://doi.org/')
+            if "://doi.acm.org/" in url:
+                url = url.replace("://doi.acm.org/", "://doi.org/")
+            elif "://doi.ieeecomputersociety.org/" in url:
+                url = url.replace("://doi.ieeecomputersociety.org/", "://doi.org/")
 
-            if 'doi.org/10.' in url and not doi:
+            if "doi.org/10." in url and not doi:
                 doi = clean_doi(url)
-            elif 'wikidata.org/entity/Q' in url and not wikidata_qid:
+            elif "wikidata.org/entity/Q" in url and not wikidata_qid:
                 wikidata_qid = clean_wikidata_qid(url)
-            elif '://arxiv.org/abs/' in url and not arxiv_id:
-                arxiv_id = url.replace('http://', '').replace('https://', '').replace('arxiv.org/abs/', '')
+            elif "://arxiv.org/abs/" in url and not arxiv_id:
+                arxiv_id = (
+                    url.replace("http://", "")
+                    .replace("https://", "")
+                    .replace("arxiv.org/abs/", "")
+                )
                 arxiv_id = clean_arxiv_id(arxiv_id)
-            elif '://hdl.handle.net' in url and not hdl:
+            elif "://hdl.handle.net" in url and not hdl:
                 hdl = clean_hdl(url)
 
         return fatcat_openapi_client.ReleaseExtIds(
@@ -525,14 +532,14 @@ class DblpReleaseImporter(EntityImporter):
         sandcrawler ingest requests.
         """
         EXTID_PATTERNS = [
-            '://doi.acm.org/',
-            '://doi.ieeecomputersociety.org/',
-            'doi.org/10.',
-            'wikidata.org/entity/Q',
-            '://arxiv.org/abs/',
+            "://doi.acm.org/",
+            "://doi.ieeecomputersociety.org/",
+            "doi.org/10.",
+            "wikidata.org/entity/Q",
+            "://arxiv.org/abs/",
         ]
         urls = []
-        for ee in xml_elem.find_all('ee'):
+        for ee in xml_elem.find_all("ee"):
             url = ee.text
             skip = False
             for pattern in EXTID_PATTERNS:
author	Bryan Newbold <bnewbold@robocracy.org>	2021-11-02 18:14:59 -0700
committer	Bryan Newbold <bnewbold@robocracy.org>	2021-11-02 18:14:59 -0700
commit	31d1a6a713d177990609767d508209ced19ca396 (patch)
tree	a628a57bdb373669394a6b520102b1b4b5ffe7da /python/fatcat_tools/importers/dblp_release.py
parent	9dc891b8098542bb089c8c47098b60a8beb76a53 (diff)
download	fatcat-31d1a6a713d177990609767d508209ced19ca396.tar.gz fatcat-31d1a6a713d177990609767d508209ced19ca396.zip