fmt (black): fatcat_tools/

author: Bryan Newbold <bnewbold@robocracy.org> 2021-11-02 18:14:59 -0700
committer: Bryan Newbold <bnewbold@robocracy.org> 2021-11-02 18:14:59 -0700
commit: 31d1a6a713d177990609767d508209ced19ca396 (patch)
tree: a628a57bdb373669394a6b520102b1b4b5ffe7da /python/fatcat_tools/importers/arabesque.py
parent: 9dc891b8098542bb089c8c47098b60a8beb76a53 (diff)
download: fatcat-31d1a6a713d177990609767d508209ced19ca396.tar.gz
fatcat-31d1a6a713d177990609767d508209ced19ca396.zip
1 files changed, 60 insertions, 53 deletions
diff --git a/python/fatcat_tools/importers/arabesque.py b/python/fatcat_tools/importers/arabesque.py
index 2b0ff7ec..ae4f9049 100644
--- a/python/fatcat_tools/importers/arabesque.py
+++ b/python/fatcat_tools/importers/arabesque.py
@@ -1,9 +1,9 @@
-
 import fatcat_openapi_client
 
 from .common import SANE_MAX_RELEASES, SANE_MAX_URLS, EntityImporter, b32_hex, make_rel_url
 
-ARABESQUE_MATCH_WHERE_CLAUSE='WHERE hit = 1 AND identifier IS NOT NULL'
+ARABESQUE_MATCH_WHERE_CLAUSE = "WHERE hit = 1 AND identifier IS NOT NULL"
+
 
 class ArabesqueMatchImporter(EntityImporter):
     """
@@ -38,17 +38,17 @@ class ArabesqueMatchImporter(EntityImporter):
 
     def __init__(self, api, extid_type, require_grobid=True, **kwargs):
 
-        eg_desc = kwargs.get('editgroup_description', None) or "Match web crawl files to releases based on identifier/URL seedlist"
-        eg_extra = kwargs.get('editgroup_extra', dict())
-        eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.ArabesqueMatchImporter')
-        if kwargs.get('crawl_id'):
-            eg_extra['crawl_id'] = kwargs.get('crawl_id')
-        kwargs['do_updates'] = kwargs.get("do_updates", False)
-        super().__init__(api,
-            editgroup_description=eg_desc,
-            editgroup_extra=eg_extra,
-            **kwargs)
-        assert extid_type in ('doi', 'pmcid', 'pmid')
+        eg_desc = (
+            kwargs.get("editgroup_description", None)
+            or "Match web crawl files to releases based on identifier/URL seedlist"
+        )
+        eg_extra = kwargs.get("editgroup_extra", dict())
+        eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.ArabesqueMatchImporter")
+        if kwargs.get("crawl_id"):
+            eg_extra["crawl_id"] = kwargs.get("crawl_id")
+        kwargs["do_updates"] = kwargs.get("do_updates", False)
+        super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs)
+        assert extid_type in ("doi", "pmcid", "pmid")
         self.extid_type = extid_type
         self.default_link_rel = kwargs.get("default_link_rel", "web")
         assert self.default_link_rel
@@ -60,33 +60,35 @@ class ArabesqueMatchImporter(EntityImporter):
             print("NOT checking GROBID status column")
 
     def want(self, row):
-        if self.require_grobid and not row['postproc_status'] == "200":
+        if self.require_grobid and not row["postproc_status"] == "200":
             return False
-        if (bool(row['hit']) is True
-                and row['final_sha1']
-                and row['final_timestamp']
-                and row['final_timestamp'] != "-"
-                and len(row['final_timestamp']) == 14
-                and row['final_mimetype']
-                and bool(row['hit']) is True
-                and row['identifier']):
+        if (
+            bool(row["hit"]) is True
+            and row["final_sha1"]
+            and row["final_timestamp"]
+            and row["final_timestamp"] != "-"
+            and len(row["final_timestamp"]) == 14
+            and row["final_mimetype"]
+            and bool(row["hit"]) is True
+            and row["identifier"]
+        ):
             return True
         else:
             return False
 
     def parse_record(self, row):
 
-        extid = row['identifier'].strip()
+        extid = row["identifier"].strip()
 
         # check/cleanup DOI
-        if self.extid_type == 'doi':
+        if self.extid_type == "doi":
             extid = extid.lower()
-            extid.replace('http://doi.org/', '')
-            extid.replace('https://doi.org/', '')
-            if extid.startswith('doi:'):
+            extid.replace("http://doi.org/", "")
+            extid.replace("https://doi.org/", "")
+            if extid.startswith("doi:"):
                 extid = extid[4:]
-            if not extid.startswith('10.'):
-                self.counts['skip-extid-invalid']
+            if not extid.startswith("10."):
+                self.counts["skip-extid-invalid"]
                 return None
 
         # lookup extid
@@ -95,35 +97,35 @@ class ArabesqueMatchImporter(EntityImporter):
         except fatcat_openapi_client.rest.ApiException as err:
             if err.status == 404:
                 # bail on 404 (release not in DB)
-                self.counts['skip-extid-not-found'] += 1
+                self.counts["skip-extid-not-found"] += 1
                 return None
             elif err.status == 400:
-                self.counts['skip-extid-invalid'] += 1
+                self.counts["skip-extid-invalid"] += 1
                 return None
             else:
                 raise err
 
-        url = make_rel_url(row['final_url'], self.default_link_rel)
+        url = make_rel_url(row["final_url"], self.default_link_rel)
         if not url:
-            self.counts['skip-url'] += 1
+            self.counts["skip-url"] += 1
             return None
-        if not row['final_timestamp']:
-            self.counts['skip-missing-timestamp'] += 1
+        if not row["final_timestamp"]:
+            self.counts["skip-missing-timestamp"] += 1
             return None
         wayback = "https://web.archive.org/web/{}/{}".format(
-            row['final_timestamp'],
-            row['final_url'])
+            row["final_timestamp"], row["final_url"]
+        )
         urls = [url, ("webarchive", wayback)]
 
         urls = [fatcat_openapi_client.FileUrl(rel=rel, url=url) for (rel, url) in urls]
 
         if len(urls) > SANE_MAX_URLS:
-            self.counts['skip-too-many-url'] += 1
+            self.counts["skip-too-many-url"] += 1
             return None
 
         fe = fatcat_openapi_client.FileEntity(
-            sha1=b32_hex(row['final_sha1']),
-            mimetype=row['final_mimetype'] or self.default_mimetype,
+            sha1=b32_hex(row["final_sha1"]),
+            mimetype=row["final_mimetype"] or self.default_mimetype,
             release_ids=[re.ident],
             urls=urls,
         )
@@ -143,15 +145,15 @@ class ArabesqueMatchImporter(EntityImporter):
 
         if (fe.release_ids[0] in existing.release_ids) and existing.urls:
             # TODO: could still, in theory update with the new URL?
-            self.counts['exists'] += 1
+            self.counts["exists"] += 1
             return False
 
         if not self.do_updates:
-            self.counts['skip-update-disabled'] += 1
+            self.counts["skip-update-disabled"] += 1
             return False
 
         if existing.ident in [e.ident for e in self._edits_inflight]:
-            self.counts['skip-update-inflight'] += 1
+            self.counts["skip-update-inflight"] += 1
             return False
 
         # TODO: this code path never gets hit because of the check above
@@ -159,28 +161,33 @@ class ArabesqueMatchImporter(EntityImporter):
             existing_urls = set([u.url for u in existing.urls])
             new_urls = set([u.url for u in fe.urls])
             if existing_urls.issuperset(new_urls):
-                self.counts['skip-update-nothing-new'] += 1
+                self.counts["skip-update-nothing-new"] += 1
                 return False
 
         # merge the existing into this one and update
         existing.urls = list(set([(u.rel, u.url) for u in fe.urls + existing.urls]))
-        existing.urls = [fatcat_openapi_client.FileUrl(rel=rel, url=url) for (rel, url) in existing.urls]
+        existing.urls = [
+            fatcat_openapi_client.FileUrl(rel=rel, url=url) for (rel, url) in existing.urls
+        ]
         if len(existing.urls) > SANE_MAX_URLS:
-            self.counts['skip-update-too-many-url'] += 1
+            self.counts["skip-update-too-many-url"] += 1
             return None
         existing.release_ids = list(set(fe.release_ids + existing.release_ids))
         if len(existing.release_ids) > SANE_MAX_RELEASES:
-            self.counts['skip-update-too-many-url'] += 1
+            self.counts["skip-update-too-many-url"] += 1
             return None
         existing.mimetype = existing.mimetype or fe.mimetype
         edit = self.api.update_file(self.get_editgroup_id(), existing.ident, existing)
         self._edits_inflight.append(edit)
-        self.counts['update'] += 1
+        self.counts["update"] += 1
         return False
 
     def insert_batch(self, batch):
-        self.api.create_file_auto_batch(fatcat_openapi_client.FileAutoBatch(
-            editgroup=fatcat_openapi_client.Editgroup(
-                description=self.editgroup_description,
-                extra=self.editgroup_extra),
-            entity_list=batch))
+        self.api.create_file_auto_batch(
+            fatcat_openapi_client.FileAutoBatch(
+                editgroup=fatcat_openapi_client.Editgroup(
+                    description=self.editgroup_description, extra=self.editgroup_extra
+                ),
+                entity_list=batch,
+            )
+        )
author	Bryan Newbold <bnewbold@robocracy.org>	2021-11-02 18:14:59 -0700
committer	Bryan Newbold <bnewbold@robocracy.org>	2021-11-02 18:14:59 -0700
commit	31d1a6a713d177990609767d508209ced19ca396 (patch)
tree	a628a57bdb373669394a6b520102b1b4b5ffe7da /python/fatcat_tools/importers/arabesque.py
parent	9dc891b8098542bb089c8c47098b60a8beb76a53 (diff)
download	fatcat-31d1a6a713d177990609767d508209ced19ca396.tar.gz fatcat-31d1a6a713d177990609767d508209ced19ca396.zip